require "rnahybrid_module.pm"; require "rnafold_module.pm"; require "iupac_module.pm"; use List::MoreUtils qw(first_index); sub run_struct_calc_dir { my ($dir, $mfe_cutoff, $h_constr, $hcs, $hce, $tt_skip, $tail_cut, $session) = @_; my $lookup = "$dir/" . "rh_lookup.csv"; open(CSV, $lookup); my @csv_rows = ; close(CSV); open(CSVR, ">>$dir/genome_tracr_summary.csv"); print CSVR "ORGANISM,CONTIG_ID,RR_CONTIG,RR_ID,CD_SCORE,REPEAT_SEQ,RR_START,RR_END,RR_ORI,CAS9_INFO," . "BLAST_HSP_START,BLAST_HSP_END,BLAST_EVALUE,RC_OF_HSP_EXTRACT,". "HSP_EXTRACT_ID,RH_MFE,HYBRID_SEQS,HYBRID_STRUCT,TAIL_SEQ,TAIL_STRUCT,REPEAT_TRACR,RH_RF_STRUCT,RCF_STRUCT," . "TRACR_RNA_SEQ,TRACR_START,TRACR_END,TRACR_ORI\n"; my $count = 0; for (my $i = 0; $i < scalar(@csv_rows); $i++) { my $lookup_line = @csv_rows[$i]; $lookup_line =~ s/\n//g; my @lookup_toks = split(/\,/, $lookup_line); my $num_lookup_toks = scalar(@lookup_toks); my $tpath = @lookup_toks[$num_lookup_toks - 1]; my $qpath = @lookup_toks[$num_lookup_toks - 2]; my $rc = @lookup_toks[$num_lookup_toks - 3]; my ($ext_start) = ($tpath =~ /.*_GE_(\d+)_\d+_.*/); my $l_tpath = $tpath; $tpath =~ s/\.fa$/_loc\.fa/g; my @rna_struct_out = run_struct_calc($dir, $qpath, $tpath, $ext_start, $rc, $mfe_cutoff, $h_constr, $hcs, $hce, $l_tpath, $tt_skip, $tail_cut); my $str = ''; for (my $j = 0; $j < scalar(@lookup_toks) - 2; $j++) { $str .= @lookup_toks[$j]; $str .= ","; } for (my $j = 0; $j < scalar(@rna_struct_out); $j++) { my $rsline = @rna_struct_out[$j]; my @r_parts = split(/\,/, $rsline); if (@r_parts[5] !~ /[\(\)]/) { next; } print CSVR $str . $rsline . "\n"; #my $contig = @lookup_toks[1]; #my $start = @r_parts[10]; #my $end = @r_parts[11]; #my $contig_fa = "$dir/" . $contig . ".fa"; #open (CTG, $contig_fa); #my @ctg_contents = ; #close(CTG); #my $seq = @ctg_contents[1]; #$seq =~ s/\n//g; #if ($end < $start) { # my $temp = $start; # $start = $end; # $end = $temp; #} #my $ctg_start = $start - 200; #if ($ctg_start < 0) { # $ctg_start = 0; #} #my $ctg_end = $end + 200; #if ($ctg_end > (length($seq) - 1)) { # $ctg_end = length($seq) - 1; #} #$count++; #my $id = "TRACR_" . $count . "_" . $start . "_" . $end; #my $seq_ext = substr($seq, $ctg_start, $ctg_end - $ctg_start + 1); #open(FAFLNK, ">>$dir/". $id . "_flank.fa"); #print FAFLNK ">" . $id . "_flank\n"; #print FAFLNK $seq_ext . "\n"; #close(FAFLNK); #my $flag = crispr_call_rerun($dir, $id, $session); } } close(CSVR); return 1; } sub run_struct_calc { my ($dir, $query, $target, $ext_start, $rc, $mfe_cutoff, $h_constr, $hcs, $hce, $long_target, $tt_skip, $tail_cut) = @_; open(seqs, $target) or die "can't open txt file"; open(reps, $query) or die "can't open txt file"; open(lseqs, $long_target) or die "can't open txt file"; my @seq_contents = ; my $seq_h = @seq_contents[0]; my $seq = @seq_contents[1]; $seq =~ s/\n//g; my @l_seq_contents = ; my $l_seq_h = @l_seq_contents[0]; my $l_seq = @l_seq_contents[1]; $l_seq =~ s/\n//g; if ($rc > 0) { $seq =~ tr/ATCG/TAGC/; $seq = reverse($seq); $l_seq =~ tr/ATCG/TAGC/; $l_seq = reverse($l_seq); } my @seq_contents_rep = ; my $rep_h = @seq_contents_rep[0]; my $rep = @seq_contents_rep[1]; my $tpath = "$dir/" . "temp_target.fa"; my $qpath = "$dir/" . "temp_query.fa"; open(out1, ">>$tpath") or die "can't open fa file"; print out1 @seq_contents[0]; print out1 $seq . "\n"; open(out2, ">>$qpath") or die "can't open fa file"; print out2 @seq_contents_rep[0]; print out2 $rep; my $mq = length($rep) + 5; my $mt = length($seq) + 5; my $outpath = "$dir/" . "temp_out.txt"; my $cmd = "RNAhybrid -t $tpath -q $qpath -m $mt -n $mq -d xi,theta -b 1 -c"; $cmd .= " -e $mfe_cutoff"; if ($h_constr == 1) { $cmd .= " -f $hcs,$hce"; } $cmd .= " > $outpath"; my $flag = system($cmd); my @out_table = parse_rnahybrid_output($outpath); my @outdata = (); for ($i = 0; $i < scalar(@out_table); $i++) { my $hybrid = $out_table[$i][7]; $hybrid =~ s/N//g; $out_table[$i][5] = get_exact_start($l_seq, $hybrid, $out_table[$i][5] + index($l_seq, $seq)); my $tail_seq = get_sl_seq($l_seq, $hybrid, $out_table[$i][5] + index($l_seq, $seq), $tt_skip, $tail_cut); if (length($tail_seq) >= 2) { open(out3, ">>$dir/seqin.fa") or die "can't create fa file"; open(out4, ">>$dir/rcfin.fa") or die "can't create fa file"; print out3 @seq_contents[$ind_header]; print out3 $tail_seq; print out4 $out_table[$i][6] . "&" . $hybrid . $tail_seq; my $cmd2 = "RNAfold -p -d2 --noLP -P andronescu_2007.par < $dir/seqin.fa > ". "$dir/" . "temp_rf_out.txt"; my $flag2 = system($cmd2); my $cmd3 = "RNAcofold -a -d2 --noLP -P andronescu_2007.par < $dir/rcfin.fa > ". "$dir/" . "temp_rcf_out.txt"; my $flag3 = system($cmd3); my @rf_out = parse_rnafold_output("$dir/" . "temp_rf_out.txt"); my @rcf_out = parse_rnacofold_output("$dir/" . "temp_rcf_out.txt"); my $outstr = $out_table[$i][0] . "," . $out_table[$i][3] . "," . $out_table[$i][6] . "&" . $hybrid . "," . $out_table[$i][8] . "&" . $out_table[$i][9] . "," . $tail_seq . "," . @rf_out[0] . ",". $out_table[$i][6] . "&" . $hybrid . $tail_seq . "," . $out_table[$i][8] . "&" . $out_table[$i][9] . @rf_out[0] . "," . @rcf_out[0] . "," . $hybrid . $tail_seq; my $true_pos = $ext_start + $out_table[$i][5] + 1; if ($rc > 0) { #$true_pos = $ext_start + ($out_table[$i][1] - $out_table[$i][5]) + 1; $true_pos = $ext_start + (length($l_seq) - $out_table[$i][5]) + 1; } my $hyb_len = length($hybrid); my $tail_len = length($tail_seq); my $tracr_start = $true_pos; my $tracr_end = $true_pos + $hyb_len + $tail_len; if ($rc > 0) { $tracr_start = $true_pos - $hyb_len - $tail_len; $tracr_end = $true_pos; } $outstr .= ","; $outstr .= "$tracr_start"; $outstr .= ","; $outstr .= "$tracr_end"; $outstr .= ","; $outstr .= "$rc"; print "RNAhybrid finds an anti-repeat: " . $out_table[$i][6] . "&" . $hybrid . " MFE: " . $out_table[$i][3] . "\n"; push(@outdata, $outstr); unlink("$dir/seqin.fa"); unlink("$dir/rcfin.fa"); unlink("$dir/" . "temp_rf_out.txt"); unlink("$dir/" . "temp_rcf_out.txt"); my @dir = glob("*.ps"); for ($j = 0; $j < scalar(@dir); $j++) { if (-e @dir[$j]) { unlink(@dir[$j]); } } close(out3); close(out4); } else { push(@outdata, ",,,,,,,,,,,,"); } } unlink($outpath); unlink($tpath); unlink($qpath); close(seqs); close(reps); close(out1); close(out2); return @outdata; } sub patscan_tail { my($dir, $hybrid, $tail, $tail_pat_file) = @_; my $hyb_last = substr($hybrid, length($hybrid) - 1, 1); my $seq_test = $hyb_last . $tail; #my $seq_test = $tail; open(SFM_IN, ">>$dir/seq_sfm.fa") or die "can't create FA file"; print SFM_IN '>test' . "\n"; print SFM_IN $seq_test; close(SFM_IN); my $cmd = "scan_for_matches $tail_pat_file < $dir/seq_sfm.fa > $dir/sfm_out.txt"; my $flag = system($cmd); open(SFM_OUT, "$dir/sfm_out.txt") or die "can't open TXT file"; my @sfm_content = ; close(SFM_OUT); my $rfm_out_line = ''; my $rfm_first_line = ''; for (my $i = 0; $i < scalar(@sfm_content); $i++) { my $line = @sfm_content[$i]; if ($i == 0) { $rfm_first_line = $line; } if ($line =~ /\:\[\d+\,\d+\]/) { if ($i + 1 < scalar(@sfm_content)) { my $next_line = @sfm_content[$i + 1]; $next_line =~ s/^\s+//; $rfm_out_line = $next_line; last; } else { next; } } else { next; } } unlink("$dir/sfm_out.txt"); unlink("$dir/seq_sfm.fa"); return $rfm_out_line; } sub get_sfm_pat { my ($path) = @_; open(PAT, $path) or die "can't open TXT file"; my @pat_content = ; close(PAT); my $result = ''; for (my $i = 0; $i < scalar(@pat_content); $i++) { my $line = @pat_content[$i]; $line =~ s/\n//g; $result .= ($line . ";"); } return $result; } sub search_tail_pattern { my ($dir, $tail_pat_file) = @_; open(RAWDAT, "$dir/genome_tracr_summary_classified.csv"); my @contents = ; close(RAWDAT); open(DAT, "lookup_pat.dat"); my @pat_contents =; close(DAT); open(GFFR, ">>$dir/genome_tracr_summary_classified_2.csv"); for(my $i = 0; $i < scalar(@contents); $i++) { my $line = @contents[$i]; $line =~ s/\n//g; if ($i == 0) { $line .= ","; for (my $j = 0; $j < scalar(@pat_contents); $j++) { $line .= "TAIL_SIG_$j,PAT_" . $j . "_MATCHED," } $line .= "TAIL_SIG_USER,PAT_USER_MATCHED,KPAT_FOUND\n"; print GFFR $line; next; } my @row_toks = split(/\,/, $line); my $hybrid = @row_toks[16]; my $tail = @row_toks[18]; my $count = 0; $line .= ","; for (my $j = 0; $j < scalar(@pat_contents); $j++) { my $datline = @pat_contents[$j]; $datline =~ s/\n//g; my @ptoks = split('#', $datline); my $pp = patscan_tail($dir, $hybrid, $tail, @ptoks[1]); my $bb = 0; if (length($pp) > 0) { $bb = 1; $count++; } else { $pp = ""; } $line .= "$bb,$pp,"; } my $kpatb = 0; if ($count > 0) { $kpatb = 1; } my $pat = patscan_tail($dir, $hybrid, $tail, $tail_pat_file); $pat =~ s/\n//g; my $b = 0; if (length($pat) > 0) { $b = 1; } if ($b == 0) { $pat = ""; } $line .= "$b,$pat,$kpatb"; $line =~ s/\n//g; print GFFR $line . "\n"; } close(GFFR); unlink("$dir/genome_tracr_summary_classified.csv"); my $flag = system("mv $dir/genome_tracr_summary_classified_2.csv $dir/genome_tracr_summary_classified.csv"); return 1; } sub parse_tail_fold { my ($dir) = @_; open(RAWDAT, "$dir/genome_tracr_summary.csv"); my @contents = ; close(RAWDAT); open(GFFR, ">>$dir/genome_tracr_summary_2.csv"); for(my $i = 0; $i < scalar(@contents); $i++) { my $line = @contents[$i]; $line =~ s/\n//g; my @toks = split(/\,/, $line); if ($i == 0) { $line .= ",TAIL_FOLD_SUMMARY,NUM_H,MIN_H,MAX_H,FIRST_H," . "NUM_C,MIN_C,MAX_C,FIRST_C,NUM_L,MIN_L,MAX_L,FIRST_L," . "TETRA_T_END,XRAY_FOLD\n"; print GFFR $line; next; } my $tail = @toks[19]; my $tail_seq = @toks[18]; my $tail_fold_str = parse_dbstr($tail); my $b_tetra = 0; my @tail_fold_str_toks = split(/\,/, $tail_fold_str); my $nh = @tail_fold_str_toks[1]; my $nc = @tail_fold_str_toks[5]; my $minh = @tail_fold_str_toks[2]; my $maxh = @tail_fold_str_toks[3]; $tail_seq =~ s/\n//g; if ($tail_seq =~ /[ATCGN]+TTTT$/g) { $tail_fold_str .= ",1"; $b_tetra = 1; } else { $tail_fold_str .= ",0"; } if (($minh >= 8) && ($maxh <= 40) && ($nh == 2 || $nh == 3) && ($nc == 0) && ($b_tetra > 0)) { $tail_fold_str .= ",1\n"; } else { $tail_fold_str .= ",0\n"; } $line .= ","; $line .= $tail_fold_str; print GFFR $line; } unlink("$dir/genome_tracr_summary.csv"); my $flag = system("mv $dir/genome_tracr_summary_2.csv $dir/genome_tracr_summary.csv"); return 1; } sub generate_tracr_gff { my ($dir, $p_cutoff) = @_; open(RAWDAT, "$dir/genome_tracr_summary_classified.csv"); my @contents = ; close(RAWDAT); open(GFFR, ">>$dir/genome_tracr_gff.gff"); my @gfflines = (); my @pps = (); for(my $i = 0; $i < scalar(@contents); $i++) { if ($i == 0) { next; } my $line = @contents[$i]; $line =~ s/\n//g; my @row_toks = split(/\,/, $line); my $seqname = @row_toks[1]; my $source = "TracrApp"; my $feature = "transcript"; my $start = @row_toks[24]; my $end = @row_toks[25]; my $score = @row_toks[54] * 100.0; my $strand = "+"; if (@row_toks[26] == 1) { $strand = "-"; } my $gene_gff_path = "$dir/" . $seqname . "_gene.gff"; my $cmd = "grep $pid $gene_gff_path"; my $frame = "."; my $id = "TRACR_" . $i . "_" . $start . "_" . $end; my $name = "TRACR_RNA_" . $i; my $note = "CDREF " . @row_toks[3] . " HYBRID " . @row_toks[16] . " MFE " . @row_toks[15]; my $attr = "ID=" . $id . ";Name=" . $name . ";Note=" . $note; my $gff_line = "$seqname\t$source\t$feature\t$start\t$end\t$score\t$strand\t$frame\t$attr"; push(@gfflines, $gff_line); push(@pps, @row_toks[54]); my $line_next = "####,####\n"; if ($i < scalar(@contents) - 1) { $line_next = @contents[$i + 1]; } my @row_toks_next = split(/\,/, $line_next); my $seqname_next = @row_toks_next[1]; if ($seqname_next ne $seqname) { my $cmd = 'grep source ' . $dir . '/' . $seqname . '.gff'; my @grep_result = `$cmd`; my $line0 = @grep_result[0]; $line0 =~ s/\n//g; my $org_name = 'Unknown species'; if ($line0 =~ /source.*\;organism=.*/) { my ($org_name_) = ($line0 =~ /source.*\;organism=(.*)\s*/); $org_name = $org_name_; $org_name =~ s/\,//g; } my @gtoks = split(/[\t]/, $line0); my $g_gff_line = ""; for (my $j = 0; $j < 8; $j++) { $g_gff_line .= @gtoks[$j]; $g_gff_line .= "\t"; } my $g_attr = "ID=" . $seqname . ";Name=Contig_" . $seqname . ";Organism=" . $org_name; $g_gff_line .= $g_attr; print GFFR $g_gff_line . "\n"; for (my $j = 0; $j < scalar(@gfflines); $j++) { if (@pps[$j] < $p_cutoff) { next; } my $gline = @gfflines[$j]; $gline .= ";Parent="; $gline .= $seqname; print GFFR $gline . "\n"; } open(RR_GFF, $dir . '/' . $seqname . '_repeat.gff'); my @rr_contents = ; close(RR_GFF); for (my $j = 0; $j < scalar(@rr_contents); $j++) { my $rr_line = @rr_contents[$j]; $rr_line =~ s/\n//g; if ($rr_line =~ /repeat_region/) { $rr_line .= ";Parent="; $rr_line .= $seqname; print GFFR $rr_line . "\n"; } else { print GFFR $rr_line . "\n"; } } @gfflines = (); @pps = (); } } return 1; } sub generate_tracr_txt { my ($dir, $p_cutoff, $tail_pat_file, $session) = @_; open(RAWDAT, "$dir/genome_tracr_summary_classified.csv"); my @contents = ; close(RAWDAT); open(DAT, "lookup_pat.dat"); my @pat_contents =; close(DAT); my ($gcf) = ($dir =~ /.*(GCF_\d+_\d+)_.*/); $gcf =~ s/_/\./g; open(TXTOUT, ">>$dir/genome_tracr_txt.txt"); open(MFA, ">>$dir/genome_tracr_seqs.fa"); for(my $i = 0; $i < scalar(@contents); $i++) { if ($i == 0) { next; } my $line = @contents[$i]; $line =~ s/\n//g; my @row_toks = split(/\,/, $line); my $organism = @row_toks[0]; my $contig = @row_toks[1]; my $start = @row_toks[24]; my $end = @row_toks[25]; my $prob = @row_toks[54]; my $mfe = @row_toks[15]; my $id = "TRACR_" . $i . "_" . $start . "_" . $end; my $hybrid = @row_toks[16]; my $hybrid_struct = @row_toks[17]; my $tail = @row_toks[18]; my $tail_struct = @row_toks[19]; my $rr_id = @row_toks[3]; my $rr_rc = "-"; if ($rr_id =~ /\d+_RC/) { $rr_rc = "+"; } my $rc = "+"; if (@row_toks[26] == 0) { $rc = "-"; } my $rep = @row_toks[5]; my $rcstr = $rr_rc . "/" . $rc; my @pats = (); for (my $j = 0; $j < scalar(@pat_contents); $j++) { my $datline = @pat_contents[$j]; $datline =~ s/\n//g; my @ptoks = split('#', $datline); my $found = @row_toks[(2 * $j) + 56]; if ($found == 1) { my $match_pat = get_sfm_pat(@ptoks[1]); my $desc = @ptoks[2]; my $str = $desc .": " . $match_pat; push(@pats, $str); } } my $sp_pat = @row_toks[scalar(@row_toks) - 3]; my $sp_pat_found = "No"; my $sp_pat_match = ""; if ($sp_pat == 1) { my $sp_pat_match = get_sfm_pat($tail_pat_file); my $str = "User pattern" .": " . $sp_pat_match; push(@pats, $str); } if (scalar(@pats) > 0) { $sp_pat_found = "Yes"; for (my $j = 0; $j < scalar(@pats); $j++) { $sp_pat_match .= @pats[$j]; $sp_pat_match .= "\n"; } } else { $sp_pat_match = "NA\n"; } if (@row_toks[54] < $p_cutoff) { next; } my $b_xray = @row_toks[53]; my $xray = "No"; if ($b_xray == 1) { $xray = "Yes"; } my $strfold = @row_toks[39]; my $overlap_str = "No"; my $overlap = @row_toks[71]; my $overlap_gene = @row_toks[72]; if ($overlap > 0) { $overlap_str = "Yes ($overlap_gene)"; } print TXTOUT " TracrRNA prediction $i, ID: $id, Predicted by TracrRNAFinder 0.00\n" . "====================================================================================================\n" . "Organism: $organism\n" . "Contig ID: $contig\n". "Position: [$start, $end]\n" . "Repeat sequence (predicted by CRISPRDetect 2.1): $rep\n" . "Hybrid free energy: $mfe\n" . "True-positive probability (hybrid SVM): $prob\n" . "Reverse complement used (repeat/anti-repeat): $rcstr\n" . "Known conserved motifs in the tail (by patscan): $sp_pat_found\n" . "Has 2-3 8-40 nt hairpins in the tail: $xray\n" . "Tail folding: $strfold\n" . "Overlapping with an annotated gene: $overlap_str\n". "====================================================================================================\n" . "\n" . "Hybrid structure (repeat & anti-repeat): \n". "$hybrid\n" . "$hybrid_struct\n" . "\n" . "Tail structure: \n" . "$tail\n" . "$tail_struct\n" . "\n" . "Known conserved motif matched: \n" . "$sp_pat_match" . "\n" . "//\n\n"; my @strs = split("&", $hybrid); my $ar = @strs[1]; print MFA ">$id\n"; print MFA $ar . $tail . "\n"; } close(TXTOUT); close(MFA); return 1; } sub add_Cas9_data { my ($dir) = @_; open(RAWDAT, "$dir/genome_tracr_summary.csv"); my @contents = ; close(RAWDAT); open(GFFR, ">>$dir/genome_tracr_summary_2.csv"); for(my $i = 0; $i < scalar(@contents); $i++) { my $line = @contents[$i]; $line =~ s/\n//g; if ($i == 0) { $line .= ",N_CONTIG,CONTIG_LEN,CAS9_START,CAS9_END,CAS9_ID,CAS9_ORI,CAS9_PRED_TYPE\n"; print GFFR $line; next; } my @row_toks = split(/\,/, $line); my $contig = @row_toks[1]; my $cmd = "grep $contig $dir/cas9_genome_summary.csv"; my @results = `$cmd`; if (scalar(@results) == 0) { $line .= ",,,,,,,\n"; print GFFR $line; next; } my $cas9_line = @results[0]; $cas9_line =~ s/\n//g; my @toks = split(/\,/, $cas9_line); my $str = ""; for (my $j = 1; $j < (scalar(@toks) - 1); $j++) { $str .= ","; $str .= @toks[$j]; } $str .= "\n"; $str =~ s/NA//g; $line .= $str; print GFFR $line; } close(GFFR); unlink("$dir/genome_tracr_summary.csv"); my $flag = system("mv $dir/genome_tracr_summary_2.csv $dir/genome_tracr_summary.csv"); return 1; } sub add_cas_classification { my ($dir) = @_; open(RAWDAT, "$dir/genome_tracr_summary_classified.csv"); my @contents = ; close(RAWDAT); open(GFFR, ">>$dir/genome_tracr_summary_classified_2.csv"); my $n = 0; for(my $i = 0; $i < scalar(@contents); $i++) { my $line = @contents[$i]; $line =~ s/\n//g; my @row_toks = split(/\,/, $line); if ($i == 0) { $line .= ",CAS_CLASS\n"; print GFFR $line; $n = scalar(@row_toks); next; } my @row_toks = split(/\,/, $line); my $contig = @row_toks[1]; my $cmd = "grep $contig $dir/cas9_genome_summary.csv"; my @results = `$cmd`; if (scalar(@results) == 0) { $line .= ",unknown\n"; print GFFR $line; next; } my $cas9_line = @results[0]; $cas9_line =~ s/\n//g; my @toks = split(/\,/, $cas9_line); my $n = scalar(@toks); $line .= "," . @toks[$n - 1] . "\n"; print GFFR $line; } close(GFFR); unlink("$dir/genome_tracr_summary_classified.csv"); my $flag = system("mv $dir/genome_tracr_summary_classified_2.csv $dir/genome_tracr_summary_classified.csv"); return 1; } sub add_distance_data { my ($dir) = @_; open(RAWDAT, "$dir/genome_tracr_summary.csv"); my @contents = ; close(RAWDAT); open(GFFR, ">>$dir/genome_tracr_summary_2.csv"); my $n = 0; for(my $i = 0; $i < scalar(@contents); $i++) { my $line = @contents[$i]; $line =~ s/\n//g; my @row_toks = split(/\,/, $line); if ($i == 0) { $line .= ",DIST_TRACR_CAS,DIST_TRACR_RR,DIST_RR_CAS\n"; print GFFR $line; $n = scalar(@row_toks); next; } my $contig_tracr = @row_toks[1]; my $contig_rr = @row_toks[2]; my $n_contig = @row_toks[27]; my $contig_size = @row_toks[28]; my $cas_start = @row_toks[29]; my $cas_end = @row_toks[30]; my $tracr_start = @row_toks[24]; my $tracr_end = @row_toks[25]; my $rr_start = @row_toks[6]; my $rr_end = @row_toks[7]; if ( $cas_start eq '') { $cas_start = 9**9**9; } if ( $cas_end eq '') { $cas_end = 9**9**9; } if ( $tracr_start eq '') { $tracr_start = 9**9**9; } if ( $tracr_end eq '') { $tracr_end = 9**9**9; } if ( $rr_start eq '') { $rr_start = 9**9**9; } if ( $rr_end eq '') { $rr_end = 9**9**9; } my $str = getDistances($cas_start, $cas_end, $tracr_start, $tracr_end, $rr_start, $rr_end, $contig_tracr, $contig_rr, $n_contig, $contig_size); $line .= ","; $line .= $str; $line .= "\n"; print GFFR $line; } close(GFFR); unlink("$dir/genome_tracr_summary.csv"); my $flag = system("mv $dir/genome_tracr_summary_2.csv $dir/genome_tracr_summary.csv"); return 1; } sub add_gene_overlapping_flag { my ($dir) = @_; open(RAWDAT, "$dir/genome_tracr_summary_classified.csv"); my @contents = ; close(RAWDAT); open(GFFR, ">>$dir/genome_tracr_summary_classified_2.csv"); my $n = 0; for(my $i = 0; $i < scalar(@contents); $i++) { my $line = @contents[$i]; $line =~ s/\n//g; my @row_toks = split(/\,/, $line); if ($i == 0) { $line .= ",OVERLAPPING,OL_GENE\n"; print GFFR $line; $n = scalar(@row_toks); next; } my $contig_tracr = @row_toks[1]; my $tracr_start = @row_toks[24]; my $tracr_end = @row_toks[25]; my $gff_path = $dir . "/" . $contig_tracr . "_gene.gff"; my @rs = find_overlapping_gene($gff_path, $tracr_start, $tracr_end); $line .= "," . @rs[0] . "," . @rs[1] . "\n"; print GFFR $line; } close(GFFR); unlink("$dir/genome_tracr_summary_classified.csv"); my $flag = system("mv $dir/genome_tracr_summary_classified_2.csv $dir/genome_tracr_summary_classified.csv"); return 1; } sub add_additional_data { my ($dir, $rep_start, $junct) = @_; open(RAWDAT, "$dir/genome_tracr_summary.csv"); my @contents = ; close(RAWDAT); open(GFFR, ">>$dir/genome_tracr_summary_2.csv"); my $n = 0; for(my $i = 0; $i < scalar(@contents); $i++) { my $line = @contents[$i]; $line =~ s/\n//g; my @row_toks = split(/\,/, $line); if ($i == 0) { $line .= ",DIR_SCORE,TS_SCORE\n"; print GFFR $line; $n = scalar(@row_toks); next; } my $rep = @row_toks[5]; my $hyb = @row_toks[16]; my $rr_id = @row_toks[3]; my $tail = @row_toks[18]; #my $dist = @row_toks[66]; my $rc = 0; if ($rr_id =~ /\d+_RC/) { $rc = 1; } my $b = check_repeat($rep_start, $hyb); #direction_rank($rep_start, $rep, $hyb, $rc); my $tb = check_junction($junct, $tail); #my $dsb = 0; #if ($dist <= 3000) { # $dsb = 1; #} $line .= ","; $line .= $b . "," . $tb; $line .= "\n"; print GFFR $line; } close(GFFR); unlink("$dir/genome_tracr_summary.csv"); my $flag = system("mv $dir/genome_tracr_summary_2.csv $dir/genome_tracr_summary.csv"); return 1; } sub crispr_call_rerun{ my ($dir, $id, $session) = @_; if (! (-d "CRISPRDetect_2.3/work" . $session)) { my $flag = system("mkdir CRISPRDetect_2.3/work" . $session) } my $cmd1 = 'cp ' .$dir . '/' . $id . '_flank.fa CRISPRDetect_2.3/work' . $session; my $cmd2 = "CRISPRDetect_2.3"; my $cmd3 = 'perl CRISPRDetect.pl -f work' . $session . '/' . $id . '_flank.fa -o ' . $id . '_repeat_redo.txt' . ' -array_quality_score_cutoff 0.0' . ' -minimum_word_repeatation 2 -minimum_no_of_repeats 2 -word_length 7' . ' -ea_allowed_percent_similarity 50 -sa_allowed_percent_similarity 50' . ' -minimum_repeat_length 15 > /dev/null'; my $cmd4 = ".."; my $cmd5 = 'rm -f CRISPRDetect_2.3/work' . $session . '/' . $id . '_flank.fa'; my $cmd6 = 'mv CRISPRDetect_2.3/' . $id . '_repeat_redo.txt ' . $dir . '/' . $id . '_repeat_redo.txt'; my $cmd7 = 'mv CRISPRDetect_2.3/' . $id . '_repeat_redo.txt.fp ' . $dir . '/' . $id . '_repeat_redo.fp'; my $cmd8 = 'mv CRISPRDetect_2.3/' . $id . '_repeat_redo.txt.gff ' . $dir . '/' . $id . '_repeat_redo.gff'; my $cmd9 = 'rm -rf CRISPRDetect_2.3/work' . $session; my $flag = system($cmd1); my $flag = chdir($cmd2); my $flag = system($cmd3); my $flag = chdir($cmd4); my $flag = system($cmd5); my $flag = system($cmd6); my $flag = system($cmd7); my $flag = system($cmd8); my $flag = system($cmd9); #my $output_file_size = -s $dir . '/' . $id . '_repeat.txt'; #if(! (defined $output_file_size and $output_file_size > 5)) { # unlink($dir . '/' . $id . '_repeat.txt'); # unlink($dir . '/' . $id . '_repeat.gff'); # unlink($dir . '/' . $id . '_repeat.fp'); #} return $flag; } sub disc_fcn { my @nums = @_; if (scalar(@nums) <= 1) { return 0; } if (scalar(@nums) == 2) { return 1; } my $c = 0; for (my $i = 2; $i < scalar(@nums); $i++) { my $a2 = @nums[$i]; my $a1 = @nums[$i - 1]; my $a0 = @nums[$i - 2]; if ($a1 > $a2 && $a1 > $a0) { $c++; } } return $c; } sub max_min { my @nums = @_; my $maxsofar = 0; my $minsofar = 0; if (scalar(@nums) > 0) { $maxsofar = @nums[0]; $minsofar = @nums[0]; } for (my $i = 0; $i < scalar(@nums); $i++) { if (@nums[$i] >= $maxsofar) { $maxsofar = @nums[$i]; } if (@nums[$i] <= $minsofar) { $minsofar = @nums[$i]; } } my @out = ($maxsofar, $minsofar); return @out; } sub orderCSV { my ($dir) = @_; open(RAWDAT, "$dir/genome_tracr_summary_classified.csv"); my @contents = ; close(RAWDAT); my $head = @contents[0]; $head =~ s/\n//g; my @headtoks = split(/\,/, $head); #my $ind_xray = 54; #my $ind_kpat = 44; #my $ind_q = 70; open (TEMP, ">>$dir/temp.csv"); for (my $j = 1; $j < scalar(@contents); $j++) { my $line = @contents[$j]; $line =~ s/\n//g; print TEMP $line . "\n"; } close(TEMP); open(SDAT, ">>$dir/genome_tracr_summary_classified_2.csv"); print SDAT $head . "\n"; my $flag = system("cat $dir/temp.csv \| sort -t $\',\' -u -k17,17" . " \| sort -t $\',\' -k72n,72 -k38nr,38 -k54nr,54 " . "-k39nr,39 -k35n,35 >> " . "$dir/genome_tracr_summary_classified_2.csv"); unlink("$dir/temp.csv"); unlink("$dir/genome_tracr_summary_classified.csv"); $flag = system("mv $dir/genome_tracr_summary_classified_2.csv $dir/genome_tracr_summary_classified.csv"); return 1; } sub filterCSV { my ($dir, $rep_start, $tail_start, $min_mismatch, $max_mismatch, $helix_size) = @_; open(RAWDAT, "$dir/genome_tracr_summary.csv"); my @contents = ; close(RAWDAT); open(GFFR, ">>$dir/genome_tracr_summary_2.csv"); for(my $i = 0; $i < scalar(@contents); $i++) { my $line = @contents[$i]; $line =~ s/\n//g; if ($i == 0) { print GFFR $line . "\n"; next; } my @row_toks = split(/\,/, $line); my $rep = @row_toks[5]; my $hyb = @row_toks[16]; my $hyb_struct = @row_toks[17]; my $tail = @row_toks[18]; my $rr_id = @row_toks[3]; my $rc = 0; if (index($rr_id, "_RC") >= 0) { $rc = 1; } my $b = 1; if ((length($rep_start) > 0) && (length($tail_start) > 0)) { $b *= direction_rank_2($rep_start, $rep, $rc, $tail_start, $tail); #$b *= direction_rank($rep_start, $rep, $hyb, $rc, $tail_start, $tail); #$b *= direction_rank_hyb($rep_start, $rep, $rc); } my $ll = max_hybrid_helix_length($hyb_struct); my $nm = num_mismatches_in_hybrid($hyb_struct); my $hyb_start = index($hyb_struct, "("); if ($hyb_start <= 2) { $b *= 1; } else { $b *= 0; } if (($min_mismatch > $max_mismatch) && ($max_mismatch >= 0) && ($min_mismatch >= 0)) { my $temp = $max_mismatch; $max_mismatch = $min_mismatch; $min_mismatch = $temp; } if (($min_mismatch < 0) && ($max_mismatch >= 0)) { if ($nm <= $max_mismatch) { $b *= 1; } else { $b *= 0; } } if (($min_mismatch >= 0) && ($max_mismatch < 0)) { if ($nm >= $min_mismatch) { $b *= 1; } else { $b *= 0; } } if (($max_mismatch >= 0) && ($min_mismatch >= 0)) { if (($nm <= $max_mismatch) && ($nm >= $min_mismatch)) { $b *= 1; } else { $b *= 0; } } if ($helix_size > 0) { if ($ll >= $helix_size) { $b *= 1; } else { $b *= 0; } } if ($b > 0) { print GFFR $line . "\n"; } } close(GFFR); unlink("$dir/genome_tracr_summary.csv"); my $flag = system("mv $dir/genome_tracr_summary_2.csv $dir/genome_tracr_summary.csv"); return 1; } sub getDistances { my ($cas_start, $cas_end, $tracr_start, $tracr_end, $rr_start, $rr_end, $contig_tracr, $contig_rr, $n_contig, $contig_size) = @_; my @coords = ($cas_start, $cas_end, $tracr_start, $tracr_end, $rr_start, $rr_end); my @starts = (@coords[0], @coords[2], @coords[4]); my @inds = ("Cas9", "Tracr", "RR"); my @poss = (0,2,4); for (my $i = 0; $i < scalar(@starts); $i++) { my $min = @starts[$i]; for (my $j = $i + 1; $j < scalar(@starts); $j++) { if (@starts[$j] < $min) { $min = @starts[$j]; my $temp = @starts[$i]; @starts[$i] = @starts[$j]; @starts[$j] = $temp; $temp = @inds[$i]; @inds[$i] = @inds[$j]; @inds[$j] = $temp; $temp = @poss[$i]; @poss[$i] = @poss[$j]; @poss[$j] = $temp; } } } @coords = (@coords[@poss[0]], @coords[@poss[0] + 1], @coords[@poss[1]], @coords[@poss[1] + 1], @coords[@poss[2]], @coords[@poss[2] + 1]); my $ind0 = first_index { $_ eq "Tracr" } @inds; my $ind1 = first_index { $_ eq "Cas9" } @inds; my $dtc = abs(@coords[(2 * $ind0) + 1] - @coords[(2 * $ind1)]); if ($ind1 < $ind0) { $dtc = abs(@coords[(2 * $ind1) + 1] - @coords[(2 * $ind0)]); } $ind0 = first_index { $_ eq "Tracr" } @inds; $ind1 = first_index { $_ eq "RR" } @inds; my $dtr = abs(@coords[(2 * $ind0) + 1] - @coords[(2 * $ind1)]); if ($ind1 < $ind0) { $dtr = abs(@coords[(2 * $ind1) + 1] - @coords[(2 * $ind0)]); } $ind0 = first_index { $_ eq "Cas9" } @inds; $ind1 = first_index { $_ eq "RR" } @inds; my $dcr = abs(@coords[(2 * $ind0) + 1] - @coords[(2 * $ind1)]); if ($ind1 < $ind0) { $dcr = abs(@coords[(2 * $ind1) + 1] - @coords[(2 * $ind0)]); } if ($n_contig == 1) { if (abs($contig_size - $dtc) < $dtc) { $dtc = abs($contig_size - $dtc); } if (abs($contig_size - $dtr) < $dtr) { $dtr = abs($contig_size - $dtr); } if (abs($contig_size - $dcr) < $dcr) { $dcr = abs($contig_size - $dcr); } } if ($contig_tracr ne $contig_rr) { $dtr = "inf"; $dcr = "inf"; } $str = $dtc . "," . $dtr . "," . $dcr; $str =~ s/inf/10000000/g; return $str; } sub direction_rank { my ($iupac, $rep, $hyb, $rc, $tail_start, $tail) = @_; my $b = 0; if ((iupac_match_seq_start($iupac, $hyb) == 1) && (iupac_match_seq_start($iupac, $rep) == 0)) { $b = 1; } elsif ((iupac_match_seq_start($iupac, $hyb) == 1) && (iupac_match_seq_start($iupac, $rep) == 1) && ($rc == 0)) { $b = 1; } elsif ((iupac_match_seq_start($iupac, $hyb) == 1) && (iupac_match_seq_start($iupac, $rep) == 1) && ($rc == 1)) { if ((iupac_match_seq_start($tail_start, $tail) == 1)) { $b = 1; } else { $b = 0; } } elsif ((iupac_match_seq_start($iupac, $hyb) == 0) && (iupac_match_seq_start($iupac, $rep) == 0) && ($rc == 0)) { $b = 1; } elsif ((iupac_match_seq_start($iupac, $hyb) == 0) && (iupac_match_seq_start($iupac, $rep) == 0) && ($rc == 1)) { if ((iupac_match_seq_start($tail_start, $tail) == 1)) { $b = 1; } else { $b = 0; } } else { $b = 0; } return $b; } sub direction_rank_2 { my ($iupac, $rep, $rc, $tail_start, $tail) = @_; my $b = 0; my $rcrep = $rep; $rcrep =~ tr/ATCG/TAGC/; $rcrep = reverse($rcrep); if ((iupac_match_seq_start($iupac, $rep) == 0) && (iupac_match_seq_start($iupac, $rcrep) == 1) && ($rc == 1)) { $b = 1; } elsif ((iupac_match_seq_start($iupac, $rep) == 0) && (iupac_match_seq_start($iupac, $rcrep) == 1) && ($rc == 0)) { $b = 0; } elsif ((iupac_match_seq_start($iupac, $rep) == 1) && (iupac_match_seq_start($iupac, $rcrep) == 0) && ($rc == 1)) { $b = 0; } elsif ((iupac_match_seq_start($iupac, $rep) == 1) && (iupac_match_seq_start($iupac, $rcrep) == 0) && ($rc == 0)) { $b = 1; } else { if ($rc == 0) { $b = 1; } else { if (iupac_match_seq_start($tail_start, $tail) == 1) { $b = 1; } else { $b = 0; } } } return $b; } sub direction_rank_hyb { my ($iupac, $rep, $rc) = @_; my $b = 0; my $rcrep = $rep; $rcrep =~ tr/ATCG/TAGC/; $rcrep = reverse($rcrep); if ((iupac_match_seq_start($iupac, $rep) == 0) && (iupac_match_seq_start($iupac, $rcrep) == 1) && ($rc == 1)) { $b = 1; } elsif ((iupac_match_seq_start($iupac, $rep) == 0) && (iupac_match_seq_start($iupac, $rcrep) == 1) && ($rc == 0)) { $b = 0; } elsif ((iupac_match_seq_start($iupac, $rep) == 1) && (iupac_match_seq_start($iupac, $rcrep) == 0) && ($rc == 1)) { $b = 0; } elsif ((iupac_match_seq_start($iupac, $rep) == 1) && (iupac_match_seq_start($iupac, $rcrep) == 0) && ($rc == 0)) { $b = 1; } else { if ($rc == 0) { $b = 1; } else { $b = 0; } } return $b; } sub check_repeat { my ($iupac, $seq) = @_; my $b = iupac_match_seq_start($iupac, $seq); return $b; } sub check_junction { my ($iupac, $seq) = @_; my $b = iupac_match_seq_start($iupac, $seq); return $b; } sub find_overlapping_gene { my ($gffpath, $start, $end) = @_; open(GFF, $gffpath); my @contents = ; close(GFF); my $indmax = scalar(@contents) - 1; my @indxy_0 = (1, $indmax); my @inds = b_search(\@contents, \@indxy_0, $start); my $n_overlap = 0; my $gene_info = ""; for (my $i = 0; $i < scalar(@inds); $i++) { my $gffline = @contents[@inds[$i]]; my @gxy = get_gene_span($gffline); $n_overlap = overlap_nt($start, $end, @gxy[0], @gxy[1]); $gene_info = get_gene_info($gffline); if ($n_overlap > 0) { last; } } if ($n_overlap == 0) { $gene_info = ""; } return ($n_overlap, $gene_info); } sub b_search { my ($gffcontents, $indxy, $start) = @_; my @contents = @{$gffcontents}; my @ixy = @{$indxy}; if (abs(@ixy[1] - @ixy[0]) == 1) { return @ixy; } my $mid = int((@ixy[1] + @ixy[0]) / 2); my @gxy0 = get_gene_span(@contents[@ixy[0]]); my @gxy1 = get_gene_span(@contents[$mid]); my @gxy2 = get_gene_span(@contents[@ixy[1]]); if (($start < @gxy1[0]) && ($start >= @gxy0[0])) { my @new_ixy = (@ixy[0], $mid); return b_search($gffcontents, \@new_ixy, $start); } elsif (($start <= @gxy2[0]) && ($start > @gxy1[0])) { my @new_ixy = ($mid, @ixy[1]); return b_search($gffcontents, \@new_ixy, $start); } else { my @new_ixy = ($mid, $mid); return @new_ixy; } } sub get_gene_span { my ($line) = @_; $line =~ s/\n//g; my @toks = split(/[\t]/, $line); my $start = @toks[3]; my $end = @toks[4]; if ($end < $start) { my $temp = $start; $start = $end; $end = $temp; } return ($start, $end); } sub get_gene_info { my ($line) = @_; $line =~ s/\n//g; my @toks = split(/[\t]/, $line); my $str = @toks[8]; my @strtoks = split(/\;/, $str); return @strtoks[0]; } sub overlap_nt { my ($start_1, $end_1, $start_2, $end_2) = @_; if ($start_1 > $end_1) { my $temp = $start_1; $start_1 = $end_1; $end_1 = $temp; } if ($start_2 > $end_2) { my $temp = $start_2; $start_2 = $end_2; $end_2 = $temp; } my $b1 = (($start_1 >= $start_2) && ($start_1 <= $end_2) && ($end_2 >= $start_1) && ($end_2 <= $end_1)); my $b2 = (($start_2 >= $start_1) && ($start_2 <= $end_1) && ($end_1 >= $start_2) && ($end_1 <= $end_2)); my $b3 = ($start_1 >= $start_2) && ($end_1 <= $end_2); my $b4 = ($start_1 <= $start_2) && ($end_1 >= $end_2); my $out = 0; if ($b1 > 0) { $out = abs($start_1 - $end_2); } elsif ($b2 > 0) { $out = abs($start_2 - $end_1); } elsif ($b3 > 0) { $out = abs($start_1 - $end_1); } elsif ($b4 > 0) { $out = abs($start_2 - $end_2); } else { $out = 0; } return $out; } return 1; #sub direction_rank { # my ($rep, $hyb, $rc) = @_; # # my $b = 0; # # if ((check_GYY($hyb) == 1) && (check_GYY($rep) == 0)) { # $b = 1; # } elsif ((check_GYY($hyb) == 1) && (check_GYY($rep) == 1) && ($rc == 0)) { # $b = 1; # } elsif ((check_GYY($hyb) == 0) && (check_GYY($rep) == 0)) { # $b = 0; # } else { # $b = 0; # } # # return $b; #} #sub check_GYY { # my ($hyb) = @_; # my @chars = split('', $hyb); # # my $c1 = @chars[0]; # my $c2 = @chars[1]; # my $c3 = @chars[2]; # # my $b1 = ($c1 eq "G"); # my $b2 = (($c2 eq "T") || ($c2 eq "C")); # my $b3 = (($c3 eq "C") || ($c3 eq "T")); # # my $b = 0; # # if ($b1 && $b2 && $b3) { # $b = 1; # } # # return $b; #} #sub check_APP { # my ($seq) = @_; # my @chars = split('', $seq); # # my $c1 = @chars[0]; # my $c2 = @chars[1]; # my $c3 = @chars[2]; # # my $b1 = ($c1 eq "A"); # my $b2 = (($c2 eq "A") || ($c2 eq "G")); # my $b3 = (($c3 eq "G") || ($c3 eq "A")); # # my $b = 0; # # if ($b1 && $b2) { # $b = 1; # } # # return $b; #} #sub parse_dbstr { # my ($str) = @_; # # my @chars = split('', $str); # # my $br_count = 0; # my $d_count = 0; # my $hc_count = 0; # my @br_count_list = (); # my @str_list = (); # my @h_list = (); # my @c_list = (); # my @l_list = (); # for (my $i = 0; $i < scalar(@chars); $i++) { # my $ch = @chars[$i]; # if ($ch =~ /\./) { # if ($i == 0 || $br_count == 0) { # $d_count++; # } # } # # if ($ch =~ /\(/) { # #$br_count = $br_count + 1; # push(@br_count_list, $br_count++); # if ($d_count > 0) { # push(@str_list, 'L' . $d_count); # push(@l_list, $d_count); # $d_count = 0 # } # } # # if ($ch =~ /\)/) { # #$br_count = $br_count - 1; # push(@br_count_list, $br_count--); # } # # if ($br_count == 0) { # if (scalar(@br_count_list) > 0) { # $hc_count++; # my $c = disc_fcn(@br_count_list); # if ($c == 1) { # push(@str_list, 'H' . $hc_count); # push(@h_list, $hc_count); # } else { # push(@str_list, 'C' . $hc_count); # push(@c_list, $hc_count); # } # @br_count_list = (); # $hc_count = 0; # } # } else { # $hc_count++; # } # # if ($i == scalar(@chars) - 1) { # if ($d_count > 0) { # push(@str_list, 'L' . $d_count); # push(@l_list, $d_count); # $d_count = 0 # } # } # } # # my $out = ''; # for (my $i = 0; $i < scalar(@str_list); $i++) { # if ($i == scalar(@str_list) - 1) { # $out .= @str_list[$i]; # } else { # $out .= (@str_list[$i] . '-'); # } # } # # my @max_min_h = max_min(@h_list); # my @max_min_c = max_min(@c_list); # my @max_min_l = max_min(@l_list); # # my $nh = scalar(@h_list); # my $nc = scalar(@c_list); # my $nl = scalar(@l_list); # # my $first_h = 0; # my $first_c = 0; # my $first_l = 0; # # if ($nh > 0) { # $first_h = @h_list[0]; # } # # if ($nc > 0) { # $first_c = @c_list[0]; # } # # if ($nl > 0) { # $first_l = @l_list[0]; # } # # $out .= "\," . $nh . "\," . @max_min_h[1] . "\," . @max_min_h[0] . "\," . $first_h . # "\," . $nc . "\," . @max_min_c[1] . "\," . @max_min_c[0] . "\," . $first_c . # "\," . $nl . "\," . @max_min_l[1] . "\," . @max_min_l[0] . "\," . $first_l; # # return $out; #}