use Bio::Tools::GFF; use Bio::SeqIO; sub remove_repeat_elem { my %seen; grep(!$seen{$_}++, @_); } sub gbk_2_gff { my ($seqfile, $session) = @_; my $seqio = new Bio::SeqIO(-format => 'genbank', -file => $seqfile); my $count = 0; while( my $seq = $seqio->next_seq ) { $count++; my $fname = sprintf("work_01$session/%s.gff", $seq->display_id || "seq-$count"); my $gffout = new Bio::Tools::GFF(-file => ">$fname" , -gff_version => 3); my $fname_2 = sprintf("work_01$session/%s_gene.gff", $seq->display_id || "seq-$count"); my $gffout_2 = new Bio::Tools::GFF(-file => ">$fname_2" , -gff_version => 3); my $fname_fa = sprintf("work_01$session/%s.fa", $seq->display_id || "seq-$count"); my $fa_head = sprintf(">%s", $seq->display_id || "seq-$count"); open(fa_out, '>>'.$fname_fa) or die "can't create fa file"; print fa_out $fa_head . "\n"; print fa_out $seq->seq . "\n"; close(fa_out); foreach my $feature ( $seq->top_SeqFeatures() ) { $gffout->write_feature($feature); my $tag = $feature->primary_tag(); if ($tag eq "gene") { $gffout_2->write_feature($feature); } } } return $count; } sub split_gbff_file { my($input_gbff_file, $session) = @_; my $gbk_names; my $count = split_gbff_file_to_individual_gbk_files($input_gbff_file, $gbk_names, $session); return $count; } sub split_gbff_file_to_individual_gbk_files { my($input_gbff_file,$arr_gbk_files,$session) = @_; my @arr_line=`less $input_gbff_file >&1`; my $last_acc=""; my $count = 0; for(my $i = 0; $i <= $#arr_line; $i++) { if($arr_line[$i]=~/^LOCUS /){ $count++; my @arr_t1=split('\s+',$arr_line[$i]); $last_acc = $arr_t1[1]; my $j = $i; my $gbk_file = "work_01$session/" . $last_acc . ".gbff"; open(WR,">$gbk_file"); while($arr_line[$j]!~/^\/\//){ print WR "$arr_line[$j]"; $j++; if($j >= $#arr_line){ last; } } print WR "$arr_line[$j]"; close(WR); push(@{$arr_gbk_files},$gbk_file); $i = $j; } } return $count; } sub process_gbff_psiblast { my ($seqfile, $session) = @_; my $count = 0; if (! (-d "work_01$session")) { my $flag = system("mkdir work_01$session"); } print "Split GBFF file by contigs...\n"; $count = split_gbff_file($seqfile, $session); my $n_contigs = $count; if ($count == 0) { return 0; } $count = 0; my @gbff_paths = glob("work_01$session/*.gbff"); print "Convert GBFF files into GFF files...\n"; for (my $i = 0; $i < scalar(@gbff_paths); $i++) { $count += gbk_2_gff(@gbff_paths[$i], $session); } if ($count == 0) { return 0; } if (-d "psiblast_files$session") { my $flag = system("rm -rf psiblast_files$session"); } print "Set up for PSI-BLAST...\n"; my $flag = system("mkdir psiblast_files$session"); my @file_paths = glob("work_01$session/*.*"); for (my $i = 0; $i < scalar(@file_paths); $i++) { my $line = @file_paths[$i]; my ($dir, $fname_ext) = ($line =~ m|^(.*[/\\])([^/\\]+?)$|); my $cmd = 'mv ' . $line . ' psiblast_files' . $session; my $flag = system($cmd); } my @gff_file_paths = glob("psiblast_files$session/*.gff"); print "Extract amino acid sequences...\n"; open(FAAW, ">>psiblast_files$session/proteins.faa") or die "can't create faa file"; for (my $i = 0; $i < scalar(@gff_file_paths); $i++) { my $path = @gff_file_paths[$i]; my ($dir, $fname_ext) = ($path =~ m|^(.*[/\\])([^/\\]+?)$|); my $contig_id = $fname_ext; $contig_id =~ s/\./_/g; $contig_id =~ s/_gff//g; open(GFFCON, $path) or die "can't open gff file"; while () { my $line = $_; if ($line =~ /CDS\t\d+\t\d+.*[\+\-].*codon_start=.*\;product=.*\;protein_id=\w{2,3}_*\d+\.\d+\;.*\;translation=.*/) { my ($start, $end, $strand, $type, $pid, $pseq) = ($line =~ /CDS\t(\d+)\t(\d+).*([\+\-]).*codon_start=.*\;product=(.*)\;protein_id=(\w{2,3}_*\d+\.\d+)\;.*\;translation=(.*)/); $pseq =~ s/\n//g; my $ori = 1; if ($strand =~ /\-/) { $ori = 0; } print FAAW ">" . $contig_id . "," . $pid . "," . $start . "," . $end . "," . $ori . "," . $type . "\n"; print FAAW $pseq . "\n"; } } close(GFFCON); } close(FAAW); print "Create BLAST database (type = prot)...\n"; my $flag = system('makeblastdb -in psiblast_files' . $session . '/proteins.faa -input_type fasta ' . '-dbtype prot -out psiblast_files' . $session . '/proteins -parse_seqids'); print "Run PSI-BLAST...\n"; my @sr_names = ('mkCas0193', 'cd09643', 'COG3513', 'cd09704', 'mkCas0192'); my @sr_type = ('cas9_mkCas0193', 'cas9_cd09643', 'cas9_COG3513', 'cas9_cd09704', 'cpf1'); for (my $i = 0; $i < scalar(@sr_names); $i++) { $flag = system('psiblast -in_msa SRS/' . @sr_names[$i] . '.sr -num_threads 4 -num_iterations 2 ' . '-db psiblast_files' . $session . '/proteins -out psiblast_files' . $session . '/Cas9_' . @sr_names[$i] . '.txt ' . '-evalue 0.0000001 -outfmt 6 -num_alignments 1 -comp_based_stats 1'); } $flag = system("rm -rf work_01$session"); open(PSICOLLECT, ">>psiblast_files$session/psiblast_result.txt"); my @rs_collect = (); my @pid_collect = (); for (my $i = 0; $i < scalar(@sr_names); $i++) { open(PSIOUTn, 'psiblast_files' . $session . '/Cas9_' . @sr_names[$i] . '.txt') or die "can't open TXT file"; my @psi_contents_n = ; close(PSIOUTn); my $cas_type = @sr_type[$i]; for (my $j = 0; $j < scalar(@psi_contents_n); $j++) { my $line = @psi_contents_n[$j]; $line =~ s/\n//g; $line .= "\t$cas_type"; if ($line =~/\w+\s+[\w\d_]+\,.*\,/) { my ($pid) = ($line =~ /\w+\s+[\w\d_]+\,(\w{2}_\d+\.\d+)\,\d+\,\d+\,\d+\,.*\s+/); push(@rs_collect, $line); push(@pid_collect, $pid); } } } #@pid_collect = remove_repeat_elem(@pid_collect); for (my $j = 0; $j < scalar(@rs_collect); $j++) { my $line = @rs_collect[$j]; print PSICOLLECT $line . "\n"; } #A: for (my $i = 0; $i < scalar(@pid_collect); $i++) { # B: for (my $j = 0; $j < scalar(@rs_collect); $j++) { # if (index(@rs_collect[$j], @pid_collect[$i]) >= 0) { # my $line = @rs_collect[$j]; # print PSICOLLECT $line . "\n"; # last B; # } # } #} close(PSICOLLECT); print "Extract Cas9 matches from PSI-BLAST output...\n"; open(PSIOUT, "psiblast_files$session/psiblast_result.txt") or die "can't open TXT file"; my @psi_contents = ; close(PSIOUT); my @ids = (); for (my $i = 0; $i < scalar(@psi_contents); $i++) { my $line = @psi_contents[$i]; if ($line =~/\w+\s+[\w\d_]+\,.*\,/) { my ($id) = ($line =~/\w+\s+([\w\d_]+)\,.*\,/); push(@ids, $id); } else { next; } } @ids = remove_repeat_elem(@ids); if (scalar(@ids) == 0) { return 0; } return $n_contigs; } sub run_hmmsearch { my ($session) = @_; print "Run HMMSEARCH...\n"; my @cas_genes = ("c2c1", "cas9", "CasX", "CasY", "Cas9_archaeal"); for (my $i = 0; $i < scalar(@cas_genes); $i++) { my $cas = @cas_genes[$i]; my $cas_hmm = "HMM/$cas.hmm"; my $faapath = "psiblast_files$session/proteins.faa"; my $outfile = "psiblast_files$session/Cas9_" . $cas . "_hmm.txt"; my $tracefile = "psiblast_files$session/Cas9_" . $cas . "_hmm_tr.txt"; my $flag = system("hmmsearch --tblout $outfile --noali -E 1e-6 $cas_hmm $faapath > $tracefile"); } open(HMMCOLLECT, ">>psiblast_files$session/hmm_result.txt"); my @rs_collect = (); my @pid_collect = (); for (my $i = 0; $i < scalar(@cas_genes); $i++) { open(HMMOUTn, 'psiblast_files' . $session . '/Cas9_' . @cas_genes[$i] . '_hmm.txt') or die "can't open TXT file"; my @hmm_contents_n = ; close(HMMOUTn); for (my $j = 0; $j < scalar(@hmm_contents_n); $j++) { my $line = @hmm_contents_n[$j]; $line =~ s/\n//g; if ($line =~ /^#.*/) { next; } my @toks = split(/\,/, $line); my $pid = @toks[1]; $line =~ s/\s+\-\s+/ /g; $line =~ s/\s+/\t/g; push(@rs_collect, $line); push(@pid_collect, $pid); } } #@pid_collect = remove_repeat_elem(@pid_collect); for (my $j = 0; $j < scalar(@rs_collect); $j++) { my $line = @rs_collect[$j]; print HMMCOLLECT $line . "\n"; } #A: for (my $i = 0; $i < scalar(@pid_collect); $i++) { # B: for (my $j = 0; $j < scalar(@rs_collect); $j++) { # if (index(@rs_collect[$j], @pid_collect[$i]) >= 0) { # my $line = @rs_collect[$j]; # print HMMCOLLECT $line . "\n"; # last B; # } # } #} close(HMMCOLLECT); print "Extract Cas9 matches from HMMSEARCH output...\n"; open(HMMOUT, "psiblast_files$session/hmm_result.txt") or die "can't open TXT file"; my @hmm_contents = ; close(HMMOUT); my @ids = (); for (my $i = 0; $i < scalar(@hmm_contents); $i++) { my $line = @hmm_contents[$i]; $line =~ s/\n//g; my @toks = split(/\,/, $line); my $pid = @toks[1]; push(@ids, $pid); } @ids = remove_repeat_elem(@ids); if (scalar(@ids) == 0) { return 0; } return 1; } sub handle_crispr_repeats_data_txt { my ($dir, $id, $revcom) = @_; my @ids = (); my @repeats = (); my @scores = (); my @reg_starts = (); my @reg_ends = (); my @reg_oris = (); my @cas_lines = (); if (! -e $dir . '/' . $id . '_repeat.txt') { return 0; } open(TXT, $dir . '/' . $id . '_repeat.txt') or die "cannot open txt file."; my @txt_contents = ; my $n = scalar(@txt_contents); if (scalar(@txt_contents) == 0) { close(TXT); return 0; } for (my $i = 0; $i < scalar(@txt_contents); $i++) { my $line = @txt_contents[$i]; if ($line =~ /Array\s\d+\s+\d+-\d+\s*/) { my $id_line = $line; my ($arr_ind, $r_start, $r_end) = ($id_line =~ /Array\s(\d+)\s+(\d+)-(\d+)\s*/); my $id_code = 'CRISPR' . $arr_ind . '_' . $r_start . '_' . $r_end; push(@ids, $id_code); if ($r_start < $r_end) { push(@reg_starts, $r_start); push(@reg_ends, $r_end); } else { push(@reg_starts, $r_end); push(@reg_ends, $r_start); } my $j = $i; my $rep_line_found = 0; my $score_line_found = 0; my $ori_line_found = 0; my $idf_cas_line_found = 0; while(@txt_contents[$j] !~ /^\/\//) { if (@txt_contents[$j] =~ />.*Array_Orientation\:\s+/) { $ori_line_found = 1; my $ro = @txt_contents[$j]; my ($ori) = ($ro =~ />.*Array_Orientation\:\s+(\w+)/); if ($ori =~ /Forward/) { push(@reg_oris, 1); } else { push(@reg_oris, 0); } } if (@txt_contents[$j] =~ /#\sPrimary\srepeat\s\:\s+/) { $rep_line_found = 1; my $ro = @txt_contents[$j]; $ro =~ s/\n//g; if ($ro =~ /\:\s+[ATCG]+$/) { my ($rep) = ($ro =~ /\:\s+([ATCG]+$)/); push(@repeats, $rep); } else { push(@repeats, ''); } } if (@txt_contents[$j] =~ /#\sQuestionable\sarray\s\:\s/) { $score_line_found = 1; my $ro = @txt_contents[$j]; $ro =~ s/\n//g; if ($ro =~ /Score\:\s*\d+\.\d+/) { my ($score) = ($ro =~ /Score\:\s*(\d+\.\d+)/); push(@scores, $score); } else { push(@scores, 0.0); } } if (@txt_contents[$j] =~ /#\sIdentified\sCas\sgenes\:\s/) { $idf_cas_line_found = 1; my $gln = @txt_contents[$j]; $gln =~ s/\n//g; $gln =~ s/\,//g; push(@cas_lines, $gln); } $j++; if ($j >= scalar(@txt_contents)) { last; } } $i = $j; if ($rep_line_found == 0) { push(@repeats, ''); } if ($score_line_found == 0) { push(@scores, 0.0); } if ($ori_line_found == 0) { push(@reg_oris, 1); } if ($idf_cas_line_found == 0) { push(@cas_lines, ''); } } } close(TXT); if (scalar(@ids) == 0) { return 0; } for (my $i = 0; $i < scalar(@ids); $i++) { my $ii = $i + 1; my $tag = $id . '_' . $ii . '_RE_' . @ids[$i]; open(FAO2, '>>' . $dir . '/' . $tag . ".fa") or die "cannot open gff file."; print FAO2 ">" . $id . '_' . $ii . '_' . @ids[$i] . "\n"; print FAO2 @repeats[$i]; close(FAO2); if ($revcom == 0) { next; } $tag = $id . '_' . $ii . '_RE_' . @ids[$i] . "_RC"; open(FAO3, '>>' . $dir . '/' . $tag . ".fa") or die "cannot open gff file."; print FAO3 ">" . $id . '_' . $ii . '_' . @ids[$i] . "\n"; my $rs = @repeats[$i]; $rs =~ tr/ATCG/TAGC/; $rs = reverse($rs); print FAO3 $rs; close(FAO3); } open(CSVRS, '>>' . $dir . '/' . $id . "_repeat_summary.csv") or die "cannot open csv file."; for (my $i = 0; $i < scalar(@ids); $i++) { my $ii = $i + 1; my $tag = $id . '_' . $ii . '_RE_' . @ids[$i]; my $path = $dir . '/' . $tag . ".fa"; print CSVRS @ids[$i] . "\," . @scores[$i] . "\," . @repeats[$i] . "\," . @reg_starts[$i] . "\," . @reg_ends[$i] . "\," . @reg_oris[$i] . "\," . @cas_lines[$i] . "\," . $path . "\n"; if ($revcom == 0) { next; } $tag = $id . '_' . $ii . '_RE_' . @ids[$i] . "_RC"; $path = $dir . '/' . $tag . ".fa"; print CSVRS @ids[$i] . "_RC" . "\," . @scores[$i] . "\," . @repeats[$i] . "\," . @reg_starts[$i] . "\," . @reg_ends[$i] . "\," . @reg_oris[$i] . "\," . @cas_lines[$i] . "\," . $path . "\n"; } close(CSVRS); return 1; } sub get_organism { my ($dir, $id) = @_; my $cmd = 'grep source ' . $dir . '/' . $id . '.gff'; my @grep_result = `$cmd`; my $line0 = @grep_result[0]; my $org_name = 'Unknown species'; if ($line0 =~ /source.*\;organism=.*/) { my ($org_name_) = ($line0 =~ /source.*\;organism=(.*)\s*/); $org_name = $org_name_; $org_name =~ s/\,//g; } return $org_name; } sub crispr_call{ my ($dir, $id, $cd_cutoff, $session) = @_; if (! (-d "CRISPRDetect_2.3/work" . $session)) { my $flag = system("mkdir CRISPRDetect_2.3/work" . $session) } my $cmd1 = 'cp ' .$dir . '/' . $id . '.gbff CRISPRDetect_2.3/work' . $session; my $cmd2 = "CRISPRDetect_2.3"; my $cmd3 = 'perl CRISPRDetect.pl -g work' . $session . '/' . $id . '.gbff -o ' . $id . '_repeat.txt' . ' -array_quality_score_cutoff ' . $cd_cutoff . ' > /dev/null'; my $cmd4 = ".."; my $cmd5 = 'rm -f CRISPRDetect_2.3/work' . $session . '/' . $id . '.gbff'; my $cmd6 = 'mv CRISPRDetect_2.3/' . $id . '_repeat.txt ' . $dir . '/' . $id . '_repeat.txt'; my $cmd7 = 'mv CRISPRDetect_2.3/' . $id . '_repeat.txt.fp ' . $dir . '/' . $id . '_repeat.fp'; my $cmd8 = 'mv CRISPRDetect_2.3/' . $id . '_repeat.txt.gff ' . $dir . '/' . $id . '_repeat.gff'; my $cmd9 = 'rm -rf CRISPRDetect_2.3/work' . $session; my $flag = system($cmd1); my $flag = chdir($cmd2); my $flag = system($cmd3); my $flag = chdir($cmd4); my $flag = system($cmd5); my $flag = system($cmd6); my $flag = system($cmd7); my $flag = system($cmd8); my $flag = system($cmd9); my $output_file_size = -s $dir . '/' . $id . '_repeat.txt'; if(! (defined $output_file_size and $output_file_size > 5)) { unlink($dir . '/' . $id . '_repeat.txt'); unlink($dir . '/' . $id . '_repeat.gff'); unlink($dir . '/' . $id . '_repeat.fp'); } return $flag; } sub blast_call { my ($dir_name, $e_cutoff, $around_cas9, $range, $rr_unmask_start, $rr_unmask_end, $mask_all_genes) = @_; open(LOOKUP_R, "$dir_name/blast_lookup.csv"); my @contents = ; close(LOOKUP_R); my $flag = 0; my $count = 0; open(LOOKUP_RH, ">>$dir_name/rh_lookup.csv"); for (my $i = 0; $i < scalar(@contents); $i++) { my $line = @contents[$i]; $line =~ s/\n//g; my @toks = split(/\,/, $line); my $q_path = @toks[10]; my $rr_id = @toks[3]; my $rr_score = @toks[4]; my $fname_out = @toks[12]; my $contig = @toks[1]; my $contig_seq_path = @toks[11]; my @coords = get_cas9_range_psiblast($dir_name, $contig); if (scalar(@coords) == 0) { @coords = get_cas9_range_hmm($dir_name, $contig); } else { if (scalar(@{$coords[0]} == 0)) { @coords = get_cas9_range_hmm($dir_name, $contig); } } if ($around_cas9 == 1) { if (scalar(@coords) == 0) { next; } else { if (scalar(@{$coords[0]} == 0)) { next; } } } my $rptsum_path = "$dir_name/" . $contig . "_repeat_summary.csv"; my @rptsum_file_contents = (); if (-e $rptsum_path) { open (RPTSUM, $rptsum_path); @rptsum_file_contents = ; close(RPTSUM); } my $s_path_cpy = "$dir_name/$contig" . "_cp.fa"; my $s_path_temp = "$dir_name/$contig" . "_mt.fa"; my $s_path = "$dir_name/$contig" . "_m.fa"; my @pos_x = (); my @pos_y = (); $flag = fa_cpy($contig_seq_path, $s_path_cpy); #my $path0 = $s_path_cpy; for (my $j = 0; $j < scalar(@rptsum_file_contents); $j++) { my $line = @rptsum_file_contents[$j]; $line =~ s/\n//g; my @toks = split(/\,/, $line); my $start = @toks[3]; my $end = @toks[4]; my $score = @toks[1]; if ($score < 1.5) { next; } if ($end < $start) { my $temp = $start; $start = $end; $end = $temp; } my $new_start = $start + $rr_unmask_start; if ($new_start < 0) { $new_start = 0; } my $new_end = $end - $rr_unmask_end; if ($new_end < length($seq) - 1) { $new_end = length($seq) - 1; } push(@pos_x, $new_start); push(@pos_y, $new_end); } my $gene_gff_path = "$dir_name/" . $contig . "_gene.gff"; my @gene_gff_contents = (); if (-e $gene_gff_path) { open (GGFF, $gene_gff_path); @gene_gff_contents = ; close(GGFF); } if ($mask_all_genes == 1) { for (my $i = 0; $i < scalar(@gene_gff_contents); $i++) { if ($i == 0) { next; } my $line = @gene_gff_contents[$i]; $line =~ s/\n//g; my @toks = split(/[\t]/, $line); my $start = @toks[3]; my $end = @toks[4]; if ($end < $start) { my $temp = $start; $start = $end; $end = $temp; } push(@pos_x, $start); push(@pos_y, $end); } } my $path0 = $s_path_cpy; $flag = create_mask_include_from_coords($path0, $s_path_temp, \@pos_x, \@pos_y); unlink($path0); $flag = fa_cpy($s_path_temp, $path0); unlink($s_path_temp); $flag = system("mv $path0 $s_path_temp"); if ($around_cas9 == 1) { my $cas9_start = $coords[0][0] - 1; my $cas9_end = $coords[1][0] - 1; my $ms = $cas9_start - $range; my $me = $cas9_end + $range; $flag = create_mask_exclude($s_path_temp, $s_path, $ms, $me, 500); unlink($s_path_temp); } else { $flag = system("mv $s_path_temp $s_path"); } if ($rr_score < 3.0) { next; } print "Running BLASTN: $rr_id vs. $contig\n"; $flag = system("blastn -query $q_path -subject $s_path -word_size 7 " . " -lcase_masking -out $fname_out -gapopen 2 " . " -gapextend 1 -penalty -1 -task blastn-short -outfmt 6 -evalue $e_cutoff"); unlink($s_path); my $str = ''; for (my $j = 0; $j < scalar(@toks) - 3; $j++) { $str .= @toks[$j]; $str .= ","; } my $bpath = @toks[12]; $flag = system("sort $bpath \| uniq > $dir_name/temp.txt"); unlink($bpath); $flag = system("mv $dir_name/temp.txt $bpath"); open(BLAST, $bpath); my @contents = ; close(BLAST); if (scalar(@contents) == 0) { next; } for (my $j = 0; $j < scalar(@contents); $j++) { $count++; my $line = @contents[$j]; $line =~ s/\n//g; my @toks = split(/[\t]/, $line); my $start = @toks[8]; my $end = @toks[9]; my $evalue = @toks[10]; if ($end < $start) { my $temp = $start; $start = $end; $end = $temp; } my $exstart = $start - 120; my $exend = $end + 120; if ($exstart < 0) { $exstart = 0; } my $exstart_2 = $start - 30; my $exend_2 = $end + 30; if ($exstart < 2) { $exstart = 0; } open(FAC, $contig_seq_path); my @fa = ; close(FAC); my $seq = @fa[1]; if ($exend > length($seq) - 1) { $exend = length($seq) - 1; } my $head = $contig . "_" . $count . "_GE_" . $exstart . "_" . $start . "_" . $end . "_" . $exend; my $exname = $head . ".fa"; my $exname2 = $head . "_loc.fa"; open(FAEXT, ">>$dir_name/$exname"); print FAEXT ">$head\n"; $seq =~ s/\n//g; $seqext = substr($seq, $exstart, $exend - $exstart + 1); print FAEXT $seqext . "\n"; close(FAEXT); open(FAEXT, ">>$dir_name/$exname2"); print FAEXT ">$head\n"; $seq =~ s/\n//g; $seqext = substr($seq, $exstart_2, $exend_2 - $exstart_2 + 1); print FAEXT $seqext . "\n"; close(FAEXT); my $str2 = $str; $str2 .= "$start,$end,$evalue,0,$q_path,$dir_name/$exname"; print LOOKUP_RH $str2 . "\n"; $str2 = $str; $str2 .= "$start,$end,$evalue,1,$q_path,$dir_name/$exname"; print LOOKUP_RH $str2 . "\n"; } } close(LOOKUP_RH); return $flag; } sub fa_cpy { my ($file_in, $file_out) = @_; open(FA, $file_in); my @fa = ; close(FA); my $head = @fa[0]; my $seq = @fa[1]; $head =~ s/\n//g; $seq =~ s/\n//g; open(FAO, ">>$file_out"); print FAO $head . "\n"; print FAO $seq . "\n"; close(FAO); return 1; } sub create_mask_include_from_coords { my ($file_in, $file_out, $start_coords, $end_coords) = @_; open(FA, $file_in); my @fa = ; close(FA); my $head = @fa[0]; my $seq = @fa[1]; $head =~ s/\n//g; $seq =~ s/\n//g; my @x_coords = @{$start_coords}; my @y_coords = @{$end_coords}; my $mseq = $seq; for (my $i = 0; $i < scalar(@x_coords); $i++) { my $start = @x_coords[$i]; my $end = @y_coords[$i]; my $subseq = substr($mseq, $start, $end - $start + 1); $subseq = lc($subseq); my $temp = substr($mseq, $start, $end - $start + 1, $subseq); } open(FAO, ">>$file_out"); print FAO $head . "\n"; print FAO $mseq . "\n"; close(FAO); return 1; } sub create_mask_include { my ($file_in, $file_out, $start, $end) = @_; open(FA, $file_in); my @fa = ; close(FA); my $head = @fa[0]; my $seq = @fa[1]; $head =~ s/\n//g; $seq =~ s/\n//g; my $subseq = substr($seq, $start, $end - $start + 1); $subseq = lc($subseq); my $mseq = substr($seq, $start, $end - $start + 1, $subseq); open(FAO, ">>$file_out"); print FAO $head . "\n"; print FAO $seq . "\n"; close(FAO); return 1; } sub create_mask_exclude_from_coords { my ($file_in, $file_out, $start_coords, $end_coords, $ignore) = @_; open(FA, $file_in); my @fa = ; close(FA); my $head = @fa[0]; my $seq = @fa[1]; $head =~ s/\n//g; $seq =~ s/\n//g; my @x_coords = @{$start_coords}; my @y_coords = @{$end_coords}; my $mseq = $seq; for (my $i = 0; $i < scalar(@x_coords); $i++) { my $start = @x_coords[$i]; my $end = @y_coords[$i]; if ($start >= $ignore) { my $subseq = substr($mseq, 0, $start + 1); $subseq = lc($subseq); my $temp = substr($mseq, 0, $start + 1, $subseq); } if ((length($mseq) - $end) >= $ignore) { my $subseq = substr($mseq, $end, length($mseq) - $end); $subseq = lc($subseq); my $temp = substr($mseq, $end, length($mseq) - $end, $subseq); } } open(FAO, ">>$file_out"); print FAO $head . "\n"; print FAO $mseq . "\n"; close(FAO); return 1; } sub create_mask_exclude { my ($file_in, $file_out, $start, $end, $ignore) = @_; open(FA, $file_in); my @fa = ; close(FA); my $head = @fa[0]; my $seq = @fa[1]; $head =~ s/\n//g; $seq =~ s/\n//g; if ($start >= $ignore) { my $subseq = substr($seq, 0, $start + 1); $subseq = lc($subseq); my $mseq = substr($seq, 0, $start + 1, $subseq); } if ((length($seq) - $end) >= $ignore) { my $subseq = substr($seq, $end, length($seq) - $end); $subseq = lc($subseq); my $mseq = substr($seq, $end, length($seq) - $end, $subseq); } open(FAO, ">>$file_out"); print FAO $head . "\n"; print FAO $seq . "\n"; close(FAO); return 1; } sub get_cas9_range_psiblast { my ($dir, $id) = @_; my $path = $dir . '/' . 'psiblast_result.txt'; my @coords = (); if (! -e $path) { return @coords; } my $flag = system("cat $path \| sort -t \$\'\\t\' -k11n,11 \| sort -u -t \$\'\\t\' -k2,2 > $dir/temp_psi.txt"); unlink($path); $flag = system("mv $dir/temp_psi.txt $path"); open(PSILOOKUP, $path) or die "can't open TXT file"; my @contents = ; close(PSILOOKUP); my @xinds = (); my @yinds = (); my @ids = (); my @strands = (); my @types = (); my @cas_types = (); my $chk = 0; for (my $i = 0; $i < scalar(@contents); $i++) { my $line = @contents[$i]; if (($line !~ /\w+\s+\w+\,\w{2}_\d+\.\d+\,\d+\,\d+\,\d+\,.*\s+/)) { next; } my @toks = split(/[\t]/, $line); my ($gid, $cid, $start, $end, $strand, $type) = (@toks[1] =~ /(\w+)\,(\w{2}_\d+\.\d+)\,(\d+)\,(\d+)\,(\d+)\,(.*)/); my $n = scalar(@toks); my $cas_type = @toks[$n - 1]; if ($gid eq $id) { $chk++; push(@xinds, $start); push(@yinds, $end); push(@ids, $cid); push(@strands, $strand); push(@types, $type); push(@cas_types, $cas_type); } } if ($chk > 0) { push(@coords, [@xinds]); push(@coords, [@yinds]); push(@coords, [@ids]); push(@coords, [@strands]); push(@coords, [@types]); push(@coords, [@cas_types]); } return @coords; } sub get_cas9_range_hmm { my ($dir, $id) = @_; my $path = $dir . '/' . 'hmm_result.txt'; my @coords = (); if (! -e $path) { return @coords; } my $flag = system("cat $path \| sort -t \$\'\\t\' -k4n,4 \| sort -u -t \$\'\\t\' -k1,1 > $dir/temp_hmm.txt"); unlink($path); $flag = system("mv $dir/temp_hmm.txt $path"); open(PSILOOKUP, $path) or die "can't open TXT file"; my @contents = ; close(PSILOOKUP); my @xinds = (); my @yinds = (); my @ids = (); my @strands = (); my @types = (); my @cas_types = (); my $chk = 0; for (my $i = 0; $i < scalar(@contents); $i++) { my $line = @contents[$i]; if (($line !~ /\w+\,\w{2}_\d+\.\d+\,\d+\,\d+\,\d+\,.*\s+/)) { next; } my @toks = split(/[\t]/, $line); my ($gid, $cid, $start, $end, $strand, $type) = (@toks[0] =~ /(\w+)\,(\w{2}_\d+\.\d+)\,(\d+)\,(\d+)\,(\d+)\,(.*)/); my $cas_type = @toks[1]; if ($cas_type eq "cas9") { $cas_type = "cas9_unspecified"; } if ($gid eq $id) { $chk++; push(@xinds, $start); push(@yinds, $end); push(@ids, $cid); push(@strands, $strand); push(@types, $type); push(@cas_types, $cas_type); } } if ($chk > 0) { push(@coords, [@xinds]); push(@coords, [@yinds]); push(@coords, [@ids]); push(@coords, [@strands]); push(@coords, [@types]); push(@coords, [@cas_types]); } return @coords; } sub get_unique_id { my $letters="ABCDEFGHIJKLMNOPQRSTUVWXYZ"; my @arr_letters=split('',$letters); my $unique_id = time(); for(my $i = 0; $i < int(rand(50)); $i++){ $unique_id .= $arr_letters[int(rand($#arr_letters))]; } return $unique_id; } return 1;