use Bio::Tools::GFF;
use Bio::SeqIO;

sub remove_repeat_elem {
    my %seen;
    grep(!$seen{$_}++, @_);
}

sub gbk_2_gff {
	my ($seqfile, $session) = @_;
	
	my $seqio = new Bio::SeqIO(-format => 'genbank',
			   -file   => $seqfile);
	my $count = 0;
	
	while( my $seq = $seqio->next_seq ) {
		$count++;
		my $fname = sprintf("work_01$session/%s.gff", $seq->display_id || "seq-$count");
		my $gffout = new Bio::Tools::GFF(-file => ">$fname" ,
				     -gff_version => 3);
				     
		my $fname_2 = sprintf("work_01$session/%s_gene.gff", $seq->display_id || "seq-$count");
		my $gffout_2 = new Bio::Tools::GFF(-file => ">$fname_2" ,
				     -gff_version => 3);
				     
		my $fname_fa = sprintf("work_01$session/%s.fa", $seq->display_id || "seq-$count");
		my $fa_head = sprintf(">%s", $seq->display_id || "seq-$count");
		open(fa_out, '>>'.$fname_fa) or die "can't create fa file";
		print fa_out $fa_head . "\n";
		print fa_out $seq->seq . "\n";
		close(fa_out);		          
		foreach my $feature ( $seq->top_SeqFeatures() ) {
			$gffout->write_feature($feature);
			my $tag = $feature->primary_tag();
			if ($tag eq "gene") {
				$gffout_2->write_feature($feature);
			}
    	}
	}
	
	return $count;
}

sub split_gbff_file {
	my($input_gbff_file, $session) = @_;
	my $gbk_names;
	my $count = split_gbff_file_to_individual_gbk_files($input_gbff_file, $gbk_names, $session);
	return $count;
}

sub split_gbff_file_to_individual_gbk_files {
	my($input_gbff_file,$arr_gbk_files,$session) = @_;
		
	my @arr_line=`less $input_gbff_file >&1`;
	my $last_acc="";
	my $count = 0;
	for(my $i = 0; $i <= $#arr_line; $i++) {
		if($arr_line[$i]=~/^LOCUS     /){
			$count++;
			my @arr_t1=split('\s+',$arr_line[$i]);
			$last_acc = $arr_t1[1];
								
			my $j = $i;
			my $gbk_file = "work_01$session/" . $last_acc . ".gbff";
			open(WR,">$gbk_file");
						
			while($arr_line[$j]!~/^\/\//){
				print WR "$arr_line[$j]";			
				$j++;
				if($j >= $#arr_line){
					last;
				}
			}
							
			print WR "$arr_line[$j]";	
			close(WR);
			push(@{$arr_gbk_files},$gbk_file);
				
			$i = $j;					
		}
	}		
	return $count;	
}

sub process_gbff_psiblast {
	my ($seqfile, $session) = @_;
	
	my $count = 0;
	
	if (! (-d "work_01$session")) {
		my $flag = system("mkdir work_01$session");
	}
	
	print "Split GBFF file by contigs...\n";
	$count = split_gbff_file($seqfile, $session);
	my $n_contigs = $count;
	if ($count == 0) {
		return 0;
	}
	
	$count = 0;
	my @gbff_paths = glob("work_01$session/*.gbff");
	
	print "Convert GBFF files into GFF files...\n";
	for (my $i = 0; $i < scalar(@gbff_paths); $i++) {
		$count += gbk_2_gff(@gbff_paths[$i], $session);
	}
	
	if ($count == 0) {
		return 0;
	}
	
	if (-d "psiblast_files$session") {
		my $flag = system("rm -rf psiblast_files$session");
	}
	
	print "Set up for PSI-BLAST...\n";
	my $flag = system("mkdir psiblast_files$session");
	
	my @file_paths = glob("work_01$session/*.*");
	
	for (my $i = 0; $i < scalar(@file_paths); $i++) {
		my $line = @file_paths[$i];
		my ($dir, $fname_ext) = ($line =~ m|^(.*[/\\])([^/\\]+?)$|); 
		my $cmd = 'mv ' . $line . ' psiblast_files' . $session;
		my $flag = system($cmd);
	}
	
	my @gff_file_paths = glob("psiblast_files$session/*.gff");
	
	print "Extract amino acid sequences...\n";
	open(FAAW, ">>psiblast_files$session/proteins.faa") or die "can't create faa file";
	
	for (my $i = 0; $i < scalar(@gff_file_paths); $i++) {
	
		my $path = @gff_file_paths[$i];
		my ($dir, $fname_ext) = ($path =~ m|^(.*[/\\])([^/\\]+?)$|);
		my $contig_id = $fname_ext;
		$contig_id =~ s/\./_/g;
		$contig_id =~ s/_gff//g;
	
		open(GFFCON, $path) or die "can't open gff file";
	
		while (<GFFCON>) {
			my $line = $_;
			if ($line =~ 
			    /CDS\t\d+\t\d+.*[\+\-].*codon_start=.*\;product=.*\;protein_id=\w{2,3}_*\d+\.\d+\;.*\;translation=.*/) {
			    
				my ($start, $end, $strand, $type, $pid, $pseq) 
				= ($line =~ /CDS\t(\d+)\t(\d+).*([\+\-]).*codon_start=.*\;product=(.*)\;protein_id=(\w{2,3}_*\d+\.\d+)\;.*\;translation=(.*)/);
				$pseq =~ s/\n//g;
			
				my $ori = 1;
				if ($strand =~ /\-/) {
					$ori = 0;
				}
			
				print FAAW ">" . $contig_id . "," . $pid . "," . $start . "," . $end . "," . $ori . "," . $type . "\n";
				print FAAW $pseq . "\n";
			}
		}
		close(GFFCON);
	}
	
	close(FAAW);
	
	print "Create BLAST database (type = prot)...\n";
	my $flag = system('makeblastdb -in psiblast_files' . $session . '/proteins.faa -input_type fasta ' . 
	                  '-dbtype prot -out psiblast_files' . $session . '/proteins -parse_seqids');
	
	print "Run PSI-BLAST...\n";
	my @sr_names = ('mkCas0193', 'cd09643', 'COG3513', 'cd09704', 'mkCas0192');
	my @sr_type = ('cas9_mkCas0193', 'cas9_cd09643', 'cas9_COG3513', 'cas9_cd09704', 'cpf1');
	
	for (my $i = 0; $i < scalar(@sr_names); $i++) {
		$flag = system('psiblast -in_msa SRS/' . @sr_names[$i] . '.sr -num_threads 4 -num_iterations 2 ' . 
	                   '-db psiblast_files' . $session . '/proteins -out psiblast_files' . $session . 
	                   '/Cas9_' . @sr_names[$i] . '.txt ' . 
	                   '-evalue 0.0000001 -outfmt 6 -num_alignments 1 -comp_based_stats 1');
	}
	
	$flag = system("rm -rf work_01$session");
	
	open(PSICOLLECT, ">>psiblast_files$session/psiblast_result.txt"); 
	
	my @rs_collect = ();
	my @pid_collect = ();
	
	for (my $i = 0; $i < scalar(@sr_names); $i++) {
		open(PSIOUTn, 'psiblast_files' . $session . '/Cas9_' . @sr_names[$i] . '.txt') or die "can't open TXT file";
		my @psi_contents_n = <PSIOUTn>;
		close(PSIOUTn);
		
		my $cas_type = @sr_type[$i];
		for (my $j = 0; $j < scalar(@psi_contents_n); $j++) {
			my $line = @psi_contents_n[$j]; 
			$line =~ s/\n//g;
			$line .= "\t$cas_type";	
			if ($line =~/\w+\s+[\w\d_]+\,.*\,/) {
				my ($pid) = ($line =~ /\w+\s+[\w\d_]+\,(\w{2}_\d+\.\d+)\,\d+\,\d+\,\d+\,.*\s+/);
				push(@rs_collect, $line);
				push(@pid_collect, $pid);
			}
		}
	}
	
	#@pid_collect = remove_repeat_elem(@pid_collect);
	
	for (my $j = 0; $j < scalar(@rs_collect); $j++) {
		my $line = @rs_collect[$j];
		print PSICOLLECT $line . "\n";
	}
	
	#A: for (my $i = 0; $i < scalar(@pid_collect); $i++) {
	#	B: for (my $j = 0; $j < scalar(@rs_collect); $j++) {
	#		if (index(@rs_collect[$j], @pid_collect[$i]) >= 0) {
	#			my $line = @rs_collect[$j];
	#			print PSICOLLECT $line . "\n";
	#			last B;
	#		}
	#	}
	#}
	
	close(PSICOLLECT);
	
	print "Extract Cas9 matches from PSI-BLAST output...\n";
	open(PSIOUT, "psiblast_files$session/psiblast_result.txt") or die "can't open TXT file";
	my @psi_contents = <PSIOUT>;
	close(PSIOUT);
	
	my @ids = ();
	for (my $i = 0; $i < scalar(@psi_contents); $i++) {
		my $line = @psi_contents[$i];
		
		if ($line =~/\w+\s+[\w\d_]+\,.*\,/) {
			my ($id) = ($line =~/\w+\s+([\w\d_]+)\,.*\,/);
			push(@ids, $id);
		} else {
			next;
		}
	}
	
	@ids = remove_repeat_elem(@ids);
	
	if (scalar(@ids) == 0) {
		return 0;
	}
	
	return $n_contigs;
}

sub run_hmmsearch {
	my ($session) = @_;
	
	print "Run HMMSEARCH...\n";
	my @cas_genes = ("c2c1", "cas9", "CasX", "CasY", "Cas9_archaeal");
	
	for (my $i = 0; $i < scalar(@cas_genes); $i++) {
		my $cas = @cas_genes[$i];
		my $cas_hmm = "HMM/$cas.hmm";
		my $faapath = "psiblast_files$session/proteins.faa";
		my $outfile = "psiblast_files$session/Cas9_" . $cas . "_hmm.txt";
		my $tracefile = "psiblast_files$session/Cas9_" . $cas . "_hmm_tr.txt";
		my $flag = system("hmmsearch --tblout $outfile --noali -E 1e-6 $cas_hmm $faapath > $tracefile");
	}
	
	open(HMMCOLLECT, ">>psiblast_files$session/hmm_result.txt"); 
	
	my @rs_collect = ();
	my @pid_collect = ();
	
	for (my $i = 0; $i < scalar(@cas_genes); $i++) {
		open(HMMOUTn, 'psiblast_files' . $session . '/Cas9_' . @cas_genes[$i] . '_hmm.txt') or die "can't open TXT file";
		my @hmm_contents_n = <HMMOUTn>;
		close(HMMOUTn);
		
		for (my $j = 0; $j < scalar(@hmm_contents_n); $j++) {
			my $line = @hmm_contents_n[$j]; 
			$line =~ s/\n//g;
			
			if ($line =~ /^#.*/) {
				next;
			} 
			
			my @toks = split(/\,/, $line);
			my $pid = @toks[1];
			
			$line =~ s/\s+\-\s+/ /g;
			$line =~ s/\s+/\t/g;
			
			push(@rs_collect, $line);
			push(@pid_collect, $pid);
		}
	}
	
	#@pid_collect = remove_repeat_elem(@pid_collect);
	
	for (my $j = 0; $j < scalar(@rs_collect); $j++) {
		my $line = @rs_collect[$j];
		print HMMCOLLECT $line . "\n";
	}
	
	#A: for (my $i = 0; $i < scalar(@pid_collect); $i++) {
	#	B: for (my $j = 0; $j < scalar(@rs_collect); $j++) {
	#		if (index(@rs_collect[$j], @pid_collect[$i]) >= 0) {
	#			my $line = @rs_collect[$j];
	#			print HMMCOLLECT $line . "\n";
	#			last B;
	#		}
	#	}
	#}
	
	close(HMMCOLLECT);
	
	print "Extract Cas9 matches from HMMSEARCH output...\n";
	open(HMMOUT, "psiblast_files$session/hmm_result.txt") or die "can't open TXT file";
	my @hmm_contents = <HMMOUT>;
	close(HMMOUT);
	
	my @ids = ();
	for (my $i = 0; $i < scalar(@hmm_contents); $i++) {
		my $line = @hmm_contents[$i];
		$line =~ s/\n//g;
		
		my @toks = split(/\,/, $line);
		my $pid = @toks[1];
		push(@ids, $pid);
	}
	
	@ids = remove_repeat_elem(@ids);
	
	if (scalar(@ids) == 0) {
		return 0;
	}
	
	return 1;
}

sub handle_crispr_repeats_data_txt {
	my ($dir, $id, $revcom) = @_;
	my @ids = ();
    my @repeats = ();
    my @scores = ();
    my @reg_starts = ();
    my @reg_ends = ();
    my @reg_oris = ();
    my @cas_lines = ();
    
    if (! -e $dir . '/' . $id . '_repeat.txt') {
    	return 0;
    }
    
    open(TXT, $dir . '/' . $id . '_repeat.txt') or die "cannot open txt file.";

	my @txt_contents = <TXT>;
	my $n = scalar(@txt_contents);
	if (scalar(@txt_contents) == 0) {
		close(TXT);
		return 0;
	}
	
	for (my $i = 0; $i < scalar(@txt_contents); $i++) {
		my $line = @txt_contents[$i];
		if ($line =~ /Array\s\d+\s+\d+-\d+\s*/) {
			my $id_line = $line;
			my ($arr_ind, $r_start, $r_end) = 
			   ($id_line =~ /Array\s(\d+)\s+(\d+)-(\d+)\s*/);
			my $id_code = 'CRISPR' . $arr_ind . '_' . $r_start . '_' . $r_end;
			push(@ids, $id_code);
			
			if ($r_start < $r_end) {
				push(@reg_starts, $r_start);
				push(@reg_ends, $r_end);
			} else {
				push(@reg_starts, $r_end);
				push(@reg_ends, $r_start);
			}
			
			my $j = $i;
			
			my $rep_line_found = 0;
			my $score_line_found = 0;
			my $ori_line_found = 0;
			my $idf_cas_line_found = 0;
			while(@txt_contents[$j] !~ /^\/\//) {
				if (@txt_contents[$j] =~ />.*Array_Orientation\:\s+/) {
					$ori_line_found = 1;
					my $ro = @txt_contents[$j];
					my ($ori) = ($ro =~ />.*Array_Orientation\:\s+(\w+)/);
					if ($ori =~ /Forward/) {
						push(@reg_oris, 1);
					} else {
						push(@reg_oris, 0);
					}
				}
			
				if (@txt_contents[$j] =~ /#\sPrimary\srepeat\s\:\s+/) {
					$rep_line_found = 1;
					my $ro = @txt_contents[$j];
					$ro =~ s/\n//g;
					if ($ro =~ /\:\s+[ATCG]+$/) {
						my ($rep) = ($ro =~ /\:\s+([ATCG]+$)/);
						push(@repeats, $rep);
					} else {
						push(@repeats, '');
					}
				}
				
				if (@txt_contents[$j] =~ /#\sQuestionable\sarray\s\:\s/) {
					$score_line_found = 1;
					my $ro = @txt_contents[$j];
					$ro =~ s/\n//g;
					if ($ro =~ /Score\:\s*\d+\.\d+/) {
						my ($score) = ($ro =~ /Score\:\s*(\d+\.\d+)/);
						push(@scores, $score);
					} else {
						push(@scores, 0.0);
					}
				}
				
				if (@txt_contents[$j] =~ /#\sIdentified\sCas\sgenes\:\s/) {
					$idf_cas_line_found = 1;
					my $gln = @txt_contents[$j];
					$gln =~ s/\n//g;
					$gln =~ s/\,//g;
					push(@cas_lines, $gln);
				}
				$j++;
				if ($j >= scalar(@txt_contents)) {
					last;
				}
			}
			$i = $j;
			
			if ($rep_line_found == 0) {
				push(@repeats, '');
			}
			
			if ($score_line_found == 0) {
				push(@scores, 0.0);
			}
			
			if ($ori_line_found == 0) {
				push(@reg_oris, 1);
			}
			
			if ($idf_cas_line_found == 0) {
				push(@cas_lines, '');
			}
		}
	}
	
	close(TXT);
	
	if (scalar(@ids) == 0) {
		return 0;
	}
	
	for (my $i = 0; $i < scalar(@ids); $i++) {
		my $ii = $i + 1;
		my $tag = $id . '_' . $ii . '_RE_' . @ids[$i];
		open(FAO2, '>>' . $dir . '/' . $tag . ".fa") or die "cannot open gff file.";
		print FAO2 ">" . $id . '_' . $ii . '_' . @ids[$i] . "\n";
		print FAO2 @repeats[$i];
		close(FAO2);
		
		if ($revcom == 0) {
			next;
		}
		$tag = $id . '_' . $ii . '_RE_' . @ids[$i] . "_RC";
		open(FAO3, '>>' . $dir . '/' . $tag . ".fa") or die "cannot open gff file.";
		print FAO3 ">" . $id . '_' . $ii . '_' . @ids[$i] . "\n";
		
		my $rs = @repeats[$i];
		$rs =~ tr/ATCG/TAGC/;
		$rs = reverse($rs);		

		print FAO3 $rs;
		close(FAO3);
	}
	
	open(CSVRS, '>>' . $dir . '/' . $id . "_repeat_summary.csv") 
	or die "cannot open csv file.";

	for (my $i = 0; $i < scalar(@ids); $i++) {
		my $ii = $i + 1;
		my $tag = $id . '_' . $ii . '_RE_' . @ids[$i];
		my $path = $dir . '/' . $tag . ".fa";
		print CSVRS @ids[$i] . "\," . @scores[$i] . "\," . @repeats[$i] . "\," .
					@reg_starts[$i] . "\," . @reg_ends[$i] . "\," . @reg_oris[$i] . "\," .
		            @cas_lines[$i] . "\," . $path . "\n";
		            
		if ($revcom == 0) {
			next;
		}
		$tag = $id . '_' . $ii . '_RE_' . @ids[$i] . "_RC";
		$path = $dir . '/' . $tag . ".fa";
		print CSVRS @ids[$i] . "_RC" . "\," . @scores[$i] . "\," . @repeats[$i] . "\," .
					@reg_starts[$i] . "\," . @reg_ends[$i] . "\," . @reg_oris[$i] . "\," .
		            @cas_lines[$i] . "\," . $path . "\n"; 
	}
	
	close(CSVRS);
	
	return 1;
    
}

sub get_organism {
	my ($dir, $id) = @_;
	my $cmd = 'grep source ' . $dir . '/' . $id . '.gff';
	my @grep_result = `$cmd`;
	my $line0 = @grep_result[0];
	my $org_name = 'Unknown species';
	if ($line0 =~ /source.*\;organism=.*/) {
		my ($org_name_) = ($line0 =~ /source.*\;organism=(.*)\s*/);
		$org_name = $org_name_;
		$org_name =~ s/\,//g;
	}
	return $org_name;
}

sub crispr_call{

	my ($dir, $id, $cd_cutoff, $session) = @_;
	
	if (! (-d "CRISPRDetect_2.3/work" . $session)) {
		my $flag = system("mkdir CRISPRDetect_2.3/work" . $session)
	}

	my $cmd1 = 'cp ' .$dir . '/' . $id . '.gbff CRISPRDetect_2.3/work' . $session; 
	my $cmd2 = "CRISPRDetect_2.3";
	my $cmd3 = 'perl CRISPRDetect.pl -g work' . $session . '/' . $id . '.gbff -o ' . $id . '_repeat.txt' . ' -array_quality_score_cutoff ' . $cd_cutoff . ' > /dev/null';
	my $cmd4 = "..";
	my $cmd5 = 'rm -f CRISPRDetect_2.3/work' . $session . '/' . $id . '.gbff';
	my $cmd6 = 'mv CRISPRDetect_2.3/' . $id . '_repeat.txt ' . $dir . '/' . $id . '_repeat.txt';
	my $cmd7 = 'mv CRISPRDetect_2.3/' . $id . '_repeat.txt.fp ' . $dir . '/' . $id . '_repeat.fp';
	my $cmd8 = 'mv CRISPRDetect_2.3/' . $id . '_repeat.txt.gff ' . $dir . '/' . $id . '_repeat.gff';
	my $cmd9 = 'rm -rf CRISPRDetect_2.3/work' . $session;

	my $flag = system($cmd1);
	my $flag = chdir($cmd2);
	my $flag = system($cmd3);
	my $flag = chdir($cmd4);
	my $flag = system($cmd5);
	my $flag = system($cmd6);
	my $flag = system($cmd7);
	my $flag = system($cmd8);
	my $flag = system($cmd9);
	
	my $output_file_size = -s $dir . '/' . $id . '_repeat.txt';
	if(! (defined $output_file_size and $output_file_size > 5)) {
		unlink($dir . '/' . $id . '_repeat.txt');
		unlink($dir . '/' . $id . '_repeat.gff');
		unlink($dir . '/' . $id . '_repeat.fp');
	}
	
	return $flag;
}

sub blast_call {

	my ($dir_name, $e_cutoff, $around_cas9, $range, $rr_unmask_start, $rr_unmask_end, $mask_all_genes) = @_;

	open(LOOKUP_R, "$dir_name/blast_lookup.csv");
	my @contents = <LOOKUP_R>;
	close(LOOKUP_R);

	my $flag = 0;
	my $count = 0;
	open(LOOKUP_RH, ">>$dir_name/rh_lookup.csv");
	for (my $i = 0; $i < scalar(@contents); $i++) {
		my $line = @contents[$i];
		$line =~ s/\n//g;
	
		my @toks = split(/\,/, $line);
		my $q_path = @toks[10];
		my $rr_id = @toks[3];
		my $rr_score = @toks[4];
		my $fname_out = @toks[12];
		
		my $contig = @toks[1];
		my $contig_seq_path = @toks[11];
		
		my @coords = get_cas9_range_psiblast($dir_name, $contig);
		
		if (scalar(@coords) == 0) {
			@coords = get_cas9_range_hmm($dir_name, $contig);
		} else {
			if (scalar(@{$coords[0]} == 0)) {
				@coords = get_cas9_range_hmm($dir_name, $contig);
			}
		}
		
		if ($around_cas9 == 1) {
			if (scalar(@coords) == 0) {
				next;
			} else {
				if (scalar(@{$coords[0]} == 0)) {
					next;
				}
			}
		}
		
		my $rptsum_path = "$dir_name/" . $contig . "_repeat_summary.csv";
		my @rptsum_file_contents = ();
		
		if (-e $rptsum_path) {	
			open (RPTSUM, $rptsum_path);
			@rptsum_file_contents = <RPTSUM>;
			close(RPTSUM);
		}
		
		my $s_path_cpy = "$dir_name/$contig" . "_cp.fa";
		my $s_path_temp = "$dir_name/$contig" . "_mt.fa";
		my $s_path = "$dir_name/$contig" . "_m.fa";
		
		my @pos_x = ();
		my @pos_y = ();
		
		$flag = fa_cpy($contig_seq_path, $s_path_cpy);
		
		#my $path0 = $s_path_cpy;
		for (my $j = 0; $j < scalar(@rptsum_file_contents); $j++) {
			my $line = @rptsum_file_contents[$j];
			$line =~ s/\n//g;
			
			my @toks = split(/\,/, $line);
			
			my $start = @toks[3];
			my $end = @toks[4];
			my $score = @toks[1];
			
			if ($score < 1.5) {
				next;
			}
		
			if ($end < $start) {
				my $temp = $start;
				$start = $end;
				$end = $temp;
			}
			
			my $new_start = $start + $rr_unmask_start; 
			if ($new_start < 0) {
				$new_start = 0;
			}
			
			my $new_end = $end - $rr_unmask_end; 
			if ($new_end < length($seq) - 1) {
				$new_end = length($seq) - 1;
			}
			
			push(@pos_x, $new_start);
			push(@pos_y, $new_end);
		}
		
		my $gene_gff_path = "$dir_name/" . $contig . "_gene.gff";
		my @gene_gff_contents = ();
		
		if (-e $gene_gff_path) {	
			open (GGFF, $gene_gff_path);
			@gene_gff_contents = <GGFF>;
			close(GGFF);
		}
		
		if ($mask_all_genes == 1) {
			for (my $i = 0; $i < scalar(@gene_gff_contents); $i++) {
				if ($i == 0) {
					next;
				}
			
				my $line = @gene_gff_contents[$i];
				$line =~ s/\n//g;
			
				my @toks = split(/[\t]/, $line);
				my $start = @toks[3];
				my $end = @toks[4]; 
			
				if ($end < $start) {
					my $temp = $start;
					$start = $end;
					$end = $temp;
				}
			
				push(@pos_x, $start);
				push(@pos_y, $end);
			}
		}
		
		my $path0 = $s_path_cpy;
		$flag = create_mask_include_from_coords($path0, $s_path_temp, \@pos_x, \@pos_y);
			
		unlink($path0);
		$flag = fa_cpy($s_path_temp, $path0);
		unlink($s_path_temp);
		
		$flag = system("mv $path0 $s_path_temp");
		
		if ($around_cas9 == 1) {
			my $cas9_start = $coords[0][0] - 1;
			my $cas9_end = $coords[1][0] - 1;
			
			my $ms = $cas9_start - $range;
			my $me = $cas9_end + $range;
				
			$flag = create_mask_exclude($s_path_temp, $s_path, $ms, $me, 500);
			unlink($s_path_temp);
		} else {
			$flag = system("mv $s_path_temp $s_path");
		}
		
		if ($rr_score < 3.0) {
			next;
		}
		
		print "Running BLASTN: $rr_id vs. $contig\n";
		
		$flag = system("blastn -query $q_path -subject $s_path -word_size 7 " . 
		               " -lcase_masking -out $fname_out -gapopen 2 " . 
		               " -gapextend 1 -penalty -1 -task blastn-short -outfmt 6 -evalue $e_cutoff");
		
		unlink($s_path);
		
		my $str = '';
		for (my $j = 0; $j < scalar(@toks) - 3; $j++) {
			$str .= @toks[$j];
			$str .= ",";
		}
		
		my $bpath = @toks[12];
		
		$flag = system("sort $bpath \| uniq > $dir_name/temp.txt");
		unlink($bpath);
		$flag = system("mv $dir_name/temp.txt $bpath"); 
		
		open(BLAST, $bpath);
		my @contents = <BLAST>;
		close(BLAST);
			
		if (scalar(@contents) == 0) {
			next;
		}
			
		for (my $j = 0; $j < scalar(@contents); $j++) {
			$count++;
			my $line = @contents[$j];
			$line =~ s/\n//g;
				
			my @toks = split(/[\t]/, $line);
			my $start = @toks[8];
			my $end = @toks[9];
			my $evalue = @toks[10];
				
			if ($end < $start) {
				my $temp = $start;
				$start = $end;
				$end = $temp;
			}
				
			my $exstart = $start - 120;
			my $exend = $end + 120;
			
			if ($exstart < 0) {
				$exstart = 0;
			}
			
			my $exstart_2 = $start - 30;
			my $exend_2 = $end + 30;
			
			if ($exstart < 2) {
				$exstart = 0;
			}
			
			open(FAC, $contig_seq_path);
			my @fa = <FAC>;
			close(FAC);
			
			my $seq = @fa[1];
			
			if ($exend > length($seq) - 1) {
				$exend = length($seq) - 1;
			}
			
			my $head = 	$contig . "_" . $count . "_GE_" . $exstart . "_" . $start . "_" . $end . "_" . $exend;
			my $exname = $head . ".fa";
			my $exname2 = $head . "_loc.fa";
			
			open(FAEXT, ">>$dir_name/$exname");
			print FAEXT ">$head\n";
			
			$seq =~ s/\n//g;
			$seqext = substr($seq, $exstart, $exend - $exstart + 1);
			print FAEXT $seqext . "\n";
			
			close(FAEXT);
			
			open(FAEXT, ">>$dir_name/$exname2");
			print FAEXT ">$head\n";
			
			$seq =~ s/\n//g;
			$seqext = substr($seq, $exstart_2, $exend_2 - $exstart_2 + 1);
			print FAEXT $seqext . "\n";
			
			close(FAEXT);
				
			my $str2 = $str;
			$str2 .= "$start,$end,$evalue,0,$q_path,$dir_name/$exname";
			print LOOKUP_RH $str2 . "\n";
				
			$str2 = $str;
			$str2 .= "$start,$end,$evalue,1,$q_path,$dir_name/$exname";
			print LOOKUP_RH $str2 . "\n";
		}
	}
	close(LOOKUP_RH);
	
	return $flag;
}

sub fa_cpy {
	my ($file_in, $file_out) = @_;
	
	open(FA, $file_in);
	my @fa = <FA>;
	close(FA);
		
	my $head = @fa[0];
	my $seq = @fa[1];
	$head =~ s/\n//g;
	$seq =~ s/\n//g;
	
	open(FAO, ">>$file_out");
	print FAO $head . "\n";
	print FAO $seq . "\n";
	close(FAO);
	
	return 1;
}

sub create_mask_include_from_coords {
	my ($file_in, $file_out, $start_coords, $end_coords) = @_;
	
	open(FA, $file_in);
	my @fa = <FA>;
	close(FA);
		
	my $head = @fa[0];
	my $seq = @fa[1];
	$head =~ s/\n//g;
	$seq =~ s/\n//g;
	
	my @x_coords = @{$start_coords};
	my @y_coords = @{$end_coords};
	
	my $mseq = $seq;
	for (my $i = 0; $i < scalar(@x_coords); $i++) {
		my $start = @x_coords[$i];
		my $end = @y_coords[$i];
	
		my $subseq = substr($mseq, $start, $end - $start + 1);
		$subseq = lc($subseq);
		
		my $temp = substr($mseq, $start, $end - $start + 1, $subseq);
	}
	
	open(FAO, ">>$file_out");
	print FAO $head . "\n";
	print FAO $mseq . "\n";
	close(FAO);
	
	return 1;
}

sub create_mask_include {
	my ($file_in, $file_out, $start, $end) = @_;
	
	open(FA, $file_in);
	my @fa = <FA>;
	close(FA);
		
	my $head = @fa[0];
	my $seq = @fa[1];
	$head =~ s/\n//g;
	$seq =~ s/\n//g;
	
	my $subseq = substr($seq, $start, $end - $start + 1);
	$subseq = lc($subseq);
	my $mseq = substr($seq, $start, $end - $start + 1, $subseq);
	
	open(FAO, ">>$file_out");
	print FAO $head . "\n";
	print FAO $seq . "\n";
	close(FAO);
	
	return 1;
}

sub create_mask_exclude_from_coords {
	my ($file_in, $file_out, $start_coords, $end_coords, $ignore) = @_;
	
	open(FA, $file_in);
	my @fa = <FA>;
	close(FA);
		
	my $head = @fa[0];
	my $seq = @fa[1];
	$head =~ s/\n//g;
	$seq =~ s/\n//g;
	
	my @x_coords = @{$start_coords};
	my @y_coords = @{$end_coords};
	
	my $mseq = $seq;
	for (my $i = 0; $i < scalar(@x_coords); $i++) {
		my $start = @x_coords[$i];
		my $end = @y_coords[$i];
	
		if ($start >= $ignore) {		
			my $subseq = substr($mseq, 0, $start + 1);
			$subseq = lc($subseq);
			my $temp = substr($mseq, 0, $start + 1, $subseq);
		}
			
		if ((length($mseq) - $end) >= $ignore) {
			my $subseq = substr($mseq, $end, length($mseq) - $end);
			$subseq = lc($subseq);
			my $temp = substr($mseq, $end, length($mseq) - $end, $subseq);
		}
	}
	
	open(FAO, ">>$file_out");
	print FAO $head . "\n";
	print FAO $mseq . "\n";
	close(FAO);
	
	return 1;
}

sub create_mask_exclude {
	my ($file_in, $file_out, $start, $end, $ignore) = @_;
	
	open(FA, $file_in);
	my @fa = <FA>;
	close(FA);
		
	my $head = @fa[0];
	my $seq = @fa[1];
	$head =~ s/\n//g;
	$seq =~ s/\n//g;
	
	if ($start >= $ignore) {		
		my $subseq = substr($seq, 0, $start + 1);
		$subseq = lc($subseq);
		my $mseq = substr($seq, 0, $start + 1, $subseq);
	}
			
	if ((length($seq) - $end) >= $ignore) {
		my $subseq = substr($seq, $end, length($seq) - $end);
		$subseq = lc($subseq);
		my $mseq = substr($seq, $end, length($seq) - $end, $subseq);
	}
	
	open(FAO, ">>$file_out");
	print FAO $head . "\n";
	print FAO $seq . "\n";
	close(FAO);
	
	return 1;
}

sub get_cas9_range_psiblast {
	my ($dir, $id) = @_;
	my $path = $dir . '/' . 'psiblast_result.txt';
	
	my @coords = ();
	
	if (! -e $path) {
		return @coords;
	}
	
	my $flag = system("cat $path \| sort -t \$\'\\t\' -k11n,11 \| sort -u -t \$\'\\t\' -k2,2 > $dir/temp_psi.txt");
	unlink($path);
	$flag = system("mv $dir/temp_psi.txt $path");
	
	open(PSILOOKUP, $path) or die "can't open TXT file";
	my @contents = <PSILOOKUP>;
	close(PSILOOKUP);
	
	my @xinds = ();
	my @yinds = ();
	my @ids = ();
	my @strands = ();
	my @types = ();
	my @cas_types = ();
	
	my $chk = 0;
	for (my $i = 0; $i < scalar(@contents); $i++) {
		my $line = @contents[$i];
		if (($line !~ /\w+\s+\w+\,\w{2}_\d+\.\d+\,\d+\,\d+\,\d+\,.*\s+/)) {
			next;
		}
		my @toks = split(/[\t]/, $line);
		my ($gid, $cid, $start, $end, $strand, $type) = (@toks[1] =~ /(\w+)\,(\w{2}_\d+\.\d+)\,(\d+)\,(\d+)\,(\d+)\,(.*)/);
		
		my $n = scalar(@toks);
		my $cas_type = @toks[$n - 1];
		
		if ($gid eq $id) {
			$chk++;
			push(@xinds, $start);
			push(@yinds, $end);
			push(@ids, $cid);
			push(@strands, $strand);
			push(@types, $type);
			push(@cas_types, $cas_type);
		}
	}
	
	if ($chk > 0) {
		push(@coords, [@xinds]);
		push(@coords, [@yinds]);
		push(@coords, [@ids]);
		push(@coords, [@strands]);
		push(@coords, [@types]);
		push(@coords, [@cas_types]);
	}
	
	return @coords;
}

sub get_cas9_range_hmm {
	my ($dir, $id) = @_;
	my $path = $dir . '/' . 'hmm_result.txt';
	
	my @coords = ();
	
	if (! -e $path) {
		return @coords;
	}
	
	my $flag = system("cat $path \| sort -t \$\'\\t\' -k4n,4 \| sort -u -t \$\'\\t\' -k1,1 > $dir/temp_hmm.txt");
	unlink($path);
	$flag = system("mv $dir/temp_hmm.txt $path");
	
	open(PSILOOKUP, $path) or die "can't open TXT file";
	my @contents = <PSILOOKUP>;
	close(PSILOOKUP);
	
	my @xinds = ();
	my @yinds = ();
	my @ids = ();
	my @strands = ();
	my @types = ();
	my @cas_types = ();
	
	my $chk = 0;
	for (my $i = 0; $i < scalar(@contents); $i++) {
		my $line = @contents[$i];
		if (($line !~ /\w+\,\w{2}_\d+\.\d+\,\d+\,\d+\,\d+\,.*\s+/)) {
			next;
		}
		my @toks = split(/[\t]/, $line);
		my ($gid, $cid, $start, $end, $strand, $type) = (@toks[0] =~ /(\w+)\,(\w{2}_\d+\.\d+)\,(\d+)\,(\d+)\,(\d+)\,(.*)/);
		
		my $cas_type = @toks[1];
		if ($cas_type eq "cas9") {
			$cas_type = "cas9_unspecified";
		}
		
		if ($gid eq $id) {
			$chk++;
			push(@xinds, $start);
			push(@yinds, $end);
			push(@ids, $cid);
			push(@strands, $strand);
			push(@types, $type);
			push(@cas_types, $cas_type);
		}
	}
	
	if ($chk > 0) {
		push(@coords, [@xinds]);
		push(@coords, [@yinds]);
		push(@coords, [@ids]);
		push(@coords, [@strands]);
		push(@coords, [@types]);
		push(@coords, [@cas_types]);
	}
	
	return @coords;
}

sub get_unique_id {
	my $letters="ABCDEFGHIJKLMNOPQRSTUVWXYZ";
	my @arr_letters=split('',$letters);		
	my $unique_id = time();	   
	
	for(my $i = 0; $i < int(rand(50)); $i++){
		$unique_id .= $arr_letters[int(rand($#arr_letters))];
	}
	return $unique_id;	 
}

return 1;
