#!/bin/csh # #This software is Copyright (C) 2005 by Andrew Firth, University of Otago, #Dunedin, New Zealand. All rights reserved. # #Script to prepare sequences and run through MLRGD suite. #Plots 'residual' conservation along an alignment once the protein-coding # constraints have been 'factored' out. At least, that's the idea. # #Required software: # mlrgd # runmean # runclipmean # addgaps # minmax # ntadjust # EMBOSS: seqret, seqretsplit, infoseq # EMBASSY: ednaml, ednapars # code2aln # R # #Required files: # $prefix.seqs # *.fasta, *.gbk [Sequence files in gbk or fasta format.] # aa.dat [Scoring matrix.] # aa2codon.dat [Genetic code (same order as scoring matrix).] # nuc.dat [4 x 4 nucleotide mutation matrix (order UCAG).] # codon.dat [Codon usage table.] # mlrgd.R [Plot template.] # *.fasta,orfs, *.gbk.orfs [ORF information. Alternatively can extract ORFs # from *.gbk headers.] # *.fasta.features, *.gbk.features [Known features (optional).] # $prefix.pairs [Sequence pairs file (optional).] # #Usage: run_mlrgd prefix # # #Notes: # Input files must have suffixes 'fasta' or 'gbk' as appropriate. # Doesn't really matter if input sequences have '-' or '.' gaps. #------------------------------------------------------------------------------ #Options set pairs = 1 # 2 -> use dnapars, 1 -> use dnaml, # 0 or other -> use '$prefix.pairs' set refpos = 0 # 0 -> use seperate ORF files for each sequence, # 1 -> use refseq ORF files for all sequences set gbkpos = 1 # 0 -> get ORF files from 'sequence'.orfs files # 1 -> get ORF files from 'sequence'.gbk files (for any # 'sequence'.fasta files, the 'sequence'.orfs files, if any, # will be used) set skip = 0 # 1 or other -> skip sequence files containing non-ACGTUacgtu; # exits if reference sequence contains non-ACGTUacgtu # 0 -> pass flag to mlrgd so that non-ACGTUacgtu.- are read as # N (ambiguous nt) and ignored for most statistics set window1 = 5 # n, where sliding window size is 2n+1 (#codons for # N1,N2,N3 plots, #nt/codons for syn) set window2 = 10 # n, where sliding window size is 2n+1 (#nt for all, nc) set circular = 0 # 0 -> not circular genome, 1 or other -> circular genome, set clip1 = 0.00 # for clipped running mean; 'clip1 = 0.05' means upper 5% # (i.e. the columns with low obs#muts relative to exp#muts) # in each window are clipped before calculating mean set clip2 = 0.30 # for clipped running mean; 'clip2 = 0.05' means lower 5% # (i.e. the columns with high obs#muts relative to exp#muts) # in each window are clipped before calculating mean set tttmin = 0.0 # Lowest t value to use in MLRGD fitting set tttmax = 10.0 # Highest t value to use in MLRGD fitting set tVfit = 0.2 # Required fitting accuracy (total #muts in seq pair) for # t and V set maxiter = 20 # Maximum number of fitting iterations for t and V set Vmin = 0.01 # Lowest V value to use in MLRGD fitting (>= 0.01) set Vmax = 10.0 # Highest V value to use in MLRGD fitting (<= 10) set Vtype = 1 # 0 -> find seperate V for each sequence pair; # 1 or other -> find one V value for all sequence pairs set Vfix = 0 # If > 0, use this V scaling for all sequence pairs; # defaults to 1 if <= 0; overridden if fitwhat = 3. set fitwhat = 3 #0 or other -> fit to total # muts #1 -> fit to # of neutral/synonymous muts, #2 -> fit to # of neutral/synonymous muts at 4-fold degenerate # sites #3 -> as for 2, then adjust V to fit # of nonsyn muts #Note: 1-3 are not recommended if lots of double-coding set wholeseq = 1 #Nucleotide range to use: 0 = region, 1 or other = whole # sequence. set range1 = 0 #Start nucleotide (if $wholeseq = 0; otherwise ignored); # reference sequence coords set range2 = 0 #End nucleotide (if $wholeseq = 0; otherwise ignored); # reference sequence coords #------------------------------------------------------------------------------ #Check all required programmes and files are present. @ argc = `echo $argv | awk '{print NF}'` if ($argc != 1) then echo "Usage: run_mlrgd prefix" exit endif set prefix = $1 limit stacksize unlimited setenv PATH $HOME/bin:${PATH} echo `date` ": Starting running CDS-plotcon on alignment $prefix" set test = 0 foreach i (runmean runclipmean mlrgd seqret seqretsplit ednaml ednapars infoseq code2aln addgaps minmax ntadjust R) if (! -X $i) then echo "Can't find software $i." set test = 1 endif end if ($test) then exit endif set test = 0 foreach i (aa.dat aa2codon.dat nuc.dat codon.dat $prefix.seqs mlrgd.R) if (! -r $i) then echo "Can't find file $i." set test = 1 endif end if ($test) then exit endif set test = 0 foreach i (`cat $prefix.seqs`) if (! -r $i) then echo "Can't find sequence $i." set test = 1 endif end if ($test) then exit endif set test = 0 foreach i (temp temp1 temp2 temp3 aln.aln info.ral ORF.ps cluster.txt phylip.in phylip.out phylip.tree setup.mlrgd seqs.dat `ls seq.* orfs.??? tghyv.* temp.???` $prefix.aln $prefix.errorlog $prefix.fc.dat $prefix.fc.info $prefix.fc.log $prefix.fcm.eps $prefix.fcm.R $prefix.fc.plot $prefix.info.ral $prefix.nc.dat $prefix.nc.info $prefix.nc.log $prefix.ncm.eps $prefix.ncm.R $prefix.nc.plot $prefix.ORF.ps $prefix.run_mlrgd $prefix.tree) if (-r $i) then echo "Please remove file '$i' so that I can overwrite it." set test = 1 endif end if ($test) then exit endif if (1 != $pairs && 2 != $pairs) then if (! -r $prefix.pairs) then echo "Can't find file $prefix.pairs." exit endif endif touch $prefix.errorlog set refseq = `head -1 $prefix.seqs` #------------------------------------------------------------------------------ #Check, tidy and align sequences and ORF files. echo -n `date` ": Checking sequences..." #Look for non-ACGTU characters. if ($skip) then seqret $refseq temp1 -osformat fasta -auto set test = `tail +2 temp1 | grep "[^acgtuACGTU.-]" | wc -l` if ($test) then echo "Reference sequence contains non-ACGTU characters. Aborting." exit else echo $refseq > temp endif foreach i (`tail +2 $prefix.seqs`) seqret $i temp1 -osformat fasta -auto set test = `tail +2 temp1 | grep "[^acgtuACGTU.-]" | wc -l` if ($test) then echo "$i contains non-ACGTU characters. Omitting this sequence." >> \ $prefix.errorlog else echo $i >> temp endif end mv temp $prefix.seqs endif #Look for and remove identical sequences. touch temp foreach i (`cat $prefix.seqs`) seqret $i temp1 -osformat fasta -auto degapseq temp1 temp2 -auto tail +2 temp2 | sed 'y/ACGTUuMRWVHDBSYKNX/acgtttmrwvhdbsyknx/' \ > rfge8345dsf.$i echo rfge8345dsf.$i >> temp end rm -f temp[123] @ count = 0 foreach i (`cat $prefix.seqs`) @ t2 = 1 foreach j (`head -$count temp`) @ t1 = `diff rfge8345dsf.$i $j | wc -l | awk '{print $1}'` if ($t1 == 0) then @ t2 = 0 set old = `echo $j | sed 's/rfge8345dsf\.//'` echo Sequence $i identical to $old. Omitting $i. >> $prefix.errorlog if (1 != $pairs && 2 != $pairs) then awk '{if ($1=="'"$i"'") {print "'"$old"'",$2} else {print $0}}' \ $prefix.pairs | \ awk '{if ($2=="'"$i"'") {print $1,"'"$old"'"} else {print $0}}' \ > temp1 mv temp1 $prefix.pairs endif break endif end if ($t2) then echo $i >> temp3 endif @ count += 1 end mv temp3 $prefix.seqs if (1 != $pairs && 2 != $pairs) then awk '{if ($1!=$2) print $0}' $prefix.pairs > temp mv temp $prefix.pairs endif rm -f rfge8345dsf.* #If ($gbkpos), make ORF files for any sequences with .gbk extension. if ($gbkpos) then foreach i (`cat $prefix.seqs`) set extn = `echo $i | sed 's/.*\.//'` if ($extn == "gbk") then grep " CDS " $i | awk '{print $2}' | sed 's/[<>]//g' > $i.orfs endif end endif #Check required ORF files present. if ($refpos) then # use refseq ORF files for all if (! -r $refseq.orfs) then echo "Can't find file $refseq.orfs." exit endif else # use ORF files for each sequences set test = 0 foreach i (`cat $prefix.seqs`) if (! -r $i.orfs) then echo "Can't find file $i.orfs." set test = 1 endif end if ($test) then exit endif endif #Rename sequences # (Note seqret changes '.'s to '-'s, but doesn't strip gaps. Preserves # non-ACGTU if they are valid ambiguous nt codes, but non valid codes, e.g. O, # are replaced with '-'.) #Change all sequences to fasta format. Then for any sequences with *.orfs # files (even if (refpos)), put the ORF info into a header and make a .gbk # file readable by code2aln. touch seqs.dat @ count = 0 foreach i (`cat $prefix.seqs`) @ count += 1 set id = `echo $count | awk '{printf "%03i\n",$1}'` if (-r $i.orfs) then # write .gbk format file seqret $i temp1 -osformat fasta -auto degapseq temp1 temp2 -auto set length = `infoseq temp2 -auto | tail +2 | awk '{print $5}'` echo LOCUS seq.$id $length > seq.$id.gbk awk '{printf " CDS %s\n",$1}' $i.orfs >> seq.$id.gbk echo ORIGIN >> seq.$id.gbk tail +2 temp2 >> seq.$id.gbk echo // >> seq.$id.gbk echo seq.$id.gbk >> seqs.dat else # write .fasta format file seqret $i temp1 -osformat fasta -auto degapseq temp1 temp2 -auto sed 's/>.*/>seq.'$id'/' temp2 > seq.$id.fasta echo seq.$id.fasta >> seqs.dat endif end set nseqs = $count if ($refpos) then sed 's/[^0-9]/ /g' $refseq.orfs | awk '{print NF,$0}' > orfs.001 else @ count = 0 foreach i (`cat $prefix.seqs`) @ count += 1 set id = `echo $count | awk '{printf "%03i\n",$1}'` sed 's/[^0-9]/ /g' $i.orfs | awk '{print NF,$0}' > orfs.$id end endif echo "Done" #Align sequences with code2aln. # code2aln and seqretsplit seem to handle abiguous nt codes. echo -n `date` ": Aligning sequences..." code2aln `cat seqs.dat` >>& $prefix.errorlog || exit 1 mv aln.aln $prefix.aln seqretsplit $prefix.aln temp -auto -osextension2 aln rm -f temp awk -F. '{printf "%s.%s.aln\n",$1,$2}' seqs.dat > temp mv temp seqs.dat rm -f seq.???.fasta seq.???.gbk mv info.ral $prefix.info.ral mv ORF.ps $prefix.ORF.ps echo "Done" #------------------------------------------------------------------------------ #Make phylo tree and sequence pair list. # #Use sequence pairs tracing round outside of tree (this covers every branch # precisely twice). #ednaml, ednapars seem to handle abiguous nt codes. echo -n `date` ": Making phylogenetic tree..." seqret -osformat phylip $prefix.aln phylip.in -auto if (1 == $pairs) then if ($nseqs <= 3) then sed 's/seq\.//' seqs.dat | sed 's/\.aln//' > temp else ednaml phylip.in -outfile phylip.out -treefile phylip.tree -ttratio 3 \ -random no -trout yes -auto -besttree yes -randtimes 1 \ >>& $prefix.errorlog || exit 1 sed 's/:[^,()]*[,()]/ /g' phylip.tree | sed 's/(//g' | sed 's/ /\n/g' | \ grep seq. | head -$nseqs > temp endif tail +2 temp > temp1 head -1 temp >> temp1 paste temp temp1 | sed 's/seq\.//g' | awk '{print $1,$2}' > $prefix.pairs rm -f temp temp1 else if (2 == $pairs) then if ($nseqs <= 3) then sed 's/seq\.//' seqs.dat | sed 's/\.aln//' > temp else ednapars phylip.in -outfile phylip.out -treefile phylip.tree -random no \ -trout yes -auto -besttree yes -randtimes 1 \ >>& $prefix.errorlog || exit 1 sed 's/[,()]/ /g' phylip.tree | sed 's/ /\n/g' | grep seq. | head -$nseqs \ > temp endif tail +2 temp > temp1 head -1 temp >> temp1 paste temp temp1 | sed 's/seq\.//g' | awk '{print $1,$2}' > $prefix.pairs rm -f temp temp1 else if (! -r $prefix.pairs) then echo "Can't find file $prefix.pairs." exit endif @ count = 0 foreach i (`cat $prefix.seqs`) @ count += 1 set id = `echo $count | awk '{printf "%03i\n",$1}'` mv $prefix.pairs temp cat temp | \ awk '{if ($1=="'"$i"'") {print "'"$id"'",$2} else {print $1,$2}}' | \ awk '{if ($2=="'"$i"'") {print $1,"'"$id"'"} else {print $1,$2}}' \ > $prefix.pairs end rm -f temp endif awk '{print $2,$1}' $prefix.pairs >> $prefix.pairs if (-r phylip.tree) then mv phylip.tree $prefix.tree endif echo "Done" #------------------------------------------------------------------------------ #Run MLRGD echo -n `date` ": Running MLRGD..." echo $prefix.pairs " " > setup.mlrgd echo $tttmin $tttmax $tVfit $maxiter " #t fitting params" >> setup.mlrgd echo $fitwhat " #fitwhat" >> setup.mlrgd echo 0 " #model" >> setup.mlrgd #non-coding model echo $refpos " #refpos" >> setup.mlrgd echo $circular " #circular" >> setup.mlrgd echo $wholeseq $range1 $range2 " #nt range params" >> setup.mlrgd echo $Vfix $Vtype $Vmin $Vmax " #V fitting params" >> setup.mlrgd echo $skip " #ambiguous nt" >> setup.mlrgd mlrgd setup.mlrgd >>& $prefix.errorlog || exit mv mlrgd_out.dat $prefix.nc.dat mv mlrgd_out.log $prefix.nc.log mv mlrgd_out.plot $prefix.nc.plot mv mlrgd_out.info $prefix.nc.info echo $prefix.pairs " " > setup.mlrgd echo $tttmin $tttmax $tVfit $maxiter " #t fitting params" >> setup.mlrgd echo $fitwhat " #fitwhat" >> setup.mlrgd echo 1 " #model" >> setup.mlrgd #full-coding model echo $refpos " #refpos" >> setup.mlrgd echo $circular " #circular" >> setup.mlrgd echo $wholeseq $range1 $range2 " #nt range params" >> setup.mlrgd echo $Vfix $Vtype $Vmin $Vmax " #V fitting params" >> setup.mlrgd echo $skip " #ambiguous nt" >> setup.mlrgd mlrgd setup.mlrgd >>& $prefix.errorlog || exit mv mlrgd_out.dat $prefix.fc.dat mv mlrgd_out.log $prefix.fc.log mv mlrgd_out.plot $prefix.fc.plot mv mlrgd_out.info $prefix.fc.info echo "Done" #*.dat: #1 best fit t (evolutionary time) #2 Sum[log(prob(seq1_nt -> seq2_nt))] per nt #3 number of nt used in comparison #4 number of nt discarded due to being gaps in one or both sequences #5 number of nt discarded due to being 'zero-probability' transitions # according to the model aa.dat, codon.dat (e.g. stop <-> non-stop) #6 number of point mutations (out of the nt used) #7 number of neutral or synonymous point mutations (includes non-coding nt) #8 flag = 1 if problems with t fitting (outside given range or didn't converge # in allowed number of iterations). #9 number of 4-fold degenerate sites with neutral or null mutation #10 number of 4-fold degenerate sites with neutral non-null mutation #11 best fit V (scaling between nonsynonymous and synonymous substitution # acceptabilities) #12 flag = 1 if problems with V fitting (outside given range or didn't converge # in allowed number of iterations). #Mutation rate per nt is $6/$3 #$3+$4+$5 = length of sequence in alignment coords # #*.log: #1 pair number #2 base number (alignment coords) #3 log(prob(seq1_nt -> seq2_nt)) # (9 = gap in both seq, 8 = gap in seq1, 7 = gap in seq2, 6 = zero-prob, # 5 = gap only in reference sequence) #4 exp#muts (0 if gap in either seq1 or seq2 #5 exp # neutral muts (0 if gap in either seq1 or seq2 #6 obs#muts (0 if gap in either seq1 or seq2) #7 obs # neutral muts (0 if gap in either seq1 or seq2) # #*.plot: #1 base number (alignment coords) #2 exp#muts summed over all pairs #3 obs#muts summed over all pairs #4 number of pairs in which nt is N0 (using seq1 ORF file) #5 number of pairs in which nt is N1 (using seq1 ORF file) #6 number of pairs in which nt is N2 (using seq1 ORF file) #7 number of pairs in which nt is N3 (using seq1 ORF file) #8 number of pairs in which nt is gap (in either seq1 or seq2) or a # 'zero-probability' transition #9 stddev estimated from exp#muts #10 lambda = mean number (over sequence) of muts per nt, summed over pairs # without gaps at that nt position #11 exp#neutralmuts summed over all pairs #12 obs#neutralmuts summed over all pairs #13 summed exp#muts at 4-fold degen neutral/syn/null sites #14 summed obs#muts at 4-fold degen neutral/syn/null sites #15 summed lambda4 for 4-fold degen neutral/syn/null sites #16 summed # of 4-fold degen neutral/syn/null sites #$4+$5+$6+$7+$8 = total number of pairs #Note that $2,$3,$10,$11,$12,$13,$14,$15 should be multiplied by 0.5 since we # do forward and backward pairs, and multiplied by 0.5 again since we cross # each branch of the tree with two pairwise comparisons. $9 should be # multiplied by sqrt(0.25) = 0.5. #------------------------------------------------------------------------------ #Plots echo -n `date` ": Preparing plot data..." set npairs = `wc $prefix.pairs | awk '{print $1}'` set np2 = `wc $prefix.pairs | awk '{print $1*0.5}'` #Divide output mlrgd_out.plot files into N0,N1,N2,N3, etc scaling by the # summed lambda values (so regions with lots of gaps in the alignment are at # the same level as regions without gaps). # #($2-$3)/$10 = (exp#muts - obs#muts)/(summed lambda). The factors of *0.25 # in $2,$3,$10 cancel out. #2.*$9/$10 = 0.5*stddev/0.25*(summed lambda). The factor of 1/$10 is a # simple scaling. foreach name (nc fc) set mname = `echo $name | awk '{printf "%sm\n",$1}'` set cname = `echo $name | awk '{printf "%sc\n",$1}'` awk '{if ((0.+$4)>(0.5*"'"$npairs"'")) print $1,$2-$3,2.*$9,$10}' \ $prefix.$name.plot | \ awk '{if (0.+$4>0) {print $1,$2/$4,$3/$4} else {print $1,0,0}}' | \ sed 's/nan/0/g' > tghyv.$name.n0 awk '{if ((0.+$5)>(0.5*"'"$npairs"'")) print $1,$2-$3,2.*$9,$10}' \ $prefix.$name.plot | \ awk '{if (0.+$4>0) {print $1,$2/$4,$3/$4} else {print $1,0,0}}' | \ sed 's/nan/0/g' > tghyv.$name.n1 awk '{if ((0.+$6)>(0.5*"'"$npairs"'")) print $1,$2-$3,2.*$9,$10}' \ $prefix.$name.plot | \ awk '{if (0.+$4>0) {print $1,$2/$4,$3/$4} else {print $1,0,0}}' | \ sed 's/nan/0/g' > tghyv.$name.n2 awk '{if ((0.+$7)>(0.5*"'"$npairs"'")) print $1,$2-$3,2.*$9,$10}' \ $prefix.$name.plot | \ awk '{if (0.+$4>0) {print $1,$2/$4,$3/$4} else {print $1,0,0}}' | \ sed 's/nan/0/g' > tghyv.$name.n3 awk '{if (($4+$5+$6+$7)>0.) print $1,$2-$3,2.*$9,$10}' \ $prefix.$name.plot | \ awk '{if ($4>0) {print $1,$2/$4,$3/$4} else {print $1,0,0}}' | \ sed 's/nan/0/g' > tghyv.$name.all awk '{if ((0.+$16)>=(0.5*"'"$npairs"'")) print $1,$13-$14,2.*$9,$15}' \ $prefix.$name.plot | \ awk '{if (0.+$4>0) {print $1,$2/$4,$3/$4} else {print $1,0,0}}' | \ sed 's/nan/0/g' > tghyv.$name.syn awk '{print $1,0.25*$10}' $prefix.$name.plot | sed 's/nan/0/g' \ > tghyv.$name.lambda runmean tghyv.$name.n0 $window2 $circular > tghyv.$mname.n0 || exit runmean tghyv.$name.n1 $window1 $circular > tghyv.$mname.n1 || exit runmean tghyv.$name.n2 $window1 $circular > tghyv.$mname.n2 || exit runmean tghyv.$name.n3 $window1 $circular > tghyv.$mname.n3 || exit runmean tghyv.$name.all $window2 $circular > tghyv.$mname.all || exit runmean tghyv.$name.syn $window1 $circular > tghyv.$mname.syn || exit runclipmean tghyv.$name.n0 $window2 $clip1 $clip2 $circular \ > tghyv.$cname.n0 || exit runclipmean tghyv.$name.n1 $window1 $clip1 $clip2 $circular \ > tghyv.$cname.n1 || exit runclipmean tghyv.$name.n2 $window1 $clip1 $clip2 $circular \ > tghyv.$cname.n2 || exit runclipmean tghyv.$name.n3 $window1 $clip1 $clip2 $circular \ > tghyv.$cname.n3 || exit runclipmean tghyv.$name.all $window2 $clip1 $clip2 $circular \ > tghyv.$cname.all || exit runclipmean tghyv.$name.syn $window1 $clip1 $clip2 $circular \ > tghyv.$cname.syn || exit end #Add gaps back in (note that this comes after taking the running mean). foreach i (nc ncm ncc fc fcm fcc) foreach k (n0 n1 n2 n3 all syn) mv tghyv.$i.$k tghyv.temp addgaps tghyv.temp > tghyv.$i.$k || exit end end #If file is empty, write '0 0 0' to it so that R doesn't barf. foreach i (ncm ncc fcm fcc) foreach k (n0 n1 n2 n3 all syn) set t = `wc -l tghyv.$i.$k | awk '{print $1}'` if (0 == $t) then echo 0 0 0 >> tghyv.$i.$k endif end end #Reference sequence orfs. Change from refseq to alignment coords. set w = `wc -l orfs.001 | awk '{print $1}'` if ($w > 0) then set norfs = `wc -l orfs.001 | awk '{print $1}'` cut --delimiter=" " -f 2- orfs.001 | sed 's/[^0-9]/\n/g' | grep "[0-9]" \ > temp rm -f temp1; touch temp1 foreach i (`cat temp`) echo `ntadjust seq.001.aln $i` >> temp1 end set orfx1 = `awk '{if (NR%2==1) printf "%s,",$0}' temp1 | sed 's/,$//g'` set orfx2 = `awk '{if (NR%2==0) printf "%s,",$0}' temp1 | sed 's/,$//g'` awk '{print $1/2}' orfs.001 > temp1 rm -f temp; touch temp @ b = 0 foreach i (`cat temp1`) @ b ++ @ a = 0 while ($a < $i) echo $b $norfs | awk '{print ($1-0.5)/$2}' >> temp @ a ++ end end set orfy = `awk '{printf "%s,",$0}' temp | sed 's/,$//g'` else set orfx1 = "0" set orfx2 = "0" set orfy = "0" endif #Known features. Change from refseq to alignment coords. if (-r $refseq.features) then set w = `wc -l $refseq.features | awk '{print $1}'` if ($w > 0) then sed 's/[^0-9]/ /g' $refseq.features | awk '{print NF,$0}' > temp2 set nkfs = `wc -l temp2 | awk '{print $1}'` cut --delimiter=" " -f 2- temp2 | sed 's/[^0-9]/\n/g' | grep "[0-9]" > temp rm -f temp1; touch temp1 foreach i (`cat temp`) echo `ntadjust seq.001.aln $i` >> temp1 end set kfx1 = `awk '{if (NR%2==1) printf "%s,",$0}' temp1 | sed 's/,$//g'` set kfx2 = `awk '{if (NR%2==0) printf "%s,",$0}' temp1 | sed 's/,$//g'` awk '{print $1/2}' temp2 > temp1 rm -f temp; touch temp @ b = 0 foreach i (`cat temp1`) @ b ++ @ a = 0 while ($a < $i) echo $b $nkfs | awk '{print ($1-0.5)/$2}' >> temp @ a ++ end end set kfy = `awk '{printf "%s,",$0}' temp | sed 's/,$//g'` else set kfx1 = "0" set kfx2 = "0" set kfy = "0" endif else set kfx1 = "0" set kfx2 = "0" set kfy = "0" endif echo "Done" #------------------------------------------------------------------------------ #Plot N0, N1, N2, N3, all and syn sliding-mean tracks for nc and fc. echo -n `date` ": Making plots..." #Note *0.25 correction in $nmutsum and $lambdasum corrects for doing both # forward and backward pairwise comparisons, and for covering every branch # of the tree with two pairwise comparisons. set w1 = `echo $window1 | awk '{print 1+2*$1}'` set w2 = `echo $window2 | awk '{print 1+2*$1}'` #Conservation score plots. foreach i (nc fc) set length = `head -2 $prefix.$i.info | tail -1 | awk '{print $2}'` set base1 = `head -2 $prefix.$i.info | tail -1 | awk '{print $3}'` set base2 = `head -2 $prefix.$i.info | tail -1 | awk '{print $4}'` set nmutsum = `head -3 $prefix.$i.info | tail -1 | awk '{print $2*0.25}'` set lambdasum = `head -4 $prefix.$i.info | tail -1 | awk '{printf "%.2f\n",$2*0.25}'` set lambda4sum = `tail -1 $prefix.$i.info | awk '{printf "%.2f\n",$2*0.25}'` set mname = `echo $i | awk '{printf "%sm\n",$1}'` set cname = `echo $i | awk '{printf "%sc\n",$1}'` foreach j ($mname $cname) cat tghyv.$j.all tghyv.$j.syn tghyv.$j.n[0123] | \ awk '{printf "%14.6f\n", $2}' > temp1 set ymin = `minmax temp1 | awk '{print $1}'` set ymax = `minmax temp1 | awk '{print $2}'` if ("fc" == $i) then set aaa = full else set aaa = non endif if ($mname == $j) then set bbb = "" else set bbb = "$clip1\/$clip2-clipped" endif cat mlrgd.R | sed 's/mmm/'$j'/g' | \ sed 's/yyymax/'$ymax'/g' | sed 's/yyymin/'$ymin'/g' | \ sed 's/lll/'$length'/g' | \ sed 's/ddd1/'$base1'/g' | sed 's/ddd2/'$base2'/g' | \ sed 's/qqq/'$np2'/g' | sed 's/nnn/'$nmutsum'/g' | \ sed 's/rrr/'$lambdasum'/g' | sed 's/ccc/'$lambda4sum'/g' | \ sed 's/aaa/'$aaa'-coding model/g' | sed 's/bbb/'$bbb' running mean/g' | \ sed 's/ running mean/ running mean/' | sed 's/ppp/'$i'/' | \ sed 's/www1/'$w1'/g' | sed 's/www2/'$w2'/g' | \ sed 's/orfx1/'$orfx1'/g' | sed 's/orfx2/'$orfx2'/g' | \ sed 's/orfy/'$orfy'/g' | sed 's/kfy/'$kfy'/g' | \ sed 's/kfx1/'$kfx1'/g' | sed 's/kfx2/'$kfx2'/g' | \ sed 's/xxx/'$prefix'/g' > $prefix.$j.R R --no-save --slave < $prefix.$j.R end end echo "Done" #------------------------------------------------------------------------------ #Tidy up. cp run_mlrgd $prefix.run_mlrgd mkdir TIDYUP foreach i (`ls tghyv.* seq.???.aln orfs.??? seqs.dat aa2codon.dat aa.dat codon.dat nuc.dat mlrgd.R run_mlrgd`) mv $i TIDYUP end rm -f cluster.txt temp temp[12] phylip.in phylip.out setup.mlrgd awk '{printf "seq.%s seq.%s\n",$1,$2}' $prefix.pairs > temp1 mv temp1 $prefix.pairs @ count = 0 foreach i (`cat $prefix.seqs`) set name = `echo $i | sed 's/\.fasta$//' | sed 's/\.gbk$//'` @ count += 1 set id = `echo $count | awk '{printf "%03i\n",$1}'` mv $prefix.pairs temp1 mv $prefix.aln temp2 sed 's/seq\.'$id'/'$name'/g' temp1 > $prefix.pairs sed 's/seq\.'$id'/'$name'/g' temp2 > $prefix.aln end rm -f temp[12] echo `date` ": Finished running CDS-plotcon on alignment $prefix"