#!/bin/csh # Copyright (C) 2005 Andrew E Firth, University of Otago, Dunedin, # New Zealand, aef(at)sanger.otago.ac.nz # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License (version 2) as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA # 02110-1301, USA. #Runs mlogd programme for all six frames in sliding windows. limit stacksize unlimited @ argc = `echo $argv | awk '{print NF}'` if ($argc != 2) then echo "Usage: dosixframe newdir newurl
" exit 1 endif set olddir = `pwd` set newdir = $1 set newurl = $2 cd $newdir source mlogd.param if (! $download) then setenv PATH ${PATH}:/home/aef/bin set formdir = "/aef/MLOGD/FORM" else setenv PATH ${PATH}:${HOME}/bin set formdir = "../../FORM" endif echo -n "Preparing for six-frame plots...
" cp ../../SCRIPTS/sixframe.plotdata.html . cp orfs.1 orfs.1.bkp mv orfs.2 orfs.2.bkp set threshold = 0.75 set threshold = `echo $threshold | awk '{if (0.+$1<0.01) {print "0.01"} else {print $1}}' | awk '{if (0.+$1>0.99) {print "0.99"} else {print $1}}'` @ range1 = $range1 @ range2 = $range2 #If wholeseq = 1 or 2 then do six frame plot for wholeseq. #If wholeseq = 0 then do six frame plot for range1-range2. set refseq2 = `awk '{if ($1=="'"$refseq"'") print $2}' seqs.key` degapseq $refseq2.fasta temp.fasta -auto set length = `infoseq temp.fasta -auto -only -length | tail -1` if ($wholeseq) then @ range1 = 1 @ range2 = $length endif @ step3 = `echo $step | awk '{print $1*3}'` @ ncodons3 = `echo $ncodons | awk '{print $1*3}'` if ($step3 < 15) then echo "Error: Minimum step size is 5 codons. You entered $step.

" exit 1 endif rm -f temp.fasta @ drop = `echo $range1 $range2 | awk '{print int(0.5*($2-$1+1))}'` if ($ncodons3 > $drop) then echo "Error: Window size '$ncodons3 nt' is too large relative to the requested nucleotide range '$range1 to $range2'. Therefore no plot data. Please ensure nucleotide range is at least twice the window size.

" exit 1 endif @ drop = `echo $step3 | awk '{print $1*2}'` if ($drop > $ncodons3) then echo "Error: Step size '$step3 nt' should be no greater than half the window size '$ncodons3 nt'.

" exit 1 endif #Estimated run-time @ maxruntime = 500 @ w = `wc -l pairs.tree.dat | awk '{print $1}'` @ runtime = `echo $range1 $range2 $step $ncodons $w | awk '{print 10+int(0.02*sqrt($5*$4)*($2-$1+1)/$3)}'` echo "Estimated run-time: $runtime seconds.
" if ($runtime > $maxruntime) then echo "Estimated run-time is too long - your submission won't be processed.
" echo "You could try decreasing the nucleotide range, or increasing the step size.

" exit 1 endif if ($wholeseq) then set base1 = 1 set base2 = `infoseq $refseq2.fasta -auto -only -length | tail -1` else set base1 = `ntadjust $refseq2.fasta $range1` set base2 = `ntadjust $refseq2.fasta $range2` endif echo $length $base1 $base2 > ref.info #Make refseq to align coords conversion file. tail +2 seq.001.fasta | sed 's/./&@/g' | sed 'y/@/\n/' | grep "." | \ awk '{if ($1!="-") s+=1}{if ($1!="-") {printf "%6s %6s %6s\n",NR,s,$1} else \ {printf "%6s %6s %6s\n",NR,"-",$1}}' > coords.txt #Make lists of windows to use. foreach frame (0 1 2) @ s1 = `echo $frame $range1 | awk '{print $1+1+$2-($2%3)}'` @ s2 = `echo $frame $range1 $ncodons | awk '{print $3*3+$1+$2-($2%3)}'` @ s3 = `echo $s1 $s2 | awk '{print int(0.5*($1+$2))}'` rm -f temp1 touch temp1 while ($s2 <= $range2) echo $s1 $s2 $s3 >> temp1 @ s1 += $step3 @ s2 += $step3 @ s3 += $step3 end awk '{printf "%s 2:%s:%s\n",$3,$1,$2}' temp1 > orfs.p.$frame awk '{printf "%s 2:%s:%s\n",$3,$2,$1}' temp1 > orfs.m.$frame if ($circular && $wholeseq) then rm -f temp2 touch temp2 while ($s1 <= $length) if ($s3 > $length) then @ s3 -= $length endif echo $s1 $s2 $s3 >> temp2 @ s1 += $step3 @ s2 += $step3 @ s3 += $step3 end awk '{printf "%s 4:%s:%s:%s:%s\n",$3,$1,"'"$length"'","1",$2-"'"$length"'"}' \ temp2 >> orfs.p.$frame awk '{printf "%s 4:%s:%s:%s:%s\n",$3,$2-"'"$length"'","1","'"$length"'",$1}' \ temp2 >> orfs.m.$frame endif sort -n orfs.p.$frame | awk '{printf "%s+%s\n",$1,$2}' > temp3 mv temp3 orfs.p.$frame sort -n orfs.m.$frame | awk '{printf "%s+%s\n",$1,$2}' > temp3 mv temp3 orfs.m.$frame end rm -f temp[12] echo "...Done

" #Calculate summed divergence of contributing sequence pairs. touch orfs.2 cp pairs.tree.dat pairs.dat awk '{print $2,$1}' pairs.tree.dat >> pairs.dat echo 1 1 $length " " > setup.mlogd echo $circular " " >> setup.mlogd mlogd setup.mlogd >>& errorlog.txt || exit 1 set lambdasum = `head -2 mlogd_out.info | tail -1 | awk '{printf "%.2f\n",$1*0.25}'` touch mlogd_out.treesum awk '{print $1,$3*0.25}' mlogd_out.treesum > tree.lambda awk '{if (0.+$12>0.) {print $13/$12} else {print "0"}}' mlogd_out.dat > div.dat set lambdasum2 = `awk '{s+=$1}END{print s}' div.dat` set maxsumdiv = `echo $lambdasum2 $ncodons3 | awk '{print $1*$2}'` set threshold2 = `echo $threshold $maxsumdiv | awk '{print $1*$2}'` foreach suffix (dat gaplog gappos glitches info MLElog stoplog stoppos1 stoppos2 startpos1 startpos2 treesum) rm -f mlogd_out.$suffix end rm -f orfs.2 setup.mlogd pairs.dat awk '{printf "%6i %10.4f\n",$1,$2}' tree.lambda > div.data.txt #Run MLOGD for six frames. echo -n "Doing six-frame plots...
" cp pairs.tree.dat pairs.dat awk '{print $2,$1}' pairs.tree.dat >> pairs.dat foreach frame (p.0 p.1 p.2 m.0 m.1 m.2) rm -f mle.$frame touch mle.$frame foreach j (`cat orfs.$frame`) set i = `echo $j | awk -F+ '{print $2}'` echo $i | sed 's/:/ /g' > orfs.2 set r1 = `echo $i | cut -d ":" -f 2- | sed 'y/:/\n/' | grep "[0-9]" | sort -n | head -1` set r2 = `echo $i | cut -d ":" -f 2- | sed 'y/:/\n/' | grep "[0-9]" | sort -n | tail -1` echo 2 $r1 $r2 " " > setup.mlogd echo $circular " " >> setup.mlogd mlogd setup.mlogd >>& errorlog.txt || exit 1 set totalmle = \ `awk '{s+=($4-$3)*$5}END{print s*0.25}' mlogd_out.dat` set sumdiv = \ `paste div.dat mlogd_out.dat | awk '{s+=$1*$6}END{print s}'` set refbase = `echo $j | awk -F+ '{print $1}'` set base = `ntadjust $refseq2.fasta $refbase` set num = `echo $i | awk -F: '{print $1}'` if ($num == "4") then set v1 = `echo $i | cut -d ":" -f 2- | sed 'y/:/\n/' | grep "[0-9]" | sort -n | head -3 | tail -1` set v2 = `echo $i | cut -d ":" -f 2- | sed 'y/:/\n/' | grep "[0-9]" | sort -n | head -2 | tail -1` else set v1 = $r1 set v2 = $r2 endif set t1 = `ntadjust $refseq2.fasta $v1` set t2 = `ntadjust $refseq2.fasta $v2` echo $base $totalmle $t1 $t2 $sumdiv >> mle.$frame end echo -n "$frame.." end awk '{printf "+0 %7i %9.2f %7i %7i %12.2f\n",$1,$2,$3,$4,$5}' mle.p.0 \ > mle.data.txt awk '{printf "+1 %7i %9.2f %7i %7i %12.2f\n",$1,$2,$3,$4,$5}' mle.p.1 \ >> mle.data.txt awk '{printf "+2 %7i %9.2f %7i %7i %12.2f\n",$1,$2,$3,$4,$5}' mle.p.2 \ >> mle.data.txt awk '{printf "-0 %7i %9.2f %7i %7i %12.2f\n",$1,$2,$3,$4,$5}' mle.m.0 \ >> mle.data.txt awk '{printf "-1 %7i %9.2f %7i %7i %12.2f\n",$1,$2,$3,$4,$5}' mle.m.1 \ >> mle.data.txt awk '{printf "-2 %7i %9.2f %7i %7i %12.2f\n",$1,$2,$3,$4,$5}' mle.m.2 \ >> mle.data.txt echo "...Done

" echo -n "Calculating other plot data...
" #Apply threshold and scale scores. The mle2.$frame are only used for the plot. set length2 = `infoseq $refseq2.fasta -auto -only -length | tail -1` foreach frame (p.0 p.1 p.2 m.0 m.1 m.2) awk '{if (0.+$5>=0.+"'"$threshold2"'"&&0.+$5>0.) print $1,$2*"'"$maxsumdiv"'"/$5,$3,$4,$5}' mle.$frame | awk '{if (0.+$3>0.+$4) {printf "%s %s %s %s %s\n%s %s %s %s %s\n",$1,$2,$3,"'"$length2"'",$5,$1,$2,"1",$4,$5} else {print $0}}' > mle2.$frame end foreach suffix (dat gaplog glitches info MLElog stoplog treesum stoppos1 stoppos2 startpos1 startpos2) rm -f mlogd_out.$suffix end rm -f setup.mlogd orfs.2 touch mlogd_out.gappos #OK to use gappos file for last iteration. It's done for full alignment. mv mlogd_out.gappos gappos awk '{printf "%4i %6i\n",$1,$2}' gappos > gaps.data.txt #Null model orfs. Change from refseq to alignment coords. @ norfs1 = `wc -l orfs.1 | awk '{print $1}'` if ($norfs1 > 0) then cut -d " " -f 2- orfs.1 | sed 's/[^0-9]/x/g' | sed 'y/x/\n/' | grep "[0-9]" \ > temp rm -f temp1; touch temp1 foreach i (`cat temp`) echo `ntadjust $refseq2.fasta $i` >> temp1 end set orf1x1 = `awk '{if (NR%2==1) printf "%s,",$0}' temp1 | sed 's/,$//g'` set orf1x2 = `awk '{if (NR%2==0) printf "%s,",$0}' temp1 | sed 's/,$//g'` else set orf1x1 = "0" set orf1x2 = "0" endif rm -f temp temp1 #Other parameters set nseqs = `wc -l pairs.ref.dat | awk '{print $1+1}'` set date = `date +%y-%m-%d` cat mle2.[pm].[012] | awk '{printf "%14.6f\n",$2}' > temp1 echo 0 >> temp1 set y2min = `minmax temp1 | awk '{print $1}'` set y2max = `minmax temp1 | awk '{print $2}'` awk '{print $2}' pairs.ref.dat > temp1 rm -f temp2 touch temp2 foreach i (`cat temp1`) set id = seq.$i set name = `awk '{if ($2=="'"$id"'") print $1}' seqs.key` echo $name >> temp2 end set sss1 = `awk '{printf "%s,",(NR+1-0.5)/"'"$nseqs"'"}' temp2 | sed 's/,$//'` set sss2 = `awk '{printf "@%s@,",$1}' temp2 | sed 's/,$//'` rm -f temp[12] #Calculate stop posfiles. rm -f orfs.1 orfs.2 touch orfs.2 head -1 pairs.tree.dat > pairs.dat echo 1 1 $length " " > setup.mlogd echo $circular " " >> setup.mlogd foreach frame (0 1 2) @ l1 = `echo $frame | awk '{print $1+1}'` @ l2 = `echo $frame | awk '{print "'"$length"'"-"'"$length"'"%3+$1}'` if ($l2 > $length) then @ l2 -= 3 endif echo 2 $l1 $l2 > orfs.1 mlogd setup.mlogd >>& errorlog.txt || exit 1 touch mlogd_out.stoppos1 mlogd_out.startpos1 mv mlogd_out.stoppos1 stoppos.p.$frame mv mlogd_out.startpos1 startpos.p.$frame echo 2 $l2 $l1 > orfs.1 mlogd setup.mlogd >>& errorlog.txt || exit 1 touch mlogd_out.stoppos1 mlogd_out.startpos1 mv mlogd_out.stoppos1 stoppos.m.$frame mv mlogd_out.startpos1 startpos.m.$frame end foreach suffix (dat gaplog gappos glitches info MLElog stoplog stoppos2 startpos2 treesum) rm -f mlogd_out.$suffix end rm -f orfs.1 orfs.2 setup.mlogd pairs.dat awk '{printf "+0 %4i %6i\n",$1,$2}' stoppos.p.0 > stops.data.txt awk '{printf "+1 %4i %6i\n",$1,$2}' stoppos.p.1 >> stops.data.txt awk '{printf "+2 %4i %6i\n",$1,$2}' stoppos.p.2 >> stops.data.txt awk '{printf "-0 %4i %6i\n",$1,$2}' stoppos.m.0 >> stops.data.txt awk '{printf "-1 %4i %6i\n",$1,$2}' stoppos.m.1 >> stops.data.txt awk '{printf "-2 %4i %6i\n",$1,$2}' stoppos.m.2 >> stops.data.txt awk '{printf "+0 %4i %6i\n",$1,$2}' startpos.p.0 > starts.data.txt awk '{printf "+1 %4i %6i\n",$1,$2}' startpos.p.1 >> starts.data.txt awk '{printf "+2 %4i %6i\n",$1,$2}' startpos.p.2 >> starts.data.txt awk '{printf "-0 %4i %6i\n",$1,$2}' startpos.m.0 >> starts.data.txt awk '{printf "-1 %4i %6i\n",$1,$2}' startpos.m.1 >> starts.data.txt awk '{printf "-2 %4i %6i\n",$1,$2}' startpos.m.2 >> starts.data.txt #Check for empty files to edit out of R script to stop R from barfing. @ w = `wc -l gappos | awk '{print $1}'` if ($w == 0) then set kkk4 = "#" else set kkk4 = "" endif @ w = `wc -l tree.lambda | awk '{print $1}'` if ($w == 0) then set kkk5 = "#" else set kkk5 = "" endif @ w = `wc -l stoppos.p.0 | awk '{print $1}'` if ($w == 0) then set ppp1 = "#" else set ppp1 = "" endif @ w = `wc -l stoppos.p.1 | awk '{print $1}'` if ($w == 0) then set ppp2 = "#" else set ppp2 = "" endif @ w = `wc -l stoppos.p.2 | awk '{print $1}'` if ($w == 0) then set ppp3 = "#" else set ppp3 = "" endif @ w = `wc -l stoppos.m.0 | awk '{print $1}'` if ($w == 0) then set ppp4 = "#" else set ppp4 = "" endif @ w = `wc -l stoppos.m.1 | awk '{print $1}'` if ($w == 0) then set ppp5 = "#" else set ppp5 = "" endif @ w = `wc -l stoppos.m.2 | awk '{print $1}'` if ($w == 0) then set ppp6 = "#" else set ppp6 = "" endif @ w = `wc -l mle2.p.0 | awk '{print $1}'` if ($w == 0) then set jjj1 = "#" else set jjj1 = "" endif @ w = `wc -l mle2.p.1 | awk '{print $1}'` if ($w == 0) then set jjj2 = "#" else set jjj2 = "" endif @ w = `wc -l mle2.p.2 | awk '{print $1}'` if ($w == 0) then set jjj3 = "#" else set jjj3 = "" endif @ w = `wc -l mle2.m.0 | awk '{print $1}'` if ($w == 0) then set jjj4 = "#" else set jjj4 = "" endif @ w = `wc -l mle2.m.1 | awk '{print $1}'` if ($w == 0) then set jjj5 = "#" else set jjj5 = "" endif @ w = `wc -l mle2.m.2 | awk '{print $1}'` if ($w == 0) then set jjj6 = "#" else set jjj6 = "" endif echo "...Done

" echo -n "Doing plots...
" cat ../../SCRIPTS/sixframe.R | \ sed 's/nnn/'$nseqs'/g' | sed 's/ddd/'$date'/g' | \ sed 's/bbb1/'$base1'/g' | sed 's/bbb2/'$base2'/g' | \ sed 's/www/'$ncodons'/g' | sed 's/lll/'$step'/g' | \ sed 's/ttt/'$lambdasum'/g' | \ sed 's/orf1x1/'$orf1x1'/g' | sed 's/orf1x2/'$orf1x2'/g' | \ sed 's/#kkk4/'$kkk4'/g' | sed 's/#kkk5/'$kkk5'/g' | \ sed 's/#jjj1/'$jjj1'/g' | sed 's/#jjj2/'$jjj2'/g' | \ sed 's/#jjj3/'$jjj3'/g' | sed 's/#jjj4/'$jjj4'/g' | \ sed 's/#jjj5/'$jjj5'/g' | sed 's/#jjj6/'$jjj6'/g' | \ sed 's/#ppp1/'$ppp1'/g' | sed 's/#ppp2/'$ppp2'/g' | \ sed 's/#ppp3/'$ppp3'/g' | sed 's/#ppp4/'$ppp4'/g' | \ sed 's/#ppp5/'$ppp5'/g' | sed 's/#ppp6/'$ppp6'/g' | \ sed 's/yyy2max/'$y2max'/g' | sed 's/yyy2min/'$y2min'/g' | \ sed 's/sss1/'$sss1'/g' | sed 's/sss2/'$sss2'/g' | sed 's/@/"/g' | \ sed 's/%sss%/'$refseq'/g' | sed 's/%ggg%/'$title'/g' > plot3.R R --no-save --slave < plot3.R >>& errorlog.txt epstopdf sixframe.eps echo "set nseqs = $nseqs" > sixframe.params echo "set base1 = $base1" >> sixframe.params echo "set base2 = $base2" >> sixframe.params echo "set lambdasum = $lambdasum" >> sixframe.params echo "set maxsumdiv = $maxsumdiv" >> sixframe.params echo "set length2 = $length2" >> sixframe.params echo "set orf1x1 = $orf1x1" >> sixframe.params echo "set orf1x2 = $orf1x2" >> sixframe.params echo "set sss1 = $sss1" >> sixframe.params echo "set sss2 = $sss2" >> sixframe.params echo "set kkk4 = "'"'$kkk4'"' >> sixframe.params echo "set kkk5 = "'"'$kkk5'"' >> sixframe.params echo "set ppp1 = "'"'$ppp1'"' >> sixframe.params echo "set ppp2 = "'"'$ppp2'"' >> sixframe.params echo "set ppp3 = "'"'$ppp3'"' >> sixframe.params echo "set ppp4 = "'"'$ppp4'"' >> sixframe.params echo "set ppp5 = "'"'$ppp5'"' >> sixframe.params echo "set ppp6 = "'"'$ppp6'"' >> sixframe.params cp orfs.1.bkp orfs.1 cp orfs.2.bkp orfs.2 echo "...Done

" chmod a+rw ./* #Write rest of webpage. set id9 = `echo $newurl | sed 's/.*\///'` echo "
Six-frame plots

Download " echo "png, " echo "eps, " echo "pdf, " echo "description, " echo "raw plot data.
" echo "Error log " echo "(description of common error messages)." echo "
" if (! $download) then echo "Redraw " echo "plot with a zoomed-in nt range and/or grid lines " else echo "You may redraw this plot with a zoomed-in nt range and/or grid lines " echo "with the 'redosixframe' script " endif echo "(note).
" echo "Frame notation.

" echo "Extended regions of positive signal may indicate potential new CDSs " echo "(i.e. other than those identified in the null model), especially " echo "where there is an absense of stop codons. You can find the " echo "coordinates of such regions and use the " if (! $download) then echo "'Test input query CDSs' option on the " echo "MLOGD server " else echo "'domlogd' script " endif echo "to calculate more detailed " echo "statistics. Don't be surprised if the input 'Known CDS(s)' " echo "themselves have a negative signal - these CDSs have already been " echo "taken into account as part of the null model. Note that local " echo "misalignments may occasionally cause non-reference sequence stop " echo "codons to be annotated in the wrong frame. Be aware that a few " echo "particular read-frame combinations may lead to false positive signals " echo "(details). " echo "Note that the horizontal axis is in alignment coordinates, not " echo "reference sequence coordinates (lookup reference sequence coordinates " echo "here; " echo "key). The dashed line " echo "as at zero.

" echo "
" chmod a+rw ./* echo Finished dosixframe `date` >> errorlog.txt