#!/bin/csh # Copyright (C) 2005 Andrew E Firth, University of Otago, Dunedin, # New Zealand, aef(at)sanger.otago.ac.nz # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License (version 2) as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA # 02110-1301, USA. #Runs mlogd programme and makes plots. limit stacksize unlimited @ argc = `echo $argv | awk '{print NF}'` if ($argc != 4) then echo "Usage: domlogd newdir newurl mode number
" exit 1 endif set newdir = $1 set newurl = $2 cd $newdir source mlogd.param if (! $download) then setenv PATH ${PATH}:/home/aef/bin set formdir = "/aef/MLOGD/FORM" else setenv PATH ${PATH}:${HOME}/bin set formdir = "../../FORM" endif set mode = $3 set number = $4 set halfwindow = 10 cp ../../SCRIPTS/mlenuc.plotdata.html . if ($mode == "0") then echo -n "Running MLOGD...
" endif #Estimated run-time @ maxruntime = 200 set qseqs = `wc -l pairs.ref.dat | awk '{print $1+1}'` set qpairs = `wc -l pairs.tree.dat | awk '{print $1}'` set qquery = `head -1 setup.mlogd | awk '{print $3-$2+1}'` @ runtime = `echo $qseqs $qpairs $qlength $qquery | awk '{print int(5+(0.000084*$2*$4)+(0.000007*$1*$3)+(0.00048*$1*$4))}'` echo "Estimated run-time: $runtime seconds.
" if ($runtime > $maxruntime) then echo "
Estimated run-time is too long - your submission won't be processed.

" echo "You could try using fewer input sequences (e.g. first construct a phylogenetic tree, and pick a subset of 'representative' sequences), fewer sequence pairs, a smaller nucleotide range (e.g. use the 'Only use nt within query CDS(s)' option), or spliting your query up into subsections.

" echo "Maximum run-time is 200 seconds. Run-time estimated by 5 + (0.000084 x #pairs x nt-range) + (0.000007 x #sequences x sequence-length) + (0.00048 x #sequences x nt-range) seconds.

" exit 1 endif #Make refseq to align coords conversion file. tail +2 seq.001.fasta | sed 's/./&@/g' | sed 'y/@/\n/' | grep "." | \ awk '{if ($1!="-") s+=1}{if ($1!="-") {printf "%6s %6s %6s\n",NR,s,$1} else \ {printf "%6s %6s %6s\n",NR,"-",$1}}' > coords.txt #Run mlogd for ref-nonref pairs. cp pairs.ref.dat pairs.dat mlogd setup.mlogd >>& errorlog.txt || exit 1 foreach suffix (dat info MLElog treesum stoplog gaplog gappos stoppos1 stoppos2 startpos1 startpos2 glitches) mv mlogd_out.$suffix ref.$suffix touch ref.$suffix end set length = `head -1 ref.info | awk '{print $1}'` set qbase1 = `head -1 ref.info | awk '{print $2}'` set qbase2 = `head -1 ref.info | awk '{print $3}'` set base1 = 1 set base2 = $length awk '{printf "0 %4i %6i\n",$1,$2}' ref.stoppos1 > stops.data.txt awk '{printf "1 %4i %6i\n",$1,$2}' ref.stoppos2 >> stops.data.txt awk '{printf "0 %4i %6i\n",$1,$2}' ref.startpos1 > starts.data.txt awk '{printf "1 %4i %6i\n",$1,$2}' ref.startpos2 >> starts.data.txt awk '{printf "%4i %6i\n",$1,$2}' ref.gappos > gaps.data.txt #Make running mean tracks for ref-nonref pairs. # Note MLElog only written if not gapped in either sequence so runmean # jumps gapped regions. awk '{print $2}' ref.MLElog | sort | uniq > temp1 rm -f ref.runmean touch ref.runmean foreach nrseq (`cat temp1`) awk '{if ($2=="'"$nrseq"'") print $0}' ref.MLElog > temp2 runmean2 temp2 $halfwindow $circular >> ref.runmean || exit 1 end touch ref.runmean rm -f temp[12] #Run mlogd for tree pairs (both forward and backward pairs). cp pairs.tree.dat pairs.dat awk '{print $2,$1}' pairs.tree.dat >> pairs.dat mlogd setup.mlogd >>& errorlog.txt || exit 1 foreach suffix (dat info MLElog treesum stoplog gaplog gappos stoppos1 stoppos2 startpos1 startpos2 glitches) mv mlogd_out.$suffix tree.$suffix touch tree.$suffix end set lambdasum = `head -2 tree.info | tail -1 | awk '{printf "%.2f\n",$1*0.25}'` set ntmlesum = `awk '{s+=($4-$3)}END{printf "%.2f\n",s*0.25}' tree.dat` #Scale treesum awk '{print $1,$2,$3*0.25,$4*0.25,$5*0.25,$6}' tree.treesum > temp1 mv temp1 tree.treesum #Make running mean tracks for ungapped regions of treesum set npairs = `wc -l pairs.dat | awk '{print $1}'` awk '{if ($2=="'"$npairs"'") print $1,$2,0,$4,$5}' tree.treesum \ > temp1 runmean2 temp1 $halfwindow $circular > tree.runmean || exit 1 touch tree.runmean rm -f temp[12] if ($mode == "0") then echo "...Done

" echo -n "Making plots...
" endif set refseq2 = `awk '{if ($1=="'"$refseq"'") print $2}' seqs.key` #Null model orfs. Change from refseq to alignment coords. @ norfs1 = `wc -l orfs.1 | awk '{print $1}'` if ($norfs1 > 0) then cut -d " " -f 2- orfs.1 | sed 's/[^0-9]/x/g' | sed 'y/x/\n/' | grep "[0-9]" \ > temp rm -f temp1; touch temp1 foreach i (`cat temp`) echo `ntadjust $refseq2.fasta $i` >> temp1 end set orf1x1 = `awk '{if (NR%2==1) printf "%s,",$0}' temp1 | sed 's/,$//g'` set orf1x2 = `awk '{if (NR%2==0) printf "%s,",$0}' temp1 | sed 's/,$//g'` else set orf1x1 = "0" set orf1x2 = "0" endif rm -f temp #Alternate model orfs. Change from refseq to alignment coords. @ norfs2 = `wc -l orfs.2 | awk '{print $1}'` if ($norfs2 > 0) then cut -d " " -f 2- orfs.2 | sed 's/[^0-9]/x/g' | sed 'y/x/\n/' | grep "[0-9]" \ > temp rm -f temp1; touch temp1 foreach i (`cat temp`) echo `ntadjust $refseq2.fasta $i` >> temp1 end set orf2x1 = `awk '{if (NR%2==1) printf "%s,",$0}' temp1 | sed 's/,$//g'` set orf2x2 = `awk '{if (NR%2==0) printf "%s,",$0}' temp1 | sed 's/,$//g'` else set orf2x1 = "0" set orf2x2 = "0" endif rm -f temp #Process mlogd.R and plot. set nseqs = `wc -l pairs.ref.dat | awk '{print $1+1}'` set date = `date +%y-%m-%d` set window = `echo $halfwindow | awk '{print 1+$1*2}'` awk '{printf "%14.6f\n",sqrt(($5-$4)*($5-$4))}' ref.runmean > temp1 set yyyscale = `minmax temp1 | awk '{if (0.+$2>0.) {print 0.5/$2} else {print "0"}}'` rm -f temp1 awk '{printf "%14.6f\n",$5-$4}' tree.runmean > temp1 echo 0 >> temp1 set y2min = `minmax temp1 | awk '{print $1}'` set y2max = `minmax temp1 | awk '{print $2}'` rm -f temp1 awk '{print $2}' pairs.ref.dat > temp1 rm -f temp2 touch temp2 foreach i (`cat temp1`) set id = seq.$i set name = `awk '{if ($2=="'"$id"'") print $1}' seqs.key` echo $name >> temp2 end set sss1 = `awk '{printf "%s,",(NR+1-0.5)/"'"$nseqs"'"}' temp2 | sed 's/,$//'` set sss2 = `awk '{printf "@%s@,",$1}' temp2 | sed 's/,$//'` set sss3 = `awk '{if (0.+$12>0.) {printf "%.2f,",$13/$12} else {printf "%.2f,","0"}}' ref.dat | sed 's/,$//'` awk '{print $1}' temp2 > ref.names rm -f temp[12] #Check for empty files to edit out of R script to stop R from barfing. @ w = `wc -l ref.dat | awk '{print $1}'` if ($w == 0) then set vvv1 = "#" else set vvv1 = "" endif @ w = `wc -l coords.txt | awk '{print $1}'` if ($w == 0) then set kkk0 = "#" else set kkk0 = "" endif @ w = `wc -l ref.MLElog | awk '{print $1}'` if ($w == 0) then set kkk1 = "#" else set kkk1 = "" endif @ w = `wc -l ref.stoppos1 | awk '{print $1}'` if ($w == 0) then set kkk2 = "#" else set kkk2 = "" endif @ w = `wc -l ref.stoppos2 | awk '{print $1}'` if ($w == 0) then set kkk3 = "#" else set kkk3 = "" endif @ w = `wc -l ref.gappos | awk '{print $1}'` if ($w == 0) then set kkk4 = "#" else set kkk4 = "" endif @ w = `wc -l tree.treesum | awk '{print $1}'` if ($w == 0) then set kkk5 = "#" else set kkk5 = "" endif @ w = `wc -l ref.runmean | awk '{print $1}'` if ($w == 0) then set kkk6 = "#" else set kkk6 = "" endif @ w = `wc -l tree.runmean | awk '{print $1}'` if ($w == 0) then set kkk7 = "#" else set kkk7 = "" endif if ($mode == "0") then set kkk8 = "#" set kkk9 = "" else set kkk8 = "" set kkk9 = "#" endif cat ../../SCRIPTS/mlogd.R | \ sed 's/nnn/'$nseqs'/g' | sed 's/ddd/'$date'/g' | \ sed 's/bbb1/'$base1'/g' | sed 's/bbb2/'$base2'/g' | \ sed 's/bbbq1/'$qbase1'/g' | sed 's/bbbq2/'$qbase2'/g' | \ sed 's/www/'$window'/g' | sed 's/lll/'$lambdasum'/g' | \ sed 's/orf1x1/'$orf1x1'/g' | sed 's/orf1x2/'$orf1x2'/g' | \ sed 's/orf2x1/'$orf2x1'/g' | sed 's/orf2x2/'$orf2x2'/g' | \ sed 's/yyyscale/'$yyyscale'/g' | sed 's/fff/'$ntmlesum'/g' | \ sed 's/#vvv1/'$vvv1'/g' | sed 's/#kkk0/'$kkk0'/g' | \ sed 's/#kkk1/'$kkk1'/g' | sed 's/#kkk2/'$kkk2'/g' | \ sed 's/#kkk3/'$kkk3'/g' | sed 's/#kkk4/'$kkk4'/g' | \ sed 's/#kkk5/'$kkk5'/g' | sed 's/#kkk6/'$kkk6'/g' | \ sed 's/#kkk7/'$kkk7'/g' | \ sed 's/#kkk8/'$kkk8'/g' | sed 's/#kkk9/'$kkk9'/g' | \ sed 's/yyy2max/'$y2max'/g' | sed 's/yyy2min/'$y2min'/g' | \ sed 's/sss3/'$sss3'/g' | sed 's/sss1/'$sss1'/g' | \ sed 's/sss2/'$sss2'/g' | sed 's/@/"/g' | \ sed 's/%sss%/'$refseq'/g' | sed 's/%ggg%/'$title'/g' > plot1.R R --no-save --slave < plot1.R >>& errorlog.txt epstopdf MLEnuc.eps epstopdf MLEscatter.eps if ($mode == "0") then echo "...Done


" endif set totalmle = `awk '{s+=($4-$3)*$5}END{printf "%.2f\n",s*0.25}' tree.dat` set totalprob = `echo $totalmle | awk '{printf "%12.2e\n",exp($1)}'` if ($mode == "0") then echo "" endif echo "Total log likelihood ratio summed over phylogenetic tree = ${totalmle}.
" @ result = `echo $totalmle | awk '{if (0+$1>0.) {print "1"} else {print "0"}}'` if ($result > 0) then echo "Likelihood ratio is positive; hence favours alternate model.
" else echo "Likelihood ratio is negative; hence favours null model.
" endif if ($mode == "0") then echo "
" echo "Prob(alt)/Prob(null) ~ ${totalprob} (note).
" echo "For alternative probability estimates, use the Monte Carlo simulation option below.
" echo "These probabilities should be viewed as indicators rather than as precise measurements (note).
" endif cat ../../SCRIPTS/tableheader.txt > mlogd.table.html paste ref.names ref.dat | awk '{if (0.+$13>0.) {print $1,$13,$14,$14/$13,$6,$5-$4,$10,$11,$12,$16,$17,$18} else {print $1,$13,$14,"0",$6,$5-$4,$10,$11,$12,$16,$17,$18}}' | awk '{printf "%s%s%s%.2f%s%.3f%.3f%.3f%.3f%.3f%.3f%.3f\n",$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}' >> mlogd.table.html echo '' >> mlogd.table.html if ($mode == "0") then awk '{printf "%4i%4i%6i%9.4f%9.4f\n",$1,$2,$3,$4,$5}' ref.MLElog \ > ref.MLElog.txt paste tree.treesum coords.txt | \ awk '{printf "%6s%4i%9.4f%9.4f%9.4f%6s%4s\n",$1,$2,$3,$4,$5,$8,$9}' \ > tree.treesum.txt endif if ($mode == "0") then echo "set nseqs = $nseqs" > MLEnuc.params echo "set npairs = $npairs" >> MLEnuc.params echo "set lambdasum = $lambdasum" >> MLEnuc.params echo "set ntmlesum = $ntmlesum" >> MLEnuc.params echo "set orf1x1 = $orf1x1" >> MLEnuc.params echo "set orf1x2 = $orf1x2" >> MLEnuc.params echo "set orf2x1 = $orf2x1" >> MLEnuc.params echo "set orf2x2 = $orf2x2" >> MLEnuc.params echo "set sss1 = $sss1" >> MLEnuc.params echo "set sss2 = $sss2" >> MLEnuc.params echo "set sss3 = $sss3" >> MLEnuc.params echo "set vvv1 = "'"'$vvv1'"' >> MLEnuc.params echo "set kkk0 = "'"'$kkk0'"' >> MLEnuc.params echo "set kkk1 = "'"'$kkk1'"' >> MLEnuc.params echo "set kkk2 = "'"'$kkk2'"' >> MLEnuc.params echo "set kkk3 = "'"'$kkk3'"' >> MLEnuc.params echo "set kkk4 = "'"'$kkk4'"' >> MLEnuc.params echo "set kkk5 = "'"'$kkk5'"' >> MLEnuc.params endif tail +3 ref.info > temp1 @ fc1 = `awk '{if ($1=="0"&&$2=="0") print $3}' temp1` @ fc2 = `awk '{if ($1=="0"&&$2!="0") s+=$3}END{print int(s/3.)}' temp1` @ fc3 = `awk '{if ($1!="0"&&$2=="0") s+=$3}END{print int(s/3.)}' temp1` @ fcp1 = `awk '{if (($1=="1"&&$2=="3")||($1=="2"&&$2=="1")||\ ($1=="3"&&$2=="2")||($1=="4"&&$2=="6")||($1=="5"&&$2=="4")||\ ($1=="6"&&$2=="5")) s+=$3}END{print int(s/3.)}' temp1` @ fcp2 = `awk '{if (($1=="1"&&$2=="2")||($1=="2"&&$2=="3")||\ ($1=="3"&&$2=="1")||($1=="4"&&$2=="5")||($1=="5"&&$2=="6")||\ ($1=="6"&&$2=="4")) s+=$3}END{print int(s/3.)}' temp1` @ fcp3 = `awk '{if (($1==$2)&&($1!="0")) s+=$3}END{print int(s/3.)}' temp1` @ fcm1 = `awk '{if (($1=="1"&&$2=="4")||($1=="2"&&$2=="6")||\ ($1=="3"&&$2=="5")||($1=="4"&&$2=="1")||($1=="6"&&$2=="2")||\ ($1=="5"&&$2=="3")) s+=$3}END{print int(s/3.)}' temp1` @ fcm2 = `awk '{if (($1=="1"&&$2=="5")||($1=="2"&&$2=="4")||\ ($1=="3"&&$2=="6")||($1=="5"&&$2=="1")||($1=="4"&&$2=="2")||\ ($1=="6"&&$2=="3")) s+=$3}END{print int(s/3.)}' temp1` @ fcm3 = `awk '{if (($1=="1"&&$2=="6")||($1=="2"&&$2=="5")||\ ($1=="3"&&$2=="4")||($1=="6"&&$2=="1")||($1=="5"&&$2=="2")||\ ($1=="4"&&$2=="3")) s+=$3}END{print int(s/3.)}' temp1` touch ref.frames.html if ($fc1 > 0) then echo "Non-coding in both models (nt): " $fc1 "
" >> ref.frames.html endif if ($fc2 > 0) then echo "Non-coding only in null model (codons): " $fc2 "
" >> ref.frames.html endif if ($fc3 > 0) then echo "Non-coding only in alternate model (codons): " $fc3 "
" \ >> ref.frames.html endif if ($fcp3 > 0) then echo "Alternate model coding in same frame as null model (codons): " $fcp3 \ "
" >> ref.frames.html endif if ($fcp1 > 0) then echo "Alternate model in +1 frame relative to null model (codons): " $fcp1 \ "
" >> ref.frames.html endif if ($fcp2 > 0) then echo "Alternate model in +2 frame relative to null model (codons): " $fcp2 \ "
" >> ref.frames.html endif if ($fcm1 > 0) then echo "Alternate model in -1 frame relative to null model (codons): " $fcm1 \ "
" >> ref.frames.html endif if ($fcm2 > 0) then echo "Alternate model in -2 frame relative to null model (codons): " $fcm2 \ "
" >> ref.frames.html endif if ($fcm3 > 0) then echo "Alternate model in -3 frame relative to null model (codons): " $fcm3 \ "
" >> ref.frames.html endif echo "
read-frame notation" >> ref.frames.html if ($mode == "1") then rm -f orfs.2 plot1.R pairs.dat ref.names foreach suffix (dat MLElog treesum stoplog gaplog gappos stoppos1 stoppos2 startpos1 startpos2 glitches runmean) rm -f ref.$suffix rm -f tree.suffix end foreach file (mlogd.table.html ref.info tree.info MLEnuc.png MLEnuc.pdf MLEnuc.eps MLEscatter.png MLEscatter.pdf MLEscatter.eps MLEtrack.png ref.frames.html) mv $file $number.$file end endif if ($mode == "1") then echo '
' >> $number.mlogd.table.html echo 'Statistics summary: ' echo 'statistics, ' echo "description.
" echo 'Likelihood ratio plot:' echo 'png, ' echo 'pdf, ' echo 'eps, ' echo "description.
" echo 'Nucleotide-by-nucleotide plot:' echo 'png, ' echo 'pdf, ' echo 'eps, ' echo "description.
" echo 'Nucleotide-by-nucleotide plot (zoomed in):' echo 'png, ' echo "description.
" echo 'Relative frames of null and alternate model CDSs: ' echo 'frames.
' endif if ($fcp3 > 0) then echo "Warning: Alternate model coding in same frame as null model" \ " for some codons. See " \ " note.
" endif if ($fcm2 > 0) then echo "Warning: Alternate model in -2 frame relative to null model" \ " for some codons. This particular frame is liable to give a false" \ "positive signal.
" endif if ($mode == "1") then echo '' echo 'likelihood ratio plot (png)' echo '' endif chmod a+rw ./* #Write rest of webpage. set id9 = `echo $newurl | sed 's/.*\///'` if ($mode == "0") then cat mlogd.table.html echo "
Description of table columns." echo "
Error log, " echo "(description of common error " echo "messages).
" echo "Relative frames of null and alternate " echo "model CDSs;" echo "Note on false positives;" echo "Frame notation.
" echo "Likelihood ratio plot:

Download " echo "png, " echo "eps, " echo "pdf, " echo "description.
" if (! $download) then echo "Add error bars and null/alternative model loci to this plot using " echo "Monte Carlo" echo "simulations " else echo "You may add error bars and null/alternative model loci to this plot " echo "using Monte Carlo simulations with the 'domcsims' script " endif echo "(see note)." echo "'
" echo "Nucleotide-by-nucleotide plot:

Download " echo "png, " echo "eps, " echo "pdf, " echo "description, " echo "raw plot data.
" if (! $download) then echo "Redraw" echo "this plot with different running-mean window size, and/or zoom in " else echo "You may redraw this plot with different running-mean window size, " echo "and/or zoom in with the 'redoMLEnuc' script " endif echo "(note).

" echo "'
" echo "Nucleotide-by-nucleotide plot (zoomed-in):

Download " echo "png, " echo "description, " echo "raw plot data.
" if (! $download) then echo "Redraw" echo "this plot with different running-mean window size, and/or zoom in " else echo "You may redraw this plot with different running-mean window size, " echo "and/or zoom in with the 'redoMLEnuc' script " endif echo "(note).

" echo "'
" endif chmod a+rw ./* echo Finished $number `date` >> errorlog.txt