#!/bin/csh # Copyright (C) 2005 Andrew E Firth, University of Otago, Dunedin, # New Zealand, aef(at)sanger.otago.ac.nz # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License (version 2) as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA # 02110-1301, USA. #Runs mlogd programme for all ORFs. limit stacksize unlimited @ argc = `echo $argv | awk '{print NF}'` if ($argc != 2) then echo "Usage: doallorfs newdir newurl
" exit 1 endif set olddir = `pwd` set newdir = $1 set newurl = $2 cd $newdir source mlogd.param if (! $download) then setenv PATH ${PATH}:/home/aef/bin set formdir = "/aef/MLOGD/FORM" else setenv PATH ${PATH}:${HOME}/bin set formdir = "../../FORM" endif @ maxruntime = 300 set starttime = `date +%s` @ range1 = $range1 @ range2 = $range2 set refseq2 = `awk '{if ($1=="'"$refseq"'") print $2}' seqs.key` @ minorf3 = `echo $minorf | awk '{print $1*3}'` if ($minorf3 < 60) then echo "Error: Smallest allowed 'Minimum ORF length' is 20 codons. You entered $minorf.

" exit 1 endif if (! $wholeseq) then @ drop = `echo $range1 $range2 | awk '{print $2-$1+1}'` if ($drop < $minorf3) then echo "Error: You selected the 'Use a given nucleotide range' option, with range '$range1 to $range2 nt'. But this is smaller than the 'Minimum ORF length' of $minorf codons that you entered. Hence no ORFs were found.

" exit 1 endif endif #Find all ORFs. if ($orftype == "1") then set findtype = 2 echo -n "Searching for all stop-stop ORFs >= $minorf codons in reference sequence '$refseq' ...
" else set findtype = 3 echo -n "Searching for all start-stop ORFs >= $minorf codons in reference sequence '$refseq' ...
" endif if ($circular == "0") then set circ = "no" else set circ = "yes" endif degapseq $refseq2.fasta temp.fasta -auto getorf temp.fasta allorfs.1 -minsize $minorf3 -table 0 -find $findtype \ -circular $circ -reverse yes -auto set length = `infoseq temp.fasta -auto -only -length | tail -1` rm -f temp.fasta grep "^>" allorfs.1 | sed 's/\[//' | sed 's/\]//' | sed 's/>//' | \ sed 's/(REVERSE SENSE) */R/' | \ sed 's/(ORF crosses the breakpoint) */B/' | \ awk '{if ($5=="B") {print $2,$4,"N","B"} \ else if ($5=="R") {print $2,$4,"R","N"} \ else if ($5=="RB") {print $2,$4,"R","B"} \ else {print $2,$4,"N","N"}}' | \ awk '{print sqrt(($2-$1)*($2-$1)),$0}' | sort -nr | \ awk '{if ($4=="N"&&$5=="B") {printf "%s join(%s..%s,%s..%s)\n",$0,$2,"'"$length"'","1",$3-"'"$length"'"} else if ($4=="R"&&$5=="B") {printf "%s join(%s..%s,%s..%s)\n",$0,$2-"'"$length"'","1","'"$length"'",$3} else {printf "%s %s..%s\n",$0,$2,$3}}' > allorfs.2 awk '{print $6}' allorfs.2 | sed 's/[^0-9]/ /g' | awk '{print NF,$0}' | \ awk '{if ($1=="4"&&0+$2<=0+"'"$length"'"&&0+$3<=0+"'"$length"'"&&0+$4<=0+"'"$length"'"&&0+$5<=0+"'"$length"'") {printf "%s:%s:%s:%s:%s\n",$1,$2,$3,$4,$5} else if ($1=="2"&&0+$2<=0+"'"$length"'"&&0+$3<=0+"'"$length"'") {printf "%s:%s:%s\n",$1,$2,$3}}' > temp1 awk '{if ($1=="4") {printf "%s:%s:%s:%s:%s\n",$1,$2,$3,$4,$5} else if ($1=="2") {printf "%s:%s:%s\n",$1,$2,$3}}' orfs.1 > temp2 foreach orf0 (`cat temp2`) sed 's/'$orf0'//' temp1 > temp3 mv temp3 temp1 end grep ":" temp1 > allorfs.3 rm -f temp[123] echo "...Done
" echo "
Query ORFs:
" @ w = `wc -l allorfs.3 | awk '{print $1}'` if ($w == 0) then echo "
No ORFs found.
" exit 1 endif sed 's/:/ /g' allorfs.3 | awk '{if ($1=="4") {printf "%s: join(%s..%s,%s..%s)
\n",NR,$2,$3,$4,$5} else if ($1=="2") {printf "%s: %s..%s
\n",NR,$2,$3}}' if (! $wholeseq) then echo "
You selected 'Use a given nucleotide range' option. Therefore" echo " query ORFs outside the given range (i.e. $range1 to $range2 nt)" echo " will not be used. Query ORFs that overlap the boundary of this" echo " region will be used only if the portion within $range1 to $range2 nt" echo " is at least your selected minimum ORF length (i.e. $minorf codons)." echo " Statistics will be calculated only for the portion within $range1 to" echo " $range2 nt.
" endif echo "
Running...Please be patient...
" #Run MLOGD for ref-nonref pairs. #Note - rewrite setup.mlogd for each ORF -> relies on setup.mlogd not being # rewritten in domlogd (just written in prepmlogd). echo '
' @ number = 0 foreach i (`cat allorfs.3`) set currenttime = `date +%s` @ totaltime = `echo $starttime $currenttime | awk '{print $2-$1}'` if ($totaltime > $maxruntime) then echo '

' echo "Warning: Exceeded maximum allowed run-time; the remaining ORFs will not be processed. If you really want to investigate any of the remaining ORFs, then try using the 'Use a given nucleotide range' option to analyse the input alignment in several sections. Note that sometimes a run-time overflow may be the result of server overload, in which case you could try running your job at a different time.
" exit 0 endif @ number += 1 @ r1 = `echo $i | cut -d ":" -f 2- | sed 'y/:/\n/' | grep "[0-9]" | sort -n | head -1` @ r2 = `echo $i | cut -d ":" -f 2- | sed 'y/:/\n/' | grep "[0-9]" | sort -n | tail -1` set ok = 1 if (! $wholeseq) then @ seg = `echo $i | awk -F: '{print $1}'` if ("2" == $seg) then if ($r1 <= $range2 && $r2 >= $range1) then if ($r1 < $range1) then @ r1 = $range1 endif if ($r2 > $range2) then @ r2 = $range2 endif @ olap = `echo $r1 $r2 | awk '{print $2-$1+1}'` if ($olap < $minorf3) then set ok = 0 endif else set ok = 0 endif else @ r1 = `echo $i | cut -d ":" -f 2- | sed 'y/:/\n/' | grep "[0-9]" | sort -n | head -2 | tail -1` @ r2 = `echo $i | cut -d ":" -f 2- | sed 'y/:/\n/' | grep "[0-9]" | sort -n | tail -2 | head -1` if ($range1 > $r1 && $range2 < $r2) then set ok = 0 else if ($range2 <= $r1 || $range1 >= $r2) then @ r1 = $range1 @ r2 = $range2 @ olap = `echo $r1 $r2 | awk '{print $2-$1+1}'` else if ($range2 < $r2) then @ r2 = $r1 @ r1 = $range1 @ olap = `echo $r1 $r2 | awk '{print $2-$1+1}'` else if ($range1 > $r1) then @ r1 = $r2 @ r2 = $range2 @ olap = `echo $r1 $r2 | awk '{print $2-$1+1}'` else @ olap = `echo $r1 $r2 $range1 $range2 | awk '{print 2+($1+$4)-($2+$3)}'` @ r1 = $range1 @ r2 = $range2 endif if ($olap < $minorf3) then set ok = 0 endif endif endif endif if ($ok) then echo '' echo -n "Query ORF $number" echo $i | sed 's/:/ /g' | awk '{if ($1=="4") \ {printf ": join(%s..%s,%s..%s)",$2,$3,$4,$5} \ else if ($1=="2") {printf ": %s..%s",$2,$3}}' echo "

" echo $i | sed 's/:/ /g' > orfs.2 echo 2 $r1 $r2 " " > setup.mlogd echo $circular " " >> setup.mlogd if (! $download) then $olddir/domlogd . $newurl 1 $number || exit 1 else domlogd . $newurl 1 $number || exit 1 endif endif end echo '' chmod a+rw ./* #Write rest of webpage. echo "
Error log " echo "(description of common error messages)." echo "
Note on false positives.
" echo "Note on interpretation of likelihood " echo "ratios.

Note that the zoomed-in nucleotide-by-nucleotide " echo "plots can be very wide and may not display properly in some browsers; " echo "try 'Save Link Target As...', to save to a file instead.
" echo "If any of these query ORFs look interesting (e.g. " echo "consistently positive log likelihood ratios) you can " if (! $download) then echo "return to the MLOGD server, and enter these " echo "ORFs one at a time, using the 'Test input query CDSs' option. " echo "At the results page for each ORF, you will have the option to add " echo "Monte Carlo simulations to the plots to derive error bars and " echo "alternative probability estimates.

" else echo "use the 'domlogd' script on one ORF at a time. Following that you " echo "may like to use the 'domcsims' script to add Monte Carlo simulations " echo "to the plots to derive error bars and alternative probability " echo "estimates.

" endif chmod a+rw ./*