#!/bin/csh
# Copyright (C) 2005 Andrew E Firth, University of Otago, Dunedin,
# New Zealand, aef(at)sanger.otago.ac.nz
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License (version 2) as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.
#Runs mlogd programme for all ORFs.
limit stacksize unlimited
@ argc = `echo $argv | awk '{print NF}'`
if ($argc != 2) then
echo "Usage: doallorfs newdir newurl
"
exit 1
endif
set olddir = `pwd`
set newdir = $1
set newurl = $2
cd $newdir
source mlogd.param
if (! $download) then
setenv PATH ${PATH}:/home/aef/bin
set formdir = "/aef/MLOGD/FORM"
else
setenv PATH ${PATH}:${HOME}/bin
set formdir = "../../FORM"
endif
@ maxruntime = 300
set starttime = `date +%s`
@ range1 = $range1
@ range2 = $range2
set refseq2 = `awk '{if ($1=="'"$refseq"'") print $2}' seqs.key`
@ minorf3 = `echo $minorf | awk '{print $1*3}'`
if ($minorf3 < 60) then
echo "Error: Smallest allowed 'Minimum ORF length' is 20 codons. You entered $minorf.
"
exit 1
endif
if (! $wholeseq) then
@ drop = `echo $range1 $range2 | awk '{print $2-$1+1}'`
if ($drop < $minorf3) then
echo "Error: You selected the 'Use a given nucleotide range' option, with range '$range1 to $range2 nt'. But this is smaller than the 'Minimum ORF length' of $minorf codons that you entered. Hence no ORFs were found.
"
exit 1
endif
endif
#Find all ORFs.
if ($orftype == "1") then
set findtype = 2
echo -n "Searching for all stop-stop ORFs >= $minorf codons in reference sequence '$refseq' ...
"
else
set findtype = 3
echo -n "Searching for all start-stop ORFs >= $minorf codons in reference sequence '$refseq' ...
"
endif
if ($circular == "0") then
set circ = "no"
else
set circ = "yes"
endif
degapseq $refseq2.fasta temp.fasta -auto
getorf temp.fasta allorfs.1 -minsize $minorf3 -table 0 -find $findtype \
-circular $circ -reverse yes -auto
set length = `infoseq temp.fasta -auto -only -length | tail -1`
rm -f temp.fasta
grep "^>" allorfs.1 | sed 's/\[//' | sed 's/\]//' | sed 's/>//' | \
sed 's/(REVERSE SENSE) */R/' | \
sed 's/(ORF crosses the breakpoint) */B/' | \
awk '{if ($5=="B") {print $2,$4,"N","B"} \
else if ($5=="R") {print $2,$4,"R","N"} \
else if ($5=="RB") {print $2,$4,"R","B"} \
else {print $2,$4,"N","N"}}' | \
awk '{print sqrt(($2-$1)*($2-$1)),$0}' | sort -nr | \
awk '{if ($4=="N"&&$5=="B") {printf "%s join(%s..%s,%s..%s)\n",$0,$2,"'"$length"'","1",$3-"'"$length"'"} else if ($4=="R"&&$5=="B") {printf "%s join(%s..%s,%s..%s)\n",$0,$2-"'"$length"'","1","'"$length"'",$3} else {printf "%s %s..%s\n",$0,$2,$3}}' > allorfs.2
awk '{print $6}' allorfs.2 | sed 's/[^0-9]/ /g' | awk '{print NF,$0}' | \
awk '{if ($1=="4"&&0+$2<=0+"'"$length"'"&&0+$3<=0+"'"$length"'"&&0+$4<=0+"'"$length"'"&&0+$5<=0+"'"$length"'") {printf "%s:%s:%s:%s:%s\n",$1,$2,$3,$4,$5} else if ($1=="2"&&0+$2<=0+"'"$length"'"&&0+$3<=0+"'"$length"'") {printf "%s:%s:%s\n",$1,$2,$3}}' > temp1
awk '{if ($1=="4") {printf "%s:%s:%s:%s:%s\n",$1,$2,$3,$4,$5} else if ($1=="2") {printf "%s:%s:%s\n",$1,$2,$3}}' orfs.1 > temp2
foreach orf0 (`cat temp2`)
sed 's/'$orf0'//' temp1 > temp3
mv temp3 temp1
end
grep ":" temp1 > allorfs.3
rm -f temp[123]
echo "...Done
"
echo "
Query ORFs:
"
@ w = `wc -l allorfs.3 | awk '{print $1}'`
if ($w == 0) then
echo "
No ORFs found.
"
exit 1
endif
sed 's/:/ /g' allorfs.3 | awk '{if ($1=="4") {printf "%s: join(%s..%s,%s..%s)
\n",NR,$2,$3,$4,$5} else if ($1=="2") {printf "%s: %s..%s
\n",NR,$2,$3}}'
if (! $wholeseq) then
echo "
You selected 'Use a given nucleotide range' option. Therefore"
echo " query ORFs outside the given range (i.e. $range1 to $range2 nt)"
echo " will not be used. Query ORFs that overlap the boundary of this"
echo " region will be used only if the portion within $range1 to $range2 nt"
echo " is at least your selected minimum ORF length (i.e. $minorf codons)."
echo " Statistics will be calculated only for the portion within $range1 to"
echo " $range2 nt.
"
endif
echo "
Running...Please be patient...
"
#Run MLOGD for ref-nonref pairs.
#Note - rewrite setup.mlogd for each ORF -> relies on setup.mlogd not being
# rewritten in domlogd (just written in prepmlogd).
echo '
'
@ number = 0
foreach i (`cat allorfs.3`)
set currenttime = `date +%s`
@ totaltime = `echo $starttime $currenttime | awk '{print $2-$1}'`
if ($totaltime > $maxruntime) then
echo '
'
echo "Warning: Exceeded maximum allowed run-time; the remaining ORFs will not be processed. If you really want to investigate any of the remaining ORFs, then try using the 'Use a given nucleotide range' option to analyse the input alignment in several sections. Note that sometimes a run-time overflow may be the result of server overload, in which case you could try running your job at a different time.
"
exit 0
endif
@ number += 1
@ r1 = `echo $i | cut -d ":" -f 2- | sed 'y/:/\n/' | grep "[0-9]" | sort -n | head -1`
@ r2 = `echo $i | cut -d ":" -f 2- | sed 'y/:/\n/' | grep "[0-9]" | sort -n | tail -1`
set ok = 1
if (! $wholeseq) then
@ seg = `echo $i | awk -F: '{print $1}'`
if ("2" == $seg) then
if ($r1 <= $range2 && $r2 >= $range1) then
if ($r1 < $range1) then
@ r1 = $range1
endif
if ($r2 > $range2) then
@ r2 = $range2
endif
@ olap = `echo $r1 $r2 | awk '{print $2-$1+1}'`
if ($olap < $minorf3) then
set ok = 0
endif
else
set ok = 0
endif
else
@ r1 = `echo $i | cut -d ":" -f 2- | sed 'y/:/\n/' | grep "[0-9]" | sort -n | head -2 | tail -1`
@ r2 = `echo $i | cut -d ":" -f 2- | sed 'y/:/\n/' | grep "[0-9]" | sort -n | tail -2 | head -1`
if ($range1 > $r1 && $range2 < $r2) then
set ok = 0
else
if ($range2 <= $r1 || $range1 >= $r2) then
@ r1 = $range1
@ r2 = $range2
@ olap = `echo $r1 $r2 | awk '{print $2-$1+1}'`
else if ($range2 < $r2) then
@ r2 = $r1
@ r1 = $range1
@ olap = `echo $r1 $r2 | awk '{print $2-$1+1}'`
else if ($range1 > $r1) then
@ r1 = $r2
@ r2 = $range2
@ olap = `echo $r1 $r2 | awk '{print $2-$1+1}'`
else
@ olap = `echo $r1 $r2 $range1 $range2 | awk '{print 2+($1+$4)-($2+$3)}'`
@ r1 = $range1
@ r2 = $range2
endif
if ($olap < $minorf3) then
set ok = 0
endif
endif
endif
endif
if ($ok) then
echo ''
echo -n "Query ORF $number"
echo $i | sed 's/:/ /g' | awk '{if ($1=="4") \
{printf ": join(%s..%s,%s..%s)",$2,$3,$4,$5} \
else if ($1=="2") {printf ": %s..%s",$2,$3}}'
echo "
"
echo $i | sed 's/:/ /g' > orfs.2
echo 2 $r1 $r2 " " > setup.mlogd
echo $circular " " >> setup.mlogd
if (! $download) then
$olddir/domlogd . $newurl 1 $number || exit 1
else
domlogd . $newurl 1 $number || exit 1
endif
endif
end
echo ''
chmod a+rw ./*
#Write rest of webpage.
echo " Error log "
echo "(description of common error messages)."
echo " Note on false positives. "
echo "Note on interpretation of likelihood "
echo "ratios.
Note that the zoomed-in nucleotide-by-nucleotide "
echo "plots can be very wide and may not display properly in some browsers; "
echo "try 'Save Link Target As...', to save to a file instead. "
echo "If any of these query ORFs look interesting (e.g. "
echo "consistently positive log likelihood ratios) you can "
if (! $download) then
echo "return to the MLOGD server, and enter these "
echo "ORFs one at a time, using the 'Test input query CDSs' option. "
echo "At the results page for each ORF, you will have the option to add "
echo "Monte Carlo simulations to the plots to derive error bars and "
echo "alternative probability estimates.
"
else
echo "use the 'domlogd' script on one ORF at a time. Following that you "
echo "may like to use the 'domcsims' script to add Monte Carlo simulations "
echo "to the plots to derive error bars and alternative probability "
echo "estimates.
"
endif
chmod a+rw ./*
|