#!/bin/csh
# Copyright (C) 2005 Andrew E Firth, University of Otago, Dunedin,
# New Zealand, aef(at)sanger.otago.ac.nz
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License (version 2) as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.
#Runs mlogd programme for all six frames in sliding windows.
limit stacksize unlimited
@ argc = `echo $argv | awk '{print NF}'`
if ($argc != 2) then
echo "Usage: dosixframe newdir newurl
"
exit 1
endif
set olddir = `pwd`
set newdir = $1
set newurl = $2
cd $newdir
source mlogd.param
if (! $download) then
setenv PATH ${PATH}:/home/aef/bin
set formdir = "/aef/MLOGD/FORM"
else
setenv PATH ${PATH}:${HOME}/bin
set formdir = "../../FORM"
endif
echo -n "Preparing for six-frame plots...
"
cp ../../SCRIPTS/sixframe.plotdata.html .
cp orfs.1 orfs.1.bkp
mv orfs.2 orfs.2.bkp
set threshold = 0.75
set threshold = `echo $threshold | awk '{if (0.+$1<0.01) {print "0.01"} else {print $1}}' | awk '{if (0.+$1>0.99) {print "0.99"} else {print $1}}'`
@ range1 = $range1
@ range2 = $range2
#If wholeseq = 1 or 2 then do six frame plot for wholeseq.
#If wholeseq = 0 then do six frame plot for range1-range2.
set refseq2 = `awk '{if ($1=="'"$refseq"'") print $2}' seqs.key`
degapseq $refseq2.fasta temp.fasta -auto
set length = `infoseq temp.fasta -auto -only -length | tail -1`
if ($wholeseq) then
@ range1 = 1
@ range2 = $length
endif
@ step3 = `echo $step | awk '{print $1*3}'`
@ ncodons3 = `echo $ncodons | awk '{print $1*3}'`
if ($step3 < 15) then
echo "Error: Minimum step size is 5 codons. You entered $step.
"
exit 1
endif
rm -f temp.fasta
@ drop = `echo $range1 $range2 | awk '{print int(0.5*($2-$1+1))}'`
if ($ncodons3 > $drop) then
echo "Error: Window size '$ncodons3 nt' is too large relative to the requested nucleotide range '$range1 to $range2'. Therefore no plot data. Please ensure nucleotide range is at least twice the window size.
"
exit 1
endif
@ drop = `echo $step3 | awk '{print $1*2}'`
if ($drop > $ncodons3) then
echo "Error: Step size '$step3 nt' should be no greater than half the window size '$ncodons3 nt'.
"
exit 1
endif
#Estimated run-time
@ maxruntime = 500
@ w = `wc -l pairs.tree.dat | awk '{print $1}'`
@ runtime = `echo $range1 $range2 $step $ncodons $w | awk '{print 10+int(0.02*sqrt($5*$4)*($2-$1+1)/$3)}'`
echo "Estimated run-time: $runtime seconds.
"
if ($runtime > $maxruntime) then
echo "Estimated run-time is too long - your submission won't be processed.
"
echo "You could try decreasing the nucleotide range, or increasing the step size.
"
exit 1
endif
if ($wholeseq) then
set base1 = 1
set base2 = `infoseq $refseq2.fasta -auto -only -length | tail -1`
else
set base1 = `ntadjust $refseq2.fasta $range1`
set base2 = `ntadjust $refseq2.fasta $range2`
endif
echo $length $base1 $base2 > ref.info
#Make refseq to align coords conversion file.
tail +2 seq.001.fasta | sed 's/./&@/g' | sed 'y/@/\n/' | grep "." | \
awk '{if ($1!="-") s+=1}{if ($1!="-") {printf "%6s %6s %6s\n",NR,s,$1} else \
{printf "%6s %6s %6s\n",NR,"-",$1}}' > coords.txt
#Make lists of windows to use.
foreach frame (0 1 2)
@ s1 = `echo $frame $range1 | awk '{print $1+1+$2-($2%3)}'`
@ s2 = `echo $frame $range1 $ncodons | awk '{print $3*3+$1+$2-($2%3)}'`
@ s3 = `echo $s1 $s2 | awk '{print int(0.5*($1+$2))}'`
rm -f temp1
touch temp1
while ($s2 <= $range2)
echo $s1 $s2 $s3 >> temp1
@ s1 += $step3
@ s2 += $step3
@ s3 += $step3
end
awk '{printf "%s 2:%s:%s\n",$3,$1,$2}' temp1 > orfs.p.$frame
awk '{printf "%s 2:%s:%s\n",$3,$2,$1}' temp1 > orfs.m.$frame
if ($circular && $wholeseq) then
rm -f temp2
touch temp2
while ($s1 <= $length)
if ($s3 > $length) then
@ s3 -= $length
endif
echo $s1 $s2 $s3 >> temp2
@ s1 += $step3
@ s2 += $step3
@ s3 += $step3
end
awk '{printf "%s 4:%s:%s:%s:%s\n",$3,$1,"'"$length"'","1",$2-"'"$length"'"}' \
temp2 >> orfs.p.$frame
awk '{printf "%s 4:%s:%s:%s:%s\n",$3,$2-"'"$length"'","1","'"$length"'",$1}' \
temp2 >> orfs.m.$frame
endif
sort -n orfs.p.$frame | awk '{printf "%s+%s\n",$1,$2}' > temp3
mv temp3 orfs.p.$frame
sort -n orfs.m.$frame | awk '{printf "%s+%s\n",$1,$2}' > temp3
mv temp3 orfs.m.$frame
end
rm -f temp[12]
echo "...Done
"
#Calculate summed divergence of contributing sequence pairs.
touch orfs.2
cp pairs.tree.dat pairs.dat
awk '{print $2,$1}' pairs.tree.dat >> pairs.dat
echo 1 1 $length " " > setup.mlogd
echo $circular " " >> setup.mlogd
mlogd setup.mlogd >>& errorlog.txt || exit 1
set lambdasum = `head -2 mlogd_out.info | tail -1 | awk '{printf "%.2f\n",$1*0.25}'`
touch mlogd_out.treesum
awk '{print $1,$3*0.25}' mlogd_out.treesum > tree.lambda
awk '{if (0.+$12>0.) {print $13/$12} else {print "0"}}' mlogd_out.dat > div.dat
set lambdasum2 = `awk '{s+=$1}END{print s}' div.dat`
set maxsumdiv = `echo $lambdasum2 $ncodons3 | awk '{print $1*$2}'`
set threshold2 = `echo $threshold $maxsumdiv | awk '{print $1*$2}'`
foreach suffix (dat gaplog gappos glitches info MLElog stoplog stoppos1 stoppos2 startpos1 startpos2 treesum)
rm -f mlogd_out.$suffix
end
rm -f orfs.2 setup.mlogd pairs.dat
awk '{printf "%6i %10.4f\n",$1,$2}' tree.lambda > div.data.txt
#Run MLOGD for six frames.
echo -n "Doing six-frame plots...
"
cp pairs.tree.dat pairs.dat
awk '{print $2,$1}' pairs.tree.dat >> pairs.dat
foreach frame (p.0 p.1 p.2 m.0 m.1 m.2)
rm -f mle.$frame
touch mle.$frame
foreach j (`cat orfs.$frame`)
set i = `echo $j | awk -F+ '{print $2}'`
echo $i | sed 's/:/ /g' > orfs.2
set r1 = `echo $i | cut -d ":" -f 2- | sed 'y/:/\n/' | grep "[0-9]" | sort -n | head -1`
set r2 = `echo $i | cut -d ":" -f 2- | sed 'y/:/\n/' | grep "[0-9]" | sort -n | tail -1`
echo 2 $r1 $r2 " " > setup.mlogd
echo $circular " " >> setup.mlogd
mlogd setup.mlogd >>& errorlog.txt || exit 1
set totalmle = \
`awk '{s+=($4-$3)*$5}END{print s*0.25}' mlogd_out.dat`
set sumdiv = \
`paste div.dat mlogd_out.dat | awk '{s+=$1*$6}END{print s}'`
set refbase = `echo $j | awk -F+ '{print $1}'`
set base = `ntadjust $refseq2.fasta $refbase`
set num = `echo $i | awk -F: '{print $1}'`
if ($num == "4") then
set v1 = `echo $i | cut -d ":" -f 2- | sed 'y/:/\n/' | grep "[0-9]" | sort -n | head -3 | tail -1`
set v2 = `echo $i | cut -d ":" -f 2- | sed 'y/:/\n/' | grep "[0-9]" | sort -n | head -2 | tail -1`
else
set v1 = $r1
set v2 = $r2
endif
set t1 = `ntadjust $refseq2.fasta $v1`
set t2 = `ntadjust $refseq2.fasta $v2`
echo $base $totalmle $t1 $t2 $sumdiv >> mle.$frame
end
echo -n "$frame.."
end
awk '{printf "+0 %7i %9.2f %7i %7i %12.2f\n",$1,$2,$3,$4,$5}' mle.p.0 \
> mle.data.txt
awk '{printf "+1 %7i %9.2f %7i %7i %12.2f\n",$1,$2,$3,$4,$5}' mle.p.1 \
>> mle.data.txt
awk '{printf "+2 %7i %9.2f %7i %7i %12.2f\n",$1,$2,$3,$4,$5}' mle.p.2 \
>> mle.data.txt
awk '{printf "-0 %7i %9.2f %7i %7i %12.2f\n",$1,$2,$3,$4,$5}' mle.m.0 \
>> mle.data.txt
awk '{printf "-1 %7i %9.2f %7i %7i %12.2f\n",$1,$2,$3,$4,$5}' mle.m.1 \
>> mle.data.txt
awk '{printf "-2 %7i %9.2f %7i %7i %12.2f\n",$1,$2,$3,$4,$5}' mle.m.2 \
>> mle.data.txt
echo "...Done
"
echo -n "Calculating other plot data...
"
#Apply threshold and scale scores. The mle2.$frame are only used for the plot.
set length2 = `infoseq $refseq2.fasta -auto -only -length | tail -1`
foreach frame (p.0 p.1 p.2 m.0 m.1 m.2)
awk '{if (0.+$5>=0.+"'"$threshold2"'"&&0.+$5>0.) print $1,$2*"'"$maxsumdiv"'"/$5,$3,$4,$5}' mle.$frame | awk '{if (0.+$3>0.+$4) {printf "%s %s %s %s %s\n%s %s %s %s %s\n",$1,$2,$3,"'"$length2"'",$5,$1,$2,"1",$4,$5} else {print $0}}' > mle2.$frame
end
foreach suffix (dat gaplog glitches info MLElog stoplog treesum stoppos1 stoppos2 startpos1 startpos2)
rm -f mlogd_out.$suffix
end
rm -f setup.mlogd orfs.2
touch mlogd_out.gappos
#OK to use gappos file for last iteration. It's done for full alignment.
mv mlogd_out.gappos gappos
awk '{printf "%4i %6i\n",$1,$2}' gappos > gaps.data.txt
#Null model orfs. Change from refseq to alignment coords.
@ norfs1 = `wc -l orfs.1 | awk '{print $1}'`
if ($norfs1 > 0) then
cut -d " " -f 2- orfs.1 | sed 's/[^0-9]/x/g' | sed 'y/x/\n/' | grep "[0-9]" \
> temp
rm -f temp1; touch temp1
foreach i (`cat temp`)
echo `ntadjust $refseq2.fasta $i` >> temp1
end
set orf1x1 = `awk '{if (NR%2==1) printf "%s,",$0}' temp1 | sed 's/,$//g'`
set orf1x2 = `awk '{if (NR%2==0) printf "%s,",$0}' temp1 | sed 's/,$//g'`
else
set orf1x1 = "0"
set orf1x2 = "0"
endif
rm -f temp temp1
#Other parameters
set nseqs = `wc -l pairs.ref.dat | awk '{print $1+1}'`
set date = `date +%y-%m-%d`
cat mle2.[pm].[012] | awk '{printf "%14.6f\n",$2}' > temp1
echo 0 >> temp1
set y2min = `minmax temp1 | awk '{print $1}'`
set y2max = `minmax temp1 | awk '{print $2}'`
awk '{print $2}' pairs.ref.dat > temp1
rm -f temp2
touch temp2
foreach i (`cat temp1`)
set id = seq.$i
set name = `awk '{if ($2=="'"$id"'") print $1}' seqs.key`
echo $name >> temp2
end
set sss1 = `awk '{printf "%s,",(NR+1-0.5)/"'"$nseqs"'"}' temp2 | sed 's/,$//'`
set sss2 = `awk '{printf "@%s@,",$1}' temp2 | sed 's/,$//'`
rm -f temp[12]
#Calculate stop posfiles.
rm -f orfs.1 orfs.2
touch orfs.2
head -1 pairs.tree.dat > pairs.dat
echo 1 1 $length " " > setup.mlogd
echo $circular " " >> setup.mlogd
foreach frame (0 1 2)
@ l1 = `echo $frame | awk '{print $1+1}'`
@ l2 = `echo $frame | awk '{print "'"$length"'"-"'"$length"'"%3+$1}'`
if ($l2 > $length) then
@ l2 -= 3
endif
echo 2 $l1 $l2 > orfs.1
mlogd setup.mlogd >>& errorlog.txt || exit 1
touch mlogd_out.stoppos1 mlogd_out.startpos1
mv mlogd_out.stoppos1 stoppos.p.$frame
mv mlogd_out.startpos1 startpos.p.$frame
echo 2 $l2 $l1 > orfs.1
mlogd setup.mlogd >>& errorlog.txt || exit 1
touch mlogd_out.stoppos1 mlogd_out.startpos1
mv mlogd_out.stoppos1 stoppos.m.$frame
mv mlogd_out.startpos1 startpos.m.$frame
end
foreach suffix (dat gaplog gappos glitches info MLElog stoplog stoppos2 startpos2 treesum)
rm -f mlogd_out.$suffix
end
rm -f orfs.1 orfs.2 setup.mlogd pairs.dat
awk '{printf "+0 %4i %6i\n",$1,$2}' stoppos.p.0 > stops.data.txt
awk '{printf "+1 %4i %6i\n",$1,$2}' stoppos.p.1 >> stops.data.txt
awk '{printf "+2 %4i %6i\n",$1,$2}' stoppos.p.2 >> stops.data.txt
awk '{printf "-0 %4i %6i\n",$1,$2}' stoppos.m.0 >> stops.data.txt
awk '{printf "-1 %4i %6i\n",$1,$2}' stoppos.m.1 >> stops.data.txt
awk '{printf "-2 %4i %6i\n",$1,$2}' stoppos.m.2 >> stops.data.txt
awk '{printf "+0 %4i %6i\n",$1,$2}' startpos.p.0 > starts.data.txt
awk '{printf "+1 %4i %6i\n",$1,$2}' startpos.p.1 >> starts.data.txt
awk '{printf "+2 %4i %6i\n",$1,$2}' startpos.p.2 >> starts.data.txt
awk '{printf "-0 %4i %6i\n",$1,$2}' startpos.m.0 >> starts.data.txt
awk '{printf "-1 %4i %6i\n",$1,$2}' startpos.m.1 >> starts.data.txt
awk '{printf "-2 %4i %6i\n",$1,$2}' startpos.m.2 >> starts.data.txt
#Check for empty files to edit out of R script to stop R from barfing.
@ w = `wc -l gappos | awk '{print $1}'`
if ($w == 0) then
set kkk4 = "#"
else
set kkk4 = ""
endif
@ w = `wc -l tree.lambda | awk '{print $1}'`
if ($w == 0) then
set kkk5 = "#"
else
set kkk5 = ""
endif
@ w = `wc -l stoppos.p.0 | awk '{print $1}'`
if ($w == 0) then
set ppp1 = "#"
else
set ppp1 = ""
endif
@ w = `wc -l stoppos.p.1 | awk '{print $1}'`
if ($w == 0) then
set ppp2 = "#"
else
set ppp2 = ""
endif
@ w = `wc -l stoppos.p.2 | awk '{print $1}'`
if ($w == 0) then
set ppp3 = "#"
else
set ppp3 = ""
endif
@ w = `wc -l stoppos.m.0 | awk '{print $1}'`
if ($w == 0) then
set ppp4 = "#"
else
set ppp4 = ""
endif
@ w = `wc -l stoppos.m.1 | awk '{print $1}'`
if ($w == 0) then
set ppp5 = "#"
else
set ppp5 = ""
endif
@ w = `wc -l stoppos.m.2 | awk '{print $1}'`
if ($w == 0) then
set ppp6 = "#"
else
set ppp6 = ""
endif
@ w = `wc -l mle2.p.0 | awk '{print $1}'`
if ($w == 0) then
set jjj1 = "#"
else
set jjj1 = ""
endif
@ w = `wc -l mle2.p.1 | awk '{print $1}'`
if ($w == 0) then
set jjj2 = "#"
else
set jjj2 = ""
endif
@ w = `wc -l mle2.p.2 | awk '{print $1}'`
if ($w == 0) then
set jjj3 = "#"
else
set jjj3 = ""
endif
@ w = `wc -l mle2.m.0 | awk '{print $1}'`
if ($w == 0) then
set jjj4 = "#"
else
set jjj4 = ""
endif
@ w = `wc -l mle2.m.1 | awk '{print $1}'`
if ($w == 0) then
set jjj5 = "#"
else
set jjj5 = ""
endif
@ w = `wc -l mle2.m.2 | awk '{print $1}'`
if ($w == 0) then
set jjj6 = "#"
else
set jjj6 = ""
endif
echo "...Done
"
echo -n "Doing plots...
"
cat ../../SCRIPTS/sixframe.R | \
sed 's/nnn/'$nseqs'/g' | sed 's/ddd/'$date'/g' | \
sed 's/bbb1/'$base1'/g' | sed 's/bbb2/'$base2'/g' | \
sed 's/www/'$ncodons'/g' | sed 's/lll/'$step'/g' | \
sed 's/ttt/'$lambdasum'/g' | \
sed 's/orf1x1/'$orf1x1'/g' | sed 's/orf1x2/'$orf1x2'/g' | \
sed 's/#kkk4/'$kkk4'/g' | sed 's/#kkk5/'$kkk5'/g' | \
sed 's/#jjj1/'$jjj1'/g' | sed 's/#jjj2/'$jjj2'/g' | \
sed 's/#jjj3/'$jjj3'/g' | sed 's/#jjj4/'$jjj4'/g' | \
sed 's/#jjj5/'$jjj5'/g' | sed 's/#jjj6/'$jjj6'/g' | \
sed 's/#ppp1/'$ppp1'/g' | sed 's/#ppp2/'$ppp2'/g' | \
sed 's/#ppp3/'$ppp3'/g' | sed 's/#ppp4/'$ppp4'/g' | \
sed 's/#ppp5/'$ppp5'/g' | sed 's/#ppp6/'$ppp6'/g' | \
sed 's/yyy2max/'$y2max'/g' | sed 's/yyy2min/'$y2min'/g' | \
sed 's/sss1/'$sss1'/g' | sed 's/sss2/'$sss2'/g' | sed 's/@/"/g' | \
sed 's/%sss%/'$refseq'/g' | sed 's/%ggg%/'$title'/g' > plot3.R
R --no-save --slave < plot3.R >>& errorlog.txt
epstopdf sixframe.eps
echo "set nseqs = $nseqs" > sixframe.params
echo "set base1 = $base1" >> sixframe.params
echo "set base2 = $base2" >> sixframe.params
echo "set lambdasum = $lambdasum" >> sixframe.params
echo "set maxsumdiv = $maxsumdiv" >> sixframe.params
echo "set length2 = $length2" >> sixframe.params
echo "set orf1x1 = $orf1x1" >> sixframe.params
echo "set orf1x2 = $orf1x2" >> sixframe.params
echo "set sss1 = $sss1" >> sixframe.params
echo "set sss2 = $sss2" >> sixframe.params
echo "set kkk4 = "'"'$kkk4'"' >> sixframe.params
echo "set kkk5 = "'"'$kkk5'"' >> sixframe.params
echo "set ppp1 = "'"'$ppp1'"' >> sixframe.params
echo "set ppp2 = "'"'$ppp2'"' >> sixframe.params
echo "set ppp3 = "'"'$ppp3'"' >> sixframe.params
echo "set ppp4 = "'"'$ppp4'"' >> sixframe.params
echo "set ppp5 = "'"'$ppp5'"' >> sixframe.params
echo "set ppp6 = "'"'$ppp6'"' >> sixframe.params
cp orfs.1.bkp orfs.1
cp orfs.2.bkp orfs.2
echo "...Done
"
chmod a+rw ./*
#Write rest of webpage.
set id9 = `echo $newurl | sed 's/.*\///'`
echo "