#!/bin/csh

# Copyright (C) 2005 Andrew E Firth, University of Otago, Dunedin, 
# New Zealand, aef(at)sanger.otago.ac.nz
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License (version 2) as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.

#Runs mlogd programme for all six frames in sliding windows.

limit stacksize unlimited

@ argc = `echo $argv | awk '{print NF}'`
if ($argc != 2) then
  echo "Usage: dosixframe newdir newurl<br>"
  exit 1
endif

set olddir = `pwd`
set newdir = $1
set newurl = $2
cd $newdir
source mlogd.param

if (! $download) then
  setenv PATH ${PATH}:/home/aef/bin
  set formdir = "/aef/MLOGD/FORM"
else
  setenv PATH ${PATH}:${HOME}/bin
  set formdir = "../../FORM"
endif

echo -n "Preparing for six-frame plots...<br>"

cp ../../SCRIPTS/sixframe.plotdata.html .
cp orfs.1 orfs.1.bkp
mv orfs.2 orfs.2.bkp
set threshold = 0.75
set threshold = `echo $threshold | awk '{if (0.+$1<0.01) {print "0.01"} else {print $1}}' | awk '{if (0.+$1>0.99) {print "0.99"} else {print $1}}'`
@ range1 = $range1
@ range2 = $range2
#If wholeseq = 1 or 2 then do six frame plot for wholeseq.
#If wholeseq = 0 then do six frame plot for range1-range2.
set refseq2 = `awk '{if ($1=="'"$refseq"'") print $2}' seqs.key`
degapseq $refseq2.fasta temp.fasta -auto
set length = `infoseq temp.fasta -auto -only -length | tail -1`
if ($wholeseq) then 
  @ range1 = 1
  @ range2 = $length
endif
@ step3 = `echo $step | awk '{print $1*3}'`
@ ncodons3 = `echo $ncodons | awk '{print $1*3}'`
if ($step3 < 15) then
  echo "Error: Minimum step size is 5 codons.  You entered $step.<br><br>"
  exit 1
endif
rm -f temp.fasta

@ drop = `echo $range1 $range2 | awk '{print int(0.5*($2-$1+1))}'`
if ($ncodons3 > $drop) then
  echo "Error: Window size '$ncodons3 nt' is too large relative to the requested nucleotide range '$range1 to $range2'.  Therefore no plot data.  Please ensure nucleotide range is at least twice the window size.<br><br>"
  exit 1
endif
@ drop = `echo $step3 | awk '{print $1*2}'`
if ($drop > $ncodons3) then
  echo "Error: Step size '$step3 nt' should be no greater than half the window size '$ncodons3 nt'.<br><br>"
  exit 1
endif

#Estimated run-time
@ maxruntime = 500
@ w = `wc -l pairs.tree.dat | awk '{print $1}'`
@ runtime = `echo $range1 $range2 $step $ncodons $w | awk '{print 10+int(0.02*sqrt($5*$4)*($2-$1+1)/$3)}'`
echo "<b>Estimated run-time: $runtime seconds.</b><br>"
if ($runtime > $maxruntime) then
  echo "Estimated run-time is too long - your submission won't be processed.<br>"
  echo "You could try decreasing the nucleotide range, or increasing the step size.<br><br>"
  exit 1
endif

if ($wholeseq) then
  set base1 = 1
  set base2 = `infoseq $refseq2.fasta -auto -only -length | tail -1`
else
  set base1 = `ntadjust $refseq2.fasta $range1`
  set base2 = `ntadjust $refseq2.fasta $range2`
endif
echo $length $base1 $base2 > ref.info

#Make refseq to align coords conversion file.
tail +2 seq.001.fasta | sed 's/./&@/g' | sed 'y/@/\n/' | grep "." | \
  awk '{if ($1!="-") s+=1}{if ($1!="-") {printf "%6s %6s %6s\n",NR,s,$1} else \
  {printf "%6s %6s %6s\n",NR,"-",$1}}' > coords.txt

#Make lists of windows to use.
foreach frame (0 1 2)
  @ s1 = `echo $frame $range1 | awk '{print $1+1+$2-($2%3)}'`
  @ s2 = `echo $frame $range1 $ncodons | awk '{print $3*3+$1+$2-($2%3)}'`
  @ s3 = `echo $s1 $s2 | awk '{print int(0.5*($1+$2))}'`
  rm -f temp1
  touch temp1
  while ($s2 <= $range2)
    echo $s1 $s2 $s3 >> temp1
    @ s1 += $step3
    @ s2 += $step3
    @ s3 += $step3
  end
  awk '{printf "%s 2:%s:%s\n",$3,$1,$2}' temp1 > orfs.p.$frame
  awk '{printf "%s 2:%s:%s\n",$3,$2,$1}' temp1 > orfs.m.$frame
  if ($circular && $wholeseq) then
    rm -f temp2
    touch temp2
    while ($s1 <= $length)
      if ($s3 > $length) then
        @ s3 -= $length
      endif
      echo $s1 $s2 $s3 >> temp2
      @ s1 += $step3
      @ s2 += $step3
      @ s3 += $step3
    end
    awk '{printf "%s 4:%s:%s:%s:%s\n",$3,$1,"'"$length"'","1",$2-"'"$length"'"}' \
      temp2 >> orfs.p.$frame
    awk '{printf "%s 4:%s:%s:%s:%s\n",$3,$2-"'"$length"'","1","'"$length"'",$1}' \
      temp2 >> orfs.m.$frame
  endif
  sort -n orfs.p.$frame | awk '{printf "%s+%s\n",$1,$2}' > temp3
  mv temp3 orfs.p.$frame
  sort -n orfs.m.$frame | awk '{printf "%s+%s\n",$1,$2}' > temp3
  mv temp3 orfs.m.$frame
end
rm -f temp[12]

echo "...Done<br><br>"

#Calculate summed divergence of contributing sequence pairs.
touch orfs.2
cp pairs.tree.dat pairs.dat
awk '{print $2,$1}' pairs.tree.dat >> pairs.dat
echo 1 1 $length " " > setup.mlogd
echo $circular " " >> setup.mlogd
mlogd setup.mlogd >>& errorlog.txt || exit 1
set lambdasum = `head -2 mlogd_out.info | tail -1 | awk '{printf "%.2f\n",$1*0.25}'`
touch mlogd_out.treesum
awk '{print $1,$3*0.25}' mlogd_out.treesum > tree.lambda
awk '{if (0.+$12>0.) {print $13/$12} else {print "0"}}' mlogd_out.dat > div.dat
set lambdasum2 = `awk '{s+=$1}END{print s}' div.dat`
set maxsumdiv = `echo $lambdasum2 $ncodons3 | awk '{print $1*$2}'`
set threshold2 = `echo $threshold $maxsumdiv | awk '{print $1*$2}'`
foreach suffix (dat gaplog gappos glitches info MLElog stoplog stoppos1 stoppos2 startpos1 startpos2 treesum)
  rm -f mlogd_out.$suffix
end
rm -f orfs.2 setup.mlogd pairs.dat
awk '{printf "%6i %10.4f\n",$1,$2}' tree.lambda > div.data.txt

#Run MLOGD for six frames.
echo -n "Doing six-frame plots...<br>"
cp pairs.tree.dat pairs.dat
awk '{print $2,$1}' pairs.tree.dat >> pairs.dat
foreach frame (p.0 p.1 p.2 m.0 m.1 m.2)
  rm -f mle.$frame
  touch mle.$frame
  foreach j (`cat orfs.$frame`)
    set i = `echo $j | awk -F+ '{print $2}'`
    echo $i | sed 's/:/ /g' > orfs.2
    set r1 = `echo $i | cut -d ":" -f 2- | sed 'y/:/\n/' | grep "[0-9]" | sort -n | head -1`
    set r2 = `echo $i | cut -d ":" -f 2- | sed 'y/:/\n/' | grep "[0-9]" | sort -n | tail -1`
    echo 2 $r1 $r2 " " >  setup.mlogd
    echo $circular " " >> setup.mlogd
    mlogd setup.mlogd >>& errorlog.txt || exit 1
    set totalmle = \
      `awk '{s+=($4-$3)*$5}END{print s*0.25}' mlogd_out.dat`
    set sumdiv = \
      `paste div.dat mlogd_out.dat | awk '{s+=$1*$6}END{print s}'`
    set refbase = `echo $j | awk -F+ '{print $1}'`
    set base = `ntadjust $refseq2.fasta $refbase`
    set num = `echo $i | awk -F: '{print $1}'`
    if ($num == "4") then
      set v1 = `echo $i | cut -d ":" -f 2- | sed 'y/:/\n/' | grep "[0-9]" | sort -n | head -3 | tail -1`
      set v2 = `echo $i | cut -d ":" -f 2- | sed 'y/:/\n/' | grep "[0-9]" | sort -n | head -2 | tail -1`
    else
      set v1 = $r1
      set v2 = $r2
    endif
    set t1 = `ntadjust $refseq2.fasta $v1`
    set t2 = `ntadjust $refseq2.fasta $v2`
    echo $base $totalmle $t1 $t2 $sumdiv >> mle.$frame
  end
  echo -n "$frame.."
end
awk '{printf "+0 %7i %9.2f %7i %7i %12.2f\n",$1,$2,$3,$4,$5}' mle.p.0 \
  >  mle.data.txt
awk '{printf "+1 %7i %9.2f %7i %7i %12.2f\n",$1,$2,$3,$4,$5}' mle.p.1 \
  >> mle.data.txt
awk '{printf "+2 %7i %9.2f %7i %7i %12.2f\n",$1,$2,$3,$4,$5}' mle.p.2 \
  >> mle.data.txt
awk '{printf "-0 %7i %9.2f %7i %7i %12.2f\n",$1,$2,$3,$4,$5}' mle.m.0 \
  >> mle.data.txt
awk '{printf "-1 %7i %9.2f %7i %7i %12.2f\n",$1,$2,$3,$4,$5}' mle.m.1 \
  >> mle.data.txt
awk '{printf "-2 %7i %9.2f %7i %7i %12.2f\n",$1,$2,$3,$4,$5}' mle.m.2 \
  >> mle.data.txt
echo "...Done<br><br>"

echo -n "Calculating other plot data...<br>"

#Apply threshold and scale scores.  The mle2.$frame are only used for the plot.
set length2 = `infoseq $refseq2.fasta -auto -only -length | tail -1`
foreach frame (p.0 p.1 p.2 m.0 m.1 m.2)
  awk '{if (0.+$5>=0.+"'"$threshold2"'"&&0.+$5>0.) print $1,$2*"'"$maxsumdiv"'"/$5,$3,$4,$5}' mle.$frame | awk '{if (0.+$3>0.+$4) {printf "%s %s %s %s %s\n%s %s %s %s %s\n",$1,$2,$3,"'"$length2"'",$5,$1,$2,"1",$4,$5} else {print $0}}'  > mle2.$frame
end

foreach suffix (dat gaplog glitches info MLElog stoplog treesum stoppos1 stoppos2 startpos1 startpos2)
  rm -f mlogd_out.$suffix
end
rm -f setup.mlogd orfs.2
touch mlogd_out.gappos
#OK to use gappos file for last iteration.  It's done for full alignment.
mv mlogd_out.gappos gappos
awk '{printf "%4i %6i\n",$1,$2}' gappos > gaps.data.txt

#Null model orfs.  Change from refseq to alignment coords.
@ norfs1 = `wc -l orfs.1 | awk '{print $1}'`
if ($norfs1 > 0) then
  cut -d " " -f 2- orfs.1 | sed 's/[^0-9]/x/g' | sed 'y/x/\n/' | grep "[0-9]" \
    > temp
  rm -f temp1; touch temp1
  foreach i (`cat temp`)
    echo `ntadjust $refseq2.fasta $i` >> temp1
  end
  set orf1x1 = `awk '{if (NR%2==1) printf "%s,",$0}' temp1 | sed 's/,$//g'`
  set orf1x2 = `awk '{if (NR%2==0) printf "%s,",$0}' temp1 | sed 's/,$//g'`
else
  set orf1x1 = "0"
  set orf1x2 = "0"
endif
rm -f temp temp1

#Other parameters
set nseqs = `wc -l pairs.ref.dat | awk '{print $1+1}'`
set date = `date +%y-%m-%d`
cat mle2.[pm].[012] | awk '{printf "%14.6f\n",$2}' > temp1
echo 0 >> temp1
set y2min = `minmax temp1 | awk '{print $1}'`
set y2max = `minmax temp1 | awk '{print $2}'`
awk '{print $2}' pairs.ref.dat > temp1
rm -f temp2
touch temp2
foreach i (`cat temp1`)
  set id = seq.$i
  set name = `awk '{if ($2=="'"$id"'") print $1}' seqs.key`
  echo $name >> temp2
end
set sss1 = `awk '{printf "%s,",(NR+1-0.5)/"'"$nseqs"'"}' temp2 | sed 's/,$//'`
set sss2 = `awk '{printf "@%s@,",$1}' temp2 | sed 's/,$//'`
rm -f temp[12]

#Calculate stop posfiles.  
rm -f orfs.1 orfs.2
touch orfs.2
head -1 pairs.tree.dat > pairs.dat
echo 1 1 $length " " >  setup.mlogd
echo $circular " " >> setup.mlogd
foreach frame (0 1 2)
  @ l1 = `echo $frame | awk '{print $1+1}'`
  @ l2 = `echo $frame | awk '{print "'"$length"'"-"'"$length"'"%3+$1}'`
  if ($l2 > $length) then
    @ l2 -= 3
  endif 
  echo 2 $l1 $l2 > orfs.1
  mlogd setup.mlogd >>& errorlog.txt || exit 1
  touch mlogd_out.stoppos1 mlogd_out.startpos1
  mv mlogd_out.stoppos1 stoppos.p.$frame
  mv mlogd_out.startpos1 startpos.p.$frame
  echo 2 $l2 $l1 > orfs.1
  mlogd setup.mlogd >>& errorlog.txt || exit 1
  touch mlogd_out.stoppos1 mlogd_out.startpos1
  mv mlogd_out.stoppos1 stoppos.m.$frame
  mv mlogd_out.startpos1 startpos.m.$frame
end
foreach suffix (dat gaplog gappos glitches info MLElog stoplog stoppos2 startpos2 treesum)
  rm -f mlogd_out.$suffix
end
rm -f orfs.1 orfs.2 setup.mlogd pairs.dat
awk '{printf "+0 %4i %6i\n",$1,$2}' stoppos.p.0 >  stops.data.txt
awk '{printf "+1 %4i %6i\n",$1,$2}' stoppos.p.1 >> stops.data.txt
awk '{printf "+2 %4i %6i\n",$1,$2}' stoppos.p.2 >> stops.data.txt
awk '{printf "-0 %4i %6i\n",$1,$2}' stoppos.m.0 >> stops.data.txt
awk '{printf "-1 %4i %6i\n",$1,$2}' stoppos.m.1 >> stops.data.txt
awk '{printf "-2 %4i %6i\n",$1,$2}' stoppos.m.2 >> stops.data.txt
awk '{printf "+0 %4i %6i\n",$1,$2}' startpos.p.0 >  starts.data.txt
awk '{printf "+1 %4i %6i\n",$1,$2}' startpos.p.1 >> starts.data.txt
awk '{printf "+2 %4i %6i\n",$1,$2}' startpos.p.2 >> starts.data.txt
awk '{printf "-0 %4i %6i\n",$1,$2}' startpos.m.0 >> starts.data.txt
awk '{printf "-1 %4i %6i\n",$1,$2}' startpos.m.1 >> starts.data.txt
awk '{printf "-2 %4i %6i\n",$1,$2}' startpos.m.2 >> starts.data.txt

#Check for empty files to edit out of R script to stop R from barfing.
@ w = `wc -l gappos | awk '{print $1}'`
if ($w == 0) then
  set kkk4 = "#"
else
  set kkk4 = ""
endif
@ w = `wc -l tree.lambda | awk '{print $1}'`
if ($w == 0) then
  set kkk5 = "#"
else
  set kkk5 = ""
endif

@ w = `wc -l stoppos.p.0 | awk '{print $1}'`
if ($w == 0) then
  set ppp1 = "#"
else
  set ppp1 = ""
endif
@ w = `wc -l stoppos.p.1 | awk '{print $1}'`
if ($w == 0) then
  set ppp2 = "#"
else
  set ppp2 = ""
endif
@ w = `wc -l stoppos.p.2 | awk '{print $1}'`
if ($w == 0) then
  set ppp3 = "#"
else
  set ppp3 = ""
endif
@ w = `wc -l stoppos.m.0 | awk '{print $1}'`
if ($w == 0) then
  set ppp4 = "#"
else
  set ppp4 = ""
endif
@ w = `wc -l stoppos.m.1 | awk '{print $1}'`
if ($w == 0) then
  set ppp5 = "#"
else
  set ppp5 = ""
endif
@ w = `wc -l stoppos.m.2 | awk '{print $1}'`
if ($w == 0) then
  set ppp6 = "#"
else
  set ppp6 = ""
endif

@ w = `wc -l mle2.p.0 | awk '{print $1}'`
if ($w == 0) then
  set jjj1 = "#"
else
  set jjj1 = ""
endif
@ w = `wc -l mle2.p.1 | awk '{print $1}'`
if ($w == 0) then
  set jjj2 = "#"
else
  set jjj2 = ""
endif
@ w = `wc -l mle2.p.2 | awk '{print $1}'`
if ($w == 0) then
  set jjj3 = "#"
else
  set jjj3 = ""
endif
@ w = `wc -l mle2.m.0 | awk '{print $1}'`
if ($w == 0) then
  set jjj4 = "#"
else
  set jjj4 = ""
endif
@ w = `wc -l mle2.m.1 | awk '{print $1}'`
if ($w == 0) then
  set jjj5 = "#"
else
  set jjj5 = ""
endif
@ w = `wc -l mle2.m.2 | awk '{print $1}'`
if ($w == 0) then
  set jjj6 = "#"
else
  set jjj6 = ""
endif

echo "...Done<br><br>"

echo -n "Doing plots...<br>"

cat ../../SCRIPTS/sixframe.R | \
  sed 's/nnn/'$nseqs'/g' | sed 's/ddd/'$date'/g' | \
  sed 's/bbb1/'$base1'/g' | sed 's/bbb2/'$base2'/g' | \
  sed 's/www/'$ncodons'/g' | sed 's/lll/'$step'/g' | \
  sed 's/ttt/'$lambdasum'/g' | \
  sed 's/orf1x1/'$orf1x1'/g' | sed 's/orf1x2/'$orf1x2'/g' | \
  sed 's/#kkk4/'$kkk4'/g' | sed 's/#kkk5/'$kkk5'/g' | \
  sed 's/#jjj1/'$jjj1'/g' | sed 's/#jjj2/'$jjj2'/g' | \
  sed 's/#jjj3/'$jjj3'/g' | sed 's/#jjj4/'$jjj4'/g' | \
  sed 's/#jjj5/'$jjj5'/g' | sed 's/#jjj6/'$jjj6'/g' | \
  sed 's/#ppp1/'$ppp1'/g' | sed 's/#ppp2/'$ppp2'/g' | \
  sed 's/#ppp3/'$ppp3'/g' | sed 's/#ppp4/'$ppp4'/g' | \
  sed 's/#ppp5/'$ppp5'/g' | sed 's/#ppp6/'$ppp6'/g' | \
  sed 's/yyy2max/'$y2max'/g' | sed 's/yyy2min/'$y2min'/g' | \
  sed 's/sss1/'$sss1'/g' | sed 's/sss2/'$sss2'/g' | sed 's/@/"/g' | \
  sed 's/%sss%/'$refseq'/g' | sed 's/%ggg%/'$title'/g' > plot3.R
R --no-save --slave < plot3.R >>& errorlog.txt
epstopdf sixframe.eps 

echo "set nseqs = $nseqs"           >  sixframe.params 
echo "set base1 = $base1"           >> sixframe.params 
echo "set base2 = $base2"           >> sixframe.params
echo "set lambdasum = $lambdasum"   >> sixframe.params
echo "set maxsumdiv = $maxsumdiv"   >> sixframe.params
echo "set length2 = $length2"       >> sixframe.params
echo "set orf1x1 = $orf1x1"         >> sixframe.params 
echo "set orf1x2 = $orf1x2"         >> sixframe.params
echo "set sss1 = $sss1"             >> sixframe.params 
echo "set sss2 = $sss2"             >> sixframe.params 
echo "set kkk4 = "'"'$kkk4'"'       >> sixframe.params 
echo "set kkk5 = "'"'$kkk5'"'       >> sixframe.params
echo "set ppp1 = "'"'$ppp1'"'       >> sixframe.params 
echo "set ppp2 = "'"'$ppp2'"'       >> sixframe.params
echo "set ppp3 = "'"'$ppp3'"'       >> sixframe.params 
echo "set ppp4 = "'"'$ppp4'"'       >> sixframe.params
echo "set ppp5 = "'"'$ppp5'"'       >> sixframe.params 
echo "set ppp6 = "'"'$ppp6'"'       >> sixframe.params

cp orfs.1.bkp orfs.1
cp orfs.2.bkp orfs.2
echo "...Done<br><br>"
chmod a+rw ./*

#Write rest of webpage.
set id9 = `echo $newurl | sed 's/.*\///'`
echo "<hr><b>Six-frame plots</b><br><br> Download "
echo "<a href=$newurl/sixframe.png>png</a>, "
echo "<a href=$newurl/sixframe.eps>eps</a>, "
echo "<a href=$newurl/sixframe.pdf>pdf</a>, "
echo "<a href=$formdir/plot.sixframe.html>description</a>, "
echo "<a href=$newurl/sixframe.plotdata.html>raw plot data</a>.<br>"
echo "<a href=$newurl/errorlog.txt>Error log</a> "
echo "(<a href=$formdir/error.html>description of common error messages</a>)."
echo "<br>"
if (! $download) then
  echo "<a href=redo_sixframe.pl?id=$id9&count=0>Redraw</a> "
  echo "plot with a zoomed-in nt range and/or grid lines "
else
  echo "You may redraw this plot with a zoomed-in nt range and/or grid lines "
  echo "with the 'redosixframe' script "
endif
echo "(<a href=$formdir/threshold3.html>note</a>).<br>"
echo "<a href=$formdir/frames.html>Frame notation</a>.<br><br>"
echo "Extended regions of positive signal may indicate potential new CDSs "
echo "(i.e. other than those identified in the null model), especially "
echo "where there is an absense of stop codons.  You can find the "
echo "coordinates of such regions and use the "
if (! $download) then
  echo "'Test input query CDSs' option on the "
  echo "<a href=mlogd.pl>MLOGD server</a> "
else
  echo "'domlogd' script "
endif
echo "to calculate more detailed "
echo "statistics.  Don't be surprised if the input 'Known CDS(s)' "
echo "themselves have a negative signal - these CDSs have already been "
echo "taken into account as part of the null model.  Note that local "
echo "misalignments may occasionally cause non-reference sequence stop "
echo "codons to be annotated in the wrong frame.  Be aware that a few "
echo "particular read-frame combinations may lead to false positive signals "
echo "(<a href=$formdir/falsepos.html>details</a>). "
echo "Note that the horizontal axis is in alignment coordinates, not "
echo "reference sequence coordinates (lookup reference sequence coordinates "
echo "<a href=$newurl/coords.txt>here</a>; "
echo "<a href=$formdir/cols.coords.html>key</a>).  The dashed line "
echo "as at zero.<br><br>"
echo "<img src="'"'"$newurl/sixframe.png"'"'" "
echo "alt="'"'"six-frame plot (png)"'"'"><hr>"

chmod a+rw ./*
echo Finished dosixframe `date` >> errorlog.txt