#!/bin/csh
#
#This software is Copyright (C) 2005 by Andrew Firth, University of Otago, 
#Dunedin, New Zealand. All rights reserved.
#
#Script to prepare sequences and run through MLRGD suite.
#Plots 'residual' conservation along an alignment once the protein-coding
#  constraints have been 'factored' out.  At least, that's the idea.
#
#Required software:
#  mlrgd
#  runmean
#  runclipmean
#  addgaps
#  minmax
#  ntadjust
#  EMBOSS: seqret, seqretsplit, infoseq
#  EMBASSY: ednaml, ednapars
#  code2aln
#  R
#
#Required files:
#  $prefix.seqs
#  *.fasta, *.gbk           [Sequence files in gbk or fasta format.]
#  aa.dat                   [Scoring matrix.]
#  aa2codon.dat             [Genetic code (same order as scoring matrix).]
#  nuc.dat                  [4 x 4 nucleotide mutation matrix (order UCAG).]
#  codon.dat                [Codon usage table.]
#  mlrgd.R                  [Plot template.]
#  *.fasta,orfs, *.gbk.orfs [ORF information.  Alternatively can extract ORFs 
#                             from *.gbk headers.]
#  *.fasta.features, *.gbk.features  [Known features (optional).]
#  $prefix.pairs            [Sequence pairs file (optional).]
#
#Usage: run_mlrgd prefix
#
#
#Notes:
#  Input files must have suffixes 'fasta' or 'gbk' as appropriate.  
#  Doesn't really matter if input sequences have '-' or '.' gaps.

#------------------------------------------------------------------------------
#Options

set pairs = 1    # 2 -> use dnapars, 1 -> use dnaml, 
                 # 0 or other -> use '$prefix.pairs'
set refpos = 0   # 0 -> use seperate ORF files for each sequence, 
                 # 1 -> use refseq ORF files for all sequences
set gbkpos = 1   # 0 -> get ORF files from 'sequence'.orfs files
                 # 1 -> get ORF files from 'sequence'.gbk files (for any
                 #   'sequence'.fasta files, the 'sequence'.orfs files, if any,
                 #   will be used)
set skip = 0     # 1 or other -> skip sequence files containing non-ACGTUacgtu;
                 #   exits if reference sequence contains non-ACGTUacgtu
                 # 0 -> pass flag to mlrgd so that non-ACGTUacgtu.- are read as
                 #   N (ambiguous nt) and ignored for most statistics
set window1 = 5  # n, where sliding window size is 2n+1 (#codons for 
                 #   N1,N2,N3 plots, #nt/codons for syn)
set window2 = 10 # n, where sliding window size is 2n+1 (#nt for all, nc)
set circular = 0 # 0 -> not circular genome, 1 or other -> circular genome, 
set clip1 = 0.00 # for clipped running mean; 'clip1 = 0.05' means upper 5%
                 # (i.e. the columns with low obs#muts relative to exp#muts) 
                 # in each window are clipped before calculating mean
set clip2 = 0.30 # for clipped running mean; 'clip2 = 0.05' means lower 5%
                 # (i.e. the columns with high obs#muts relative to exp#muts) 
                 # in each window are clipped before calculating mean

set tttmin = 0.0    # Lowest t value to use in MLRGD fitting
set tttmax = 10.0   # Highest t value to use in MLRGD fitting
set tVfit = 0.2     # Required fitting accuracy (total #muts in seq pair) for
                    #   t and V
set maxiter = 20    # Maximum number of fitting iterations for t and V
set Vmin = 0.01     # Lowest V value to use in MLRGD fitting (>= 0.01)
set Vmax = 10.0     # Highest V value to use in MLRGD fitting (<= 10)
set Vtype = 1       # 0 -> find seperate V for each sequence pair; 
                    # 1 or other -> find one V value for all sequence pairs
set Vfix = 0        # If > 0, use this V scaling for all sequence pairs;
                    #   defaults to 1 if <= 0; overridden if fitwhat = 3.

set fitwhat = 3  #0 or other -> fit to total # muts
                 #1 -> fit to # of neutral/synonymous muts, 
                 #2 -> fit to # of neutral/synonymous muts at 4-fold degenerate
                 #     sites
                 #3 -> as for 2, then adjust V to fit # of nonsyn muts
                 #Note: 1-3 are not recommended if lots of double-coding
set wholeseq = 1 #Nucleotide range to use: 0 = region, 1 or other = whole 
                 #  sequence. 
set range1 = 0   #Start nucleotide (if $wholeseq = 0; otherwise ignored); 
                 #  reference sequence coords
set range2 = 0   #End nucleotide (if $wholeseq = 0; otherwise ignored);
                 #  reference sequence coords

#------------------------------------------------------------------------------
#Check all required programmes and files are present.

@ argc = `echo $argv | awk '{print NF}'`
if ($argc != 1) then
  echo "Usage: run_mlrgd prefix"
  exit
endif

set prefix = $1
limit stacksize unlimited
setenv PATH $HOME/bin:${PATH}
echo `date` ": Starting running CDS-plotcon on alignment $prefix"

set test = 0
foreach i (runmean runclipmean mlrgd seqret seqretsplit ednaml ednapars infoseq code2aln addgaps minmax ntadjust R)
  if (! -X $i) then
    echo "Can't find software $i."
    set test = 1
  endif
end
if ($test) then
  exit
endif

set test = 0
foreach i (aa.dat aa2codon.dat nuc.dat codon.dat $prefix.seqs mlrgd.R)
  if (! -r $i) then
    echo "Can't find file $i."
    set test = 1
  endif
end
if ($test) then
  exit
endif

set test = 0
foreach i (`cat $prefix.seqs`)
  if (! -r $i) then
    echo "Can't find sequence $i."
    set test = 1
  endif
end
if ($test) then
  exit
endif

set test = 0
foreach i (temp temp1 temp2 temp3 aln.aln info.ral ORF.ps cluster.txt phylip.in phylip.out phylip.tree setup.mlrgd seqs.dat `ls seq.* orfs.??? tghyv.* temp.???` $prefix.aln $prefix.errorlog $prefix.fc.dat $prefix.fc.info $prefix.fc.log $prefix.fcm.eps $prefix.fcm.R $prefix.fc.plot $prefix.info.ral $prefix.nc.dat $prefix.nc.info $prefix.nc.log $prefix.ncm.eps $prefix.ncm.R $prefix.nc.plot $prefix.ORF.ps $prefix.run_mlrgd $prefix.tree)
  if (-r $i) then
    echo "Please remove file '$i' so that I can overwrite it."
    set test = 1
  endif
end
if ($test) then
  exit
endif

if (1 != $pairs && 2 != $pairs) then
  if (! -r $prefix.pairs) then
    echo "Can't find file $prefix.pairs."
    exit
  endif
endif

touch $prefix.errorlog
set refseq = `head -1 $prefix.seqs`

#------------------------------------------------------------------------------
#Check, tidy and align sequences and ORF files.

echo -n `date` ": Checking sequences..."

#Look for non-ACGTU characters.
if ($skip) then
  seqret $refseq temp1 -osformat fasta -auto
  set test = `tail +2 temp1 | grep "[^acgtuACGTU.-]" | wc -l`
  if ($test) then
    echo "Reference sequence contains non-ACGTU characters.  Aborting."
    exit
  else
    echo $refseq > temp
  endif
  foreach i (`tail +2 $prefix.seqs`)
    seqret $i temp1 -osformat fasta -auto
    set test = `tail +2 temp1 | grep "[^acgtuACGTU.-]" | wc -l`
    if ($test) then
      echo "$i contains non-ACGTU characters.  Omitting this sequence." >> \
        $prefix.errorlog
    else
      echo $i >> temp
    endif
  end
  mv temp $prefix.seqs
endif

#Look for and remove identical sequences.
touch temp
foreach i (`cat $prefix.seqs`)
  seqret $i temp1 -osformat fasta -auto
  degapseq temp1 temp2 -auto
  tail +2 temp2 | sed 'y/ACGTUuMRWVHDBSYKNX/acgtttmrwvhdbsyknx/' \
    > rfge8345dsf.$i
  echo rfge8345dsf.$i >> temp
end
rm -f temp[123]
@ count = 0
foreach i (`cat $prefix.seqs`)
  @ t2 = 1
  foreach j (`head -$count temp`)
    @ t1 = `diff rfge8345dsf.$i $j | wc -l | awk '{print $1}'`
    if ($t1 == 0) then
      @ t2 = 0
      set old = `echo $j | sed 's/rfge8345dsf\.//'`
      echo Sequence $i identical to $old. Omitting $i. >> $prefix.errorlog
      if (1 != $pairs && 2 != $pairs) then
        awk '{if ($1=="'"$i"'") {print "'"$old"'",$2} else {print $0}}' \
          $prefix.pairs | \
          awk '{if ($2=="'"$i"'") {print $1,"'"$old"'"} else {print $0}}' \
          > temp1
        mv temp1 $prefix.pairs
      endif
      break
    endif 
  end
  if ($t2) then
    echo $i >> temp3
  endif
  @ count += 1
end
mv temp3 $prefix.seqs
if (1 != $pairs && 2 != $pairs) then
  awk '{if ($1!=$2) print $0}' $prefix.pairs > temp
  mv temp $prefix.pairs
endif
rm -f rfge8345dsf.*

#If ($gbkpos), make ORF files for any sequences with .gbk extension.
if ($gbkpos) then 
  foreach i (`cat $prefix.seqs`)
    set extn = `echo $i | sed 's/.*\.//'`
    if ($extn == "gbk") then
      grep " CDS " $i | awk '{print $2}' | sed 's/[<>]//g' > $i.orfs
    endif
  end
endif

#Check required ORF files present.
if ($refpos) then   # use refseq ORF files for all 
  if (! -r $refseq.orfs) then
    echo "Can't find file $refseq.orfs."
    exit
  endif
else                # use ORF files for each sequences
  set test = 0
  foreach i (`cat $prefix.seqs`)
    if (! -r $i.orfs) then
      echo "Can't find file $i.orfs."
      set test = 1
    endif
  end
  if ($test) then
    exit
  endif
endif

#Rename sequences
#  (Note seqret changes '.'s to '-'s, but doesn't strip gaps. Preserves 
#  non-ACGTU if they are valid ambiguous nt codes, but non valid codes, e.g. O,
#  are replaced with '-'.)
#Change all sequences to fasta format.  Then for any sequences with *.orfs
#  files (even if (refpos)), put the ORF info into a header and make a .gbk
#  file readable by code2aln.
touch seqs.dat
@ count = 0
foreach i (`cat $prefix.seqs`)
  @ count += 1
  set id = `echo $count | awk '{printf "%03i\n",$1}'`
  if (-r $i.orfs) then   # write .gbk format file
    seqret $i temp1 -osformat fasta -auto
    degapseq temp1 temp2 -auto
    set length = `infoseq temp2 -auto | tail +2 | awk '{print $5}'`
    echo LOCUS seq.$id $length > seq.$id.gbk
    awk '{printf "     CDS             %s\n",$1}' $i.orfs >> seq.$id.gbk
    echo ORIGIN >> seq.$id.gbk
    tail +2 temp2 >> seq.$id.gbk
    echo // >> seq.$id.gbk
    echo seq.$id.gbk >> seqs.dat
  else                   # write .fasta format file
    seqret $i temp1 -osformat fasta -auto
    degapseq temp1 temp2 -auto
    sed 's/>.*/>seq.'$id'/' temp2 > seq.$id.fasta
    echo seq.$id.fasta >> seqs.dat
  endif
end
set nseqs = $count

if ($refpos) then
  sed 's/[^0-9]/ /g' $refseq.orfs | awk '{print NF,$0}' > orfs.001
else
  @ count = 0
  foreach i (`cat $prefix.seqs`)
    @ count += 1
    set id = `echo $count | awk '{printf "%03i\n",$1}'`
    sed 's/[^0-9]/ /g' $i.orfs | awk '{print NF,$0}' > orfs.$id
  end
endif

echo "Done"

#Align sequences with code2aln.
#  code2aln and seqretsplit seem to handle abiguous nt codes.
echo -n `date` ": Aligning sequences..."
code2aln `cat seqs.dat` >>& $prefix.errorlog || exit 1
mv aln.aln $prefix.aln
seqretsplit $prefix.aln temp -auto -osextension2 aln
rm -f temp
awk -F. '{printf "%s.%s.aln\n",$1,$2}' seqs.dat > temp
mv temp seqs.dat
rm -f seq.???.fasta seq.???.gbk
mv info.ral $prefix.info.ral
mv ORF.ps $prefix.ORF.ps
echo "Done"

#------------------------------------------------------------------------------
#Make phylo tree and sequence pair list.
#
#Use sequence pairs tracing round outside of tree (this covers every branch 
#  precisely twice).
#ednaml, ednapars seem to handle abiguous nt codes.

echo -n `date` ": Making phylogenetic tree..."
seqret -osformat phylip $prefix.aln phylip.in -auto
if (1 == $pairs) then
  if ($nseqs <= 3) then
    sed 's/seq\.//' seqs.dat | sed 's/\.aln//' > temp
  else
    ednaml phylip.in -outfile phylip.out -treefile phylip.tree -ttratio 3 \
      -random no -trout yes -auto -besttree yes -randtimes 1 \
      >>& $prefix.errorlog || exit 1
    sed 's/:[^,()]*[,()]/ /g' phylip.tree | sed 's/(//g' | sed 's/ /\n/g' | \
      grep seq. | head -$nseqs > temp
  endif
  tail +2 temp > temp1
  head -1 temp >> temp1
  paste temp temp1 | sed 's/seq\.//g' | awk '{print $1,$2}' > $prefix.pairs
  rm -f temp temp1
else if (2 == $pairs) then
  if ($nseqs <= 3) then
    sed 's/seq\.//' seqs.dat | sed 's/\.aln//' > temp
  else
    ednapars phylip.in -outfile phylip.out -treefile phylip.tree -random no \
      -trout yes -auto -besttree yes -randtimes 1 \
      >>& $prefix.errorlog || exit 1
    sed 's/[,()]/ /g' phylip.tree | sed 's/ /\n/g' | grep seq. | head -$nseqs \
      > temp
  endif
  tail +2 temp > temp1
  head -1 temp >> temp1
  paste temp temp1 | sed 's/seq\.//g' | awk '{print $1,$2}' > $prefix.pairs
  rm -f temp temp1
else
  if (! -r $prefix.pairs) then
    echo "Can't find file $prefix.pairs."
    exit
  endif
  @ count = 0
  foreach i (`cat $prefix.seqs`)
    @ count += 1
    set id = `echo $count | awk '{printf "%03i\n",$1}'`
    mv $prefix.pairs temp
    cat temp | \
      awk '{if ($1=="'"$i"'") {print "'"$id"'",$2} else {print $1,$2}}' | \
      awk '{if ($2=="'"$i"'") {print $1,"'"$id"'"} else {print $1,$2}}' \
      > $prefix.pairs
  end
  rm -f temp
endif
awk '{print $2,$1}' $prefix.pairs >> $prefix.pairs
if (-r phylip.tree) then
  mv phylip.tree $prefix.tree
endif
echo "Done"

#------------------------------------------------------------------------------
#Run MLRGD

echo -n `date` ": Running MLRGD..."

echo $prefix.pairs "   " > setup.mlrgd
echo $tttmin $tttmax $tVfit $maxiter "   #t fitting params" >> setup.mlrgd
echo $fitwhat "   #fitwhat" >> setup.mlrgd
echo 0 "   #model" >> setup.mlrgd                 #non-coding model
echo $refpos "   #refpos" >> setup.mlrgd
echo $circular "   #circular" >> setup.mlrgd
echo $wholeseq $range1 $range2 "   #nt range params" >> setup.mlrgd
echo $Vfix $Vtype $Vmin $Vmax "   #V fitting params" >> setup.mlrgd
echo $skip "   #ambiguous nt" >> setup.mlrgd
mlrgd setup.mlrgd >>& $prefix.errorlog || exit
mv mlrgd_out.dat    $prefix.nc.dat
mv mlrgd_out.log    $prefix.nc.log
mv mlrgd_out.plot   $prefix.nc.plot
mv mlrgd_out.info   $prefix.nc.info

echo $prefix.pairs "   " > setup.mlrgd
echo $tttmin $tttmax $tVfit $maxiter "   #t fitting params" >> setup.mlrgd
echo $fitwhat "   #fitwhat" >> setup.mlrgd
echo 1 "   #model" >> setup.mlrgd                 #full-coding model
echo $refpos "   #refpos" >> setup.mlrgd
echo $circular "   #circular" >> setup.mlrgd
echo $wholeseq $range1 $range2 "   #nt range params" >> setup.mlrgd
echo $Vfix $Vtype $Vmin $Vmax "   #V fitting params" >> setup.mlrgd
echo $skip "   #ambiguous nt" >> setup.mlrgd
mlrgd setup.mlrgd >>& $prefix.errorlog || exit
mv mlrgd_out.dat    $prefix.fc.dat
mv mlrgd_out.log    $prefix.fc.log
mv mlrgd_out.plot   $prefix.fc.plot
mv mlrgd_out.info   $prefix.fc.info

echo "Done"

#*.dat:
#1  best fit t (evolutionary time)
#2  Sum[log(prob(seq1_nt -> seq2_nt))] per nt
#3  number of nt used in comparison
#4  number of nt discarded due to being gaps in one or both sequences
#5  number of nt discarded due to being 'zero-probability' transitions
#   according to the model aa.dat, codon.dat (e.g. stop <-> non-stop)
#6  number of point mutations (out of the nt used)
#7  number of neutral or synonymous point mutations (includes non-coding nt)
#8  flag = 1 if problems with t fitting (outside given range or didn't converge
#   in allowed number of iterations).
#9  number of 4-fold degenerate sites with neutral or null mutation
#10 number of 4-fold degenerate sites with neutral non-null mutation
#11 best fit V (scaling between nonsynonymous and synonymous substitution 
#   acceptabilities)
#12 flag = 1 if problems with V fitting (outside given range or didn't converge
#   in allowed number of iterations).
#Mutation rate per nt is $6/$3
#$3+$4+$5 = length of sequence in alignment coords
#
#*.log:
#1  pair number 
#2  base number (alignment coords) 
#3  log(prob(seq1_nt -> seq2_nt))
#   (9 = gap in both seq, 8 = gap in seq1, 7 = gap in seq2, 6 = zero-prob,
#   5 = gap only in reference sequence)
#4  exp#muts           (0 if gap in either seq1 or seq2
#5  exp # neutral muts (0 if gap in either seq1 or seq2
#6  obs#muts           (0 if gap in either seq1 or seq2)
#7  obs # neutral muts (0 if gap in either seq1 or seq2)
#
#*.plot:
#1  base number (alignment coords) 
#2  exp#muts summed over all pairs
#3  obs#muts summed over all pairs
#4  number of pairs in which nt is N0 (using seq1 ORF file)
#5  number of pairs in which nt is N1 (using seq1 ORF file)
#6  number of pairs in which nt is N2 (using seq1 ORF file)
#7  number of pairs in which nt is N3 (using seq1 ORF file)
#8  number of pairs in which nt is gap (in either seq1 or seq2) or a 
#   'zero-probability' transition
#9  stddev estimated from exp#muts
#10 lambda = mean number (over sequence) of muts per nt, summed over pairs 
#   without gaps at that nt position
#11 exp#neutralmuts summed over all pairs
#12 obs#neutralmuts summed over all pairs
#13 summed exp#muts at 4-fold degen neutral/syn/null sites
#14 summed obs#muts at 4-fold degen neutral/syn/null sites
#15 summed lambda4 for 4-fold degen neutral/syn/null sites
#16 summed # of 4-fold degen neutral/syn/null sites
#$4+$5+$6+$7+$8 = total number of pairs
#Note that $2,$3,$10,$11,$12,$13,$14,$15 should be multiplied by 0.5 since we 
#  do forward and backward pairs, and multiplied by 0.5 again since we cross 
#  each branch of the tree with two pairwise comparisons. $9 should be
#  multiplied by sqrt(0.25) = 0.5.

#------------------------------------------------------------------------------
#Plots

echo -n `date` ": Preparing plot data..."

set npairs = `wc $prefix.pairs | awk '{print $1}'`
set np2 = `wc $prefix.pairs | awk '{print $1*0.5}'`

#Divide output mlrgd_out.plot files into N0,N1,N2,N3, etc scaling by the
#  summed lambda values (so regions with lots of gaps in the alignment are at
#  the same level as regions without gaps). 
#
#($2-$3)/$10 = (exp#muts - obs#muts)/(summed lambda).  The factors of *0.25
#  in $2,$3,$10 cancel out.
#2.*$9/$10 = 0.5*stddev/0.25*(summed lambda).  The factor of 1/$10 is a 
#  simple scaling.  
foreach name (nc fc)
  set mname = `echo $name | awk '{printf "%sm\n",$1}'`
  set cname = `echo $name | awk '{printf "%sc\n",$1}'`
  awk '{if ((0.+$4)>(0.5*"'"$npairs"'")) print $1,$2-$3,2.*$9,$10}' \
    $prefix.$name.plot | \
    awk '{if (0.+$4>0) {print $1,$2/$4,$3/$4} else {print $1,0,0}}' | \
    sed 's/nan/0/g' > tghyv.$name.n0
  awk '{if ((0.+$5)>(0.5*"'"$npairs"'")) print $1,$2-$3,2.*$9,$10}' \
    $prefix.$name.plot | \
    awk '{if (0.+$4>0) {print $1,$2/$4,$3/$4} else {print $1,0,0}}' | \
    sed 's/nan/0/g' > tghyv.$name.n1
  awk '{if ((0.+$6)>(0.5*"'"$npairs"'")) print $1,$2-$3,2.*$9,$10}' \
    $prefix.$name.plot | \
    awk '{if (0.+$4>0) {print $1,$2/$4,$3/$4} else {print $1,0,0}}' | \
    sed 's/nan/0/g' > tghyv.$name.n2
  awk '{if ((0.+$7)>(0.5*"'"$npairs"'")) print $1,$2-$3,2.*$9,$10}' \
    $prefix.$name.plot | \
    awk '{if (0.+$4>0) {print $1,$2/$4,$3/$4} else {print $1,0,0}}' | \
    sed 's/nan/0/g' > tghyv.$name.n3
  awk '{if (($4+$5+$6+$7)>0.) print $1,$2-$3,2.*$9,$10}' \
    $prefix.$name.plot | \
    awk '{if ($4>0) {print $1,$2/$4,$3/$4} else {print $1,0,0}}' | \
    sed 's/nan/0/g' > tghyv.$name.all
  awk '{if ((0.+$16)>=(0.5*"'"$npairs"'")) print $1,$13-$14,2.*$9,$15}' \
    $prefix.$name.plot | \
    awk '{if (0.+$4>0) {print $1,$2/$4,$3/$4} else {print $1,0,0}}' | \
    sed 's/nan/0/g' > tghyv.$name.syn
  awk '{print $1,0.25*$10}' $prefix.$name.plot | sed 's/nan/0/g' \
    > tghyv.$name.lambda
  runmean tghyv.$name.n0  $window2 $circular > tghyv.$mname.n0  || exit
  runmean tghyv.$name.n1  $window1 $circular > tghyv.$mname.n1  || exit
  runmean tghyv.$name.n2  $window1 $circular > tghyv.$mname.n2  || exit
  runmean tghyv.$name.n3  $window1 $circular > tghyv.$mname.n3  || exit
  runmean tghyv.$name.all $window2 $circular > tghyv.$mname.all || exit
  runmean tghyv.$name.syn $window1 $circular > tghyv.$mname.syn || exit
  runclipmean tghyv.$name.n0  $window2 $clip1 $clip2 $circular \
    > tghyv.$cname.n0  || exit
  runclipmean tghyv.$name.n1  $window1 $clip1 $clip2 $circular \
    > tghyv.$cname.n1  || exit
  runclipmean tghyv.$name.n2  $window1 $clip1 $clip2 $circular \
    > tghyv.$cname.n2  || exit
  runclipmean tghyv.$name.n3  $window1 $clip1 $clip2 $circular \
    > tghyv.$cname.n3  || exit
  runclipmean tghyv.$name.all $window2 $clip1 $clip2 $circular \
    > tghyv.$cname.all || exit
  runclipmean tghyv.$name.syn $window1 $clip1 $clip2 $circular \
    > tghyv.$cname.syn || exit
end

#Add gaps back in (note that this comes after taking the running mean).
foreach i (nc ncm ncc fc fcm fcc)
  foreach k (n0 n1 n2 n3 all syn) 
    mv tghyv.$i.$k tghyv.temp
    addgaps tghyv.temp > tghyv.$i.$k || exit
  end
end

#If file is empty, write '0 0 0' to it so that R doesn't barf.
foreach i (ncm ncc fcm fcc)
  foreach k (n0 n1 n2 n3 all syn) 
    set t = `wc -l tghyv.$i.$k | awk '{print $1}'`
    if (0 == $t) then
      echo 0 0 0 >> tghyv.$i.$k
    endif
  end
end

#Reference sequence orfs.  Change from refseq to alignment coords.
set w = `wc -l orfs.001 | awk '{print $1}'`
if ($w > 0) then
  set norfs = `wc -l orfs.001 | awk '{print $1}'`
  cut --delimiter=" " -f 2- orfs.001 | sed 's/[^0-9]/\n/g' | grep "[0-9]" \
    > temp
  rm -f temp1; touch temp1
  foreach i (`cat temp`)
    echo `ntadjust seq.001.aln $i` >> temp1
  end
  set orfx1 = `awk '{if (NR%2==1) printf "%s,",$0}' temp1 | sed 's/,$//g'`
  set orfx2 = `awk '{if (NR%2==0) printf "%s,",$0}' temp1 | sed 's/,$//g'`
  awk '{print $1/2}' orfs.001 > temp1
  rm -f temp; touch temp
  @ b = 0
  foreach i (`cat temp1`)
    @ b ++
    @ a = 0
    while ($a < $i)
      echo $b $norfs | awk '{print ($1-0.5)/$2}' >> temp
      @ a ++
    end
  end
  set orfy = `awk '{printf "%s,",$0}' temp | sed 's/,$//g'`
else
  set orfx1 = "0"
  set orfx2 = "0"
  set orfy =  "0"
endif

#Known features.  Change from refseq to alignment coords.
if (-r $refseq.features) then
  set w = `wc -l $refseq.features | awk '{print $1}'`
  if ($w > 0) then
    sed 's/[^0-9]/ /g' $refseq.features | awk '{print NF,$0}' > temp2
    set nkfs = `wc -l temp2 | awk '{print $1}'`
    cut --delimiter=" " -f 2- temp2 | sed 's/[^0-9]/\n/g' | grep "[0-9]" > temp
    rm -f temp1; touch temp1
    foreach i (`cat temp`)
      echo `ntadjust seq.001.aln $i` >> temp1
    end
    set kfx1 = `awk '{if (NR%2==1) printf "%s,",$0}' temp1 | sed 's/,$//g'`
    set kfx2 = `awk '{if (NR%2==0) printf "%s,",$0}' temp1 | sed 's/,$//g'`
    awk '{print $1/2}' temp2 > temp1
    rm -f temp; touch temp
    @ b = 0
    foreach i (`cat temp1`)
      @ b ++
      @ a = 0
      while ($a < $i)
        echo $b $nkfs | awk '{print ($1-0.5)/$2}' >> temp
        @ a ++
      end
    end
    set kfy = `awk '{printf "%s,",$0}' temp | sed 's/,$//g'`
  else
    set kfx1 = "0"
    set kfx2 = "0"
    set kfy =  "0"
  endif
else
  set kfx1 = "0"
  set kfx2 = "0"
  set kfy =  "0"
endif

echo "Done"

#------------------------------------------------------------------------------
#Plot N0, N1, N2, N3, all and syn sliding-mean tracks for nc and fc.

echo -n `date` ": Making plots..."

#Note *0.25 correction in $nmutsum and $lambdasum corrects for doing both
#  forward and backward pairwise comparisons, and for covering every branch
#  of the tree with two pairwise comparisons.

set w1 = `echo $window1 | awk '{print 1+2*$1}'`
set w2 = `echo $window2 | awk '{print 1+2*$1}'`

#Conservation score plots.
foreach i (nc fc)
  set length = `head -2 $prefix.$i.info | tail -1 | awk '{print $2}'`
  set base1 = `head -2 $prefix.$i.info | tail -1 | awk '{print $3}'`
  set base2 = `head -2 $prefix.$i.info | tail -1 | awk '{print $4}'`
  set nmutsum = `head -3 $prefix.$i.info | tail -1 | awk '{print $2*0.25}'`
  set lambdasum = `head -4 $prefix.$i.info | tail -1 | awk '{printf "%.2f\n",$2*0.25}'`
  set lambda4sum = `tail -1 $prefix.$i.info | awk '{printf "%.2f\n",$2*0.25}'`
  set mname = `echo $i | awk '{printf "%sm\n",$1}'`
  set cname = `echo $i | awk '{printf "%sc\n",$1}'`
  foreach j ($mname $cname)
    cat tghyv.$j.all tghyv.$j.syn tghyv.$j.n[0123] | \
      awk '{printf "%14.6f\n", $2}' > temp1
    set ymin = `minmax temp1 | awk '{print $1}'`
    set ymax = `minmax temp1 | awk '{print $2}'`
    if ("fc" == $i) then
      set aaa = full
    else
      set aaa = non
    endif
    if ($mname == $j) then
      set bbb = ""
    else
      set bbb = "$clip1\/$clip2-clipped"
    endif
    cat mlrgd.R | sed 's/mmm/'$j'/g' | \
      sed 's/yyymax/'$ymax'/g' | sed 's/yyymin/'$ymin'/g' | \
      sed 's/lll/'$length'/g' | \
      sed 's/ddd1/'$base1'/g' | sed 's/ddd2/'$base2'/g' | \
      sed 's/qqq/'$np2'/g' | sed 's/nnn/'$nmutsum'/g' | \
      sed 's/rrr/'$lambdasum'/g' | sed 's/ccc/'$lambda4sum'/g' | \
      sed 's/aaa/'$aaa'-coding model/g' | sed 's/bbb/'$bbb' running mean/g' | \
      sed 's/  running mean/ running mean/' | sed 's/ppp/'$i'/' | \
      sed 's/www1/'$w1'/g' | sed 's/www2/'$w2'/g' | \
      sed 's/orfx1/'$orfx1'/g' | sed 's/orfx2/'$orfx2'/g' | \
      sed 's/orfy/'$orfy'/g' | sed 's/kfy/'$kfy'/g' | \
      sed 's/kfx1/'$kfx1'/g' | sed 's/kfx2/'$kfx2'/g' | \
      sed 's/xxx/'$prefix'/g' > $prefix.$j.R
    R --no-save --slave < $prefix.$j.R
  end
end

echo "Done"

#------------------------------------------------------------------------------
#Tidy up.

cp run_mlrgd $prefix.run_mlrgd
mkdir TIDYUP
foreach i (`ls tghyv.* seq.???.aln orfs.??? seqs.dat aa2codon.dat aa.dat codon.dat nuc.dat mlrgd.R run_mlrgd`)
  mv $i TIDYUP
end
rm -f cluster.txt temp temp[12] phylip.in phylip.out setup.mlrgd

awk '{printf "seq.%s seq.%s\n",$1,$2}' $prefix.pairs > temp1
mv temp1 $prefix.pairs
@ count = 0
foreach i (`cat $prefix.seqs`)
  set name = `echo $i | sed 's/\.fasta$//' | sed 's/\.gbk$//'`
  @ count += 1
  set id = `echo $count | awk '{printf "%03i\n",$1}'`
  mv $prefix.pairs temp1
  mv $prefix.aln temp2
  sed 's/seq\.'$id'/'$name'/g' temp1 > $prefix.pairs
  sed 's/seq\.'$id'/'$name'/g' temp2 > $prefix.aln
end
rm -f temp[12]

echo `date` ": Finished running CDS-plotcon on alignment $prefix"