#!/bin/csh

# Copyright (C) 2005 Andrew E Firth, University of Otago, Dunedin, 
# New Zealand, aef(at)sanger.otago.ac.nz
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License (version 2) as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.

#Checks and prepares user input before running domlogd, domcsims, doallorfs
#  or dosixframe scripts.

limit stacksize unlimited

@ argc = `echo $argv | awk '{print NF}'`
if ($argc != 2) then
  echo "Usage: prepmlogd newdir mode<br>"
  exit 1
endif

set newdir = $1
cd $newdir
source mlogd.param
set mode = $2

if (! $download) then
  setenv PATH ${PATH}:/home/aef/bin
else
  setenv PATH ${PATH}:${HOME}/bin
endif

cp ../../SCRIPTS/aa2codon.dat .
cp ../../SCRIPTS/aa.dat .
cp ../../SCRIPTS/codon.dat .
cp ../../SCRIPTS/nuc.dat .

if ($download) then
  touch email
endif

@ maxlength = 35000
@ maxnseqs = 50
@ maxpairs = 50

cp allpairs.txt allpairs.org
cp allseqs.txt allseqs.org
cp orfs.1.txt orfs.1.org
cp orfs.2.txt orfs.2.org
cp email email.org
cp mlogd.param mlogd.param.org

echo -n "Preparing input data...<br>"
echo Started `date` > errorlog.txt

#------------------------------------------------------------------------------
#Check all required programmes and files are present.

set test = 0
foreach i (seqret infoseq noreturn getorf degapseq calcprob mcsim minmax mlogd ntadjust runmean2 R)
  if (! -X $i) then
    echo "Error: Can't find software '$i'.<br>"
    set test = 1
  endif
end
if ($test) then
  exit 1
endif

foreach i (allseqs.txt allpairs.txt orfs.1.txt orfs.2.txt email mlogd.param aa2codon.dat aa.dat codon.dat nuc.dat)
  if (! -r $i) then
    echo "Error: Can't find file '$i'.<br>"
    set test = 1
  endif
end
if ($test) then
  exit 1
endif

#------------------------------------------------------------------------------
#Clean up input data files.

foreach i (allseqs.txt allpairs.txt orfs.1.txt orfs.2.txt email mlogd.param)
  noreturn $i temp -auto
  mv temp $i
end

awk '{print $1,$2,$3,$4}' mlogd.param | grep "set" > temp
mv temp mlogd.param

awk '{print $1}' orfs.1.txt | grep "[0-9]" > temp
mv temp orfs.1.txt
awk '{print $1}' orfs.2.txt | grep "[0-9]" > temp
mv temp orfs.2.txt

@ w5 = `wc -l orfs.1.txt | awk '{print $1}'`
if ($w5 == 0) then
  echo "Warning: Known CDSs file is empty.  Therefore null model is non-coding.<br>"
endif

@ w6 = `wc -l orfs.2.txt | awk '{print $1}'`
if ("0" != $mode) then
  if ($w6 != 0) then
    echo "Warning: You entered 'Query CDSs'; however these will not be used because you didn't select 'Test input query CDSs' for the operating mode.<br>" 
  endif
else
  if ($w6 == 0) then
    echo "Error: Query CDSs file is empty.  Therefore there is no alternative model to test.<br>"
    exit 1
  endif
endif

head -1 email | awk '{print $1}' > temp
mv temp email

sed 's/^[[:blank:]]*//' allseqs.txt > temp
set w4 = `grep -n "." temp | head -1 | awk -F: '{print $1}'`
if ($w4) then
  tail +$w4 temp > allseqs.txt
else 
  echo "Error: Sequences file is empty.<br>"
  exit 1
endif 

awk '{if (NF!=2) printf "Error: Pairs file line %s has %s fields, instead of two.<br>\n",NR,NF}' allpairs.txt
set w1 = `awk '{if (NF!=2) print 0}' allpairs.txt | wc -l | awk '{print $1}'`
if ($w1 != 0) then
  exit 1
endif

awk '{print $1,$2}' allpairs.txt | grep "[a-zA-Z0-9_\-\.\+]" > temp
mv temp allpairs.txt

@ w2 = `wc -l allpairs.txt | awk '{print $1}'`
if ($w2 == 0) then
  echo "Error: Pairs file is empty.<br>"
  exit 1
endif
if ($w2 > $maxpairs) then
  echo "Error: Maximum number of pairs in pairs file is $maxpairs.<br>"
  echo "You entered $w2.<br>"
  exit 1
endif


#------------------------------------------------------------------------------
#Make sequences files.

#Note seqret changes '.'s to '-'s, but doesn't strip gaps.  Preserves 
#  non-ACGTU if they are valid ambiguous nt codes, but non valid codes,
#  e.g. O, are replaced with '-'.
seqret allseqs.txt temp -osformat fasta -auto || exit 1
mv temp allseqs.txt

grep ">" allseqs.txt | sed 's/^>//' | awk '{print $1}' > my.seqs
@ nseqs = `wc -l my.seqs | awk '{print $1}'`
echo "You entered $nseqs sequences.<br>"
echo "Reference sequence = $refseq.<br>"
if ($nseqs == 0) then
  echo "Error: No input sequences.<br>"
  exit 1
endif
if ($nseqs == 1) then
  echo "Error: Need at least two input sequences.<br>"
  exit 1
endif
if ($nseqs > $maxnseqs) then
  echo "Error: Maximum number of input sequences is $maxnseqs.  You entered $nseqs.<br>"
  exit 1
endif

#Check reference sequence name is in the sequences file.
@ w = `awk '{if ($1=="'"$refseq"'") s+=1}END{print s}' my.seqs`
if ($w == 0) then
  echo "Error: Reference sequence name '$refseq' is not in sequences file.<br>"
  exit 1
endif

#Check no identical names in sequences file.
@ count = 1
foreach i (`tail +2 my.seqs`)
  foreach j (`head -$count my.seqs`)
    if ($i == $j) then
      echo "Error: Sequence name '$i' is used for more than one sequence in sequences file.<br>"
      echo "Sequence names must be unique.<br>"
      exit 1
    endif
  end
  @ count += 1
end

#Check all names in pairs file are in sequences file.
foreach i (`cat allpairs.txt`)
  @ w = `awk '{if ($1=="'"$i"'") s+=1}END{print s}' my.seqs`
  if ($w == 0) then
    echo "Error: Sequence '$i' in pairs file is not in sequences file.<br>"
    exit 1
  endif
end

#Move refseq to head of sequences file.
echo $refseq > temp1
awk '{if ($1!="'"$refseq"'") print $1}' my.seqs >> temp1
mv temp1 my.seqs

#Make fasta files, rename sequences and make key for old <-> new names.
grep -n ">" allseqs.txt | sed 's/>/ /' | sed 's/:/ /' | awk '{print $1,$2}' \
  > temp1
tail +2 temp1 | awk '{print $1-1}' > temp2
wc -l allseqs.txt | awk '{print $1}' >> temp2
paste temp1 temp2 > temp4 
awk '{if ($2=="'"$refseq"'") printf "%s:%s:%s\n",$1,1+$3-$1,$2}' temp4 > temp3
awk '{if ($2!="'"$refseq"'") printf "%s:%s:%s\n",$1,1+$3-$1,$2}' temp4 >> temp3
rm -f temp4
touch seqs.key
@ count = 0
foreach i (`cat temp3`)
  @ count += 1
  set id = `echo $count | awk '{printf "%03i\n",$1}'`
  set l1 = `echo $i | awk -F: '{print $1}'`
  set l2 = `echo $i | awk -F: '{print $2}'`
  set name = `echo $i | awk -F: '{print $3}'`
  tail +$l1 allseqs.txt | head -$l2 | sed 's/>.*/>seq.'$id'/' > temp4
  seqret temp4 seq.$id.fasta -auto -osformat fasta
  echo $name seq.$id >> seqs.key
  @ w = `infoseq seq.$id.fasta -auto -only -length | tail -1`
  if ($w > $maxlength) then
    echo "Error: sequence '$name' exceeds maximum length (including alignment gaps) of $maxlength nt.<br>"
    exit 1
  endif
  rm -f temp4
end
rm -f temp[123]
awk '{print $2}' seqs.key > my.seqs

#Check for and remove identical sequences.
rm -f temp; touch temp
foreach i (`cat my.seqs`)
  degapseq $i.fasta temp1 -auto
  tail +2 temp1 | sed 'y/ACGTUuMRWVHDBSYKNX/acgtttmrwvhdbsyknx/' \
    > rfge8345dsf.$i
  echo rfge8345dsf.$i >> temp
end
rm -f temp[12]
rm -f my2.seqs
@ count = 1
head -1 my.seqs > my2.seqs
foreach i (`tail +2 my.seqs`)
  @ t2 = 1
  foreach j (`head -$count temp`)
    @ t1 = `diff rfge8345dsf.$i $j | wc -l | awk '{print $1}'`
    if ($t1 == 0) then
      @ t2 = 0
      set old = `echo $j | sed 's/rfge8345dsf\.//'`
      set i2 = `awk '{if ($2=="'"$i"'") print $1}' seqs.key`
      set old2 = `awk '{if ($2=="'"$old"'") print $1}' seqs.key`
      echo "Sequence '$i2' identical to '$old2'. Omitting '$i2'.<br>"
      awk '{if ($1=="'"$i2"'") {print "'"$old2"'",$2} else {print $0}}' \
        allpairs.txt | \
        awk '{if ($2=="'"$i2"'") {print $1,"'"$old2"'"} else {print $0}}' \
        > temp1
      mv temp1 allpairs.txt
      break
    endif 
  end
  if ($t2) then
    echo $i >> my2.seqs
  endif
  @ count += 1
end
@ w = `wc -l my2.seqs | awk '{print $1}'`
if ($w <= 1) then
  echo "Error: After removing duplicated identical sequences, only $w sequence left.<br>Need at least two sequences to proceed."
  exit 1
endif
awk '{if ($1!=$2) print $0}' allpairs.txt > temp
mv temp allpairs.txt
@ w = `wc -l allpairs.txt | awk '{print $1}'`
if ($w == 0) then
  echo "Error: After removing duplicated identical sequences, user-input pairs file is empty."
  exit 1
endif
rm -f rfge8345dsf.*

#Renumber sequences if any removed (added 22/02/06).
@ w7 = `wc -l my.seqs | awk '{print $1}'`
@ w8 = `wc -l my2.seqs | awk '{print $1}'`
if ($w7 != $w8) then
  diff my.seqs my2.seqs | grep "<" | awk '{print $2}' > temp2
  foreach i (`cat temp2`)
    rm -f $i.fasta
    awk '{if ($2!="'"$i"'") print $0}' seqs.key > temp1
    mv temp1 seqs.key
  end
  rm -f temp2
  @ count = 0
  while ($count < $w8)
    @ count += 1
    set new = `echo $count | awk '{printf "seq.%03i\n",$1}'`
    set old = `head -$count my2.seqs | tail -1` 
    if ($old != $new) then
      mv $old.fasta $new.fasta
      sed 's/'$old'/'$new'/' my2.seqs > temp1
      mv temp1 my2.seqs
      cat seqs.key | \
        awk '{if ($2=="'"$old"'") {print $1,"'"$new"'"} else {print $0}}' \
        > temp1
      mv temp1 seqs.key
    endif
  end
endif
cp my2.seqs my.seqs

#------------------------------------------------------------------------------

#Make the seqs.dat (with new names and refseq first).
set refseq2 = `awk '{if ($1=="'"$refseq"'") print $2}' seqs.key`
echo $refseq2.fasta > seqs.dat
echo $refseq2.fasta > seqs2.dat
awk '{if ($1!="'"$refseq2"'") printf "%s.fasta\n",$1}' my.seqs >> seqs.dat
awk '{if ($1!="'"$refseq2"'") printf "%s.fasta\n",$1}' my2.seqs >> seqs2.dat

#Make the tree and ref-nonref pairs.dat files (with new names).
tail +2 seqs2.dat | awk '{printf "%s %s\n","'"$refseq2"'",$1}' \
  | sed 's/seq\.//g' | sed 's/\.fasta//' > pairs.ref.dat
rm -f my2.seqs seqs2.dat
awk '{printf "%s@ %s@\n",$1,$2}' allpairs.txt > pairs.tree.dat
foreach i (`awk '{printf "%s:%s\n",$1,$2}' seqs.key`)
  set old = `echo $i | awk -F: '{printf "%s@\n",$1}'`
  set new = `echo $i | awk -F: '{print $2}' | sed 's/seq\.//'`
  awk '{if ($1=="'"$old"'") {print "'"$new"'",$2} else {print $0}}' \
    pairs.tree.dat | \
    awk '{if ($2=="'"$old"'") {print $1,"'"$new"'"} else {print $0}}' \
    > temp1
    mv temp1 pairs.tree.dat
end

#Make orfs.1, orfs.2 with proper formatting.
sed 's/[^0-9]/ /g' orfs.1.txt | awk '{print NF,$0}' > orfs.1
sed 's/[^0-9]/ /g' orfs.2.txt | awk '{print NF,$0}' > orfs.2

#------------------------------------------------------------------------------

#Find reference sequence length.
degapseq $refseq2.fasta temp.fasta -auto
set length = `infoseq temp.fasta -auto -only -length | tail -1`
rm -f temp.fasta

#If 1 == wholeseq, set range1, range2 to sequence boundaries.
if ("1" == $wholeseq) then 
  @ range1 = 1
  @ range2 = $length
endif

#If 0 == wholeseq, check range1 < range2.
@ range1 = $range1
@ range2 = $range2
if ($range1 > $range2) then
  @ temp = $range1
  @ range1 = $range2
  @ range2 = $temp
endif

#If 2 == wholeseq, find minimum range encompassing query ORFs.
if ("2" == $wholeseq) then 
  @ range1 = `sed 's/[^0-9]/x/g' orfs.2.org | sed 'y/x/\n/' | grep "[0-9]" | sort -n | head -1`
  @ range2 = `sed 's/[^0-9]/x/g' orfs.2.org | sed 'y/x/\n/' | grep "[0-9]" | sort -n | tail -1`
endif

#Check range1 and range2 are within sequence.
if ($range1 < 1) then
  @ range1 = 1
endif
if ($range2 > $length) then
  @ range2 = $length
endif

#Make setup.mlogd and update mlogd.param.
echo $wholeseq $range1 $range2 " " >  setup.mlogd
echo $circular " " >> setup.mlogd
echo "set range1 = $range1 " >> mlogd.param
echo "set range2 = $range2 " >> mlogd.param
echo "set qlength = $length " >> mlogd.param

#------------------------------------------------------------------------------

echo "...Done<br><br>"