#!/bin/csh
# Copyright (C) 2005 Andrew E Firth, University of Otago, Dunedin,
# New Zealand, aef(at)sanger.otago.ac.nz
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License (version 2) as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.
#Checks and prepares user input before running domlogd, domcsims, doallorfs
# or dosixframe scripts.
limit stacksize unlimited
@ argc = `echo $argv | awk '{print NF}'`
if ($argc != 2) then
echo "Usage: prepmlogd newdir mode
"
exit 1
endif
set newdir = $1
cd $newdir
source mlogd.param
set mode = $2
if (! $download) then
setenv PATH ${PATH}:/home/aef/bin
else
setenv PATH ${PATH}:${HOME}/bin
endif
cp ../../SCRIPTS/aa2codon.dat .
cp ../../SCRIPTS/aa.dat .
cp ../../SCRIPTS/codon.dat .
cp ../../SCRIPTS/nuc.dat .
if ($download) then
touch email
endif
@ maxlength = 35000
@ maxnseqs = 50
@ maxpairs = 50
cp allpairs.txt allpairs.org
cp allseqs.txt allseqs.org
cp orfs.1.txt orfs.1.org
cp orfs.2.txt orfs.2.org
cp email email.org
cp mlogd.param mlogd.param.org
echo -n "Preparing input data...
"
echo Started `date` > errorlog.txt
#------------------------------------------------------------------------------
#Check all required programmes and files are present.
set test = 0
foreach i (seqret infoseq noreturn getorf degapseq calcprob mcsim minmax mlogd ntadjust runmean2 R)
if (! -X $i) then
echo "Error: Can't find software '$i'.
"
set test = 1
endif
end
if ($test) then
exit 1
endif
foreach i (allseqs.txt allpairs.txt orfs.1.txt orfs.2.txt email mlogd.param aa2codon.dat aa.dat codon.dat nuc.dat)
if (! -r $i) then
echo "Error: Can't find file '$i'.
"
set test = 1
endif
end
if ($test) then
exit 1
endif
#------------------------------------------------------------------------------
#Clean up input data files.
foreach i (allseqs.txt allpairs.txt orfs.1.txt orfs.2.txt email mlogd.param)
noreturn $i temp -auto
mv temp $i
end
awk '{print $1,$2,$3,$4}' mlogd.param | grep "set" > temp
mv temp mlogd.param
awk '{print $1}' orfs.1.txt | grep "[0-9]" > temp
mv temp orfs.1.txt
awk '{print $1}' orfs.2.txt | grep "[0-9]" > temp
mv temp orfs.2.txt
@ w5 = `wc -l orfs.1.txt | awk '{print $1}'`
if ($w5 == 0) then
echo "Warning: Known CDSs file is empty. Therefore null model is non-coding.
"
endif
@ w6 = `wc -l orfs.2.txt | awk '{print $1}'`
if ("0" != $mode) then
if ($w6 != 0) then
echo "Warning: You entered 'Query CDSs'; however these will not be used because you didn't select 'Test input query CDSs' for the operating mode.
"
endif
else
if ($w6 == 0) then
echo "Error: Query CDSs file is empty. Therefore there is no alternative model to test.
"
exit 1
endif
endif
head -1 email | awk '{print $1}' > temp
mv temp email
sed 's/^[[:blank:]]*//' allseqs.txt > temp
set w4 = `grep -n "." temp | head -1 | awk -F: '{print $1}'`
if ($w4) then
tail +$w4 temp > allseqs.txt
else
echo "Error: Sequences file is empty.
"
exit 1
endif
awk '{if (NF!=2) printf "Error: Pairs file line %s has %s fields, instead of two.
\n",NR,NF}' allpairs.txt
set w1 = `awk '{if (NF!=2) print 0}' allpairs.txt | wc -l | awk '{print $1}'`
if ($w1 != 0) then
exit 1
endif
awk '{print $1,$2}' allpairs.txt | grep "[a-zA-Z0-9_\-\.\+]" > temp
mv temp allpairs.txt
@ w2 = `wc -l allpairs.txt | awk '{print $1}'`
if ($w2 == 0) then
echo "Error: Pairs file is empty.
"
exit 1
endif
if ($w2 > $maxpairs) then
echo "Error: Maximum number of pairs in pairs file is $maxpairs.
"
echo "You entered $w2.
"
exit 1
endif
#------------------------------------------------------------------------------
#Make sequences files.
#Note seqret changes '.'s to '-'s, but doesn't strip gaps. Preserves
# non-ACGTU if they are valid ambiguous nt codes, but non valid codes,
# e.g. O, are replaced with '-'.
seqret allseqs.txt temp -osformat fasta -auto || exit 1
mv temp allseqs.txt
grep ">" allseqs.txt | sed 's/^>//' | awk '{print $1}' > my.seqs
@ nseqs = `wc -l my.seqs | awk '{print $1}'`
echo "You entered $nseqs sequences.
"
echo "Reference sequence = $refseq.
"
if ($nseqs == 0) then
echo "Error: No input sequences.
"
exit 1
endif
if ($nseqs == 1) then
echo "Error: Need at least two input sequences.
"
exit 1
endif
if ($nseqs > $maxnseqs) then
echo "Error: Maximum number of input sequences is $maxnseqs. You entered $nseqs.
"
exit 1
endif
#Check reference sequence name is in the sequences file.
@ w = `awk '{if ($1=="'"$refseq"'") s+=1}END{print s}' my.seqs`
if ($w == 0) then
echo "Error: Reference sequence name '$refseq' is not in sequences file.
"
exit 1
endif
#Check no identical names in sequences file.
@ count = 1
foreach i (`tail +2 my.seqs`)
foreach j (`head -$count my.seqs`)
if ($i == $j) then
echo "Error: Sequence name '$i' is used for more than one sequence in sequences file.
"
echo "Sequence names must be unique.
"
exit 1
endif
end
@ count += 1
end
#Check all names in pairs file are in sequences file.
foreach i (`cat allpairs.txt`)
@ w = `awk '{if ($1=="'"$i"'") s+=1}END{print s}' my.seqs`
if ($w == 0) then
echo "Error: Sequence '$i' in pairs file is not in sequences file.
"
exit 1
endif
end
#Move refseq to head of sequences file.
echo $refseq > temp1
awk '{if ($1!="'"$refseq"'") print $1}' my.seqs >> temp1
mv temp1 my.seqs
#Make fasta files, rename sequences and make key for old <-> new names.
grep -n ">" allseqs.txt | sed 's/>/ /' | sed 's/:/ /' | awk '{print $1,$2}' \
> temp1
tail +2 temp1 | awk '{print $1-1}' > temp2
wc -l allseqs.txt | awk '{print $1}' >> temp2
paste temp1 temp2 > temp4
awk '{if ($2=="'"$refseq"'") printf "%s:%s:%s\n",$1,1+$3-$1,$2}' temp4 > temp3
awk '{if ($2!="'"$refseq"'") printf "%s:%s:%s\n",$1,1+$3-$1,$2}' temp4 >> temp3
rm -f temp4
touch seqs.key
@ count = 0
foreach i (`cat temp3`)
@ count += 1
set id = `echo $count | awk '{printf "%03i\n",$1}'`
set l1 = `echo $i | awk -F: '{print $1}'`
set l2 = `echo $i | awk -F: '{print $2}'`
set name = `echo $i | awk -F: '{print $3}'`
tail +$l1 allseqs.txt | head -$l2 | sed 's/>.*/>seq.'$id'/' > temp4
seqret temp4 seq.$id.fasta -auto -osformat fasta
echo $name seq.$id >> seqs.key
@ w = `infoseq seq.$id.fasta -auto -only -length | tail -1`
if ($w > $maxlength) then
echo "Error: sequence '$name' exceeds maximum length (including alignment gaps) of $maxlength nt.
"
exit 1
endif
rm -f temp4
end
rm -f temp[123]
awk '{print $2}' seqs.key > my.seqs
#Check for and remove identical sequences.
rm -f temp; touch temp
foreach i (`cat my.seqs`)
degapseq $i.fasta temp1 -auto
tail +2 temp1 | sed 'y/ACGTUuMRWVHDBSYKNX/acgtttmrwvhdbsyknx/' \
> rfge8345dsf.$i
echo rfge8345dsf.$i >> temp
end
rm -f temp[12]
rm -f my2.seqs
@ count = 1
head -1 my.seqs > my2.seqs
foreach i (`tail +2 my.seqs`)
@ t2 = 1
foreach j (`head -$count temp`)
@ t1 = `diff rfge8345dsf.$i $j | wc -l | awk '{print $1}'`
if ($t1 == 0) then
@ t2 = 0
set old = `echo $j | sed 's/rfge8345dsf\.//'`
set i2 = `awk '{if ($2=="'"$i"'") print $1}' seqs.key`
set old2 = `awk '{if ($2=="'"$old"'") print $1}' seqs.key`
echo "Sequence '$i2' identical to '$old2'. Omitting '$i2'.
"
awk '{if ($1=="'"$i2"'") {print "'"$old2"'",$2} else {print $0}}' \
allpairs.txt | \
awk '{if ($2=="'"$i2"'") {print $1,"'"$old2"'"} else {print $0}}' \
> temp1
mv temp1 allpairs.txt
break
endif
end
if ($t2) then
echo $i >> my2.seqs
endif
@ count += 1
end
@ w = `wc -l my2.seqs | awk '{print $1}'`
if ($w <= 1) then
echo "Error: After removing duplicated identical sequences, only $w sequence left.
Need at least two sequences to proceed."
exit 1
endif
awk '{if ($1!=$2) print $0}' allpairs.txt > temp
mv temp allpairs.txt
@ w = `wc -l allpairs.txt | awk '{print $1}'`
if ($w == 0) then
echo "Error: After removing duplicated identical sequences, user-input pairs file is empty."
exit 1
endif
rm -f rfge8345dsf.*
#Renumber sequences if any removed (added 22/02/06).
@ w7 = `wc -l my.seqs | awk '{print $1}'`
@ w8 = `wc -l my2.seqs | awk '{print $1}'`
if ($w7 != $w8) then
diff my.seqs my2.seqs | grep "<" | awk '{print $2}' > temp2
foreach i (`cat temp2`)
rm -f $i.fasta
awk '{if ($2!="'"$i"'") print $0}' seqs.key > temp1
mv temp1 seqs.key
end
rm -f temp2
@ count = 0
while ($count < $w8)
@ count += 1
set new = `echo $count | awk '{printf "seq.%03i\n",$1}'`
set old = `head -$count my2.seqs | tail -1`
if ($old != $new) then
mv $old.fasta $new.fasta
sed 's/'$old'/'$new'/' my2.seqs > temp1
mv temp1 my2.seqs
cat seqs.key | \
awk '{if ($2=="'"$old"'") {print $1,"'"$new"'"} else {print $0}}' \
> temp1
mv temp1 seqs.key
endif
end
endif
cp my2.seqs my.seqs
#------------------------------------------------------------------------------
#Make the seqs.dat (with new names and refseq first).
set refseq2 = `awk '{if ($1=="'"$refseq"'") print $2}' seqs.key`
echo $refseq2.fasta > seqs.dat
echo $refseq2.fasta > seqs2.dat
awk '{if ($1!="'"$refseq2"'") printf "%s.fasta\n",$1}' my.seqs >> seqs.dat
awk '{if ($1!="'"$refseq2"'") printf "%s.fasta\n",$1}' my2.seqs >> seqs2.dat
#Make the tree and ref-nonref pairs.dat files (with new names).
tail +2 seqs2.dat | awk '{printf "%s %s\n","'"$refseq2"'",$1}' \
| sed 's/seq\.//g' | sed 's/\.fasta//' > pairs.ref.dat
rm -f my2.seqs seqs2.dat
awk '{printf "%s@ %s@\n",$1,$2}' allpairs.txt > pairs.tree.dat
foreach i (`awk '{printf "%s:%s\n",$1,$2}' seqs.key`)
set old = `echo $i | awk -F: '{printf "%s@\n",$1}'`
set new = `echo $i | awk -F: '{print $2}' | sed 's/seq\.//'`
awk '{if ($1=="'"$old"'") {print "'"$new"'",$2} else {print $0}}' \
pairs.tree.dat | \
awk '{if ($2=="'"$old"'") {print $1,"'"$new"'"} else {print $0}}' \
> temp1
mv temp1 pairs.tree.dat
end
#Make orfs.1, orfs.2 with proper formatting.
sed 's/[^0-9]/ /g' orfs.1.txt | awk '{print NF,$0}' > orfs.1
sed 's/[^0-9]/ /g' orfs.2.txt | awk '{print NF,$0}' > orfs.2
#------------------------------------------------------------------------------
#Find reference sequence length.
degapseq $refseq2.fasta temp.fasta -auto
set length = `infoseq temp.fasta -auto -only -length | tail -1`
rm -f temp.fasta
#If 1 == wholeseq, set range1, range2 to sequence boundaries.
if ("1" == $wholeseq) then
@ range1 = 1
@ range2 = $length
endif
#If 0 == wholeseq, check range1 < range2.
@ range1 = $range1
@ range2 = $range2
if ($range1 > $range2) then
@ temp = $range1
@ range1 = $range2
@ range2 = $temp
endif
#If 2 == wholeseq, find minimum range encompassing query ORFs.
if ("2" == $wholeseq) then
@ range1 = `sed 's/[^0-9]/x/g' orfs.2.org | sed 'y/x/\n/' | grep "[0-9]" | sort -n | head -1`
@ range2 = `sed 's/[^0-9]/x/g' orfs.2.org | sed 'y/x/\n/' | grep "[0-9]" | sort -n | tail -1`
endif
#Check range1 and range2 are within sequence.
if ($range1 < 1) then
@ range1 = 1
endif
if ($range2 > $length) then
@ range2 = $length
endif
#Make setup.mlogd and update mlogd.param.
echo $wholeseq $range1 $range2 " " > setup.mlogd
echo $circular " " >> setup.mlogd
echo "set range1 = $range1 " >> mlogd.param
echo "set range2 = $range2 " >> mlogd.param
echo "set qlength = $length " >> mlogd.param
#------------------------------------------------------------------------------
echo "...Done
"