#!/bin/csh # Copyright (C) 2005 Andrew E Firth, University of Otago, Dunedin, # New Zealand, aef(at)sanger.otago.ac.nz # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License (version 2) as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA # 02110-1301, USA. #Checks and prepares user input before running domlogd, domcsims, doallorfs # or dosixframe scripts. limit stacksize unlimited @ argc = `echo $argv | awk '{print NF}'` if ($argc != 2) then echo "Usage: prepmlogd newdir mode
" exit 1 endif set newdir = $1 cd $newdir source mlogd.param set mode = $2 if (! $download) then setenv PATH ${PATH}:/home/aef/bin else setenv PATH ${PATH}:${HOME}/bin endif cp ../../SCRIPTS/aa2codon.dat . cp ../../SCRIPTS/aa.dat . cp ../../SCRIPTS/codon.dat . cp ../../SCRIPTS/nuc.dat . if ($download) then touch email endif @ maxlength = 35000 @ maxnseqs = 50 @ maxpairs = 50 cp allpairs.txt allpairs.org cp allseqs.txt allseqs.org cp orfs.1.txt orfs.1.org cp orfs.2.txt orfs.2.org cp email email.org cp mlogd.param mlogd.param.org echo -n "Preparing input data...
" echo Started `date` > errorlog.txt #------------------------------------------------------------------------------ #Check all required programmes and files are present. set test = 0 foreach i (seqret infoseq noreturn getorf degapseq calcprob mcsim minmax mlogd ntadjust runmean2 R) if (! -X $i) then echo "Error: Can't find software '$i'.
" set test = 1 endif end if ($test) then exit 1 endif foreach i (allseqs.txt allpairs.txt orfs.1.txt orfs.2.txt email mlogd.param aa2codon.dat aa.dat codon.dat nuc.dat) if (! -r $i) then echo "Error: Can't find file '$i'.
" set test = 1 endif end if ($test) then exit 1 endif #------------------------------------------------------------------------------ #Clean up input data files. foreach i (allseqs.txt allpairs.txt orfs.1.txt orfs.2.txt email mlogd.param) noreturn $i temp -auto mv temp $i end awk '{print $1,$2,$3,$4}' mlogd.param | grep "set" > temp mv temp mlogd.param awk '{print $1}' orfs.1.txt | grep "[0-9]" > temp mv temp orfs.1.txt awk '{print $1}' orfs.2.txt | grep "[0-9]" > temp mv temp orfs.2.txt @ w5 = `wc -l orfs.1.txt | awk '{print $1}'` if ($w5 == 0) then echo "Warning: Known CDSs file is empty. Therefore null model is non-coding.
" endif @ w6 = `wc -l orfs.2.txt | awk '{print $1}'` if ("0" != $mode) then if ($w6 != 0) then echo "Warning: You entered 'Query CDSs'; however these will not be used because you didn't select 'Test input query CDSs' for the operating mode.
" endif else if ($w6 == 0) then echo "Error: Query CDSs file is empty. Therefore there is no alternative model to test.
" exit 1 endif endif head -1 email | awk '{print $1}' > temp mv temp email sed 's/^[[:blank:]]*//' allseqs.txt > temp set w4 = `grep -n "." temp | head -1 | awk -F: '{print $1}'` if ($w4) then tail +$w4 temp > allseqs.txt else echo "Error: Sequences file is empty.
" exit 1 endif awk '{if (NF!=2) printf "Error: Pairs file line %s has %s fields, instead of two.
\n",NR,NF}' allpairs.txt set w1 = `awk '{if (NF!=2) print 0}' allpairs.txt | wc -l | awk '{print $1}'` if ($w1 != 0) then exit 1 endif awk '{print $1,$2}' allpairs.txt | grep "[a-zA-Z0-9_\-\.\+]" > temp mv temp allpairs.txt @ w2 = `wc -l allpairs.txt | awk '{print $1}'` if ($w2 == 0) then echo "Error: Pairs file is empty.
" exit 1 endif if ($w2 > $maxpairs) then echo "Error: Maximum number of pairs in pairs file is $maxpairs.
" echo "You entered $w2.
" exit 1 endif #------------------------------------------------------------------------------ #Make sequences files. #Note seqret changes '.'s to '-'s, but doesn't strip gaps. Preserves # non-ACGTU if they are valid ambiguous nt codes, but non valid codes, # e.g. O, are replaced with '-'. seqret allseqs.txt temp -osformat fasta -auto || exit 1 mv temp allseqs.txt grep ">" allseqs.txt | sed 's/^>//' | awk '{print $1}' > my.seqs @ nseqs = `wc -l my.seqs | awk '{print $1}'` echo "You entered $nseqs sequences.
" echo "Reference sequence = $refseq.
" if ($nseqs == 0) then echo "Error: No input sequences.
" exit 1 endif if ($nseqs == 1) then echo "Error: Need at least two input sequences.
" exit 1 endif if ($nseqs > $maxnseqs) then echo "Error: Maximum number of input sequences is $maxnseqs. You entered $nseqs.
" exit 1 endif #Check reference sequence name is in the sequences file. @ w = `awk '{if ($1=="'"$refseq"'") s+=1}END{print s}' my.seqs` if ($w == 0) then echo "Error: Reference sequence name '$refseq' is not in sequences file.
" exit 1 endif #Check no identical names in sequences file. @ count = 1 foreach i (`tail +2 my.seqs`) foreach j (`head -$count my.seqs`) if ($i == $j) then echo "Error: Sequence name '$i' is used for more than one sequence in sequences file.
" echo "Sequence names must be unique.
" exit 1 endif end @ count += 1 end #Check all names in pairs file are in sequences file. foreach i (`cat allpairs.txt`) @ w = `awk '{if ($1=="'"$i"'") s+=1}END{print s}' my.seqs` if ($w == 0) then echo "Error: Sequence '$i' in pairs file is not in sequences file.
" exit 1 endif end #Move refseq to head of sequences file. echo $refseq > temp1 awk '{if ($1!="'"$refseq"'") print $1}' my.seqs >> temp1 mv temp1 my.seqs #Make fasta files, rename sequences and make key for old <-> new names. grep -n ">" allseqs.txt | sed 's/>/ /' | sed 's/:/ /' | awk '{print $1,$2}' \ > temp1 tail +2 temp1 | awk '{print $1-1}' > temp2 wc -l allseqs.txt | awk '{print $1}' >> temp2 paste temp1 temp2 > temp4 awk '{if ($2=="'"$refseq"'") printf "%s:%s:%s\n",$1,1+$3-$1,$2}' temp4 > temp3 awk '{if ($2!="'"$refseq"'") printf "%s:%s:%s\n",$1,1+$3-$1,$2}' temp4 >> temp3 rm -f temp4 touch seqs.key @ count = 0 foreach i (`cat temp3`) @ count += 1 set id = `echo $count | awk '{printf "%03i\n",$1}'` set l1 = `echo $i | awk -F: '{print $1}'` set l2 = `echo $i | awk -F: '{print $2}'` set name = `echo $i | awk -F: '{print $3}'` tail +$l1 allseqs.txt | head -$l2 | sed 's/>.*/>seq.'$id'/' > temp4 seqret temp4 seq.$id.fasta -auto -osformat fasta echo $name seq.$id >> seqs.key @ w = `infoseq seq.$id.fasta -auto -only -length | tail -1` if ($w > $maxlength) then echo "Error: sequence '$name' exceeds maximum length (including alignment gaps) of $maxlength nt.
" exit 1 endif rm -f temp4 end rm -f temp[123] awk '{print $2}' seqs.key > my.seqs #Check for and remove identical sequences. rm -f temp; touch temp foreach i (`cat my.seqs`) degapseq $i.fasta temp1 -auto tail +2 temp1 | sed 'y/ACGTUuMRWVHDBSYKNX/acgtttmrwvhdbsyknx/' \ > rfge8345dsf.$i echo rfge8345dsf.$i >> temp end rm -f temp[12] rm -f my2.seqs @ count = 1 head -1 my.seqs > my2.seqs foreach i (`tail +2 my.seqs`) @ t2 = 1 foreach j (`head -$count temp`) @ t1 = `diff rfge8345dsf.$i $j | wc -l | awk '{print $1}'` if ($t1 == 0) then @ t2 = 0 set old = `echo $j | sed 's/rfge8345dsf\.//'` set i2 = `awk '{if ($2=="'"$i"'") print $1}' seqs.key` set old2 = `awk '{if ($2=="'"$old"'") print $1}' seqs.key` echo "Sequence '$i2' identical to '$old2'. Omitting '$i2'.
" awk '{if ($1=="'"$i2"'") {print "'"$old2"'",$2} else {print $0}}' \ allpairs.txt | \ awk '{if ($2=="'"$i2"'") {print $1,"'"$old2"'"} else {print $0}}' \ > temp1 mv temp1 allpairs.txt break endif end if ($t2) then echo $i >> my2.seqs endif @ count += 1 end @ w = `wc -l my2.seqs | awk '{print $1}'` if ($w <= 1) then echo "Error: After removing duplicated identical sequences, only $w sequence left.
Need at least two sequences to proceed." exit 1 endif awk '{if ($1!=$2) print $0}' allpairs.txt > temp mv temp allpairs.txt @ w = `wc -l allpairs.txt | awk '{print $1}'` if ($w == 0) then echo "Error: After removing duplicated identical sequences, user-input pairs file is empty." exit 1 endif rm -f rfge8345dsf.* #Renumber sequences if any removed (added 22/02/06). @ w7 = `wc -l my.seqs | awk '{print $1}'` @ w8 = `wc -l my2.seqs | awk '{print $1}'` if ($w7 != $w8) then diff my.seqs my2.seqs | grep "<" | awk '{print $2}' > temp2 foreach i (`cat temp2`) rm -f $i.fasta awk '{if ($2!="'"$i"'") print $0}' seqs.key > temp1 mv temp1 seqs.key end rm -f temp2 @ count = 0 while ($count < $w8) @ count += 1 set new = `echo $count | awk '{printf "seq.%03i\n",$1}'` set old = `head -$count my2.seqs | tail -1` if ($old != $new) then mv $old.fasta $new.fasta sed 's/'$old'/'$new'/' my2.seqs > temp1 mv temp1 my2.seqs cat seqs.key | \ awk '{if ($2=="'"$old"'") {print $1,"'"$new"'"} else {print $0}}' \ > temp1 mv temp1 seqs.key endif end endif cp my2.seqs my.seqs #------------------------------------------------------------------------------ #Make the seqs.dat (with new names and refseq first). set refseq2 = `awk '{if ($1=="'"$refseq"'") print $2}' seqs.key` echo $refseq2.fasta > seqs.dat echo $refseq2.fasta > seqs2.dat awk '{if ($1!="'"$refseq2"'") printf "%s.fasta\n",$1}' my.seqs >> seqs.dat awk '{if ($1!="'"$refseq2"'") printf "%s.fasta\n",$1}' my2.seqs >> seqs2.dat #Make the tree and ref-nonref pairs.dat files (with new names). tail +2 seqs2.dat | awk '{printf "%s %s\n","'"$refseq2"'",$1}' \ | sed 's/seq\.//g' | sed 's/\.fasta//' > pairs.ref.dat rm -f my2.seqs seqs2.dat awk '{printf "%s@ %s@\n",$1,$2}' allpairs.txt > pairs.tree.dat foreach i (`awk '{printf "%s:%s\n",$1,$2}' seqs.key`) set old = `echo $i | awk -F: '{printf "%s@\n",$1}'` set new = `echo $i | awk -F: '{print $2}' | sed 's/seq\.//'` awk '{if ($1=="'"$old"'") {print "'"$new"'",$2} else {print $0}}' \ pairs.tree.dat | \ awk '{if ($2=="'"$old"'") {print $1,"'"$new"'"} else {print $0}}' \ > temp1 mv temp1 pairs.tree.dat end #Make orfs.1, orfs.2 with proper formatting. sed 's/[^0-9]/ /g' orfs.1.txt | awk '{print NF,$0}' > orfs.1 sed 's/[^0-9]/ /g' orfs.2.txt | awk '{print NF,$0}' > orfs.2 #------------------------------------------------------------------------------ #Find reference sequence length. degapseq $refseq2.fasta temp.fasta -auto set length = `infoseq temp.fasta -auto -only -length | tail -1` rm -f temp.fasta #If 1 == wholeseq, set range1, range2 to sequence boundaries. if ("1" == $wholeseq) then @ range1 = 1 @ range2 = $length endif #If 0 == wholeseq, check range1 < range2. @ range1 = $range1 @ range2 = $range2 if ($range1 > $range2) then @ temp = $range1 @ range1 = $range2 @ range2 = $temp endif #If 2 == wholeseq, find minimum range encompassing query ORFs. if ("2" == $wholeseq) then @ range1 = `sed 's/[^0-9]/x/g' orfs.2.org | sed 'y/x/\n/' | grep "[0-9]" | sort -n | head -1` @ range2 = `sed 's/[^0-9]/x/g' orfs.2.org | sed 'y/x/\n/' | grep "[0-9]" | sort -n | tail -1` endif #Check range1 and range2 are within sequence. if ($range1 < 1) then @ range1 = 1 endif if ($range2 > $length) then @ range2 = $length endif #Make setup.mlogd and update mlogd.param. echo $wholeseq $range1 $range2 " " > setup.mlogd echo $circular " " >> setup.mlogd echo "set range1 = $range1 " >> mlogd.param echo "set range2 = $range2 " >> mlogd.param echo "set qlength = $length " >> mlogd.param #------------------------------------------------------------------------------ echo "...Done

"