/* # Copyright (C) 2005 Andrew E Firth, University of Otago, Dunedin, # New Zealand, aef(at)sanger.otago.ac.nz # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License (version 2) as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA # 02110-1301, USA. */ #include #include #include #include #include #include using namespace std; #define maxlength 35000 #define maxnseqs 201 #define maxpairs 200 // Function declarations. double calc_prob_col(double weight[][64], double nuc[][4], int sequences[][maxnseqs], int base0[3], int codonpos[][2], int seqid0[2], int model, int expnmuts, int aa2aa[64], double Vfix); int readsequence(char seq[128], int gsequences[][maxnseqs], int seqid); int readorfs(char cpos[128], int codonpos[][2], int length, int circular, int posid); int checkgaps(int lookup[][maxnseqs], int codonpos[][2], int nseq0, int seqid, int pos); int test_stop(double weight[][64], int sequences[][maxnseqs], int base0[3], int codonpos[][2], int seqid0[2], int pos); int stopcodon(int aa2aa[64], int sequences[][maxnseqs], int base0[2], int codonpos[][2], int seqid, int pos); int calc_nuc(double nuc[][4], double ttt, double nucB1[][4], double nucB2[][4], double nucD[][4]); int main(int argc, char* argv[]) { if (2 != argc) { cerr << "Aborting: need setup file; use '" << argv[0] << " setupfile'.\n"; exit(EXIT_FAILURE); } //--------------------------------------------------------------------------- // Variable declarations. int i, j, k, lines, nseq, nseqs, nseq0, temp95, temp98, temp99; int aa2aa[64], j1[2], j2[2], j3[2], jcid[2], seqid, seqid0[2], basex[2]; int maxiter, niter, wholeseq, range1, range2, tttobs, model, circular; int base1, base2, base, base0[3], gsequences[maxlength+10][maxnseqs]; int sequences[maxlength+10][maxnseqs], codonpos[maxlength+10][2], pos2[7][7]; int lookup[maxlength+10][maxnseqs], degnseq[maxnseqs], N0NsNnsum, npairs, np; int ntotal[maxnseqs], nmuts[maxnseqs], nmuts2[maxnseqs], norfs, test; int nuccount[maxnseqs], stopcount[maxnseqs][2], gapcount[maxnseqs]; int N0synnon[3][maxnseqs], N123[3][maxnseqs][2], flag[maxnseqs][2]; int ntree[maxlength+10]; double score[20][20], weight[64][64], codon[64], nucB1[4][4], nucB2[4][4]; double nucA[4][4], nucD[4][4], nuc[4][4], nuc0[4][4], nuc1[4][4]; double testA[4][4], tempA[4][4], epsilon, logepsilon, temp96, temp97; double ttt, tttmin, tttmax, tttfit, fitmuts, expmuts, lambda, lambdasum; double ettt[2], enmuts[2], emtx[2][maxnseqs], tttx[2][maxnseqs], Vfix; double N123r[3][maxnseqs], N0NsNn[3][maxnseqs], treesum[maxlength+10][3]; char aa[20], aa2codon[64], seq[128], temp, temp2[128]; int verbose = 0; tttmin = 0.0; // pairwise sequence divergence fitting params tttmax = 10.0; // " " " " " tttfit = 0.01; // " " " " " maxiter = 40; // " " " " " tttobs = 1; // use observed number of mutations lambda = 0; // only used if tttobs = 0 Vfix = 1.; // nonsynonymous:synonymous scaling epsilon = 0.001; // writes log of all mut probs < epsilon if (verbose) { cout << "\n"; } for (i = 0; i < maxlength+10; ++i) { treesum[i][0] = treesum[i][1] = treesum[i][2] = 0.; ntree[i] = 0; } lambdasum = 0.; //--------------------------------------------------------------------------- // Setup parameters. // Open setup file. ifstream setupfile(argv[1]); if (!setupfile) { cerr << "Aborting: can't find setup file '" << argv[1] << "'.\n"; exit(EXIT_FAILURE); } // Read in setup parameters. setupfile >> wholeseq; if (0 == wholeseq || 2 == wholeseq) { setupfile >> range1; setupfile >> range2; } setupfile.ignore(1000, '\n'); setupfile >> circular; setupfile.ignore(1000, '\n'); setupfile.close(); //--------------------------------------------------------------------------- // Amino acid, nucleotide and codon weight matrices. // Number of lines in 20 x 20 amino acid weight matrix. ifstream matrixfile("aa.dat"); if (!matrixfile) { cerr << "Aborting: can't find file 'aa.dat'.\n"; exit(EXIT_FAILURE); } lines = 0; while (matrixfile.ignore(1000, '\n')) { ++lines; } matrixfile.clear(); matrixfile.seekg(0); // Read amino acid weight matrix. temp = '#'; while ('#' == temp) { matrixfile.get(temp); if ('#' == temp) { matrixfile.ignore(1000, '\n'); } } for (j = 0; j < 20; ++j) { matrixfile >> aa[j]; } matrixfile.ignore(1000, '\n'); for (i = 0; i < 20; ++i) { for (j = 0; j <= i; ++j) { matrixfile >> score[i][j]; } matrixfile.ignore(1000, '\n'); } matrixfile.close(); // Fill in other half of weight matrix. for (i = 0; i < 20; ++i) { for (j = i + 1; j < 20; ++j) { score[i][j] = score[j][i]; } } // Check no values greater than corresponding values on the diagonal. for (i = 0; i < 20; ++i) { for (k = 0; k < 20; ++k) { if (score[i][k] > score[i][i] || score[i][k] > score[k][k]) { cerr << "Aborting: scoring matrix value at " << i+1 << ", " << k+1 << " is greater than value on diagonal.\n"; exit(EXIT_FAILURE); } } } // Check no negative values. for (i = 0; i < 20; ++i) { for (k = 0; k < 20; ++k) { if (score[i][k] < 0.) { cerr << "Aborting: scoring matrix value at " << i+1 << ", " << k+1 << " is negative.\n"; exit(EXIT_FAILURE); } } } // Read codon to amino acid table. ifstream aa2codonfile("aa2codon.dat"); if (!aa2codonfile) { cerr << "Aborting: can't find file 'aa2codon.dat'.\n"; exit(EXIT_FAILURE); } for (i = 0; i < 64; ++i) { aa2codonfile.get(aa2codon[i]); aa2codonfile.ignore(1000, '\n'); } aa2codonfile.close(); // Lookup table connecting aa2codon[64] and aa[20] (aa[20] is the // order in which the amino acids appear in the weight matrix). for (i = 0; i < 64; ++i) { aa2aa[i] = 99; // for STOP codons for (j = 0; j < 20; ++j) { if (aa2codon[i] == aa[j]) { aa2aa[i] = j; } } } // Paste weights into 64 x 64 matrix; set stop codons = 1, 0. Weight // is for row i -> column j. for (i = 0; i < 64; ++i) { for (j = 0; j < 64; ++j) { if (99 == aa2aa[i] && 99 == aa2aa[j]) { weight[i][j] = 1.; } else if (99 == aa2aa[i]) { weight[i][j] = 0.; } else if (99 == aa2aa[j]) { weight[i][j] = 0.; } else { weight[i][j] = score[aa2aa[i]][aa2aa[j]]; } } } // Read 4 x 4 diagonalized nucleotide mutation matrix. Order is UCAG; // row i -> column k. Four matrices in input file: A, B, D, B^{-1}, where // A = B.D.B^{-1} and D is diagonal. A itself isn't actually used. ifstream nucfile("nuc.dat"); if (!nucfile) { cerr << "Aborting: can't find file 'nuc.dat'.\n"; exit(EXIT_FAILURE); } for (i = 0; i < 4; ++i) { for (k = 0; k < 4; ++k) { nucfile >> nucA[i][k]; } nucfile.ignore(1000, '\n'); } nucfile.ignore(1000, '\n'); for (i = 0; i < 4; ++i) { for (k = 0; k < 4; ++k) { nucfile >> nucB1[i][k]; } nucfile.ignore(1000, '\n'); } nucfile.ignore(1000, '\n'); for (i = 0; i < 4; ++i) { for (k = 0; k < 4; ++k) { nucfile >> nucD[i][k]; } nucfile.ignore(1000, '\n'); } nucfile.ignore(1000, '\n'); for (i = 0; i < 4; ++i) { for (k = 0; k < 4; ++k) { nucfile >> nucB2[i][k]; } nucfile.ignore(1000, '\n'); } nucfile.ignore(1000, '\n'); nucfile.close(); // Check B.D.B{^-1}. for (i = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) { tempA[i][j] = nucB1[i][j]*nucD[j][j]; } } for (i = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) { testA[i][j] = 0.; for (k = 0; k < 4; ++k) { testA[i][j] += tempA[i][k]*nucB2[k][j]; } } } if (verbose) { cout << "Product B D B^{-1}\n"; for (i = 0; i < 4; ++i) { cout << " "; for (j = 0; j < 4; ++j) { cout << testA[i][j] << " "; } cout << "\n"; } cout << "\n"; } for (i = 0; i < 4; ++i) { for (k = 0; k < 4; ++k) { if (i == k) { if (testA[i][k] > 0.) { cerr << "Aborting: nucleotide matrix value at " << i+1 << ", " << k+1 << " is positive.\n"; exit(EXIT_FAILURE); } } else { if (testA[i][k] < 0.) { cerr << "Aborting: nucleotide matrix value at " << i+1 << ", " << k+1 << " is negative.\n"; exit(EXIT_FAILURE); } } } } // Read codon usage table (scaling unimportant; set all values equal if you // don't want to use codon usage weights). ifstream codonfile("codon.dat"); if (!codonfile) { cerr << "Aborting: can't find file 'codon.dat'.\n"; exit(EXIT_FAILURE); } for (i = 0; i < 64; ++i) { codonfile.getline(temp2, 128, ' '); codonfile >> codon[i]; codonfile.ignore(1000, '\n'); } codonfile.close(); for (i = 0; i < 64; ++i) { if (codon[i] < 0.) { cerr << "Aborting: codon usage table value at " << i+1 << " is negative.\n"; exit(EXIT_FAILURE); } } // Apply codon usage weighting. for (i = 0; i < 64; ++i) { for (j = 0; j < 64; ++j) { weight[i][j] *= codon[j]; } } //--------------------------------------------------------------------------- // Miscellaneous stuff. // Fitting parameters for ttt (evolutionary time). if (tttmin < 0.) { tttmin = 0.; } if (tttmax < tttmin) { tttmax = tttmin; } if (tttfit <= 0 || tttfit > 1.) { tttfit = 1.; } // Check epsilon. if (epsilon <= 0) { cerr << "\nRequire epsilon > 0. Adjusting to 0.001.\n"; epsilon = 0.001; } logepsilon = log(epsilon); // MLE/STOPs/gaps/glitches logfiles. ofstream logfile1("mlogd_out.MLElog"); if (!logfile1) { cerr << "Failed to open log file 'mlogd_out.MLElog'.\n"; exit(EXIT_FAILURE); } ofstream logfile2("mlogd_out.stoplog"); if (!logfile2) { cerr << "Failed to open log file 'mlogd_out.stoplog'.\n"; exit(EXIT_FAILURE); } ofstream logfile3("mlogd_out.gaplog"); if (!logfile3) { cerr << "Failed to open log file 'mlogd_out.gaplog'.\n"; exit(EXIT_FAILURE); } ofstream logfile4("mlogd_out.glitches"); if (!logfile4) { cerr << "Failed to open log file 'mlogd_out.glitches'.\n"; exit(EXIT_FAILURE); } //--------------------------------------------------------------------------- // Input sequences (msa - with gaps). First sequence in seqs.dat is // reference sequence. // Number of sequences in sequences file. ifstream sequencesfile("seqs.dat"); if (!sequencesfile) { cerr << "Aborting: can't find file 'seqs.dat'.\n"; exit(EXIT_FAILURE); } nseqs = -1; while (sequencesfile.ignore(1000, '\n')) { ++nseqs; } sequencesfile.clear(); sequencesfile.seekg(0); if (nseqs > maxnseqs) { cerr << "Aborting: too many sequence in file 'seqs.dat'.\n" << "Maximum is " << maxnseqs << "\n"; exit(EXIT_FAILURE); } // Read sequences. for (seqid = 0; seqid < nseqs; ++seqid) { sequencesfile.getline(seq, 128, '\n'); nseq = readsequence(seq, gsequences, seqid); if (verbose) { cout << "Read " << seq << ", length = " << nseq << ".\n"; } if (0 == seqid) { nseq0 = nseq; } else if (nseq != nseq0) { cerr << "Aborting: gapped sequences of differing length.\n"; exit(EXIT_FAILURE); } } sequencesfile.close(); // Degap sequences and make lookup table connecting input alignment // sequence coords to degapped coords. for (seqid = 0; seqid < nseqs; ++seqid) { degnseq[seqid] = 0; // degapped sequence lengths for (i = 0; i < nseq0; ++i) { if (9 != gsequences[i][seqid]) { // not a gap sequences[degnseq[seqid]][seqid] = gsequences[i][seqid]; lookup[i][seqid] = degnseq[seqid]; ++degnseq[seqid]; } else { lookup[i][seqid] = -1; // gap } } if (verbose) { cout << "Degapped sequence " << seqid << "; length = " << degnseq[seqid] << ".\n"; } } //--------------------------------------------------------------------------- // Codon position files (for degapped reference sequence). For each // nucleotide position: 0 = non-coding; 1/2/3 = 1st/2nd/3rd positions in // a forward read direction codon; 4/5/6 = 1st/2nd/3rd positions in a // reverse (complementary strand) read direction codon. codonpos[][0] // gives null model. codonpos[][1] gives alternative model. // codonpos[][0] + codonpos[][1] may be non-coding, single-coding (in // null or alternative), double-coding or a mixture, depending on sequence // position. Programme calculates the likelihood ratio of null to // alternative model. norfs = readorfs("orfs.1", codonpos, degnseq[0], circular, 0); if (verbose) { cout << "Read 'orfs.1', containing " << norfs << " ORFs.\n"; } norfs = readorfs("orfs.2", codonpos, degnseq[0], circular, 1); if (verbose) { cout << "Read 'orfs.2', containing " << norfs << " ORFs.\n"; } //--------------------------------------------------------------------------- // Find range boundaries (in degapped reference sequence). if (0 == wholeseq || 2 == wholeseq) { range1 -= 1; range2 -= 1; // since indices start at 0 if (range1 > range2) { // make range1 < range2 temp95 = range1; range1 = range2; range2 = temp95; } if (range2 > degnseq[0] - 1) { range2 = degnseq[0] - 1; } if (range1 < 0) { range1 = 0; } } else { // whole sequence range1 = 0; range2 = degnseq[0] - 1; } // Find corresponding boundaries in alignment coords if (0 == wholeseq || 2 == wholeseq) { for (base = 0; base < nseq0; ++base) { if (lookup[base][0] == range1) { // base in degapped refseq or -1 if gap base1 = base; // note if whole refseq, still removes leading gaps } if (lookup[base][0] == range2) { // base in degapped refseq or -1 if gap base2 = base; // note if whole refseq, still removes leading gaps } } } else { // whole sequence base1 = 0; base2 = nseq0 - 1; } //--------------------------------------------------------------------------- // Check for gaps not in groups of 3 in coding sequences // Note - have to do this before changing ambiguous nt to -1. for (seqid = 0; seqid < nseqs; ++seqid) { test = checkgaps(lookup, codonpos, nseq0, seqid, 0); } for (seqid = 0; seqid < nseqs; ++seqid) { test = checkgaps(lookup, codonpos, nseq0, seqid, 1); } // Change lookup[][] entries for ambiguous nt to -1. for (seqid = 0; seqid < nseqs; ++seqid) { for (i = 0; i < nseq0; ++i) { if (5 == sequences[lookup[i][seqid]][seqid]) { lookup[i][seqid] = -1; // ambiguous nt } } } //--------------------------------------------------------------------------- //If (circular), then shift data in sequences[][seqid] and codonpos[][2] // by +2. Append the last two nt to the beginning and the first 2 nt to // the end. Then correct lookup[][seqid] by adding +2 to each entry (except // for '-1's). Note that base1, base2, lookup[][seqid] are in msa coords, // which are unchanged; range1 and range2 are in refseq coords; hereafter // they are used just for pos2 calculation; therefore need to adjust. // not used again so are left as they are. It is impossible for // lookup[][seqid] to reference the 4 nt added to each sequence in // sequences[][seqid]. They are only there to be used as neighbouring nt in // codons in the subroutines test_stop and calc_prob_col. //If (!circular), if the first nt of seq_i is aligned to a N2 or N3 nt in // refseq, or the last nt of seq_i is aligned to a N1 or N2 nt in // refseq, then you may end up references out-of-bounds in // sequences[][seqid]. Avoid this by padding all seqs with 2 nt at // either end. For non-circular sequences, use 'N's - which will result // in any codon incorporating them being skipped, but at least you don't // reference out-of-bounds. for (seqid = 0; seqid < nseqs; ++seqid) { // update values in lookup[][] for (i = 0; i < nseq0; ++i) { if (-1 != lookup[i][seqid]) { lookup[i][seqid] += 2; } } } for (seqid = 0; seqid < nseqs; ++seqid) { // shift sequences[][] by +2 for (i = degnseq[seqid]+1; i >= 2; --i) { sequences[i][seqid] = sequences[i-2][seqid]; } if (circular) { // circularize sequences[1][seqid] = sequences[degnseq[seqid]+1][seqid]; sequences[0][seqid] = sequences[degnseq[seqid]][seqid]; sequences[degnseq[seqid]+3][seqid] = sequences[3][seqid] ; sequences[degnseq[seqid]+2][seqid] = sequences[2][seqid]; } else { // pad ends with 'N's sequences[1][seqid] = sequences[0][seqid] = sequences[degnseq[seqid]+3][seqid] = sequences[degnseq[seqid]+2][seqid] = 5; } } for (k = 0; k < 2; ++k) { // shift codonpos[][] by +2 for (i = degnseq[0]+1; i >= 2; --i) { codonpos[i][k] = codonpos[i-2][k]; } if (circular) { // circularize codonpos[1][k] = codonpos[degnseq[0]+1][k]; codonpos[0][k] = codonpos[degnseq[0]][k]; codonpos[degnseq[0]+3][k] = codonpos[3][k] ; codonpos[degnseq[0]+2][k] = codonpos[2][k]; } else { // pad ends with '0's codonpos[1][k] = codonpos[0][k] = codonpos[degnseq[0]+3][k] = codonpos[degnseq[0]+2][k] = 0; } } range1 += 2; range2 += 2; //--------------------------------------------------------------------------- // Write out logs of all stops in null and alternative model ORFs in each // sequence. Also log of all gaps (and ambiguous nt). // Note that codons containing ambiguous nt are assumed not to be stops. ofstream posfile1("mlogd_out.stoppos1"); if (!posfile1) { cerr << "Failed to open log file 'mlogd_out.stoppos1'.\n"; exit(EXIT_FAILURE); } ofstream posfile2("mlogd_out.stoppos2"); if (!posfile2) { cerr << "Failed to open log file 'mlogd_out.stoppos2'.\n"; exit(EXIT_FAILURE); } ofstream posfile3("mlogd_out.gappos"); if (!posfile3) { cerr << "Failed to open log file 'mlogd_out.gappos'.\n"; exit(EXIT_FAILURE); } ofstream posfile4("mlogd_out.startpos1"); if (!posfile4) { cerr << "Failed to open log file 'mlogd_out.startpos1'.\n"; exit(EXIT_FAILURE); } ofstream posfile5("mlogd_out.startpos2"); if (!posfile5) { cerr << "Failed to open log file 'mlogd_out.startpos2'.\n"; exit(EXIT_FAILURE); } for (seqid = 0; seqid < nseqs; ++seqid) { for (base = 0; base < nseq0; ++base) { basex[0] = lookup[base][0]; // base in degapped reference sequence basex[1] = lookup[base][seqid]; // base in degapped sequence if (-1 != basex[1] && -1 != basex[0]) { // not gapped or ambiguous nt // Note stops/starts that overlap a reference sequence gap will not // be annotated. temp98 = stopcodon(aa2aa, sequences, basex, codonpos, seqid, 0); temp99 = stopcodon(aa2aa, sequences, basex, codonpos, seqid, 1); if (0 == temp98) { posfile1 << seqid+1 << " " << base+1 << "\n"; } if (0 == temp99) { posfile2 << seqid+1 << " " << base+1 << "\n"; } if (4 == temp98) { posfile4 << seqid+1 << " " << base+1 << "\n"; } if (4 == temp99) { posfile5 << seqid+1 << " " << base+1 << "\n"; } } else { // gap or ambiguous nt if (-1 == basex[1]) { // gap or ambiguous nt in the non-ref seq posfile3 << seqid+1 << " " << base+1 << "\n"; } } } } posfile1.close(); posfile2.close(); posfile3.close(); posfile4.close(); posfile5.close(); //--------------------------------------------------------------------------- // If using 'Only use nt within query CDS(s)', mask all nt that are not // annotated as CDSs in the alternate model. if (2 == wholeseq) { for (base = 0; base < nseq0; ++base) { base0[2] = lookup[base][0]; // base in degapped refseq if (-1 != base0[2]) { // not gapped or ambiguous nt if (0 == codonpos[base0[2]][1]) { // non-coding in alt model lookup[base][0] = -1; } } } } //--------------------------------------------------------------------------- // Number of pairs in pairs file. ifstream pairsfile("pairs.dat"); if (!pairsfile) { cerr << "Aborting: can't find file 'pairs.dat'.\n"; exit(EXIT_FAILURE); } npairs = -1; while (pairsfile.ignore(1000, '\n')) { ++npairs; } pairsfile.clear(); pairsfile.seekg(0); if (npairs > maxpairs) { cerr << "Aborting: too many sequence pairs in file 'pairs.dat'.\n" << "Maximum is " << maxpairs << "\n"; exit(EXIT_FAILURE); } //--------------------------------------------------------------------------- // Step through sequence pairs. if (verbose) { cout << "\nWorking..."; } for (np = 0; np < npairs; ++np) { pairsfile >> seqid0[0]; pairsfile >> seqid0[1]; pairsfile.ignore(1000, '\n'); seqid0[0] -= 1; // since sequence indices start from 0 seqid0[1] -= 1; // since sequence indices start from 0 flag[0][np] = 0; // for failure of ttt fitting to converge (null model) flag[1][np] = 0; // for failure of ttt fitting to converge (alt model) nmuts[np] = 0; // number of point differences, inc stop <-> nonstop nmuts2[np] = 0; // ditto but not including stop <-> nonstop nuccount[np] = 0; // non-gap, not stop <-> nonstop, in both sequences gapcount[np] = 0; // gaps in one or both sequences ntotal[np] = 0; // non-gap in both sequences stopcount[np][0] = 0; // stop <-> nonstop in null model stopcount[np][1] = 0; // stop <-> nonstop in alternative model N123[0][np][0] = N123[1][np][0] = N123[2][np][0] = 0; N123[0][np][1] = N123[1][np][1] = N123[2][np][1] = 0; // Any ambiguous nt codes are treated as gaps (skipped over for most // statistics, written to log and plot files as gaps, counted as part of // gapcount). However they are not stripped out in the gap-stripping, so // CDSs are not put out-of-frame. If an ambiguous nt code is part of a // codon being referenced for a neighbouring nt, then test_stop will // return a 0 (zero-probability transition) and the nt will be logged // along with the zero-probability transitions. The subroutine // calc_prob_col is never referenced unless test_stop != 0 so shouldn't // cause any problems. for (base = base1; base <= base2; ++base) { base0[0] = lookup[base][seqid0[0]]; // base in degapped seq1 base0[1] = lookup[base][seqid0[1]]; // base in degapped seq2 base0[2] = lookup[base][0]; // base in degapped refseq if (-1 != base0[0] && -1 != base0[1] && -1 != base0[2]) { // not gapped ntotal[np] += 1; if (1 == codonpos[base0[2]][0] || 4 == codonpos[base0[2]][0]) { N123[0][np][1] += 1; } else if (2 == codonpos[base0[2]][0] || 5 == codonpos[base0[2]][0]) { N123[1][np][1] += 1; } else if (3 == codonpos[base0[2]][0] || 6 == codonpos[base0[2]][0]) { N123[2][np][1] += 1; } else if (0 != codonpos[base0[2]][0]) { cerr << "Can't get here! (016)\n"; exit(EXIT_FAILURE); } if (sequences[base0[0]][seqid0[0]] != sequences[base0[1]][seqid0[1]]) { nmuts[np] += 1; if (1 == codonpos[base0[2]][0] || 4 == codonpos[base0[2]][0]) { N123[0][np][0] += 1; } else if (2 == codonpos[base0[2]][0] || 5 == codonpos[base0[2]][0]) { N123[1][np][0] += 1; } else if (3 == codonpos[base0[2]][0] || 6 == codonpos[base0[2]][0]) { N123[2][np][0] += 1; } else if (0 != codonpos[base0[2]][0]) { cerr << "Can't get here! (016)\n"; exit(EXIT_FAILURE); } } temp98 = test_stop(weight, sequences, base0, codonpos, seqid0, 0); temp99 = test_stop(weight, sequences, base0, codonpos, seqid0, 1); if (temp98 && temp99) { // not stop <-> non-stop in both models nuccount[np] += 1; if (sequences[base0[0]][seqid0[0]] != sequences[base0[1]][seqid0[1]]) { nmuts2[np] += 1; } } else { if (!temp98) { stopcount[np][0] += 1; logfile2 << seqid0[0]+1 << " " << seqid0[1]+1 << " " << base+1 << " 0\n"; } if (!temp99) { stopcount[np][1] += 1; logfile2 << seqid0[0]+1 << " " << seqid0[1]+1 << " " << base+1 << " 1\n"; } } } else { // gaps in one or both sequences gapcount[np] += 1; logfile3 << seqid0[0]+1 << " " << seqid0[1]+1 << " " << base+1 << "\n"; } } for (k = 0; k < 3; ++k) { if (0 != N123[k][np][1]) { N123r[k][np] = float(N123[k][np][0])/float(N123[k][np][1]); } else { N123r[k][np] = 0.; } } if (0 != ntotal[np]) { lambdasum += float(nmuts[np])/float(ntotal[np]); } else { lambdasum += 0.; } if (tttobs) { fitmuts = float(nmuts2[np]); // fit exp#muts to obs#muts } else { fitmuts = float(nuccount[np])*lambda; // fit exp#muts to lambda*#nucs } // Find ttt value giving closest match between expected # muts and // fitmuts. Use bisection method -> since nmuts increases monotonically // with ttt. Stop when abs(exp#muts - fitmuts) < tttfit. Take // best guess and flag, if number of iterations exceeds maxiter or if // fitmuts is outside range allowed by tttmin--tttmax. Note fitmuts // skips muts in gaps or stops <-> non-stops, since these are skipped by // MLOGD. for (model = 0; model < 2; ++model) { // null and alternative models if (0. == fitmuts) { tttx[model][np] = 0.; } else { ettt[0] = tttmin; ettt[1] = tttmax; // Calculate exp#muts for ettt[]. for (k = 0; k < 2; ++k) { test = calc_nuc(nuc, ettt[k], nucB1, nucB2, nucD); enmuts[k] = 0.; for (base = base1; base <= base2; ++base) { base0[0] = lookup[base][seqid0[0]]; // base in degapped seq1 base0[1] = lookup[base][seqid0[1]]; // base in degapped seq2 base0[2] = lookup[base][0]; // base in degapped refseq if (-1 != base0[0] && -1 != base0[1] && -1 != base0[2]) { temp98 = test_stop(weight,sequences,base0,codonpos,seqid0,0); temp99 = test_stop(weight,sequences,base0,codonpos,seqid0,1); if (temp98 && temp99) { // only if not stop <-> non-stop enmuts[k] += calc_prob_col(weight, nuc, sequences, base0, codonpos, seqid0, model, 1, aa2aa, Vfix); } } } } // Check tttmin--tttmax range is sufficient. if ((enmuts[0] - tttfit) > fitmuts) { cerr << "Fitting number of mutations (" << fitmuts << ") outside " << "that allowed by given t range. Sequences " << seqid0[0]+1 << " and " << seqid0[1]+1 << ".\n"; tttx[model][np] = tttmin; flag[model][np] = 1; } else if (enmuts[1] < fitmuts) { cerr << "Fitting number of mutations (" << fitmuts << ") outside " << "that allowed by given t range. Sequences " << seqid0[0]+1 << " and " << seqid0[1]+1 << ".\n"; tttx[model][np] = tttmax; flag[model][np] = 1; } else { // Iterate on ttt. niter = 0; while (abs(enmuts[0] - fitmuts) > tttfit) { niter++; if (niter > maxiter) { cerr << "Iteration on t failed to converge in " << maxiter << " iterations. Sequences " << seqid0[0]+1 << " and " << seqid0[1]+1 << ".\n"; flag[model][np] = 1; break; } // Bisector. ttt = 0.5 * (ettt[0] + ettt[1]); // Calculate expected number of mutations at bisector. test = calc_nuc(nuc, ttt, nucB1, nucB2, nucD); expmuts = 0.; for (base = base1; base <= base2; ++base) { base0[0] = lookup[base][seqid0[0]]; // base in degapped seq1 base0[1] = lookup[base][seqid0[1]]; // base in degapped seq2 base0[2] = lookup[base][0]; // base in degapped refseq if (-1 != base0[0] && -1 != base0[1] && -1 != base0[2]) { temp98 = test_stop(weight,sequences,base0,codonpos,seqid0,0); temp99 = test_stop(weight,sequences,base0,codonpos,seqid0,1); if (temp98 && temp99) { // only if not stop <-> non-stop expmuts += calc_prob_col(weight, nuc, sequences, base0, codonpos, seqid0, model, 1, aa2aa, Vfix); } } } // Reassign ettt[] and enmuts[]. if (fitmuts > expmuts) { ettt[0] = ttt; enmuts[0] = expmuts; } else { ettt[1] = ttt; enmuts[1] = expmuts; } } // exit while loop when converged tttx[model][np] = ettt[0]; } // close if/else for fitmuts outside/within allowed ttt range } // close if/else for fitmuts ==/!= 0 } // close loop over model = 0/1 // Calculate nucleotide matrices for best ttt values. test = calc_nuc(nuc0, tttx[0][np], nucB1, nucB2, nucD); test = calc_nuc(nuc1, tttx[1][np], nucB1, nucB2, nucD); // Calculate likelihoods for best ttt values. emtx[0][np] = emtx[1][np] = 0.; for (base = base1; base <= base2; ++base) { base0[0] = lookup[base][seqid0[0]]; // base in degapped seq1 base0[1] = lookup[base][seqid0[1]]; // base in degapped seq2 base0[2] = lookup[base][0]; // base in degapped refseq if (-1 != base0[0] && -1 != base0[1] && -1 != base0[2]) { temp98 = test_stop(weight, sequences, base0, codonpos, seqid0, 0); temp99 = test_stop(weight, sequences, base0, codonpos, seqid0, 1); if (temp98 && temp99) { // only if not stop <-> non-stop temp96 = calc_prob_col(weight, nuc0, sequences, base0, codonpos, seqid0, 0, 0, aa2aa, Vfix); temp97 = calc_prob_col(weight, nuc1, sequences, base0, codonpos, seqid0, 1, 0, aa2aa, Vfix); emtx[0][np] += temp96; emtx[1][np] += temp97; logfile1 << seqid0[0]+1 << " " << seqid0[1]+1 << " " << base+1 << " " << temp96 << " " << temp97 << "\n"; if (0 != ntotal[np]) { treesum[base][0] += float(nmuts[np])/float(ntotal[np]); } else { treesum[base][0] += 0.; } treesum[base][1] += temp96; treesum[base][2] += temp97; ntree[base] += 1; if (temp96 < logepsilon) { logfile4 << seqid0[0]+1 << " " << seqid0[1]+1 << " " << base+1 << " 0 " << temp96 << "\n"; } if (temp97 < logepsilon) { logfile4 << seqid0[0]+1 << " " << seqid0[1]+1 << " " << base+1 << " 1 " << temp97 << "\n"; } } } } if (0 != nuccount[np]) { emtx[0][np] /= nuccount[np]; emtx[1][np] /= nuccount[np]; } else { emtx[0][np] = 0.; emtx[1][np] = 0.; } // Calculate N_identical, N_synonomous, N_nonsynonomous N0synnon[0][np] = 0; // N_identical N0synnon[1][np] = 0; // N_synonomous N0synnon[2][np] = 0; // N_nonsynonomous for (base = base1; base <= base2; ++base) { base0[0] = lookup[base][seqid0[0]]; // base in degapped seq1 base0[1] = lookup[base][seqid0[1]]; // base in degapped seq2 base0[2] = lookup[base][0]; // base in degapped refseq if (-1 != base0[0] && -1 != base0[1] && -1 != base0[2]) { if (0 != codonpos[base0[2]][0]) { // coding in primary frame for (k = 0; k < 2; ++k) { if (1 == codonpos[base0[2]][0]) { j1[k] = sequences[base0[k]][seqid0[k]]; j2[k] = sequences[base0[k]+1][seqid0[k]]; j3[k] = sequences[base0[k]+2][seqid0[k]]; } else if (2 == codonpos[base0[2]][0]) { j1[k] = sequences[base0[k]-1][seqid0[k]]; j2[k] = sequences[base0[k]][seqid0[k]]; j3[k] = sequences[base0[k]+1][seqid0[k]]; } else if (3 == codonpos[base0[2]][0]) { j1[k] = sequences[base0[k]-2][seqid0[k]]; j2[k] = sequences[base0[k]-1][seqid0[k]]; j3[k] = sequences[base0[k]][seqid0[k]]; } else if (6 == codonpos[base0[2]][0]) { j3[k] = (sequences[base0[k]][seqid0[k]] + 2) % 4; j2[k] = (sequences[base0[k]+1][seqid0[k]] + 2) % 4; j1[k] = (sequences[base0[k]+2][seqid0[k]] + 2) % 4; } else if (5 == codonpos[base0[2]][0]) { j3[k] = (sequences[base0[k]-1][seqid0[k]] + 2) % 4; j2[k] = (sequences[base0[k]][seqid0[k]] + 2) % 4; j1[k] = (sequences[base0[k]+1][seqid0[k]] + 2) % 4; } else if (4 == codonpos[base0[2]][0]) { j3[k] = (sequences[base0[k]-2][seqid0[k]] + 2) % 4; j2[k] = (sequences[base0[k]-1][seqid0[k]] + 2) % 4; j1[k] = (sequences[base0[k]][seqid0[k]] + 2) % 4; } else { cerr << "Can't get here! (004)\n"; exit(EXIT_FAILURE); } jcid[k] = 16 * j1[k] + 4 * j2[k] + j3[k]; } if (jcid[0] == jcid[1]) { N0synnon[0][np] += 1; } else if (aa2aa[jcid[0]] == aa2aa[jcid[1]]) { N0synnon[1][np] += 1; } else { N0synnon[2][np] += 1; } } } } N0NsNnsum = N0synnon[0][np]+N0synnon[1][np]+N0synnon[2][np]; for (k = 0; k < 3; ++k) { if (0 != N0NsNnsum) { N0NsNn[k][np] = float(N0synnon[k][np]) / float(N0NsNnsum); } else { N0NsNn[k][np] = 0.; } } } // finish loop over sequences pairs logfile1.close(); logfile2.close(); logfile3.close(); logfile4.close(); pairsfile.close(); //--------------------------------------------------------------------------- // Write statistics to file. ofstream outputfile("mlogd_out.dat"); if (!outputfile) { cerr << "Failed to open output file 'mlogd_out.dat'.\n"; exit(EXIT_FAILURE); } outputfile << setprecision(3); for (np = 0; np < npairs; ++np) { outputfile << setw(6) << tttx[0][np] << " " << setw(6) << tttx[1][np] << " " << setw(8) << emtx[0][np] << " " << setw(8) << emtx[1][np] << " " << setw(6) << nuccount[np] << " " << setw(5) << stopcount[np][0] << " " << setw(5) << stopcount[np][1] << " " << setw(6) << gapcount[np] << " " << setw(9) << N0NsNn[0][np] << " " << setw(9) << N0NsNn[1][np] << " " << setw(9) << N0NsNn[2][np] << " " << setw(6) << ntotal[np] << " " << setw(6) << nmuts[np] << " " << setw(6) << nmuts2[np] << " " << setw(9) << N123r[0][np] << " " << setw(9) << N123r[1][np] << " " << setw(9) << N123r[2][np] << " " << setw(4) << flag[0][np] << " " << setw(4) << flag[1][np] << "\n"; } outputfile.close(); ofstream treesumfile("mlogd_out.treesum"); if (!treesumfile) { cerr << "Failed to open output file 'mlogd_out.treesum'.\n"; exit(EXIT_FAILURE); } for (base = 0; base < nseq0; ++base) { treesumfile << base+1 << " " << ntree[base] << " " << treesum[base][0] << " " << treesum[base][1] << " " << treesum[base][2] << " "; if (-1 == lookup[base][0]) { treesumfile << "-\n"; } else { if (0 == sequences[lookup[base][0]][0]) { treesumfile << "U\n"; } else if (1 == sequences[lookup[base][0]][0]) { treesumfile << "C\n"; } else if (2 == sequences[lookup[base][0]][0]) { treesumfile << "A\n"; } else if (3 == sequences[lookup[base][0]][0]) { treesumfile << "G\n"; } else if (9 == sequences[lookup[base][0]][0]) { treesumfile << "-\n"; } else if (5 == sequences[lookup[base][0]][0]) { treesumfile << "N\n"; } else { cerr << "Can't get here! (006)\n"; exit(EXIT_FAILURE); } } } treesumfile.close(); //--------------------------------------------------------------------------- // Write out some stats. ofstream infofile("mlogd_out.info"); if (!infofile) { cerr << "Failed to open log file 'mlogd_out.info'.\n"; exit(EXIT_FAILURE); } infofile << nseq0 << " " << base1+1 << " " << base2+1 << "\n"; infofile << lambdasum << "\n"; // Find frame from codonpos[][], within range. for (j = 0; j < 7; ++j) { for (k = 0; k < 7; ++k) { pos2[j][k] = 0; } } // for (i = range1; i <= range2; ++i) { // pos2[codonpos[i][0]][codonpos[i][1]] += 1; // } for (base = base1; base <= base2; ++base) { base0[2] = lookup[base][0]; // base in degapped refseq if (-1 != base0[2]) { // not gapped pos2[codonpos[base0[2]][0]][codonpos[base0[2]][1]] += 1; } } for (j = 0; j < 7; ++j) { for (k = 0; k < 7; ++k) { if (pos2[j][k] > 0) { infofile << j << " " << k << " " << pos2[j][k] << "\n"; } } } infofile.close(); if (verbose) { cout << "\n\nFinished.\n"; } return(0); } //----------------------------------------------------------------------------- // expnmuts = 0: returns log(prob) for a given nucleotide mutation in a given // sequence pair. // expnmuts = 1: returns expected number of mutations (in the range 0-1) for a // given nucleotide site in a given sequence pair at a given divergence ttt. double calc_prob_col(double weight[][64], double nuc[][4], int sequences[][maxnseqs], int base0[3], int codonpos[][2], int seqid0[2], int model, int expnmuts, int aa2aa[64], double Vfix) { int jn[2], j1[2], j2[2], j3[2], jcid[2], k, kn, k1, k2, k3, kcid; int c1[2], c2[2], c3[2], ccid[2]; double sum, prob_col[4]; // base0[0,1,2] are locations of the nucleotide in // sequences[][seqid1,seqid2,0]; codonpos[][0,1] correspond to the refseq // and should be referenced with base0[2]. Assumed that in general this // will be the correct codonpos for the non-refseqs also. Gaps have been // stripped out in codonpos[][] and sequences[][]. jn[0] = sequences[base0[0]][seqid0[0]]; // nucleotide in seq1 jn[1] = sequences[base0[1]][seqid0[1]]; // nucleotide in seq2 if ((0 == codonpos[base0[2]][0] && 0 == model) || (0 == codonpos[base0[2]][0] && 0 == codonpos[base0[2]][1] && 1 == model)) { // use non-coding model sum = 0.; for (kn = 0; kn < 4; ++kn) { prob_col[kn] = nuc[jn[0]][kn]; sum += prob_col[kn]; } } else { // single or double coding model if (0 != codonpos[base0[2]][0]) { // coding in pos.1 // Find primary frame codon containing the nucleotide in seq1 and seq2 for (k = 0; k < 2; ++k) { if (1 == codonpos[base0[2]][0]) { j1[k] = sequences[base0[k]][seqid0[k]]; j2[k] = sequences[base0[k]+1][seqid0[k]]; j3[k] = sequences[base0[k]+2][seqid0[k]]; } else if (2 == codonpos[base0[2]][0]) { j1[k] = sequences[base0[k]-1][seqid0[k]]; j2[k] = sequences[base0[k]][seqid0[k]]; j3[k] = sequences[base0[k]+1][seqid0[k]]; } else if (3 == codonpos[base0[2]][0]) { j1[k] = sequences[base0[k]-2][seqid0[k]]; j2[k] = sequences[base0[k]-1][seqid0[k]]; j3[k] = sequences[base0[k]][seqid0[k]]; } else if (6 == codonpos[base0[2]][0]) { j3[k] = (sequences[base0[k]][seqid0[k]] + 2) % 4; j2[k] = (sequences[base0[k]+1][seqid0[k]] + 2) % 4; j1[k] = (sequences[base0[k]+2][seqid0[k]] + 2) % 4; } else if (5 == codonpos[base0[2]][0]) { j3[k] = (sequences[base0[k]-1][seqid0[k]] + 2) % 4; j2[k] = (sequences[base0[k]][seqid0[k]] + 2) % 4; j1[k] = (sequences[base0[k]+1][seqid0[k]] + 2) % 4; } else if (4 == codonpos[base0[2]][0]) { j3[k] = (sequences[base0[k]-2][seqid0[k]] + 2) % 4; j2[k] = (sequences[base0[k]-1][seqid0[k]] + 2) % 4; j1[k] = (sequences[base0[k]][seqid0[k]] + 2) % 4; } else { cerr << "Can't get here! (006)\n"; exit(EXIT_FAILURE); } jcid[k] = 16 * j1[k] + 4 * j2[k] + j3[k]; } } if (0 != codonpos[base0[2]][1] && 1 == model) { // coding in pos.2 // Find secondary frame codon containing the nucleotide in seq1 and seq2 for (k = 0; k < 2; ++k) { if (1 == codonpos[base0[2]][1]) { c1[k] = sequences[base0[k]][seqid0[k]]; c2[k] = sequences[base0[k]+1][seqid0[k]]; c3[k] = sequences[base0[k]+2][seqid0[k]]; } else if (2 == codonpos[base0[2]][1]) { c1[k] = sequences[base0[k]-1][seqid0[k]]; c2[k] = sequences[base0[k]][seqid0[k]]; c3[k] = sequences[base0[k]+1][seqid0[k]]; } else if (3 == codonpos[base0[2]][1]) { c1[k] = sequences[base0[k]-2][seqid0[k]]; c2[k] = sequences[base0[k]-1][seqid0[k]]; c3[k] = sequences[base0[k]][seqid0[k]]; } else if (6 == codonpos[base0[2]][1]) { c3[k] = (sequences[base0[k]][seqid0[k]] + 2) % 4; c2[k] = (sequences[base0[k]+1][seqid0[k]] + 2) % 4; c1[k] = (sequences[base0[k]+2][seqid0[k]] + 2) % 4; } else if (5 == codonpos[base0[2]][1]) { c3[k] = (sequences[base0[k]-1][seqid0[k]] + 2) % 4; c2[k] = (sequences[base0[k]][seqid0[k]] + 2) % 4; c1[k] = (sequences[base0[k]+1][seqid0[k]] + 2) % 4; } else if (4 == codonpos[base0[2]][1]) { c3[k] = (sequences[base0[k]-2][seqid0[k]] + 2) % 4; c2[k] = (sequences[base0[k]-1][seqid0[k]] + 2) % 4; c1[k] = (sequences[base0[k]][seqid0[k]] + 2) % 4; } else { cerr << "Can't get here! (008)\n"; exit(EXIT_FAILURE); } ccid[k] = 16 * c1[k] + 4 * c2[k] + c3[k]; } } sum = 0.; for (kn = 0; kn < 4; ++kn) { // possible point mutations at that position // Weight for nucleotide mutation. prob_col[kn] = nuc[jn[0]][kn]; if (0 != codonpos[base0[2]][0]) { // coding in pos.1 k1 = j1[1]; k2 = j2[1]; k3 = j3[1]; // codon in seq2 if (1 == codonpos[base0[2]][0]) { // new codon in seq2 k1 = kn; } else if (2 == codonpos[base0[2]][0]) { k2 = kn; } else if (3 == codonpos[base0[2]][0]) { k3 = kn; } else if (6 == codonpos[base0[2]][0]) { k3 = (kn + 2) % 4; } else if (5 == codonpos[base0[2]][0]) { k2 = (kn + 2) % 4; } else if (4 == codonpos[base0[2]][0]) { k1 = (kn + 2) % 4; } else { cerr << "Can't get here! (009)\n"; exit(EXIT_FAILURE); } kcid = 16 * k1 + 4 * k2 + k3; // Amino acid acceptability (new nucleotide kn in seq2 context, // compared with old nucleotide in seq1 context). prob_col[kn] *= weight[jcid[0]][kcid]; if (aa2aa[kcid] != aa2aa[jcid[0]]) { prob_col[kn] *= Vfix; // nonsynonymous:synonymous scaling } } if (0 != codonpos[base0[2]][1] && 1 == model) { // coding in pos.2 k1 = c1[1]; k2 = c2[1]; k3 = c3[1]; // codon in seq2 if (1 == codonpos[base0[2]][1]) { // new codon in seq2 k1 = kn; } else if (2 == codonpos[base0[2]][1]) { k2 = kn; } else if (3 == codonpos[base0[2]][1]) { k3 = kn; } else if (6 == codonpos[base0[2]][1]) { k3 = (kn + 2) % 4; } else if (5 == codonpos[base0[2]][1]) { k2 = (kn + 2) % 4; } else if (4 == codonpos[base0[2]][1]) { k1 = (kn + 2) % 4; } else { cerr << "Can't get here! (010)\n"; exit(EXIT_FAILURE); } kcid = 16 * k1 + 4 * k2 + k3; // Amino acid acceptability (new nucleotide kn in seq2 context, // compared with old nucleotide in seq1 context). prob_col[kn] *= weight[ccid[0]][kcid]; if (aa2aa[kcid] != aa2aa[ccid[0]]) { prob_col[kn] *= Vfix; // nonsynonymous:synonymous scaling } } sum += prob_col[kn]; } } if (0. == sum) { // This can occassionally occur, e.g. seq1 codon CAA, seq2 codon UAC, // mutating N3; for ttt ~ 0, prob_col[U] = prob_col[C] = prob_col[G] = 0, // and prob_col[A] = 0 since UAG is a stop codon. Once ttt has converged // on the best fit ttt, this probelm shouldn't occur. if (expnmuts) { // return exp # muts at this site for pairwise distance return(0); } else { // return log(prob) for nuc mutation from seq1 to seq2 return(-100); } } // Normalize probs so that sum = 1. for (kn = 0; kn < 4; ++kn) { prob_col[kn] /= sum; } if (prob_col[0] > 1.01 || prob_col[0] < -0.01|| prob_col[1] > 1.01 || prob_col[1] < -0.01|| prob_col[2] > 1.01 || prob_col[2] < -0.01|| prob_col[3] > 1.01 || prob_col[3] < -0.01) { cerr << "Error: invalid probability value; possible rounding error:\n " << prob_col[0] << " " << prob_col[1] << " " << prob_col[2] << " " << prob_col[3] << " " << sum << "\n"; } if (expnmuts) { // return exp # muts at this site for pairwise distance return(1. - prob_col[jn[0]]); } else { // return log(prob) for nuc mutation from seq1 to seq2 if (prob_col[jn[1]] > 0) { return(log(prob_col[jn[1]])); } else { cerr << "Error: unexpected zero-probability mutation (e.g. stop codon " << "to non-stop). (002)\n"; exit(EXIT_FAILURE); } } } //----------------------------------------------------------------------------- int readsequence(char seq[128], int gsequences[][maxnseqs], int seqid) { int i, k, nlines, nseq; char seqline[1000], inputseq[maxlength+10]; ifstream sequence(seq); if (!sequence) { cerr << "Aborting: can't find file '" << seq << "'.\n"; exit(EXIT_FAILURE); } nlines = -1; while (sequence.ignore(1000, '\n')) { ++nlines; } sequence.clear(); sequence.seekg(0); sequence.ignore(1000, '\n'); for (i = 0; i < nlines; ++i) { sequence.getline(seqline, 1000, '\n'); if (0 == i) { strcpy(inputseq, seqline); } else { strcat(inputseq, seqline); } } sequence.close(); for (i = 0; ; ++i) { if (inputseq[i] == 'U' || inputseq[i] == 'u') { gsequences[i][seqid] = 0; } else if (inputseq[i] == 'T' || inputseq[i] == 't') { gsequences[i][seqid] = 0; } else if (inputseq[i] == 'C' || inputseq[i] == 'c') { gsequences[i][seqid] = 1; } else if (inputseq[i] == 'A' || inputseq[i] == 'a') { gsequences[i][seqid] = 2; } else if (inputseq[i] == 'G' || inputseq[i] == 'g') { gsequences[i][seqid] = 3; } else if (inputseq[i] == '-' || inputseq[i] == '.') { gsequences[i][seqid] = 9; } else if (inputseq[i] == '\0') { nseq = i; break; } else { if (inputseq[i] == 'M' || inputseq[i] == 'm' || inputseq[i] == 'R' || inputseq[i] == 'r' || inputseq[i] == 'W' || inputseq[i] == 'w' || inputseq[i] == 'V' || inputseq[i] == 'v' || inputseq[i] == 'H' || inputseq[i] == 'h' || inputseq[i] == 'D' || inputseq[i] == 'd' || inputseq[i] == 'B' || inputseq[i] == 'b' || inputseq[i] == 'S' || inputseq[i] == 's' || inputseq[i] == 'Y' || inputseq[i] == 'y' || inputseq[i] == 'K' || inputseq[i] == 'k' || inputseq[i] == 'N' || inputseq[i] == 'n' || inputseq[i] == 'X' || inputseq[i] == 'x') { gsequences[i][seqid] = 5; // ambiguous nt cerr << "Warning: sequence " << seq << ", ambiguous nt '" << inputseq[i] << "', at " << i+1 << ".\n"; } else { cerr << "Aborting: sequence " << seq << ", unknown nucleotide '" << inputseq[i] << "', at " << i+1 << ".\n"; exit(EXIT_FAILURE); } } } return(nseq); } //----------------------------------------------------------------------------- // Note that if there are overlapping ORFs in the same frame (e.g. alternative // start points) then only one is counted at any nt. // Note that if there are overlapping ORFs in different frames, then only the // first one is counted in the overlap regions. // Note that the alternate model can repeat the null model CDS, in which case // you're comparing a CDS versus two copies of the same CDS. int readorfs(char cpos[128], int codonpos[][2], int length, int circular, int posid) { int i, j, k, orf1, orf2, nseg, count, norfs, test; test = 1; // for warning message for overlapping ORFs for (i = 0; i < length; ++i) { codonpos[i][posid] = 0; } // Work through ORF list. ifstream orffile(cpos); if (!orffile) { cerr << "Aborting: can't find file '" << cpos << "'.\n"; exit(EXIT_FAILURE); } norfs = -1; while (orffile.ignore(1000, '\n')) { ++norfs; } orffile.clear(); orffile.seekg(0); for (i = 0; i < norfs; ++i) { count = 0; orffile >> nseg; nseg /= 2; for (k = 0; k < nseg; ++k) { orffile >> orf1; orffile >> orf2; if (orf1<1||orf2<1||orf1>length||orf2>length) { cerr << "Aborting: ORF " << i+1 << " in " << cpos << ", endpoints outside sequence range.\n"; exit(EXIT_FAILURE); } // Since arrays start at 0 rather than 1. orf1--; orf2--; // Enter 1,2,3 (forward) or 4,5,6 (reverse) into codonpos[][] if (orf1 < orf2) { // forward frame for (j = orf1; j <= orf2; ++j) { // blank codons overlapping joins except for circular genomes if (!((j==orf2&&count+1!=3)||(j==orf2-1&&count+1==1)|| (j==orf1&&count+1!=1)||(j==orf1+1&&count+1==3))|| (circular&&(j==length-1||j==length-2||j==0||j==1))){ if (0 == codonpos[j][posid]) { codonpos[j][posid] = count+1; } else { if (test) { cerr << "Warning: " << cpos << " has overlapping ORFs. " << "Only the first will be used in overlap regions.\n"; test = 0; } } } count = (count+1)%3; } } else { // reverse frame for (j = orf1; j >= orf2; --j) { // blank codons overlapping joins except for circular genomes if (!((j==orf2&&count+1!=3)||(j==orf2+1&&count+1==1)|| (j==orf1&&count+1!=1)||(j==orf1-1&&count+1==3))|| (circular&&(j==length-1||j==length-2||j==0||j==1))){ if (0 == codonpos[j][posid]) { codonpos[j][posid] = count+4; } else { if (test) { cerr << "Warning: " << cpos << " has overlapping ORFs. " << "Only the first will be used in overlap regions.\n"; test = 0; } } } count = (count+1)%3; } } } if (0 != count%3) { cerr << "Warning: number of nucleotides in ORF " << i+1 << " in " << cpos << " not a multiple of 3.\n"; } orffile.ignore(1000, '\n'); } orffile.close(); return(norfs); } //----------------------------------------------------------------------------- // Check input aligned sequences for gaps occurring not in multiples of 3 // within coding regions (in null model if pos = 0 and in alternative model // if pos = 1). Output a warning if any problem. int checkgaps(int lookup[][maxnseqs], int codonpos[][2], int nseq0, int seqid, int pos) { int i, cdstest, gaptest, count; cdstest = 0; // 0 -> in a non-coding region; 1 -> in a coding region gaptest = 0; // 0 -> in a non-gapped region; 1 -> in a gapped region for (i = 0; i < nseq0; ++i) { if (!cdstest) { // non-coding region if (-1 != lookup[i][seqid]) { // not a gap in seq seqid if (0 != codonpos[lookup[i][0]][pos]) { cdstest = 1; // switch to coding region } } } else { // coding-region if (gaptest) { // gapped region if (-1 == lookup[i][seqid]) { // a gap in seq seqid count += 1; } else if (0 == codonpos[lookup[i][0]][pos]) { cdstest = 0; // switch to non-coding region gaptest = 0; // switch to non-gapped region } else { gaptest = 0; // switch to non-gapped region if (0 != count % 3) { if (0 == pos) { cerr << "Warning: gap within null model coding region not" << " a multiple of three nt;\n sequence " << seqid+1 << ", sequence coords " << lookup[i][seqid] << ", alignment coords " << i << ".\n"; } else { cerr << "Warning: gap within alternative model coding region not" << " a multiple of three nt;\n sequence " << seqid+1 << ", sequence coords " << lookup[i][seqid] << ", alignment coords " << i << ".\n"; } } } } else { // not a gapped region if (-1 == lookup[i][seqid]) { gaptest = 1; // switch to gapped region count = 1; } else if (0 == codonpos[lookup[i][0]][pos]) { cdstest = 0; // switch to non-coding region } } } } return(0); } //----------------------------------------------------------------------------- // Tests for stop <-> non-stop transitions between a sequence pair, in // null (pos = 0) or alternative (pos = 1) models. Actually checks for any // transitions which have zero probability in the amino acid x CUT // substitution matrix. These are skipped over when calculating MLOGD (sice // log(0) is undefined) and recorded in the stoplog file. int test_stop(double weight[][64], int sequences[][maxnseqs], int base0[3], int codonpos[][2], int seqid0[2], int pos) { int k, j1[2], j2[2], j3[2], jcid[2], k1, k2, k3; if (0 == codonpos[base0[2]][pos]) { return(2); // allowed transition, non-coding } for (k = 0; k < 2; ++k) { if (1 == codonpos[base0[2]][pos] || 6 == codonpos[base0[2]][pos]) { k1 = sequences[base0[k]][seqid0[k]]; k2 = sequences[base0[k]+1][seqid0[k]]; k3 = sequences[base0[k]+2][seqid0[k]]; } else if (2 == codonpos[base0[2]][pos] || 5 == codonpos[base0[2]][pos]) { k1 = sequences[base0[k]-1][seqid0[k]]; k2 = sequences[base0[k]][seqid0[k]]; k3 = sequences[base0[k]+1][seqid0[k]]; } else if (3 == codonpos[base0[2]][pos] || 4 == codonpos[base0[2]][pos]) { k1 = sequences[base0[k]-2][seqid0[k]]; k2 = sequences[base0[k]-1][seqid0[k]]; k3 = sequences[base0[k]][seqid0[k]]; } if (5 == k1 || 5 == k2 || 5 == k3) { return(0); // codon contains an ambiguous nt code } if (1 == codonpos[base0[2]][pos]) { j1[k] = sequences[base0[k]][seqid0[k]]; j2[k] = sequences[base0[k]+1][seqid0[k]]; j3[k] = sequences[base0[k]+2][seqid0[k]]; } else if (2 == codonpos[base0[2]][pos]) { j1[k] = sequences[base0[k]-1][seqid0[k]]; j2[k] = sequences[base0[k]][seqid0[k]]; j3[k] = sequences[base0[k]+1][seqid0[k]]; } else if (3 == codonpos[base0[2]][pos]) { j1[k] = sequences[base0[k]-2][seqid0[k]]; j2[k] = sequences[base0[k]-1][seqid0[k]]; j3[k] = sequences[base0[k]][seqid0[k]]; } else if (6 == codonpos[base0[2]][pos]) { j3[k] = (sequences[base0[k]][seqid0[k]] + 2) % 4; j2[k] = (sequences[base0[k]+1][seqid0[k]] + 2) % 4; j1[k] = (sequences[base0[k]+2][seqid0[k]] + 2) % 4; } else if (5 == codonpos[base0[2]][pos]) { j3[k] = (sequences[base0[k]-1][seqid0[k]] + 2) % 4; j2[k] = (sequences[base0[k]][seqid0[k]] + 2) % 4; j1[k] = (sequences[base0[k]+1][seqid0[k]] + 2) % 4; } else if (4 == codonpos[base0[2]][pos]) { j3[k] = (sequences[base0[k]-2][seqid0[k]] + 2) % 4; j2[k] = (sequences[base0[k]-1][seqid0[k]] + 2) % 4; j1[k] = (sequences[base0[k]][seqid0[k]] + 2) % 4; } else { cerr << "Can't get here! (012)\n"; exit(EXIT_FAILURE); } jcid[k] = 16 * j1[k] + 4 * j2[k] + j3[k]; } if (0 == weight[jcid[0]][jcid[1]]) { return(0); // zero probability transition } else { return(1); // allowed transition } } //----------------------------------------------------------------------------- // Makes 4x4 nucleotide P matrix for a given t. int calc_nuc(double nuc[][4], double ttt, double nucB1[][4], double nucB2[][4], double nucD[][4]) { int ni, nj, nk; double tempA[4][4]; for (ni = 0; ni < 4; ++ni) { for (nj = 0; nj < 4; ++nj) { tempA[ni][nj] = nucB1[ni][nj]*exp(ttt*nucD[nj][nj]); } } for (ni = 0; ni < 4; ++ni) { for (nj = 0; nj < 4; ++nj) { nuc[ni][nj] = 0.; for (nk = 0; nk < 4; ++nk) { nuc[ni][nj] += tempA[ni][nk]*nucB2[nk][nj]; } if (nuc[ni][nj] < 0.) { nuc[ni][nj] = 0.; // in case of rounding errors } } } return(0); } //----------------------------------------------------------------------------- // Tests for stop codons in a sequence, in null (pos = 0) or alternative // (pos = 1) models. base0[0] is the refseq base (to find codon position), // base0[1] is the sequence base. // Returns // 0 -> stop codon // 1 -> not a stop or start // 2 -> non-coding (therefore not a stop) // 3 -> codon contains an ambiguous nt code // 4 -> start codon int stopcodon(int aa2aa[64], int sequences[][maxnseqs], int base0[2], int codonpos[][2], int seqid, int pos) { int j1, j2, j3, jcid, k1, k2, k3; if (0 == codonpos[base0[0]][pos]) { return(2); // non-coding (therefore not a stop) } if (1 == codonpos[base0[0]][pos] || 6 == codonpos[base0[0]][pos]) { k1 = sequences[base0[1]][seqid]; k2 = sequences[base0[1]+1][seqid]; k3 = sequences[base0[1]+2][seqid]; } else if (2 == codonpos[base0[0]][pos] || 5 == codonpos[base0[0]][pos]) { k1 = sequences[base0[1]-1][seqid]; k2 = sequences[base0[1]][seqid]; k3 = sequences[base0[1]+1][seqid]; } else if (3 == codonpos[base0[0]][pos] || 4 == codonpos[base0[0]][pos]) { k1 = sequences[base0[1]-2][seqid]; k2 = sequences[base0[1]-1][seqid]; k3 = sequences[base0[1]][seqid]; } if (5 == k1 || 5 == k2 || 5 == k3) { return(3); // codon contains an ambiguous nt code } if (1 == codonpos[base0[0]][pos]) { j1 = sequences[base0[1]][seqid]; j2 = sequences[base0[1]+1][seqid]; j3 = sequences[base0[1]+2][seqid]; } else if (2 == codonpos[base0[0]][pos]) { j1 = sequences[base0[1]-1][seqid]; j2 = sequences[base0[1]][seqid]; j3 = sequences[base0[1]+1][seqid]; } else if (3 == codonpos[base0[0]][pos]) { j1 = sequences[base0[1]-2][seqid]; j2 = sequences[base0[1]-1][seqid]; j3 = sequences[base0[1]][seqid]; } else if (6 == codonpos[base0[0]][pos]) { j3 = (sequences[base0[1]][seqid] + 2) % 4; j2 = (sequences[base0[1]+1][seqid] + 2) % 4; j1 = (sequences[base0[1]+2][seqid] + 2) % 4; } else if (5 == codonpos[base0[0]][pos]) { j3 = (sequences[base0[1]-1][seqid] + 2) % 4; j2 = (sequences[base0[1]][seqid] + 2) % 4; j1 = (sequences[base0[1]+1][seqid] + 2) % 4; } else if (4 == codonpos[base0[0]][pos]) { j3 = (sequences[base0[1]-2][seqid] + 2) % 4; j2 = (sequences[base0[1]-1][seqid] + 2) % 4; j1 = (sequences[base0[1]][seqid] + 2) % 4; } else { cerr << "Can't get here! (012)\n"; exit(EXIT_FAILURE); } jcid = 16 * j1 + 4 * j2 + j3; if (99 == aa2aa[jcid]) { return(0); // stop codon } else if (35 == jcid) { return(4); // start codon } else { return(1); // not a stop } }