// This software is Copyright (C) 2005 by Andrew Firth, University of Otago, // Dunedin, New Zealand. All rights reserved. #include #include #include #include #include #include using namespace std; #define maxlength 50000 #define maxpairs 200 // Function declarations double calc_prob_col(double weight[][64], double nuc[][4], int sequences[][2], int base0[3], int codonpos[][3], int consv, int model, int aa2aa[64], double Vx); int readsequence(char seq[128], int gsequences[][2], int seqid, int skip); int readorfs(char cpos[128], int codonpos[][3], int length, int circular); int mut_type(double weight[][64], int sequences[][2], int base0[3], int codonpos[][3], int aa2aa[64], int model); int fourfold(int sequences[][2], int base0[3], int codonpos[][3], int aa2aa[64], int model); int calc_nuc(double nuc[][4], double ttt, double nucB1[][4], double nucB2[][4], double nucD[][4]); int checkgaps(int lookup[][3], int codonpos[][3], int nseq0, int np); int main(int argc, char* argv[]) { if (2 != argc) { cerr << "Aborting: need setup file; use '" << argv[0] << " setupfile'.\n"; exit(EXIT_FAILURE); } //--------------------------------------------------------------------------- // Variable declarations. int i, j, k, model, ts, wholeseq, range1, range2, base1, base2; int test, lines, nseq, nseq0, base, degnseq[3], maxiter, fitwhat, nmutsum; int base0[3], npairs, np, codonids[maxlength][5], refpos, circular, norfs; int aa2aa[64], codonpos[maxlength][3], lookup[maxlength][3], niter, temp95; int sequences[maxlength][2], gsequences[maxlength][2], obsmuts, ff, Vtype; int nuccount[maxpairs], nmuts[maxpairs], neutral[maxpairs], flag2[maxpairs]; int stopcount[maxpairs], flag[maxpairs], gapcount[maxpairs], skip; int degen4m[maxpairs], degen4s[maxpairs], Bcodonpos[maxlength][3][maxpairs]; int Blookup[maxlength][3][maxpairs], Bsequences[maxlength][2][maxpairs]; int reflookup[maxlength], refdegnseq, refcodonpos[maxlength][3]; int refsequence[maxlength]; double ttt, tttmin, tttmax, temp98, emt, ettt[2], enmuts[2], expmuts; double score[20][20], weight[64][64], plotdata[maxlength][10]; double nucA[4][4], nucB1[4][4], nucB2[4][4], nucD[4][4], nuc[4][4], temp97; double tempA[4][4], testA[4][4], codon[64], temp99, tVfit, Vx[maxpairs]; double lambdasum, lambda4sum, Vmin, Vmax, eV[2], V, Vfix, tttx[maxpairs]; char temp, temp2[128], cpos[128]; char matrix[128], aa[20], aa2codon[64], seqfile[128], seq[128], id[128]; int verbose = 0; if (verbose) { cout << "\n"; } for (i = 0; i < maxlength; ++i) { plotdata[i][0] = plotdata[i][1] = plotdata[i][2] = plotdata[i][3] = plotdata[i][4] = plotdata[i][5] = plotdata[i][6] = plotdata[i][7] = plotdata[i][8] = plotdata[i][9] = 0.; codonids[i][0] = codonids[i][1] = codonids[i][2] = codonids[i][3] = codonids[i][4] = 0; sequences[i][0] = sequences[i][1] = refsequence[i] = 0; codonpos[i][0] = codonpos[i][1] = codonpos[i][2] = 0; } nmutsum = 0; lambdasum = lambda4sum = 0.; //--------------------------------------------------------------------------- // Setup parameters. // Open setup file. ifstream setupfile(argv[1]); if (!setupfile) { cerr << "Aborting: can't find setup file '" << argv[1] << "'.\n"; exit(EXIT_FAILURE); } // Read setup parameters. setupfile.getline(seqfile, 128, ' '); setupfile.ignore(1000, '\n'); setupfile >> tttmin; setupfile >> tttmax; setupfile >> tVfit; setupfile >> maxiter; setupfile.ignore(1000, '\n'); setupfile >> fitwhat; setupfile.ignore(1000, '\n'); setupfile >> model; setupfile.ignore(1000, '\n'); setupfile >> refpos; setupfile.ignore(1000, '\n'); setupfile >> circular; setupfile.ignore(1000, '\n'); setupfile >> wholeseq; if (!wholeseq) { setupfile >> range1; setupfile >> range2; } setupfile.ignore(1000, '\n'); setupfile >> Vfix; if (3 == fitwhat) { setupfile >> Vtype; setupfile >> Vmin; setupfile >> Vmax; } setupfile.ignore(1000, '\n'); setupfile >> skip; setupfile.ignore(1000, '\n'); setupfile.close(); //--------------------------------------------------------------------------- // Amino acid, nucleotide and codon weight matrices. // Number of lines in 20 x 20 amino acid weight matrix. ifstream matrixfile("aa.dat"); if (!matrixfile) { cerr << "Aborting: can't find file 'aa.dat'.\n"; exit(EXIT_FAILURE); } lines = 0; while (matrixfile.ignore(1000, '\n')) { ++lines; } matrixfile.clear(); matrixfile.seekg(0); // Read amino acid weight matrix. temp = '#'; while ('#' == temp) { matrixfile.get(temp); if ('#' == temp) { matrixfile.ignore(1000, '\n'); } } for (j = 0; j < 20; ++j) { matrixfile >> aa[j]; } matrixfile.ignore(1000, '\n'); for (i = 0; i < 20; ++i) { for (j = 0; j <= i; ++j) { matrixfile >> score[i][j]; } matrixfile.ignore(1000, '\n'); } matrixfile.close(); // Fill in other half of weight matrix. for (i = 0; i < 20; ++i) { for (j = i + 1; j < 20; ++j) { score[i][j] = score[j][i]; } } // Check no values greater than corresponding values on the diagonal. for (i = 0; i < 20; ++i) { for (k = 0; k < 20; ++k) { if (score[i][k] > score[i][i] || score[i][k] > score[k][k]) { cerr << "Aborting: scoring matrix value at " << i+1 << ", " << k+1 << " is greater than value on diagonal.\n"; exit(EXIT_FAILURE); } } } // Check no negative values. for (i = 0; i < 20; ++i) { for (k = 0; k < 20; ++k) { if (score[i][k] < 0.) { cerr << "Aborting: scoring matrix value at " << i+1 << ", " << k+1 << " is negative.\n"; exit(EXIT_FAILURE); } } } // Read codon to amino acid table. ifstream aa2codonfile("aa2codon.dat"); if (!aa2codonfile) { cerr << "Aborting: can't find file 'aa2codon.dat'.\n"; exit(EXIT_FAILURE); } for (i = 0; i < 64; ++i) { aa2codonfile.get(aa2codon[i]); aa2codonfile.ignore(1000, '\n'); } aa2codonfile.close(); // Lookup table connecting aa2codon[64] and aa[20] (aa[20] is the // order in which the amino acids appear in the weight matrix). for (i = 0; i < 64; ++i) { aa2aa[i] = 99; // for STOP codons for (j = 0; j < 20; ++j) { if (aa2codon[i] == aa[j]) { aa2aa[i] = j; } } } // Paste weights into 64 x 64 matrix; set stop codons = 1, 0. Weight // is for row i -> column j. for (i = 0; i < 64; ++i) { for (j = 0; j < 64; ++j) { if (aa2aa[i] == 99 && aa2aa[j] == 99) { weight[i][j] = 1.; } else if (aa2aa[i] == 99) { weight[i][j] = 0.; } else if (aa2aa[j] == 99) { weight[i][j] = 0.; } else { weight[i][j] = score[aa2aa[i]][aa2aa[j]]; } } } // Read 4 x 4 diagonalized nucleotide mutation matrix. Order is UCAG. // row i -> column k. Four matrices in input file: A, B, D, B^{-1}, where // A = B.D.B^{-1} and D is diagonal. A itself isn't actually used. ifstream nucfile("nuc.dat"); if (!nucfile) { cerr << "Aborting: can't find file 'nuc.dat'.\n"; exit(EXIT_FAILURE); } for (i = 0; i < 4; ++i) { for (k = 0; k < 4; ++k) { nucfile >> nucA[i][k]; } nucfile.ignore(1000, '\n'); } nucfile.ignore(1000, '\n'); for (i = 0; i < 4; ++i) { for (k = 0; k < 4; ++k) { nucfile >> nucB1[i][k]; } nucfile.ignore(1000, '\n'); } nucfile.ignore(1000, '\n'); for (i = 0; i < 4; ++i) { for (k = 0; k < 4; ++k) { nucfile >> nucD[i][k]; } nucfile.ignore(1000, '\n'); } nucfile.ignore(1000, '\n'); for (i = 0; i < 4; ++i) { for (k = 0; k < 4; ++k) { nucfile >> nucB2[i][k]; } nucfile.ignore(1000, '\n'); } nucfile.ignore(1000, '\n'); nucfile.close(); // Check B.D.B{^-1}. for (i = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) { tempA[i][j] = nucB1[i][j]*nucD[j][j]; } } for (i = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) { testA[i][j] = 0.; for (k = 0; k < 4; ++k) { testA[i][j] += tempA[i][k]*nucB2[k][j]; } } } if (verbose) { cout << "Product B D B^{-1}\n"; for (i = 0; i < 4; ++i) { cout << " "; for (j = 0; j < 4; ++j) { cout << testA[i][j] << " "; } cout << "\n"; } cout << "\n"; } for (i = 0; i < 4; ++i) { for (k = 0; k < 4; ++k) { if (i == k) { if (testA[i][k] > 0.) { cerr << "Aborting: nucleotide matrix value at " << i+1 << ", " << k+1 << " is positive.\n"; exit(EXIT_FAILURE); } } else { if (testA[i][k] < 0.) { cerr << "Aborting: nucleotide matrix value at " << i+1 << ", " << k+1 << " is negative.\n"; exit(EXIT_FAILURE); } } } } // Read codon usage table (scaling unimportant; set all values equal if you // don't want to use codon usage weights). ifstream codonfile("codon.dat"); if (!codonfile) { cerr << "Aborting: can't find file 'codon.dat'.\n"; exit(EXIT_FAILURE); } for (i = 0; i < 64; ++i) { codonfile.getline(temp2, 128, ' '); codonfile >> codon[i]; codonfile.ignore(1000, '\n'); } codonfile.close(); for (i = 0; i < 64; ++i) { if (codon[i] < 0.) { cerr << "Aborting: codon usage table value at " << i+1 << " is negative.\n"; exit(EXIT_FAILURE); } } // Apply codon usage weighting. for (i = 0; i < 64; ++i) { for (j = 0; j < 64; ++j) { weight[i][j] *= codon[j]; } } //--------------------------------------------------------------------------- // Miscellaneous stuff. // Fitting parameters for ttt (evolutionary time). if (tttmin < 0.) { tttmin = 0.; } if (tttmax < tttmin) { tttmax = tttmin; } if (tVfit <= 0 || tVfit > 1.) { tVfit = 1.; } if (maxiter < 1) { maxiter = 1; } if (3 == fitwhat) { if (Vmin <= 0.01) { Vmin = 0.01; } if (Vmax < Vmin) { Vmax = Vmin; } if (Vmax > 10.) { Vmax = 10.; } } // MLE/gaps logfile: // pair# base# log(prob(seq1_nt -> seq2_nt)) exp#muts exp#neutralmuts // obs#muts obs#neutralmuts // " " 9 0 0 0 0 if gap in both seqs // " " 8 0 0 0 0 if gap in seq 1 // " " 7 0 0 0 0 if gap in seq 2 // " " 6 0 0 0 0 zero-probability (from codon.dat or aa.dat) transition // e.g. stop <-> non-stop // " " 5 0 0 0 0 if gap only in reference sequence ofstream logfile("mlrgd_out.log"); if (!logfile) { cerr << "Failed to open log file 'mlrgd_out.log'.\n"; exit(EXIT_FAILURE); } logfile << fixed; logfile << showpoint; // Open output data file. ofstream outputfile("mlrgd_out.dat"); if (!outputfile) { cerr << "Failed to open output file 'mlrgd_out.dat'.\n"; exit(EXIT_FAILURE); } //--------------------------------------------------------------------------- // Input sequences. // Read reference sequence and ORF file (seq.001.aln, orfs.001) nseq0 = readsequence("seq.001.aln", gsequences, 0, skip); if (verbose) { cout << "Read 'seq.001.aln', length = " << nseq0 << ".\n"; } refdegnseq = 0; // degapped sequence length for (i = 0; i < nseq0; ++i) { if (9 != gsequences[i][0]) { // not a gap refsequence[refdegnseq] = gsequences[i][0]; reflookup[i] = refdegnseq; ++refdegnseq; } else { reflookup[i] = -1; // gap } } if (verbose) { cout << "Degapped reference sequence length = " << refdegnseq << ".\n"; } if (refpos) { norfs = readorfs("orfs.001", refcodonpos, refdegnseq, circular); if (verbose) { cout << "Read 'orfs.001', containing " << norfs << " ORFs.\n"; } } // Find range boundaries (in degapped reference sequence). if (!wholeseq) { range1 -= 1; range2 -= 1; // since indices start at 0 if (range1 > range2) { // make range1 < range2 temp95 = range1; range1 = range2; range2 = temp95; } if (range2 > refdegnseq - 1) { range2 = refdegnseq - 1; } if (range1 < 0) { range1 = 0; } } else { // whole sequence range1 = 0; range2 = refdegnseq - 1; } // Find corresponding range boundaries in alignment coords if (!wholeseq) { for (base = 0; base < nseq0; ++base) { if (reflookup[base] == range1) { // base in degapped refseq or -1 if gap base1 = base; // note if range1 = 0, still removes leading gaps } if (reflookup[base] == range2) { // base in degapped refseq or -1 if gap base2 = base; // note if range2 = refdegnseq-1, still removes end gaps } } } else { // whole sequence base1 = 0; base2 = nseq0 - 1; } // Number of sequence pairs in sequences file. ifstream sequencesfile(seqfile); if (!sequencesfile) { cerr << "Aborting: can't find file '" << seqfile << "'.\n"; exit(EXIT_FAILURE); } npairs = -1; while (sequencesfile.ignore(1000, '\n')) { ++npairs; } sequencesfile.clear(); sequencesfile.seekg(0); // Loop through sequence pairs. for (np = 0; np < npairs; ++np) { // Read and degap sequence pair. for (k = 0; k < 2; ++k) { if (0 == k) { sequencesfile.getline(id, 128, ' '); } else { sequencesfile.getline(id, 128, '\n'); } // Read sequence pair. strcpy(seq, "seq."); strcat(seq, id); strcat(seq, ".aln"); nseq = readsequence(seq, gsequences, k, skip); if (verbose) { cout << "Read '" << seq << "', length = " << nseq << ".\n"; } if (nseq != nseq0) { cerr << "Aborting: gapped sequences of differing length.\n"; exit(EXIT_FAILURE); } // Degap sequence and make lookup table connecting input alignment // sequence coords to degapped coords. degnseq[k] = 0; // degapped sequence length for (i = 0; i < nseq0; ++i) { if (9 != gsequences[i][k]) { // not a gap sequences[degnseq[k]][k] = gsequences[i][k]; lookup[i][k] = degnseq[k]; ++degnseq[k]; } else { lookup[i][k] = -1; // gap } } if (verbose) { cout << "Degapped sequence length = " << degnseq[k] << ".\n"; } // Read codon nucleotide position information for first sequence of pair // (unless using refseq ORF files for all). if (0 == k) { if (!refpos) { // not using reference sequence for (i = 0; i < nseq0; ++i) { lookup[i][2] = lookup[i][0]; } degnseq[2] = degnseq[0]; strcpy(cpos, "orfs."); strcat(cpos, id); norfs = readorfs(cpos, codonpos, degnseq[k], circular); if (verbose) { cout << "Read '" << cpos << "', containing " << norfs << " ORFs.\n"; } } else { // using reference sequence for (i = 0; i < nseq0; ++i) { lookup[i][2] = reflookup[i]; } degnseq[2] = refdegnseq; for (j = 0; j < 3; ++j) { for (i = 0; i < refdegnseq; ++i) { codonpos[i][j] = refcodonpos[i][j]; } } } } } // Close loop over the two sequences in a sequence pair // Check for gaps not in groups of 3 in coding sequences test = checkgaps(lookup, codonpos, nseq0, np); // Change lookup[][] entries for ambiguous nt to -1. for (k = 0; k < 2; ++k) { for (i = 0; i < nseq0; ++i) { if (5 == sequences[lookup[i][k]][k]) { lookup[i][k] = -1; // ambiguous nt } } } if (!refpos) { for (i = 0; i < nseq0; ++i) { if (5 == sequences[lookup[i][2]][0]) { lookup[i][2] = -1; // ambiguous nt } } } else { for (i = 0; i < nseq0; ++i) { if (5 == refsequence[lookup[i][2]]) { lookup[i][2] = -1; // ambiguous nt } } } //------------------------------------------------------------------------- //If circular genome, then shift data in sequences[][2] and codonpos[][3] // by +2. Append the last two nt to the beginning and the first 2 nt to // the end. Then correct lookup[][2] by adding +2 to each entry (except // for '-1's). Note that base1, base2, lookup[][] are in msa coords, // which are unchanged; range1 and range2 are in refseq coords but are // not used again so are left as they are. It is impossible for // lookup[][] to reference the 4 nt added to each sequence in // sequences[][2]. They are only there to be used as neighbouring nt in // codons in the subroutines fourfold, mut_type and calc_prob_col. //If (!circular), if the first nt of seq2 is aligned to a N2 or N3 nt in // seq1/refseq, or the last nt of seq2 is aligned to a N1 or N2 nt in // seq1/refseq, then you may end up references out-of-bounds in the // sequences[][] array. Avoid this by padding all seqs with 2 nt at // either end. For non-circular sequences, use 'N's - which will result // in any codon incorporating them being skipped, but at least you don't // reference out-of-bounds. for (k = 0; k < 3; ++k) { // update values in lookup[][] for (i = 0; i < nseq0; ++i) { if (-1 != lookup[i][k]) { lookup[i][k] += 2; } } } for (k = 0; k < 2; ++k) { // shift sequences[][] by +2 for (i = degnseq[k]+1; i >= 2; --i) { sequences[i][k] = sequences[i-2][k]; } if (circular) { // circularize sequences[1][k] = sequences[degnseq[k]+1][k]; sequences[0][k] = sequences[degnseq[k]][k]; sequences[degnseq[k]+3][k] = sequences[3][k] ; sequences[degnseq[k]+2][k] = sequences[2][k]; } else { // pad ends with 'N's sequences[1][k] = sequences[0][k] = sequences[degnseq[k]+3][k] = sequences[degnseq[k]+2][k] = 5; } } for (k = 0; k < 3; ++k) { // shift codonpos[][] by +2 for (i = degnseq[2]+1; i >= 2; --i) { codonpos[i][k] = codonpos[i-2][k]; } if (circular) { // circularize codonpos[1][k] = codonpos[degnseq[2]+1][k]; codonpos[0][k] = codonpos[degnseq[2]][k]; codonpos[degnseq[2]+3][k] = codonpos[3][k] ; codonpos[degnseq[2]+2][k] = codonpos[2][k]; } else { // pad ends with '0's codonpos[1][k] = codonpos[0][k] = codonpos[degnseq[2]+3][k] = codonpos[degnseq[2]+2][k] = 0; } } //------------------------------------------------------------------------- // Loop through nucleotides, calculating statistics. if (Vfix > 0) { Vx[np] = Vfix; // Will be recalculated later if fitwhat = 3 } else { Vx[np] = 1.; // Will be recalculated later if fitwhat = 3 } // Count number of observed point nucleotide differences in sequence pair. nuccount[np] = 0; // number of nucleotides used (non-gap or stop) nmuts[np] = 0; // number of mutations (non-gap or stop) neutral[np] = 0; // number of neutral/synonymous mutations (non-gap or // stop) (includes all mutations in non-coding regions) gapcount[np] = 0; // positions that are gaps in one or both input sequences stopcount[np] = 0;// zero-prob transitions according to codon.dat, aa.dat flag[np] = 0; // flag = 1 means problem with ttt fit convergence flag2[np] = 0; // flag2 = 1 means problem with V fit convergence degen4s[np] = 0; // counts number of 4-fold degenerate sites that are also // neutral (in principal would also count e.g. the 3rd // base in CUA (leu) -> UUG (leu) since the A in CUA is // 4-fold degenerate and the mutation in the context // of seq 2 is synonymous) degen4m[np] = 0; // counts mutations at 4-fold degenerate sites // Any ambiguous nt codes are treated as gaps (skipped over for most // statistics, written to log and plot files as gaps, counted as part of // gapcount). However they are not stripped out in the gap-stripping, so // CDSs are not put out-of-frame. If an ambiguous nt code is part of a // codon being referenced for a neighbouring nt, then mut_type will // return a 0 (zero-probability transition) and the nt will be logged // along with the zero-probability transitions. The subroutines fourfold // and calc_prob_col are never referenced unless mut_type != 0 so // shouldn't cause any problems. for (base = base1; base <= base2; ++base) { base0[0] = lookup[base][0]; base0[1] = lookup[base][1]; base0[2] = lookup[base][2]; // == base0[0] if !refpos if (-1 != base0[0] && -1 != base0[1] && -1 != base0[2]) { // not gapped ts = mut_type(weight, sequences, base0, codonpos, aa2aa, model); if (ts) { // not stop <-> non-stop nuccount[np] += 1; if (sequences[base0[0]][0] != sequences[base0[1]][1]) { nmuts[np] += 1; if (1 == ts) { neutral[np] += 1; } } ff = fourfold(sequences, base0, codonpos, aa2aa, model); if (ff && 1 == ts) { // site with 4-fold degen and neutral/null mut degen4s[np] += 1; if (sequences[base0[0]][0] != sequences[base0[1]][1]) { degen4m[np] += 1; } } } else { stopcount[np] += 1; } } else { gapcount[np] += 1; } } nmutsum += nmuts[np]; lambdasum += float(nmuts[np])/float(nuccount[np]); lambda4sum += float(degen4m[np])/float(degen4s[np]); // Find ttt value giving closest match between expected # muts and observed // # muts. Use bisection method -> since nmuts increases monotonically // with ttt. Stop when abs(exp # muts - obs # muts) < tVfit. Exit if // number of iterations exceeds maxiter or if obs # muts is outside // range allowed by tttmin--tttmax. // // fitwhat = 0 or other -> fit to total # muts // 1 -> fit to # of neutral/synonymous muts, // 2 -> fit to # of neutral/synonymous muts at 4-fold degen sites // 3 -> as for 2, then adjust V to fit # of nonsyn muts // Note: 1 and 2 are not recommended if lots of double-coding ettt[0] = tttmin; ettt[1] = tttmax; // Calculate exp#muts for ettt[]. for (k = 0; k < 2; ++k) { test = calc_nuc(nuc, ettt[k], nucB1, nucB2, nucD); enmuts[k] = 0.; for (base = base1; base <= base2; ++base) { base0[0] = lookup[base][0]; base0[1] = lookup[base][1]; base0[2] = lookup[base][2]; // == base0[0] if !refpos if (-1 != base0[0] && -1 != base0[1] && -1 != base0[2]) { // not gapped ts = mut_type(weight, sequences, base0, codonpos, aa2aa, model); if (ts) { // not a stop if (1 == fitwhat) { enmuts[k] += calc_prob_col(weight, nuc, sequences, base0, codonpos, 2, model, aa2aa, Vx[np]); } else if (2 == fitwhat || 3 == fitwhat) { ff = fourfold(sequences, base0, codonpos, aa2aa, model); if (ff) { enmuts[k] += calc_prob_col(weight, nuc, sequences, base0, codonpos, 2, model, aa2aa, Vx[np]); } } else { enmuts[k] += calc_prob_col(weight, nuc, sequences, base0, codonpos, 1, model, aa2aa, Vx[np]); } } } } } if (1 == fitwhat) { obsmuts = neutral[np]; // fit to # of neutral/synonymous muts } else if (2 == fitwhat || 3 == fitwhat) { obsmuts = degen4m[np]; // fit to neutral/synonymous muts at 4-fold sites } else { obsmuts = nmuts[np]; // fit to total # muts } // Check tttmin--tttmax range is sufficient. if ((enmuts[0] - tVfit) > float(obsmuts)) { cerr << "Fitting number of mutations (" << obsmuts << ") outside that " << "allowed by given t range. Sequence pair " << np+1 << ".\n"; tttx[np] = tttmin; flag[np] = 1; } else if (enmuts[1] < float(obsmuts)) { cerr << "Fitting number of mutations (" << obsmuts << ") outside that " << "allowed by given t range. Sequence pair " << np+1 << ".\n"; tttx[np] = tttmax; flag[np] = 1; } else { // Iterate on ttt. niter = 0; while (abs(enmuts[0] - float(obsmuts)) > tVfit) { niter++; if (niter > maxiter) { cerr << "Iteration on t failed to converge in " << maxiter << " iterations. Sequence pair " << np+1 << ".\n"; flag[np] = 1; break; } // Bisector. ttt = 0.5 * (ettt[0] + ettt[1]); // Calculate expected number of mutations at bisector. test = calc_nuc(nuc, ttt, nucB1, nucB2, nucD); expmuts = 0.; for (base = base1; base <= base2; ++base) { base0[0] = lookup[base][0]; base0[1] = lookup[base][1]; base0[2] = lookup[base][2]; // == base0[0] if !refpos if (-1 != base0[0] && -1 != base0[1] && -1 != base0[2]) {//not gapped ts = mut_type(weight, sequences, base0, codonpos, aa2aa, model); if (ts) { // not a stop if (1 == fitwhat) { expmuts += calc_prob_col(weight, nuc, sequences, base0, codonpos, 2, model, aa2aa, Vx[np]); } else if (2 == fitwhat || 3 == fitwhat) { ff = fourfold(sequences, base0, codonpos, aa2aa, model); if (ff) { expmuts += calc_prob_col(weight, nuc, sequences, base0, codonpos, 2, model, aa2aa, Vx[np]); } } else { expmuts += calc_prob_col(weight, nuc, sequences, base0, codonpos, 1, model, aa2aa, Vx[np]); } } } } // Reassign ettt[] and enmuts[]. if (float(obsmuts) > expmuts) { ettt[0] = ttt; enmuts[0] = expmuts; } else { ettt[1] = ttt; enmuts[1] = expmuts; } } tttx[np] = ettt[0]; } // close if/else for obsmuts outside/within allowed ttt range // Calculate nucleotide matrix for best ttt. test = calc_nuc(nuc, tttx[np], nucB1, nucB2, nucD); if (3 == fitwhat && !Vtype) { // Iterate to find best V for each seq pair // Find V value giving closest match between exp # muts and obs # muts // (all sites). Use bisection method -> since nmuts increases // monotonically with V. Stop when abs(exp # muts - obs # muts) < // tVfit. Exit if number of iterations exceeds maxiter or if obs # // muts is outside range allowed by Vmin--Vmax. eV[0] = Vmin; eV[1] = Vmax; for (k = 0; k < 2; ++k) { enmuts[k] = 0.; for (base = base1; base <= base2; ++base) { base0[0] = lookup[base][0]; base0[1] = lookup[base][1]; base0[2] = lookup[base][2]; // == base0[0] if !refpos if (-1 != base0[0] && -1 != base0[1] && -1 != base0[2]) {// not gap ts = mut_type(weight, sequences, base0, codonpos, aa2aa, model); if (ts) { // not a stop enmuts[k] += calc_prob_col(weight, nuc, sequences, base0, codonpos, 1, model, aa2aa, eV[k]); } } } } obsmuts = nmuts[np]; // fit to total # muts // Check Vmin--Vmax range is sufficient. if ((enmuts[0] - tVfit) > float(obsmuts)) { cerr << "Fitting number of mutations (" << obsmuts << ") outside that " << "allowed by given V range. Sequence pair " << np+1 << ".\n"; cerr << " " << enmuts[0] << " " << enmuts[1] << "\n"; Vx[np] = Vmin; flag2[np] = 1; } else if ((enmuts[1] + tVfit) < float(obsmuts)) { cerr << "Fitting number of mutations (" << obsmuts << ") outside that " << "allowed by given V range. Sequence pair " << np+1 << ".\n"; cerr << " " << enmuts[0] << " " << enmuts[1] << "\n"; Vx[np] = Vmax; flag2[np] = 1; } else { // Iterate on V. niter = 0; while (abs(enmuts[0] - float(obsmuts)) > tVfit) { niter++; if (niter > maxiter) { cerr << "Iteration on V failed to converge in " << maxiter << " iterations. Sequence pair " << np+1 << ".\n"; flag2[np] = 1; break; } // Bisector. V = 0.5 * (eV[0] + eV[1]); // Calculate expected number of mutations at bisector. expmuts = 0.; for (base = base1; base <= base2; ++base) { base0[0] = lookup[base][0]; base0[1] = lookup[base][1]; base0[2] = lookup[base][2]; // == base0[0] if !refpos if (-1 != base0[0] && -1 != base0[1] && -1 != base0[2]) {// not gap ts = mut_type(weight, sequences, base0, codonpos, aa2aa, model); if (ts) { // not a stop expmuts += calc_prob_col(weight, nuc, sequences, base0, codonpos, 1, model, aa2aa, V); } } } // Reassign eV[] and enmuts[]. if (float(obsmuts) > expmuts) { eV[0] = V; enmuts[0] = expmuts; } else { eV[1] = V; enmuts[1] = expmuts; } } Vx[np] = eV[0]; } // close if/else for obsmuts outside/within allowed V range } // Save data to Bcodonpos[][][], Blookup[][][], Bsequences[][][]. // (Although sequences[][] and codonpos[][] have different (degapped) // lengths, they are always <= nseq0 (+4 for circular mode), so // initialize the matrices to 0 (above), and read across everything up to // nseq0+4. Sometimes, when you read back into sequences[][] and // codonpos[][], you'll have residual from a longer previous sequence, // but lookup[][] should never reference these; lookup[][] is // always nseq0 long.) for (i = 0; i < nseq0; ++i) { for (k = 0; k < 3; ++k) { Blookup[i][k][np] = lookup[i][k]; } } for (i = 0; i <= nseq0+4; ++i) { for (k = 0; k < 2; ++k) { Bsequences[i][k][np] = sequences[i][k]; } for (k = 0; k < 3; ++k) { Bcodonpos[i][k][np] = codonpos[i][k]; } } } // close main loop over sequence pairs if (3 == fitwhat && Vtype) { // Iterate to find best V for whole alignment // Find V value giving closest match between exp # muts and obs # muts // (all sites). Use bisection method -> since nmuts increases // monotonically with V. Stop when abs(exp # muts - obs # muts) < // tVfit. Exit if number of iterations exceeds maxiter or if obs # // muts is outside range allowed by Vmin--Vmax. obsmuts = 0; for (np = 0; np < npairs; ++np) { obsmuts += nmuts[np]; // fit to total # muts summed over all pairs } eV[0] = Vmin; eV[1] = Vmax; for (k = 0; k < 2; ++k) { enmuts[k] = 0.; for (np = 0; np < npairs; ++np) { // cycle through pairs for (i = 0; i < nseq0; ++i) { // read back matrices for (j = 0; j < 3; ++j) { lookup[i][j] = Blookup[i][j][np]; } } for (i = 0; i <= nseq0+4; ++i) { for (j = 0; j < 2; ++j) { sequences[i][j] = Bsequences[i][j][np]; } for (j = 0; j < 3; ++j) { codonpos[i][j] = Bcodonpos[i][j][np]; } } test = calc_nuc(nuc, tttx[np], nucB1, nucB2, nucD); // recalculate nuc for (base = base1; base <= base2; ++base) { base0[0] = lookup[base][0]; base0[1] = lookup[base][1]; base0[2] = lookup[base][2]; // == base0[0] if !refpos if (-1 != base0[0] && -1 != base0[1] && -1 != base0[2]) {// not gap ts = mut_type(weight, sequences, base0, codonpos, aa2aa, model); if (ts) { // not a stop enmuts[k] += calc_prob_col(weight, nuc, sequences, base0, codonpos, 1, model, aa2aa, eV[k]); } } } } } // Check Vmin--Vmax range is sufficient. if ((enmuts[0] - tVfit) > float(obsmuts)) { cerr << "Fitting number of mutations (" << obsmuts << ") outside that " << "allowed by given V range.\n"; cerr << " " << enmuts[0] << " " << enmuts[1] << "\n"; for (np = 0; np < npairs; ++np) { Vx[np] = Vmin; flag2[np] = 1; } } else if ((enmuts[1] + tVfit) < float(obsmuts)) { cerr << "Fitting number of mutations (" << obsmuts << ") outside that " << "allowed by given V range.\n"; cerr << " " << enmuts[0] << " " << enmuts[1] << "\n"; for (np = 0; np < npairs; ++np) { Vx[np] = Vmax; flag2[np] = 1; } } else { // Iterate on V. niter = 0; while (abs(enmuts[0] - float(obsmuts)) > tVfit) { niter++; if (niter > maxiter) { cerr << "Iteration on V failed to converge in " << maxiter << " iterations.\n"; for (np = 0; np < npairs; ++np) { flag2[np] = 1; } break; } // Bisector. V = 0.5 * (eV[0] + eV[1]); // Calculate expected number of mutations at bisector. expmuts = 0.; for (np = 0; np < npairs; ++np) { // cycle through pairs for (i = 0; i < nseq0; ++i) { // read back matrices for (k = 0; k < 3; ++k) { lookup[i][k] = Blookup[i][k][np]; } } for (i = 0; i <= nseq0+4; ++i) { for (k = 0; k < 2; ++k) { sequences[i][k] = Bsequences[i][k][np]; } for (k = 0; k < 3; ++k) { codonpos[i][k] = Bcodonpos[i][k][np]; } } test = calc_nuc(nuc, tttx[np], nucB1, nucB2, nucD); // recalc nuc for (base = base1; base <= base2; ++base) { base0[0] = lookup[base][0]; base0[1] = lookup[base][1]; base0[2] = lookup[base][2]; // == base0[0] if !refpos if (-1 != base0[0] && -1 != base0[1] && -1 != base0[2]) {// not gap ts = mut_type(weight, sequences, base0, codonpos, aa2aa, model); if (ts) { // not a stop expmuts += calc_prob_col(weight, nuc, sequences, base0, codonpos, 1, model, aa2aa, V); } } } } // Reassign eV[] and enmuts[]. if (float(obsmuts) > expmuts) { eV[0] = V; enmuts[0] = expmuts; } else { eV[1] = V; enmuts[1] = expmuts; } } for (np = 0; np < npairs; ++np) { Vx[np] = eV[0]; } } // close if/else for obsmuts outside/within allowed V range } // Restart main loop over sequence pairs for (np = 0; np < npairs; ++np) { // Read back lookup[][], sequences[][], codonpos[][] for (i = 0; i < nseq0; ++i) { for (k = 0; k < 3; ++k) { lookup[i][k] = Blookup[i][k][np]; } } for (i = 0; i <= nseq0+4; ++i) { for (k = 0; k < 2; ++k) { sequences[i][k] = Bsequences[i][k][np]; } for (k = 0; k < 3; ++k) { codonpos[i][k] = Bcodonpos[i][k][np]; } } // Recalculate nucleotide matrix test = calc_nuc(nuc, tttx[np], nucB1, nucB2, nucD); // Calculate likelihood for best ttt (and V) values. emt = 0.; for (base = base1; base <= base2; ++base) { base0[0] = lookup[base][0]; base0[1] = lookup[base][1]; base0[2] = lookup[base][2]; // == base0[0] if !refpos if (-1 != base0[0] && -1 != base0[1] && -1 != base0[2]) { // not gapped ts = mut_type(weight, sequences, base0, codonpos, aa2aa, model); if (ts) { // not zero-prob transition // Count number of pairs in which nucleotide is of each type of // codon position (codonpos[][0] only) (generally a nt should be // the same type in all pairs, except for gaps) if (0 == codonpos[base0[2]][0]) { codonids[base][0] += 1; } else if (1 == codonpos[base0[2]][0] || 4 == codonpos[base0[2]][0]) { codonids[base][1] += 1; } else if (2 == codonpos[base0[2]][0] || 5 == codonpos[base0[2]][0]) { codonids[base][2] += 1; } else if (3 == codonpos[base0[2]][0] || 6 == codonpos[base0[2]][0]) { codonids[base][3] += 1; } else { cerr << "Can't get here!\n"; exit(EXIT_FAILURE); } // Calculate statistics. temp98 = calc_prob_col(weight, nuc, sequences, base0, codonpos, 0, model, aa2aa, Vx[np]); temp99 = calc_prob_col(weight, nuc, sequences, base0, codonpos, 1, model, aa2aa, Vx[np]); temp97 = calc_prob_col(weight, nuc, sequences, base0, codonpos, 2, model, aa2aa, Vx[np]); logfile << setw(4) << np + 1 << " " << setw(7) << base + 1 << " " << setw(10) << exp(temp98) << " " << setw(10) << temp99 << " " << setw(10) << temp97; emt += temp98; plotdata[base][0] += temp99; plotdata[base][2] += temp99 * (1. - temp99); // This is the variance for binomial, n = 1, p = temp99 (i.e. // assuming the model expected probabilities). // Summed over a large column (n large, p not too close to 1 or 0) // approximates normal distribution, by CLT. // Otherwise you still get the stddev, but can't use it to determine // probabilities or confidence limits. plotdata[base][3] += float(nmuts[np])/float(nuccount[np]); plotdata[base][4] += temp97; if (sequences[base0[0]][0] != sequences[base0[1]][1]) { logfile << " 1"; // obs mutation plotdata[base][1] += 1.; if (1 == ts) { logfile << " 1\n"; plotdata[base][5] += 1.; } else { logfile << " 0\n"; } } else { logfile << " 0 0\n"; // obs identity } ff = fourfold(sequences, base0, codonpos, aa2aa, model); if (ff && 1 == ts) { // site with 4-fold degen and neutral/null mut plotdata[base][9] += 1.; // number of sites plotdata[base][6] += temp97; // exp # neutral non-null muts plotdata[base][8] += float(degen4m[np])/float(degen4s[np]); // lambda4sum for these sites if (sequences[base0[0]][0] != sequences[base0[1]][1]) { plotdata[base][7] += 1.; // obs # muts at these sites } } } else { // zero-prob transition (e.g. stop <-> non-stop) codonids[base][4] += 1; logfile << setw(4) << np + 1 << " " << setw(7) << base + 1 << " 6 0 0 0 0\n"; } } else { // gap in one or both sequences codonids[base][4] += 1; if (-1 == base0[0] && -1 == base0[1]) { logfile << setw(4) << np + 1 << " " << setw(7) << base + 1 << " 9 0 0 0 0\n"; // gap in both sequences } else if (-1 == base0[0]) { logfile << setw(4) << np + 1 << " " << setw(7) << base + 1 << " 8 0 0 0 0\n"; // gap in sequence 1 only } else if (-1 == base0[1]) { logfile << setw(4) << np + 1 << " " << setw(7) << base + 1 << " 7 0 0 0 0\n"; // gap in sequence 2 only } else if (-1 == base0[2] && refpos) { logfile << setw(4) << np + 1 << " " << setw(7) << base + 1 << " 5 0 0 0 0\n"; // gap only in reference sequence } else { cerr << "Can't get here!\n"; exit(EXIT_FAILURE); } } } emt /= nuccount[np]; // Write ttt, MLE/nucleotide, nnuc values to file. outputfile << setprecision(3); outputfile << setw(8) << tttx[np] << " "; outputfile << setw(8) << emt << " "; outputfile << setw(6) << nuccount[np] << " "; outputfile << setw(6) << gapcount[np] << " "; outputfile << setw(6) << stopcount[np] << " "; outputfile << setw(6) << nmuts[np] << " "; outputfile << setw(6) << neutral[np] << " "; outputfile << setw(6) << flag[np] << " "; outputfile << setw(6) << degen4s[np] << " "; outputfile << setw(6) << degen4m[np] << " "; outputfile << setw(6) << Vx[np] << " "; outputfile << setw(6) << flag2[np] << "\n"; if (verbose) { cout << "\n"; } } // Close loop through sequence pairs. logfile.close(); outputfile.close(); sequencesfile.close(); // Write out nucleotide-by-nucleotide expected number of non-null mutations // and observed number of non-null mutations (summed over all sequence // pairs) and number of sequence pairs discarded due to gaps. ofstream plotfile("mlrgd_out.plot"); if (!plotfile) { cerr << "Failed to open log file 'mlrgd_out.plot'.\n"; exit(EXIT_FAILURE); } // base# exp#muts #pairs_in_which_nt_is(N0 N1 N2 N3 gap) est_stddev #muts/nuc // all summed over all pairs for (i = 0; i < nseq0; ++i) { plotfile << i + 1 << " " << plotdata[i][0] << " " << plotdata[i][1] << " " << codonids[i][0] << " " << codonids[i][1] << " " << codonids[i][2] << " " << codonids[i][3] << " " << codonids[i][4] << " " << sqrt(plotdata[i][2]) << " " << plotdata[i][3] << " " << plotdata[i][4] << " " << plotdata[i][5] << " " << plotdata[i][6] << " " << plotdata[i][7] << " " << plotdata[i][8] << " " << plotdata[i][9] << "\n"; } plotfile.close(); ofstream infofile("mlrgd_out.info"); if (!infofile) { cerr << "Failed to open log file 'mlrgd_out.info'.\n"; exit(EXIT_FAILURE); } infofile << "number_pairs: " << npairs << "\n"; infofile << "alignment_length: " << nseq0 << " " << base1+1 << " " << base2+1 << "\n"; infofile << "sum_nmuts: " << nmutsum << "\n"; infofile << "sum_lambda: " << lambdasum << "\n"; infofile << "sum_4folddegen_lambda: " << lambda4sum << "\n"; infofile.close(); return(0); } //----------------------------------------------------------------------------- // Subroutine to calculate 4 x 1 normalized probability matrix for a given // nucleotide in sequence 1 to mutate to each of the four possible // nucleotides. // Not set up to deal with ambiguous nt code (sequences[][] = 5) so make sure // it is not called unless mut_type returns non-0. double calc_prob_col(double weight[][64], double nuc[][4], int sequences[][2], int base0[3], int codonpos[][3], int consv, int model, int aa2aa[64], double Vx) { int jn[2], j1[2], j2[2], j3[2], jcid[2], k, p, kn, k1, k2, k3, kcid, neut[4]; double sum, prob_col[4], sumneut; // base0[0] and base0[1] are locations of the nucleotide in sequences[][0/1] // and base0[0] is the location of the nucleotide in codonpos[][]; all // of which have gaps stripped out. jn[0] = sequences[base0[0]][0]; // nucleotide in sequence 1 jn[1] = sequences[base0[1]][1]; // nucleotide in sequence 2 sum = 0.; for (kn = 0; kn < 4; ++kn) { // possible point mutations at that position prob_col[kn] = nuc[jn[0]][kn]; // weight for nucleotide mutation neut[kn] = 1; // stays 1 if mut is neutral if (0 != model) { // not forcing non-coding (nucleotide weights only) model; // single/double/triple coding model // cycle through the three codonpos files; find the codon containing the // nucleotide in sequence 1 and sequence 2 for (p = 0; p < 3; ++p) { // cycle through the three codonpos files; find // the codon containing the nucleotide in sequence 1 and sequence 2 if (0 != codonpos[base0[2]][p]) { for (k = 0; k < 2; ++k) { // cycle through sequence 1 and sequence 2 if (1 == codonpos[base0[2]][p]) { j1[k] = sequences[base0[k]][k]; j2[k] = sequences[base0[k]+1][k]; j3[k] = sequences[base0[k]+2][k]; } else if (2 == codonpos[base0[2]][p]) { j1[k] = sequences[base0[k]-1][k]; j2[k] = sequences[base0[k]][k]; j3[k] = sequences[base0[k]+1][k]; } else if (3 == codonpos[base0[2]][p]) { j1[k] = sequences[base0[k]-2][k]; j2[k] = sequences[base0[k]-1][k]; j3[k] = sequences[base0[k]][k]; } else if (6 == codonpos[base0[2]][p]) { j3[k] = (sequences[base0[k]][k] + 2) % 4; j2[k] = (sequences[base0[k]+1][k] + 2) % 4; j1[k] = (sequences[base0[k]+2][k] + 2) % 4; } else if (5 == codonpos[base0[2]][p]) { j3[k] = (sequences[base0[k]-1][k] + 2) % 4; j2[k] = (sequences[base0[k]][k] + 2) % 4; j1[k] = (sequences[base0[k]+1][k] + 2) % 4; } else if (4 == codonpos[base0[2]][p]) { j3[k] = (sequences[base0[k]-2][k] + 2) % 4; j2[k] = (sequences[base0[k]-1][k] + 2) % 4; j1[k] = (sequences[base0[k]][k] + 2) % 4; } else { cerr << "Can't get here!\n"; exit(EXIT_FAILURE); } jcid[k] = 16 * j1[k] + 4 * j2[k] + j3[k]; } // close loop through sequence 1 and sequence 2 k1 = j1[1]; k2 = j2[1]; k3 = j3[1]; // codon in sequence 2 if (1 == codonpos[base0[2]][p]) { // new codon in sequence 2 k1 = kn; } else if (2 == codonpos[base0[2]][p]) { k2 = kn; } else if (3 == codonpos[base0[2]][p]) { k3 = kn; } else if (6 == codonpos[base0[2]][p]) { k3 = (kn + 2) % 4; } else if (5 == codonpos[base0[2]][p]) { k2 = (kn + 2) % 4; } else if (4 == codonpos[base0[2]][p]) { k1 = (kn + 2) % 4; } else { cerr << "Can't get here!\n"; exit(EXIT_FAILURE); } kcid = 16 * k1 + 4 * k2 + k3; // new codon in sequence 2 // amino acid acceptability (new nucleotide kn in sequence 2 context, // compared with old nucleotide in sequence 1 context). prob_col[kn] *= weight[jcid[0]][kcid]; if (aa2aa[kcid] != aa2aa[jcid[0]]) { neut[kn] = 0; // nonsynonymous prob_col[kn] *= Vx; // nonsynonymous:synonymous scaling } } // close "if (0 != codonpos[base0[2]][p])" } // close loop over three codonpos files } // if (0 != model) sum += prob_col[kn]; } // close loop over kn if (0. == sum) { // This can occassionally occur, e.g. seq1 codon CAA, seq2 codon UAC, // mutating N3; for ttt ~ 0, prob_col[U] = prob_col[C] = prob_col[G] = 0, // and prob_col[A] = 0 since UAG is a stop codon. Once ttt has converged // on the best fit ttt, this probelm shouldn't occur. if (0 == consv) { // return log(prob) for seq1_nt -> seq2_nt return(-100); } else if (2 == consv) { // return prob for non-null neutral mut in seq1 return(0); } else { // return prob for non-null mutation in seq1 return(0); } } // Normalize probs so that sum = 1. for (kn = 0; kn < 4; ++kn) { prob_col[kn] = prob_col[kn] / sum; } if (prob_col[0] > 1.01 || prob_col[0] < -0.01|| prob_col[1] > 1.01 || prob_col[1] < -0.01|| prob_col[2] > 1.01 || prob_col[2] < -0.01|| prob_col[3] > 1.01 || prob_col[3] < -0.01) { cerr << "Error: invalid probability value; possible rounding error:\n " << prob_col[0] << " " << prob_col[1] << " " << prob_col[2] << " " << prob_col[3] << " " << sum << "\n"; } if (0 == consv) { // return log(prob) for nucleotide observed in second sequence if (prob_col[jn[1]] > 0) { return(log(prob_col[jn[1]])); } else { return(-100); // log(prob) is not used for fitting or anything else // so no longer need to exit if this happens } } else if (2 == consv) { // return prob for non-null neutral mut in seq1 sumneut = 0.; for (kn = 0; kn < 4; ++kn) { if (1 == neut[kn] && kn != jn[0]) {// sum probs for non-null neutral muts sumneut += prob_col[kn]; } } return(sumneut); } else { // return prob for non-null mut in sequence 1 return(1. - prob_col[jn[0]]); } } //----------------------------------------------------------------------------- int readsequence(char seq[128], int gsequences[][2], int seqid, int skip) { int i, k, nlines, nseq; char seqline[1000], inputseq[maxlength]; ifstream sequence(seq); if (!sequence) { cerr << "Aborting: can't find file '" << seq << "'.\n"; exit(EXIT_FAILURE); } nlines = -1; while (sequence.ignore(1000, '\n')) { ++nlines; } sequence.clear(); sequence.seekg(0); sequence.ignore(1000, '\n'); for (i = 0; i < nlines; ++i) { sequence.getline(seqline, 1000, '\n'); if (0 == i) { strcpy(inputseq, seqline); } else { strcat(inputseq, seqline); } } sequence.close(); for (i = 0; ; ++i) { if (inputseq[i] == 'U' || inputseq[i] == 'u') { gsequences[i][seqid] = 0; } else if (inputseq[i] == 'T' || inputseq[i] == 't') { gsequences[i][seqid] = 0; } else if (inputseq[i] == 'C' || inputseq[i] == 'c') { gsequences[i][seqid] = 1; } else if (inputseq[i] == 'A' || inputseq[i] == 'a') { gsequences[i][seqid] = 2; } else if (inputseq[i] == 'G' || inputseq[i] == 'g') { gsequences[i][seqid] = 3; } else if (inputseq[i] == '-' || inputseq[i] == '.') { gsequences[i][seqid] = 9; } else if (inputseq[i] == '\0') { nseq = i; break; } else { if (!skip && (inputseq[i] == 'M' || inputseq[i] == 'm' || inputseq[i] == 'R' || inputseq[i] == 'r' || inputseq[i] == 'W' || inputseq[i] == 'w' || inputseq[i] == 'V' || inputseq[i] == 'v' || inputseq[i] == 'H' || inputseq[i] == 'h' || inputseq[i] == 'D' || inputseq[i] == 'd' || inputseq[i] == 'B' || inputseq[i] == 'b' || inputseq[i] == 'S' || inputseq[i] == 's' || inputseq[i] == 'Y' || inputseq[i] == 'y' || inputseq[i] == 'K' || inputseq[i] == 'k' || inputseq[i] == 'N' || inputseq[i] == 'n' || inputseq[i] == 'X' || inputseq[i] == 'x')) { // allow ambiguous nt codes gsequences[i][seqid] = 5; // ambiguous nt cerr << "Warning: sequence " << seq << ", ambiguous nt '" << inputseq[i] << "', at " << i+1 << ".\n"; } else { cerr << "Aborting: sequence " << seq << ", unknown nucleotide '" << inputseq[i] << "', at " << i+1 << ".\n"; exit(EXIT_FAILURE); } } } return(nseq); } //----------------------------------------------------------------------------- // Note that if there are overlapping ORFs in the same frame (e.g. alternative // start points) then only one is counted at any nt. // Does up to triple coding. Further overlapping ORFs are ignored. int readorfs(char cpos[128], int codonpos[][3], int length, int circular) { int i, j, k, orf1, orf2, nseg, count, norfs, test; test = 1; // for warning message about multiple same-frame overlapping ORFs for (i = 0; i < length; ++i) { codonpos[i][0] = codonpos[i][1] = codonpos[i][2] = 0; } // Work through ORF list. ifstream orffile(cpos); if (!orffile) { cerr << "Aborting: can't find file '" << cpos << "'.\n"; exit(EXIT_FAILURE); } norfs = -1; while (orffile.ignore(1000, '\n')) { ++norfs; } orffile.clear(); orffile.seekg(0); for (i = 0; i < norfs; ++i) { count = 0; orffile >> nseg; nseg /= 2; for (k = 0; k < nseg; ++k) { orffile >> orf1; orffile >> orf2; if (orf1<1||orf2<1||orf1>length||orf2>length) { cerr << "Aborting: ORF " << i+1 << " in " << cpos << ", endpoints outside sequence range.\n"; exit(EXIT_FAILURE); } // Since arrays start at 0 rather than 1. orf1--; orf2--; // Enter 1,2,3 (forward) or 4,5,6 (reverse) into codonpos[][] if (orf1 < orf2) { // forward frame for (j = orf1; j <= orf2; ++j) { // blank codons overlapping joins except for circular genomes if (!((j==orf2&&count+1!=3)||(j==orf2-1&&count+1==1)|| (j==orf1&&count+1!=1)||(j==orf1+1&&count+1==3))|| (circular&&(j==length-1||j==length-2||j==0||j==1))){ if (0 == codonpos[j][0]) { codonpos[j][0] = count+1; } else if (0 == codonpos[j][1]) { if (codonpos[j][0] != count+1) { // ignore if same frame as codonpos[0] codonpos[j][1] = count+1; } else { if (test) { cerr << "Warning: " << cpos << " has overlapping same-frame ORFs.\n"; test = 0; } } } else if (0 == codonpos[j][2]) { if (codonpos[j][0] != count+1 && codonpos[j][1] != count+1) { // ignore if same frame as codonpos[0] or codonpos[1] codonpos[j][2] = count+1; } else { if (test) { cerr << "Warning: " << cpos << " has overlapping same-frame ORFs.\n"; test = 0; } } } } count = (count+1)%3; } } else { // reverse frame for (j = orf1; j >= orf2; --j) { // blank codons overlapping joins except for circular genomes if (!((j==orf2&&count+1!=3)||(j==orf2+1&&count+1==1)|| (j==orf1&&count+1!=1)||(j==orf1-1&&count+1==3))|| (circular&&(j==length-1||j==length-2||j==0||j==1))){ if (0 == codonpos[j][0]) { codonpos[j][0] = count+4; } else if (0 == codonpos[j][1]) { if (codonpos[j][0] != count+4) { // ignore if same frame as codonpos[0] codonpos[j][1] = count+4; } else { if (test) { cerr << "Warning: " << cpos << " has overlapping same-frame ORFs.\n"; test = 0; } } } else if (0 == codonpos[j][2]) { if (codonpos[j][0] != count+4 && codonpos[j][1] != count+4) { // ignore if same frame as codonpos[0] or codonpos[1] codonpos[j][2] = count+4; } else { if (test) { cerr << "Warning: " << cpos << " has overlapping same-frame ORFs.\n"; test = 0; } } } } count = (count+1)%3; } } } if (0 != count%3) { cerr << "Warning: number of nucleotides in ORF " << i+1 << " in " << cpos << " not a multiple of 3.\n"; } orffile.ignore(1000, '\n'); } orffile.close(); return(norfs); } //----------------------------------------------------------------------------- // Tests for stop <-> non-stop transitions between a sequence pair, in up to // all three codonpos frames. Actually checks for any transitions which have // zero probability in the amino acid x CUT substitution matrix. These are // skipped over when calculating MLRGD (since log(0) is undefined) and // recorded in the stoplog file. Also checks whether a mutation is // neutral/synonymous or nonsynonymous. int mut_type(double weight[][64], int sequences[][2], int base0[3], int codonpos[][3], int aa2aa[64], int model) { int k, p, j1[2], j2[2], j3[2], jcid[2], ok, k1, k2, k3; if (0 == model) { return(1); } ok = 1; // stays = 1 if no non-zero probabality or nonsynonymous transitions for (p = 0; p < 3; ++p) { // cycle through the three codonpos files if (0 != codonpos[base0[2]][p]) { // coding for (k = 0; k < 2; ++k) { // cycle through sequence 1 and sequence 2 if (1 == codonpos[base0[2]][p] || 6 == codonpos[base0[2]][p]) { k1 = sequences[base0[k]][k]; k2 = sequences[base0[k]+1][k]; k3 = sequences[base0[k]+2][k]; } else if (2 == codonpos[base0[2]][p] || 5 == codonpos[base0[2]][p]) { k1 = sequences[base0[k]-1][k]; k2 = sequences[base0[k]][k]; k3 = sequences[base0[k]+1][k]; } else if (3 == codonpos[base0[2]][p] || 4 == codonpos[base0[2]][p]) { k1 = sequences[base0[k]-2][k]; k2 = sequences[base0[k]-1][k]; k3 = sequences[base0[k]][k]; } if (5 == k1 || 5 == k2 || 5 == k3) { return(0); // codon contains an ambiguous nt code } if (1 == codonpos[base0[2]][p]) { j1[k] = sequences[base0[k]][k]; j2[k] = sequences[base0[k]+1][k]; j3[k] = sequences[base0[k]+2][k]; } else if (2 == codonpos[base0[2]][p]) { j1[k] = sequences[base0[k]-1][k]; j2[k] = sequences[base0[k]][k]; j3[k] = sequences[base0[k]+1][k]; } else if (3 == codonpos[base0[2]][p]) { j1[k] = sequences[base0[k]-2][k]; j2[k] = sequences[base0[k]-1][k]; j3[k] = sequences[base0[k]][k]; } else if (6 == codonpos[base0[2]][p]) { j3[k] = (sequences[base0[k]][k] + 2) % 4; j2[k] = (sequences[base0[k]+1][k] + 2) % 4; j1[k] = (sequences[base0[k]+2][k] + 2) % 4; } else if (5 == codonpos[base0[2]][p]) { j3[k] = (sequences[base0[k]-1][k] + 2) % 4; j2[k] = (sequences[base0[k]][k] + 2) % 4; j1[k] = (sequences[base0[k]+1][k] + 2) % 4; } else if (4 == codonpos[base0[2]][p]) { j3[k] = (sequences[base0[k]-2][k] + 2) % 4; j2[k] = (sequences[base0[k]-1][k] + 2) % 4; j1[k] = (sequences[base0[k]][k] + 2) % 4; } else { cerr << "Can't get here!\n"; exit(EXIT_FAILURE); } jcid[k] = 16 * j1[k] + 4 * j2[k] + j3[k]; // codon ids } // close loop through sequence 1 and sequence 2 if (0 == weight[jcid[0]][jcid[1]]) { return(0); // zero probability transition } else if (aa2aa[jcid[0]] != aa2aa[jcid[1]]) { ok = 2; // nonsynonymous mutation } } } // close loop through codonpos files return(ok); //ok = // 0 -> zero-probability transition in one or more frames // 1 -> no mutation or neutral/synonymous mutation in all frames // 2 -> nonsynonymous mutation in at least one frame //Note that even if base0[0] = base0[1], this subroutine may return 2 if // adjacent bases in the same codon differ. } //----------------------------------------------------------------------------- // Tests whether a site is four-fold degenerate (including any non-coding nt). // Note - only looks at sequence 1, so a mutation at a four-fold degenerate // site isn't necessarily a synonymous mutation, if the other codon positions // in sequence 2 differ. // Not set up to deal with ambiguous nt code (sequences[][] = 5) so make sure // it is not called unless mut_type returns non-0. int fourfold(int sequences[][2], int base0[3], int codonpos[][3], int aa2aa[64], int model) { int kn, p, j1, j2, j3, jcid, k1, k2, k3, kcid, ok; if (0 == model) { return(1); } ok = 1; // stays = 1 if 4-fold degenerate for (p = 0; p < 3; ++p) { // cycle through the three codonpos files if (0 != codonpos[base0[2]][p]) { // coding if (1 == codonpos[base0[2]][p]) { j1 = sequences[base0[0]][0]; j2 = sequences[base0[0]+1][0]; j3 = sequences[base0[0]+2][0]; } else if (2 == codonpos[base0[2]][p]) { j1 = sequences[base0[0]-1][0]; j2 = sequences[base0[0]][0]; j3 = sequences[base0[0]+1][0]; } else if (3 == codonpos[base0[2]][p]) { j1 = sequences[base0[0]-2][0]; j2 = sequences[base0[0]-1][0]; j3 = sequences[base0[0]][0]; } else if (6 == codonpos[base0[2]][p]) { j3 = (sequences[base0[0]][0] + 2) % 4; j2 = (sequences[base0[0]+1][0] + 2) % 4; j1 = (sequences[base0[0]+2][0] + 2) % 4; } else if (5 == codonpos[base0[2]][p]) { j3 = (sequences[base0[0]-1][0] + 2) % 4; j2 = (sequences[base0[0]][0] + 2) % 4; j1 = (sequences[base0[0]+1][0] + 2) % 4; } else if (4 == codonpos[base0[2]][p]) { j3 = (sequences[base0[0]-2][0] + 2) % 4; j2 = (sequences[base0[0]-1][0] + 2) % 4; j1 = (sequences[base0[0]][0] + 2) % 4; } else { cerr << "Can't get here!\n"; exit(EXIT_FAILURE); } jcid = 16 * j1 + 4 * j2 + j3; // actual codon id for (kn = 0; kn < 4; ++kn) { // possible replacement nt k1 = j1; k2 = j2; k3 = j3; if (1 == codonpos[base0[2]][p]) { k1 = kn; } else if (2 == codonpos[base0[2]][p]) { k2 = kn; } else if (3 == codonpos[base0[2]][p]) { k3 = kn; } else if (6 == codonpos[base0[2]][p]) { k3 = (kn + 2) % 4; } else if (5 == codonpos[base0[2]][p]) { k2 = (kn + 2) % 4; } else if (4 == codonpos[base0[2]][p]) { k1 = (kn + 2) % 4; } else { cerr << "Can't get here!\n"; exit(EXIT_FAILURE); } kcid = 16 * k1 + 4 * k2 + k3; // replacement codon id if (aa2aa[kcid] != aa2aa[jcid]) { ok = 0; // replacement aa != initial aa i.e. nonsynonymous } } // close loop over kn } // close 'if (0 != codonpos[base0[2]][p])' } // close loop through codonpos files return(ok); //ok = // 0 -> not 4-fold degenerate // 1 -> 4-fold degenerate } //----------------------------------------------------------------------------- // Makes 4x4 nucleotide P matrix for a given t. int calc_nuc(double nuc[][4], double ttt, double nucB1[][4], double nucB2[][4], double nucD[][4]) { int ni, nj, nk; double tempA[4][4]; for (ni = 0; ni < 4; ++ni) { for (nj = 0; nj < 4; ++nj) { tempA[ni][nj] = nucB1[ni][nj]*exp(ttt*nucD[nj][nj]); } } for (ni = 0; ni < 4; ++ni) { for (nj = 0; nj < 4; ++nj) { nuc[ni][nj] = 0.; for (nk = 0; nk < 4; ++nk) { nuc[ni][nj] += tempA[ni][nk]*nucB2[nk][nj]; } if (nuc[ni][nj] < 0.) { nuc[ni][nj] = 0.; // in case of rounding errors } } } return(0); } //----------------------------------------------------------------------------- // Check input aligned sequence pair for gaps occurring not in multiples of 3 // within coding regions. Output a warning if any problem. int checkgaps(int lookup[][3], int codonpos[][3], int nseq0, int np) { int i, k, cdstest, gaptest, count; for (k = 0; k < 2; ++k) { // step through seq1 and seq2 cdstest = 0; // 0 -> in a non-coding region; 1 -> in a coding region gaptest = 0; // 0 -> in a non-gapped region; 1 -> in a gapped region for (i = 0; i < nseq0; ++i) { if (!cdstest) { // non-coding region if (-1 != lookup[i][k]) { // not a gap in seq k if (0 != codonpos[lookup[i][2]][0]) { cdstest = 1; // switch to coding region } } } else { // coding-region if (gaptest) { // gapped region if (-1 == lookup[i][k]) { // a gap in seq k count += 1; } else if (0 == codonpos[lookup[i][2]][0]) { cdstest = 0; // switch to non-coding region gaptest = 0; // switch to non-gapped region } else { gaptest = 0; // switch to non-gapped region if (0 != count % 3) { cerr << "Warning: gap within coding region not a multiple of " << "three nt;\n sequence pair " << np+1 << ", sequence " << k+1 << ", sequence coords " << lookup[i][k] << ", alignment coords " << i << ".\n"; } } } else { // not a gapped region if (-1 == lookup[i][k]) { gaptest = 1; // switch to gapped region count = 1; } else if (0 == codonpos[lookup[i][2]][0]) { cdstest = 0; // switch to non-coding region } } } } } return(0); }