/* # Copyright (C) 2005 Andrew E Firth, University of Otago, Dunedin, # New Zealand, aef(at)sanger.otago.ac.nz # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License (version 2) as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA # 02110-1301, USA. */ // Input a sequence and nucleotide, amino acid and codon weight matrices. // Apply random mutations taking into account the weights until required // total number of observed or accepted mutations is reached. Includes // non-coding, single-coding or double-coding constraints depending on // input CDS files. // Input seq1, seq2, seq3 are aligned sequences (with gaps as appropriate). // Input CDS files and range coords are in seq1 coords (not alignment // coords). Seq1 is mutated. Seq2 and seq3 are only used to add gaps (and // ambig nt) to output sequence and to make mask file of where to count muts // towards #obsmuts. // Ambig nt not allowed in input seq1. // If wholeseq = 2, mutations are applied to seq1 within the entire input // range, but only counted towards #obsmuts if within the alt model CDSs. #include #include #include #include #include #include using namespace std; #define maxlength 35000 // Function declarations. int readsequence(char seq[128], int gsequences[][3], int seqid); int readorfs(char cpos[128], int codonpos[][2], int length, int circular, int posid); int main(int argc, char* argv[]) { if (2 != argc) { cerr << "Aborting: need setup file; use '" << argv[0] << " setupfile'.\n"; exit(EXIT_FAILURE); } //--------------------------------------------------------------------------- // Variable declarations. int i, j, k, temp95, lines, timeout, niter, test, norfs; int wholeseq, range1, range2, simcount, nsims, model, seed, nseq0; int j1[2], j2[2], j3[2], jcid[2], base, mut, test2, frac, observed, extn; int potmut, prev, obscount, acccount, aa2aa[64], degnseq, extn1, extn2; int sequences[maxlength+10][2], gsequences[maxlength+10][3], nseq2, nseq3; int codonpos[maxlength+10][2], mask[maxlength+10], circular; double lambda1, lambda2, max0, sand, x, x1, x2; double nuc[4][4], codon[64], score[20][20], weight[64][64], P[4], Q[4][3]; char aa[20], aa2codon[64], temp, temp2[128], seq_suffix[20]; char seq1file[128], seq2file[128], seq3file[128], seq_file[128]; int verbose = 0; timeout = 1000000; extn = 20; //--------------------------------------------------------------------------- // Setup parameters. // Open setup file. ifstream setupfile(argv[1]); if (!setupfile) { cerr << "Aborting: can't find setup file '" << argv[1] << "'.\n"; exit(EXIT_FAILURE); } // Read setup parameters. setupfile.getline(seq1file, 128, ' '); setupfile.ignore(1000, '\n'); setupfile.getline(seq2file, 128, ' '); setupfile.ignore(1000, '\n'); setupfile.getline(seq3file, 128, ' '); setupfile.ignore(1000, '\n'); setupfile >> wholeseq; if (0 == wholeseq || 2 == wholeseq) { setupfile >> range1; setupfile >> range2; } setupfile.ignore(1000, '\n'); setupfile >> model; setupfile.ignore(1000, '\n'); setupfile >> lambda1; setupfile >> lambda2; setupfile.ignore(1000, '\n'); setupfile >> frac; setupfile.ignore(1000, '\n'); setupfile >> observed; setupfile.ignore(1000, '\n'); setupfile >> nsims; setupfile.ignore(1000, '\n'); setupfile >> seed; setupfile.ignore(1000, '\n'); setupfile >> circular; setupfile.ignore(1000, '\n'); setupfile.close(); //--------------------------------------------------------------------------- // Amino acid, nucleotide and codon weight matrices. // Number of lines in 20 x 20 amino acid weight matrix. ifstream matrixfile("aa.dat"); if (!matrixfile) { cerr << "\nAborting: can't find file 'aa.dat'.\n"; exit(EXIT_FAILURE); } lines = 0; while (matrixfile.ignore(1000, '\n')) { ++lines; } matrixfile.clear(); matrixfile.seekg(0); // Read amino acid weight matrix. temp = '#'; while ('#' == temp) { matrixfile.get(temp); if ('#' == temp) { matrixfile.ignore(1000, '\n'); } } for (j = 0; j < 20; ++j) { matrixfile >> aa[j]; } matrixfile.ignore(1000, '\n'); for (i = 0; i < 20; ++i) { for (j = 0; j <= i; ++j) { matrixfile >> score[i][j]; } matrixfile.ignore(1000, '\n'); } matrixfile.close(); // Fill in other half of weight matrix. for (i = 0; i < 20; ++i) { for (j = i + 1; j < 20; ++j) { score[i][j] = score[j][i]; } } // Check no values greater than corresponding values on the diagonal. for (i = 0; i < 20; ++i) { for (k = 0; k < 20; ++k) { if (score[i][k] > score[i][i] || score[i][k] > score[k][k]) { cerr << "Aborting: scoring matrix value at " << i+1 << ", " << k+1 << " is greater than value on diagonal.\n"; exit(EXIT_FAILURE); } } } // Divide each _row_ i by the diagonal term. I.e. the weight for an amino // acid to be replaced by itself is unity. (Note that the matrix is no // longer symetric.) for (i = 0; i < 20; ++i) { max0 = score[i][i]; for (k = 0; k < 20; ++k) { score[i][k] /= max0; } } // Check no negative values. for (i = 0; i < 20; ++i) { for (k = 0; k < 20; ++k) { if (score[i][k] < 0.) { cerr << "Aborting: scoring matrix value at " << i+1 << ", " << k+1 << " is negative.\n"; exit(EXIT_FAILURE); } } } // Read codon to amino acid table. ifstream aa2codonfile("aa2codon.dat"); if (!aa2codonfile) { cerr << "\nAborting: can't find file 'aa2codon.dat'.\n"; exit(EXIT_FAILURE); } for (i = 0; i < 64; ++i) { aa2codonfile.get(aa2codon[i]); aa2codonfile.ignore(1000, '\n'); } aa2codonfile.close(); // Lookup table connecting aa2codon[64] and aa[20] (aa[20] is the // order in which the amino acids appear in the weight matrix). for (i = 0; i < 64; ++i) { aa2aa[i] = 99; // for STOP codons for (j = 0; j < 20; ++j) { if (aa2codon[i] == aa[j]) { aa2aa[i] = j; } } } // Paste weights into 64 x 64 matrix; set stop codons = 1, 0. Weight // is for row i -> column k. for (i = 0; i < 64; ++i) { for (j = 0; j < 64; ++j) { if (99 == aa2aa[i] && 99 == aa2aa[j]) { weight[i][j] = 1.; } else if (99 == aa2aa[i]) { weight[i][j] = 0.; } else if (99 == aa2aa[j]) { weight[i][j] = 0.; } else { weight[i][j] = score[aa2aa[i]][aa2aa[j]]; } } } // Read 'instantaneous' 4 x 4 nucleotide mutation matrix (order is UCAG; // diagonal terms ignored, scaling unimportant; row i -> column k.) ifstream nucfile("nuc.dat"); if (!nucfile) { cerr << "\nAborting: can't find file 'nuc.dat'.\n"; exit(EXIT_FAILURE); } for (i = 0; i < 4; ++i) { for (k = 0; k < 4; ++k) { nucfile >> nuc[i][k]; } nucfile.ignore(1000, '\n'); } nucfile.close(); for (i = 0; i < 4; ++i) { nuc[i][i] = 0.; } for (i = 0; i < 4; ++i) { for (k = 0; k < 4; ++k) { if (nuc[i][k] < 0.) { cerr << "Aborting: nucleotide matrix value at " << i+1 << ", " << k+1 << " is negative.\n"; exit(EXIT_FAILURE); } } } // Read codon usage table (scaling unimportant; set all values equal if you // don't want to use codon usage weights). ifstream codonfile("codon.dat"); if (!codonfile) { cerr << "\nAborting: can't find file 'codon.dat'.\n"; exit(EXIT_FAILURE); } for (i = 0; i < 64; ++i) { codonfile.getline(temp2, 128, ' '); codonfile >> codon[i]; codonfile.ignore(1000, '\n'); } codonfile.close(); for (i = 0; i < 64; ++i) { if (codon[i] < 0.) { cerr << "Aborting: codon usage table value at " << i+1 << " is negative.\n"; exit(EXIT_FAILURE); } } // Scale so that maximum codon usage value is unity. max0 = 0.; for (i = 0; i < 64; ++i) { if (codon[i] > max0) { max0 = codon[i]; } } for (i = 0; i < 64; ++i) { codon[i] /= max0; } // Apply codon usage weighting to weight matrix. for (i = 0; i < 64; ++i) { for (j = 0; j < 64; ++j) { weight[i][j] *= codon[j]; } } // Check no values > 1. for (i = 0; i < 64; ++i) { for (k = 0; k < 64; ++k) { if (weight[i][k] > 1.) { cerr << "Aborting: weight matrix value at " << i+1 << ", " << k+1 << " is greater than unity.\n"; exit(EXIT_FAILURE); } } } // P(a given ? mutates to a different nucleotide), where ? = A, C, G or T. P[0] = nuc[0][1] + nuc[0][2] + nuc[0][3]; P[1] = nuc[1][0] + nuc[1][2] + nuc[1][3]; P[2] = nuc[2][0] + nuc[2][1] + nuc[2][3]; P[3] = nuc[3][0] + nuc[3][1] + nuc[3][2]; // Normalization value (so that maximum value = 1). max0 = P[0]; if (P[1] > max0) {max0 = P[1];} if (P[2] > max0) {max0 = P[2];} if (P[3] > max0) {max0 = P[3];} // P(a given ? mutates to a given nucleotide) (limits). Q[0][1] = nuc[0][1] / max0; Q[0][2] = (nuc[0][1] + nuc[0][2]) / max0; Q[0][3] = P[0] / max0; Q[1][1] = nuc[1][0] / max0; Q[1][2] = (nuc[1][0] + nuc[1][2]) / max0; Q[1][3] = P[1] / max0; Q[2][1] = nuc[2][0] / max0; Q[2][2] = (nuc[2][0] + nuc[2][1]) / max0; Q[2][3] = P[2] / max0; Q[3][1] = nuc[3][0] / max0; Q[3][2] = (nuc[3][0] + nuc[3][1]) / max0; Q[3][3] = P[3] / max0; if (verbose) { cout << "Relative mutation frequencies:\n Ptm = " << Q[0][3] << ", Pcm = " << Q[1][3] << ", Pam = " << Q[2][3] << ", Pgm = " << Q[3][3] << "\n"; for (i = 0; i < 4; ++i) { cout << " " << Q[i][1] << " " << Q[i][2] << " " << Q[i][3] << "\n"; } } //--------------------------------------------------------------------------- // Miscellaneous stuff. // Check model. if (0 != model && 1 != model) { cerr << "\nRequire model = 0 or 1.\n"; exit(EXIT_FAILURE); } // Check required number of Monte Carlo simulations. if (nsims <= 0) { cerr << "\nRequire number of simulations > 0. Adjusting to 1.\n"; nsims = 1; } else { if (verbose) { cout << "Number of simulations = " << nsims << ".\n"; } } // Check random seed. if (seed < 0) { seed = -seed; } if (0 == seed) { seed = 1; } srand(seed); if (verbose) { cout << "Random seed = " << seed << ".\n"; } // Check timeout (maximum number of mutation attempts). if (timeout < 1) { timeout = 1; } // Check range extension (in case of cross-talk into neighbouring regions). if (extn < 0) { extn = 0; } //--------------------------------------------------------------------------- // Read initial sequence pair (aligned - i.e. with appropriate gaps). // Mutations are applied to seq1. Seq2 and seq3 are only used for // positions of extra gaps (and ambig nt). nseq0 = readsequence(seq1file, gsequences, 0); if (verbose) { cout << "Read " << seq1file << ", length = " << nseq0 << ".\n"; } nseq2 = readsequence(seq2file, gsequences, 1); if (verbose) { cout << "Read " << seq2file << ", length = " << nseq2 << ".\n"; } nseq3 = readsequence(seq3file, gsequences, 2); if (verbose) { cout << "Read " << seq3file << ", length = " << nseq3 << ".\n"; } if (nseq2 != nseq0 || nseq3 != nseq0) { cerr << "Aborting: gapped sequences of differing length.\n"; exit(EXIT_FAILURE); } //--------------------------------------------------------------------------- // Degap seq1 and make mask file of gaps and ambig nt in seq2 and seq3 // (in degapped seq1 coords). degnseq = 0; // degapped seq1 length for (i = 0; i < nseq0; ++i) { if (5 == gsequences[i][0]) { // ambig nt in seq1 cerr << "\nDetected an ambiguous nt code in the reference sequence.\n"; cerr << "Can't run Monte Carlo simulations in this situation.\n"; exit(EXIT_FAILURE); } if (9 != gsequences[i][0]) { // not a gap in seq1 sequences[degnseq][0] = gsequences[i][0]; if (9 != gsequences[i][1] && 9 != gsequences[i][2] && 5 != gsequences[i][1] && 5 != gsequences[i][2]) { mask[degnseq] = 1; // neither a gap nor ambig nt in both seq2 and seq3 } else { mask[degnseq] = 0; // a gap or ambig nt in either seq2 or seq3 } ++degnseq; } } if (verbose) { cout << "Degapped sequences. Seq1 length = " << degnseq << ".\n"; } //--------------------------------------------------------------------------- // Codon position files (for degapped seq1). For each nucleotide position: // 0 = non-coding; 1/2/3 = 1st/2nd/3rd positions in a forward read // direction codon; 4/5/6 = 1st/2nd/3rd positions in a reverse // (complementary strand) read direction codon. codonpos[][0] gives null // model. codonpos[][1] gives alternative model. codonpos[][0] + // codonpos[][1] may be non-coding, single-coding (in null or alternative), // double-coding or a mixture, depending on sequence position. Programme // does MC sims for null model if model = 0 and for alternative model if // model = 1. norfs = readorfs("orfs.1", codonpos, degnseq, circular, 0); if (verbose) { cout << "Read 'orfs.1', containing " << norfs << " ORFs.\n"; } norfs = readorfs("orfs.2", codonpos, degnseq, circular, 1); if (verbose) { cout << "Read 'orfs.2', containing " << norfs << " ORFs.\n"; } //--------------------------------------------------------------------------- // If using 'Only use nt within query CDS(s)', mask all nt that are not // annotated as CDSs in the alternate model. if (2 == wholeseq) { for (i = 0; i < degnseq; ++i) { if (0 == codonpos[i][1]) { mask[i] = 0; // non-coding in alt model } } } //--------------------------------------------------------------------------- // Find range boundaries (in degapped seq1). if (0 == wholeseq || 2 == wholeseq) { range1 -= 1; range2 -= 1; // since indices start at 0 if (range1 > range2) { // make range1 < range2 temp95 = range1; range1 = range2; range2 = temp95; } if (range2 > degnseq - 1) { range2 = degnseq - 1; } if (range1 < 0) { range1 = 0; } extn1 = range1 - extn; extn2 = range2 + extn; if (extn2 > degnseq - 1) { if (circular && 2 == wholeseq) { range1 = 0; range2 = degnseq - 1; extn1 = 0; extn2 = degnseq - 1; } else { extn2 = degnseq - 1; } } if (extn1 < 0) { if (circular && 2 == wholeseq) { range1 = 0; range2 = degnseq - 1; extn1 = 0; extn2 = degnseq - 1; } else { extn1 = 0; } } } else { // whole sequence range1 = 0; range2 = degnseq - 1; extn1 = 0; extn2 = degnseq - 1; } //--------------------------------------------------------------------------- // Find range x1-x2 for number of nucleotide mutations (set lambda1 = lambda2 // in the setupfile to fix the number of mutations). // Number of countable sites (within range1-range2 and not gapped in either // sequence). potmut = 0; for (base = range1; base <= range2; ++base) { if (mask[base]) { potmut += 1; } } if (verbose) { cout << "\nNumber of countable mutable sites = " << potmut << ".\n"; } if (lambda1 <= 0.0) { cerr << "\nRequire number of mutations >= 0. Adjusting lambda1 to 0.\n"; lambda1 = 0; } if (lambda2 <= 0.0) { cerr << "\nRequire number of mutations >= 0. Adjusting lambda2 to 0.\n"; lambda2 = 0; } if (frac) { // lambda1, lambda2 are mutations/nucleotide x1 = float(potmut) * lambda1; x2 = float(potmut) * lambda2; } else { // lambda1, lambda2 are total number of mutations x1 = lambda1; x2 = lambda2; } if (verbose) { cout << "\nx1 = " << x1 << ", x2 = " << x2 << "\n"; } //--------------------------------------------------------------------------- // Apply random mutations. if (verbose) { cout << "\nProducing Monte Carlo simulated sequences.\n\n"; } // For each simulation, select total number of accepted/observed point // nucleotide mutations (linear random number in x1-x2 range). // For each mutation, // Randomly pick a nucleotide in the sequence. // Randomly decide what it mutates to, weighting probabilities by 4 x 4 // nucleotide matrix. // Check acceptability of amino acid substitution(s) if applicable. // End. // End. // Note the number of observed mutations may be fewer than the number of // accepted mutations if you get more than one mutation in the same place. // Note mutations only applied within region range1-extn to range2+extn and // only counted towards the total number if in region range1 to range2 and // not gapped in either seq 1 or seq 2. for (simcount = 1; simcount <= nsims; ++simcount) { // Re-initialize simulated sequence. for (i = 0; i < degnseq; ++i) { sequences[i][1] = sequences[i][0]; } // Choose random number of mutations between x1 and x2. sand = float(rand())/float(RAND_MAX); x = x1 + sand * (x2 - x1); obscount = 0; // number of observable mutations acccount = 0; // number of accepted mutations niter = 0; // for timeout // Step through point mutations. for (i = 0; i < int(x); ) { test = 1; // always true, has to exit following loops via break while (test) { // Decide which nucleotide mutates. sand = float(rand())/float(RAND_MAX); base = extn1 + int(float(extn2 - extn1 + 1) * sand); // apply mutations // within extn1--extn2, but only count as observed if in range1--range2 niter += 1; // for timeout if (niter >= timeout) { break; } // Check whether it mutates and what to. The following allows for the // possibility that not all bases ACGT have equal probability of // mutating to a different base. Keeps going through the while loop // until it finds a successful mutation. sand = float(rand())/float(RAND_MAX); if (0 == sequences[base][1]) { if (sand > 0. && sand <= Q[0][1]) {mut = 1; break;} else if (sand > Q[0][1] && sand <= Q[0][2]) {mut = 2; break;} else if (sand > Q[0][2] && sand <= Q[0][3]) {mut = 3; break;} } else if (1 == sequences[base][1]) { if (sand > 0. && sand <= Q[1][1]) {mut = 0; break;} else if (sand > Q[1][1] && sand <= Q[1][2]) {mut = 2; break;} else if (sand > Q[1][2] && sand <= Q[1][3]) {mut = 3; break;} } else if (2 == sequences[base][1]) { if (sand > 0. && sand <= Q[2][1]) {mut = 0; break;} else if (sand > Q[2][1] && sand <= Q[2][2]) {mut = 1; break;} else if (sand > Q[2][2] && sand <= Q[2][3]) {mut = 3; break;} } else if (3 == sequences[base][1]) { if (sand > 0. && sand <= Q[3][1]) {mut = 0; break;} else if (sand > Q[3][1] && sand <= Q[3][2]) {mut = 1; break;} else if (sand > Q[3][2] && sand <= Q[3][3]) {mut = 2; break;} } else { cerr << "Can't get here! (003)\n"; exit(EXIT_FAILURE); } } // exits while loop with nuc at position 'base' mutates to 'mut'. if (niter >= timeout) { break; // don't apply any more mutations } // Check acceptability of amino acid substitution(s) relative to // initial seq 1. test2 = 1; // flag for accepted (if != 0) if ((0 == codonpos[base][0] && 0 == model) || (0 == codonpos[base][0] && 0 == codonpos[base][1] && 1 == model)) { test2 = 1; // use non-coding model - no further weights applied } else { // single or double coding model if (0 != codonpos[base][0]) { // coding in pos.1 // Find the original/previous frame 1 codons in seq1 and sim seq. // Note (x+degnseq)%degnseq, maps 0 <= x <= degnseq-1 -> x; // -1,-2 -> degnseq-1,degnseq-2; degnseq,degnseq+1 -> 0,1. // OK to use this for both circular and non-circular, since if // it is not circular then you shouldn't reference outside // 0 <= x <= degnseq-1, and if by mistake you do, then this stops // you seg-faulting. for (k = 0; k < 2; ++k) { if (1 == codonpos[base][0]) { j1[k] = sequences[(base+degnseq)%degnseq][k]; j2[k] = sequences[(base+1+degnseq)%degnseq][k]; j3[k] = sequences[(base+2+degnseq)%degnseq][k]; } else if (2 == codonpos[base][0]) { j1[k] = sequences[(base-1+degnseq)%degnseq][k]; j2[k] = sequences[(base+degnseq)%degnseq][k]; j3[k] = sequences[(base+1+degnseq)%degnseq][k]; } else if (3 == codonpos[base][0]) { j1[k] = sequences[(base-2+degnseq)%degnseq][k]; j2[k] = sequences[(base-1+degnseq)%degnseq][k]; j3[k] = sequences[(base+degnseq)%degnseq][k]; } else if (6 == codonpos[base][0]) { j3[k] = (sequences[(base+degnseq)%degnseq][k] + 2) % 4; j2[k] = (sequences[(base+1+degnseq)%degnseq][k] + 2) % 4; j1[k] = (sequences[(base+2+degnseq)%degnseq][k] + 2) % 4; } else if (5 == codonpos[base][0]) { j3[k] = (sequences[(base-1+degnseq)%degnseq][k] + 2) % 4; j2[k] = (sequences[(base+degnseq)%degnseq][k] + 2) % 4; j1[k] = (sequences[(base+1+degnseq)%degnseq][k] + 2) % 4; } else if (4 == codonpos[base][0]) { j3[k] = (sequences[(base-2+degnseq)%degnseq][k] + 2) % 4; j2[k] = (sequences[(base-1+degnseq)%degnseq][k] + 2) % 4; j1[k] = (sequences[(base+degnseq)%degnseq][k] + 2) % 4; } else { cerr << "Can't get here! (006)\n"; exit(EXIT_FAILURE); } } if (1 == codonpos[base][0]) { // find the new codon in sim seq j1[1] = mut; } else if (2 == codonpos[base][0]) { j2[1] = mut; } else if (3 == codonpos[base][0]) { j3[1] = mut; } else if (6 == codonpos[base][0]) { j3[1] = (mut + 2) % 4; } else if (5 == codonpos[base][0]) { j2[1] = (mut + 2) % 4; } else if (4 == codonpos[base][0]) { j1[1] = (mut + 2) % 4; } else { cerr << "Can't get here! (009)\n"; exit(EXIT_FAILURE); } for (k = 0; k < 2; ++k) { jcid[k] = 16 * j1[k] + 4 * j2[k] + j3[k]; } sand = float(rand())/float(RAND_MAX); if (sand >= weight[jcid[0]][jcid[1]]) { // rejected test2 = 0; } } if (0 != codonpos[base][1] && 1 == model) { // coding in pos.2 // Find the original/previous frame 2 codons in seq 1 and sim seq. for (k = 0; k < 2; ++k) { if (1 == codonpos[base][1]) { j1[k] = sequences[(base+degnseq)%degnseq][k]; j2[k] = sequences[(base+1+degnseq)%degnseq][k]; j3[k] = sequences[(base+2+degnseq)%degnseq][k]; } else if (2 == codonpos[base][1]) { j1[k] = sequences[(base-1+degnseq)%degnseq][k]; j2[k] = sequences[(base+degnseq)%degnseq][k]; j3[k] = sequences[(base+1+degnseq)%degnseq][k]; } else if (3 == codonpos[base][1]) { j1[k] = sequences[(base-2+degnseq)%degnseq][k]; j2[k] = sequences[(base-1+degnseq)%degnseq][k]; j3[k] = sequences[(base+degnseq)%degnseq][k]; } else if (6 == codonpos[base][1]) { j3[k] = (sequences[(base+degnseq)%degnseq][k] + 2) % 4; j2[k] = (sequences[(base+1+degnseq)%degnseq][k] + 2) % 4; j1[k] = (sequences[(base+2+degnseq)%degnseq][k] + 2) % 4; } else if (5 == codonpos[base][1]) { j3[k] = (sequences[(base-1+degnseq)%degnseq][k] + 2) % 4; j2[k] = (sequences[(base+degnseq)%degnseq][k] + 2) % 4; j1[k] = (sequences[(base+1+degnseq)%degnseq][k] + 2) % 4; } else if (4 == codonpos[base][1]) { j3[k] = (sequences[(base-2+degnseq)%degnseq][k] + 2) % 4; j2[k] = (sequences[(base-1+degnseq)%degnseq][k] + 2) % 4; j1[k] = (sequences[(base+degnseq)%degnseq][k] + 2) % 4; } else { cerr << "Can't get here! (007)\n"; exit(EXIT_FAILURE); } } if (1 == codonpos[base][1]) { j1[1] = mut; } else if (2 == codonpos[base][1]) { j2[1] = mut; } else if (3 == codonpos[base][1]) { j3[1] = mut; } else if (6 == codonpos[base][1]) { j3[1] = (mut + 2) % 4; } else if (5 == codonpos[base][1]) { j2[1] = (mut + 2) % 4; } else if (4 == codonpos[base][1]) { j1[1] = (mut + 2) % 4; } else { cerr << "Can't get here! (010)\n"; exit(EXIT_FAILURE); } for (k = 0; k < 2; ++k) { jcid[k] = 16 * j1[k] + 4 * j2[k] + j3[k]; } sand = float(rand())/float(RAND_MAX); if (sand >= weight[jcid[0]][jcid[1]]) { test2 = 0; // rejected } } } // close if/else for non-coding versus single/double-coding // Apply mutation before going to next point mutation. if (1 == test2) { prev = sequences[base][1]; sequences[base][1] = mut; // Count accepted and observed mutations (provided not gapped in // seq1 or seq2 and in range range1--range2). if (mask[base] && base >= range1 && base <= range2) { acccount += 1; if (sequences[base][1] != sequences[base][0] && prev == sequences[base][0]) { obscount += 1; } if (sequences[base][1] == sequences[base][0] && prev != sequences[base][0]) { obscount -= 1; } if (observed) { // count observed mutations i = obscount; } else { i = acccount; // count accepted mutations } } } // finish applying mutation } // finish loop over point mutations if (niter >= timeout) { cerr << "Warning: failed to reach requested number of mutations; " << "sequence: " << simcount << ".\n"; } // Write out simulated sequence. ofstream temp_fghjys("temp.fghjys"); temp_fghjys << "." << simcount << "\n"; temp_fghjys.close(); ifstream temp_fghjyt("temp.fghjys"); temp_fghjyt.getline(seq_suffix, 20, '\n'); temp_fghjyt.close(); strcpy(seq_file, "mcseq"); strcat(seq_file, seq_suffix); ofstream sequencen(seq_file); if (!sequencen) { cerr << "Aborting: failed to open output file '" << seq_file << "'.\n"; exit(EXIT_FAILURE); } sequencen << ">Simulation " << simcount << ": " << acccount << " " << obscount << " " << niter; j = 0; for (i = 0; i < nseq0; ++i) { if (0 == i % 60) { sequencen << "\n"; } if (9 == gsequences[i][0]) { // seq1 gap sequencen << "-"; } else if (9 == gsequences[i][1] || 9 == gsequences[i][2] || 5 == gsequences[i][1] || 5 == gsequences[i][2]) { // not a seq1 but seq2 or seq3 gap or ambig nt sequencen << "-"; j++; } else { // not gap or ambig nt if (0 == sequences[j][1]) { sequencen << "U"; } else if (1 == sequences[j][1]) { sequencen << "C"; } else if (2 == sequences[j][1]) { sequencen << "A"; } else if (3 == sequences[j][1]) { sequencen << "G"; } else { cerr << "Can't get here! (004)\n"; exit(EXIT_FAILURE); } j++; } } sequencen << "\n"; sequencen.close(); } // finish loop over simulated sequences } //----------------------------------------------------------------------------- int readsequence(char seq[128], int gsequences[][3], int seqid) { int i, nlines, nseq; char seqline[1000], inputseq[maxlength+10]; ifstream sequence(seq); if (!sequence) { cerr << "Aborting: can't find file '" << seq << "'.\n"; exit(EXIT_FAILURE); } nlines = -1; while (sequence.ignore(1000, '\n')) { ++nlines; } sequence.clear(); sequence.seekg(0); sequence.ignore(1000, '\n'); for (i = 0; i < nlines; ++i) { sequence.getline(seqline, 1000, '\n'); if (0 == i) { strcpy(inputseq, seqline); } else { strcat(inputseq, seqline); } } sequence.close(); for (i = 0; ; ++i) { if (inputseq[i] == 'U' || inputseq[i] == 'u') { gsequences[i][seqid] = 0; } else if (inputseq[i] == 'T' || inputseq[i] == 't') { gsequences[i][seqid] = 0; } else if (inputseq[i] == 'C' || inputseq[i] == 'c') { gsequences[i][seqid] = 1; } else if (inputseq[i] == 'A' || inputseq[i] == 'a') { gsequences[i][seqid] = 2; } else if (inputseq[i] == 'G' || inputseq[i] == 'g') { gsequences[i][seqid] = 3; } else if (inputseq[i] == '-' || inputseq[i] == '.') { gsequences[i][seqid] = 9; } else if (inputseq[i] == '\0') { nseq = i; break; } else { if (inputseq[i] == 'M' || inputseq[i] == 'm' || inputseq[i] == 'R' || inputseq[i] == 'r' || inputseq[i] == 'W' || inputseq[i] == 'w' || inputseq[i] == 'V' || inputseq[i] == 'v' || inputseq[i] == 'H' || inputseq[i] == 'h' || inputseq[i] == 'D' || inputseq[i] == 'd' || inputseq[i] == 'B' || inputseq[i] == 'b' || inputseq[i] == 'S' || inputseq[i] == 's' || inputseq[i] == 'Y' || inputseq[i] == 'y' || inputseq[i] == 'K' || inputseq[i] == 'k' || inputseq[i] == 'N' || inputseq[i] == 'n' || inputseq[i] == 'X' || inputseq[i] == 'x') { gsequences[i][seqid] = 5; // ambiguous nt cerr << "Warning: sequence " << seq << ", ambiguous nt '" << inputseq[i] << "', at " << i+1 << ".\n"; } else { cerr << "Aborting: sequence " << seq << ", unknown nucleotide '" << inputseq[i] << "', at " << i+1 << ".\n"; exit(EXIT_FAILURE); } } } return(nseq); } //----------------------------------------------------------------------------- // Note that if there are overlapping ORFs in the same frame (e.g. alternative // start points) then only one is counted at any nt. // Note that if there are overlapping ORFs in different frames, then only the // first one is counted in the overlap regions. int readorfs(char cpos[128], int codonpos[][2], int length, int circular, int posid) { int i, j, k, orf1, orf2, nseg, count, norfs, test; test = 1; // for warning message for overlapping ORFs for (i = 0; i < length; ++i) { codonpos[i][posid] = 0; } // Work through ORF list. ifstream orffile(cpos); if (!orffile) { cerr << "Aborting: can't find file '" << cpos << "'.\n"; exit(EXIT_FAILURE); } norfs = -1; while (orffile.ignore(1000, '\n')) { ++norfs; } orffile.clear(); orffile.seekg(0); for (i = 0; i < norfs; ++i) { count = 0; orffile >> nseg; nseg /= 2; for (k = 0; k < nseg; ++k) { orffile >> orf1; orffile >> orf2; if (orf1<1||orf2<1||orf1>length||orf2>length) { cerr << "Aborting: ORF " << i+1 << " in " << cpos << ", endpoints outside sequence range.\n"; exit(EXIT_FAILURE); } // Since arrays start at 0 rather than 1. orf1--; orf2--; // Enter 1,2,3 (forward) or 4,5,6 (reverse) into codonpos[][] if (orf1 < orf2) { // forward frame for (j = orf1; j <= orf2; ++j) { // blank codons overlapping joins except for circular genomes if (!((j==orf2&&count+1!=3)||(j==orf2-1&&count+1==1)|| (j==orf1&&count+1!=1)||(j==orf1+1&&count+1==3))|| (circular&&(j==length-1||j==length-2||j==0||j==1))){ if (0 == codonpos[j][posid]) { codonpos[j][posid] = count+1; } else { if (test) { cerr << "Warning: " << cpos << " has overlapping ORFs. " << "Only the first will be used in overlap regions.\n"; test = 0; } } } count = (count+1)%3; } } else { // reverse frame for (j = orf1; j >= orf2; --j) { // blank codons overlapping joins except for circular genomes if (!((j==orf2&&count+1!=3)||(j==orf2+1&&count+1==1)|| (j==orf1&&count+1!=1)||(j==orf1-1&&count+1==3))|| (circular&&(j==length-1||j==length-2||j==0||j==1))){ if (0 == codonpos[j][posid]) { codonpos[j][posid] = count+4; } else { if (test) { cerr << "Warning: " << cpos << " has overlapping ORFs. " << "Only the first will be used in overlap regions.\n"; test = 0; } } } count = (count+1)%3; } } } if (0 != count%3) { cerr << "Warning: number of nucleotides in ORF " << i+1 << " in " << cpos << " not a multiple of 3.\n"; } orffile.ignore(1000, '\n'); } orffile.close(); return(norfs); }