#include #include #include #include #include #include using namespace std; // Programme to read pair frequency blosum matrix (.qij) and write matrix of // f(aa_i,aa_j)/f(aa_i)f(aa_j) values for use in mlogd.cxx etc. int main(int argc, char* argv[]) { int lines, i, j, k; double pairfreqs[20][20], aafreqs[20], total; double matrix1[20][20], matrix2[20][20]; char aa[20], temp; int runmode = 0; //runmode == 0: q_ij/e_ij values //runmode == 1: as for 0, except off-diagonals /= 2 - to counteract the // off-diagonals *= 2 in blosum.freqs. //else: q_ij/e_ij values scaled so that diagonals = 1 and averaged // with transpose so it's still symmetric if (3 != argc) { cerr << "Aborting: use '" << argv[0] << " input_blosumfile output_blosumfile'.\n"; exit(EXIT_FAILURE); } // Number of lines in 20 x 20 amino acid scoring matrix. ifstream matrixfile(argv[1]); if (!matrixfile) { cerr << "Aborting: can't find file '" << argv[1] << "'.\n"; exit(EXIT_FAILURE); } lines = 0; while (matrixfile.ignore(1000, '\n')) { ++lines; } matrixfile.clear(); matrixfile.seekg(0); // Read weight matrix. temp = '#'; while ('#' == temp) { matrixfile.get(temp); if ('#' == temp) { matrixfile.ignore(1000, '\n'); } } for (j = 0; j < 20; ++j) { matrixfile >> aa[j]; } total = 0.; matrixfile.ignore(1000, '\n'); for (i = 0; i < 20; ++i) { for (j = 0; j <= i; ++j) { matrixfile >> pairfreqs[i][j]; if (runmode == 1 && i != j) { pairfreqs[i][j] /= 2.; } total += pairfreqs[i][j]; } matrixfile.ignore(1000, '\n'); } matrixfile.close(); // Scale by grand sum. for (i = 0; i < 20; ++i) { for (j = 0; j <= i; ++j) { pairfreqs[i][j] /= total; } } // Calculate single amino acid frequencies. for (i = 0; i < 20; ++i) { aafreqs[i] = 0.; } for (i = 0; i < 20; ++i) { for (j = 0; j <= i; ++j) { aafreqs[i] += pairfreqs[i][j]; aafreqs[j] += pairfreqs[i][j]; } } for (i = 0; i < 20; ++i) { aafreqs[i] *= 0.5; } // Checked aafreqs[i] agree with those in blosum40.out and sum to 1. // Calculate qij/eij values. for (i = 0; i < 20; ++i) { for (j = 0; j < i; ++j) { pairfreqs[i][j] /= (2. * aafreqs[i] * aafreqs[j]); } } for (i = 0; i < 20; ++i) { pairfreqs[i][i] /= (aafreqs[i] * aafreqs[i]); } // Check no values greater than corresponding values on the diagonal. for (i = 0; i < 20; ++i) { for (j = 0; j < i; ++j) { if (pairfreqs[i][j] > pairfreqs[i][i] || pairfreqs[i][j] > pairfreqs[j][j]) { cerr << "Aborting: scoring matrix value at " << i+1 << ", " << j+1 << " is greater than value on diagonal.\n"; exit(EXIT_FAILURE); } } } if (runmode == 0 || runmode == 1) { // Output 20 x 20 matrix to output file. ofstream outfile(argv[2]); if (!outfile) { cerr << "Aborting: can't open file '" << argv[2] << "'.\n"; exit(EXIT_FAILURE); } outfile << "# created from file '" << argv[1] << "'\n"; outfile << "# q_ij/e_ij values (see make_blosum.cxx)\n"; outfile << "# \n"; for (i = 0; i < 20; ++i) { outfile << " " << aa[i]; } for (i = 0; i < 20; ++i) { outfile << "\n "; for (j = 0; j <= i; ++j) { outfile << pairfreqs[i][j] << " "; } } outfile << "\n "; outfile.close(); } else { // Fill in other half of matrix. // Scale rows so that q_ii = 1 -> Q1. // Scale columns so that q_ii = 1 -> Q2. // (Since the original matrix is symmetric Q1 and Q2 are transposes.) // Take the average of Q1 and Q2 -> Q. // Fill in other half of matrix. for (i = 0; i < 20; ++i) { for (j = i + 1; j < 20; ++j) { pairfreqs[i][j] = pairfreqs[j][i]; } } // Make two copies. for (i = 0; i < 20; ++i) { for (j = 0; j < 20; ++j) { matrix1[i][j] = pairfreqs[i][j]; matrix2[i][j] = pairfreqs[i][j]; } } // Divide each row i in matrix1 by pairfreqs[i][i] for (i = 0; i < 20; ++i) { for (j = 0; j < 20; ++j) { matrix1[i][j] /= pairfreqs[i][i]; } } // Divide each column j in matrix2 by pairfreqs[j][j] for (i = 0; i < 20; ++i) { for (j = 0; j < 20; ++j) { matrix2[i][j] /= pairfreqs[j][j]; } } // Average matrix1 and matrix2. for (i = 0; i < 20; ++i) { for (j = 0; j < 20; ++j) { pairfreqs[i][j] = 0.5 * (matrix1[i][j] + matrix2[i][j]); } } // Output 20 x 20 matrix to output file. ofstream outfile(argv[2]); if (!outfile) { cerr << "Aborting: can't open file '" << argv[2] << "'.\n"; exit(EXIT_FAILURE); } outfile << "# created from file '" << argv[1] << "'\n"; outfile << "# q_ij/e_ij values (see make_blosum.cxx)\n"; outfile << "# \n"; for (i = 0; i < 20; ++i) { outfile << " " << aa[i]; } for (i = 0; i < 20; ++i) { outfile << "\n "; for (j = 0; j < 20; ++j) { outfile << pairfreqs[i][j] << " "; } } outfile << "\n"; outfile.close(); } }