1 #include "SubstitutionMatrix.h"
2 #include "Util.h"
3 #include "Debug.h"
4 #include "lambda_calculator.h"
5 
6 
7 #include <cstring>
8 #include <algorithm>
9 #include <fstream>
10 #include <cmath>
11 #include <climits>
12 
SubstitutionMatrix(const char * filename,float bitFactor,float scoreBias)13 SubstitutionMatrix::SubstitutionMatrix(const char *filename, float bitFactor, float scoreBias) : bitFactor(bitFactor) {
14     std::pair<std::string, std::string> parsedMatrix = BaseMatrix::unserialize(filename);
15     if(parsedMatrix.second != "") {
16         // the filename can contain the substituion matrix
17         // SUBMATNAME.out:DATA
18         // this is used for index databases
19         matrixName = parsedMatrix.first;
20         matrixData = parsedMatrix.second;
21     } else {
22         // read amino acid substitution matrix from file
23         std::string fileName(parsedMatrix.first.c_str());
24         matrixName = Util::base_name(fileName, "/\\");
25         if (fileName.length() < 4 || fileName.substr(fileName.length() - 4, 4).compare(".out") != 0) {
26             Debug(Debug::ERROR) << "Invalid format of the substitution matrix input file! Only .out files are accepted.\n";
27             EXIT(EXIT_FAILURE);
28         }
29         std::ifstream in(fileName);
30         if (in.fail()) {
31             Debug(Debug::ERROR) << "Cannot read " << filename << "\n";
32             EXIT(EXIT_FAILURE);
33         }
34         matrixData = std::string((std::istreambuf_iterator<char>(in)), std::istreambuf_iterator<char>());
35         in.close();
36     }
37 
38     std::pair<int, bool> alphSizeAndX = setAaMappingDetectAlphSize(matrixData);
39     alphabetSize = alphSizeAndX.first;
40     if(alphabetSize == -1){
41         Debug(Debug::ERROR) << "Could not estimate alphabet size.\n";
42         EXIT(EXIT_FAILURE);
43     }
44     initMatrixMemory(alphabetSize);
45     readProbMatrix(matrixData, alphSizeAndX.second);
46 
47     setupLetterMapping();
48 
49     //print(probMatrix, num2aa, alphabetSize);
50     generateSubMatrix(probMatrix, subMatrixPseudoCounts, subMatrix, alphabetSize, true, bitFactor, scoreBias);
51 }
52 
53 
estimateLambdaAndBackground(const double ** scoreMatrix,int alphabetSize,double * pBack,double & lambda)54 bool SubstitutionMatrix::estimateLambdaAndBackground(const double **scoreMatrix,
55                                                      int alphabetSize, double *pBack, double &lambda) {
56     // We need to pass the parameters as 1-based pointers, hence the +1s and -1s.
57     std::vector<double> cells(alphabetSize * (alphabetSize + 1));
58     std::vector<const double *> pointers(alphabetSize + 1);
59 
60     for (int i = 0; i < alphabetSize; ++i) {
61         pointers[i + 1] = &cells[i * alphabetSize];
62         for (int j = 0; j < alphabetSize; ++j) {
63             cells[i * alphabetSize + j + 1] = scoreMatrix[i][j];
64         }
65     }
66 
67     std::vector<double> letterProbs1(alphabetSize, 0);
68     std::vector<double> letterProbs2(alphabetSize, 0);
69 
70     lambda = calculate_lambda(&pointers[0], alphabetSize,
71                               &letterProbs1[0] - 1,
72                               &letterProbs2[0] - 1);
73 
74     for (int i = 0; i < alphabetSize; i++) {
75         pBack[i] = letterProbs1[i];
76     }
77 
78     if (lambda < 0)
79         return false; //bad
80     else
81         return true; //good
82 }
83 
84 
calcLocalAaBiasCorrection(const BaseMatrix * m,const unsigned char * int_sequence,const int N,float * compositionBias)85 void SubstitutionMatrix::calcLocalAaBiasCorrection(const BaseMatrix *m,
86                                                    const unsigned char *int_sequence,
87                                                    const int N,
88                                                    float *compositionBias) {
89     const int windowSize = 40;
90     for (int i = 0; i < N; i++) {
91         const int minPos = std::max(0, (i - windowSize / 2));
92         const int maxPos = std::min(N, (i + windowSize / 2));
93         const int windowLength = maxPos - minPos;
94 
95         // negative score for the amino acids in the neighborhood of i
96         int sumSubScores = 0;
97         short *subMat = m->subMatrix[int_sequence[i]];
98         for (int j = minPos; j < maxPos; j++) {
99             sumSubScores += subMat[int_sequence[j]];
100         }
101 
102         // remove own amino acid
103         sumSubScores -= subMat[int_sequence[i]];
104         float deltaS_i = (float) sumSubScores;
105         // negative avg.
106         deltaS_i /= -1.0 * static_cast<float>(windowLength);
107         // positive score for the background score distribution for i
108         for (int a = 0; a < m->alphabetSize; a++) {
109             deltaS_i += m->pBack[a] * static_cast<float>(subMat[a]);
110         }
111         compositionBias[i] = deltaS_i;
112 //        std::cout << i << " " << compositionBias[i] << std::endl;
113     }
114 }
115 
116 
calcProfileProfileLocalAaBiasCorrection(short * profileScores,const size_t profileAASize,const int N,size_t alphabetSize)117 void SubstitutionMatrix::calcProfileProfileLocalAaBiasCorrection(short *profileScores,
118                                                                  const size_t profileAASize,
119                                                                  const int N, size_t alphabetSize) {
120 
121     const int windowSize = 40;
122 
123     float * pnul  = new float[alphabetSize];
124     float * aaSum = new float[alphabetSize];
125 
126     memset(pnul, 0, sizeof(float) * alphabetSize);
127 
128 
129     for (int pos = 0; pos < N; pos++) {
130         const short * subMat = profileScores + (pos * profileAASize);
131         for(size_t aa = 0; aa < alphabetSize; aa++) {
132             pnul[aa] += subMat[aa]  ;
133         }
134     }
135     for(size_t aa = 0; aa < alphabetSize; aa++)
136         pnul[aa] /= N;
137     for (int i = 0; i < N; i++){
138         const int minPos = std::max(0, (i - windowSize/2));
139         const int maxPos = std::min(N, (i + windowSize/2));
140         const int windowLength = maxPos - minPos;
141         // negative score for the amino acids in the neighborhood of i
142         memset(aaSum, 0, sizeof(float) * alphabetSize);
143 
144         for (int j = minPos; j < maxPos; j++){
145             const short * subMat = profileScores + (j * profileAASize);
146             if( i == j )
147                 continue;
148             for(size_t aa = 0; aa < alphabetSize; aa++){
149                 aaSum[aa] += subMat[aa] - pnul[aa];
150             }
151         }
152         for(size_t aa = 0; aa < alphabetSize; aa++) {
153             profileScores[i*profileAASize + aa] = static_cast<int>((profileScores + (i * profileAASize))[aa] - aaSum[aa]/windowLength);
154         }
155     }
156     delete [] aaSum;
157     delete [] pnul;
158 }
159 
calcProfileProfileLocalAaBiasCorrectionAln(int8_t * profileScores,int N,size_t alphabetSize,BaseMatrix * subMat)160 void SubstitutionMatrix::calcProfileProfileLocalAaBiasCorrectionAln(int8_t *profileScores,
161                                                                     int N, size_t alphabetSize, BaseMatrix *subMat) {
162 
163     const int windowSize = 40;
164 
165     float * pnul = new float[N]; // expected score of the prof ag. a random (blosum bg dist) seq
166     memset(pnul, 0, sizeof(float) * N);
167     float * aaSum = new float[alphabetSize];
168 
169     ProfileStates ps(alphabetSize,subMat->pBack);
170 
171     for (int pos = 0; pos < N; pos++) {
172         for(size_t aa = 0; aa < alphabetSize; aa++) {
173             pnul[pos] += *(profileScores + pos + N*aa) * ps.prior[aa];
174         }
175     }
176 
177     for (int i = 0; i < N; i++){
178         const int minPos = std::max(0, (i - windowSize/2));
179         const int maxPos = std::min(N, (i + windowSize/2));
180         const int windowLength = maxPos - minPos;
181         // negative score for the amino acids in the neighborhood of i
182         memset(aaSum, 0, sizeof(float) * alphabetSize);
183 
184         for (int j = minPos; j < maxPos; j++){
185             if( i == j )
186                 continue;
187             for(size_t aa = 0; aa < alphabetSize; aa++){
188                 aaSum[aa] += *(profileScores + aa*N + j) - pnul[j];
189             }
190         }
191         for(size_t aa = 0; aa < alphabetSize; aa++) {
192             profileScores[i + aa*N] = static_cast<int8_t>(*(profileScores + i + N*aa) - aaSum[aa]/windowLength);
193         }
194     }
195     delete [] aaSum;
196     delete [] pnul;
197 }
198 
199 
200 
201 /* Compute aa correction
202    => p(a) =  ( \prod_{i=1}^L pi(a) )^(1/L)
203    => p(a) = 2^[ (1/L) * \log2 ( \prod_{i=1}^L pi(a) )
204    => p(a) = 2^[ (1/L) * \sum_{i=1}^L  \log2 pi(a) ]
205    => p(a) = 2^[ (1/L) * \sum_{i=1}^L  \log2 ( pi(a) / f(a) )  + log2 f(a) ]
206    => p(a) = f(a) * 2^[ (1/L) * \sum_{i=1}^L  S(pi, a) ]
207  */
208 
calcGlobalAaBiasCorrection(const BaseMatrix * m,short * profileScores,float * pNullBuffer,const size_t profileAASize,const int N)209 void SubstitutionMatrix::calcGlobalAaBiasCorrection(const BaseMatrix *m,
210                                                     short *profileScores,
211                                                     float *pNullBuffer,
212                                                     const size_t profileAASize,
213                                                     const int N) {
214     memset(pNullBuffer, 0, sizeof(float) * N);
215     const int windowSize = 40;
216     for (int pos = 0; pos < N; pos++) {
217         const short * subMat = profileScores + (pos * profileAASize);
218         for(size_t aa = 0; aa < 20; aa++) {
219             pNullBuffer[pos] += m->pBack[aa] * static_cast<float>(subMat[aa]);
220         }
221     }
222 //    for(size_t aa = 0; aa < 20; aa++)
223 //        pNullBuffer[aa] /= N;
224     for (int i = 0; i < N; i++) {
225         const int minPos = std::max(0, (i - windowSize / 2));
226         const int maxPos = std::min(N, (i + windowSize / 2));
227         const int windowLength = maxPos - minPos;
228         // negative score for the amino acids in the neighborhood of i
229         float aaSum[20];
230         memset(aaSum, 0, sizeof(float) * 20);
231 
232         for (int j = minPos; j < maxPos; j++) {
233             const short *subMat = profileScores + (j * profileAASize);
234             if (i == j) {
235                 continue;
236             }
237             for (size_t aa = 0; aa < 20; aa++) {
238                 aaSum[aa] += subMat[aa] - pNullBuffer[j];
239             }
240         }
241         for (size_t aa = 0; aa < 20; aa++) {
242 //            printf("%d\t%d\t%2.3f\t%d\n", i, (profileScores + (i * profileAASize))[aa],
243 //                   aaSum[aa]/windowLength,
244 //                   static_cast<int>((profileScores + (i * profileAASize))[aa] -  aaSum[aa]/windowLength) );
245             //std::cout << i << "\t" << (profileScores + (i * profileAASize))[aa] << "\t" <<  aaSum[aa]/windowLength << "\t" <<  (profileScores + (i * profileAASize))[aa] -  aaSum[aa]/windowLength << std::endl;
246             profileScores[i * profileAASize + aa] = static_cast<int>((profileScores + (i * profileAASize))[aa] -
247                                                                      aaSum[aa] / windowLength);
248 //            avg += static_cast<int>((profileScores + (i * profileAASize))[aa] -  aaSum[aa]/windowLength);
249         }
250     }
251 //    std::cout << "avg=" << avg/(N*20) << std::endl;
252 }
253 
254 
~SubstitutionMatrix()255 SubstitutionMatrix::~SubstitutionMatrix() {
256 }
257 
setupLetterMapping()258 void SubstitutionMatrix::setupLetterMapping(){
259     for(int letter = 0; letter < UCHAR_MAX; letter++){
260         char upperLetter = toupper(static_cast<char>(letter));
261         switch(upperLetter){
262             case 'A':
263             case 'T':
264             case 'G':
265             case 'C':
266             case 'D':
267             case 'E':
268             case 'F':
269             case 'H':
270             case 'I':
271             case 'K':
272             case 'L':
273             case 'M':
274             case 'N':
275             case 'P':
276             case 'Q':
277             case 'R':
278             case 'S':
279             case 'V':
280             case 'W':
281             case 'Y':
282             case 'X':
283                 this->aa2num[static_cast<int>(letter)] = this->aa2num[static_cast<int>(upperLetter)];
284                 break;
285             case 'J':
286                 this->aa2num[static_cast<int>(letter)] = this->aa2num[(int)'L'];
287                 break;
288             case 'U':
289             case 'O':
290                 this->aa2num[static_cast<int>(letter)] = this->aa2num[(int)'X'];
291                 break;
292             case 'Z': this->aa2num[static_cast<int>(letter)] = this->aa2num[(int)'E']; break;
293             case 'B': this->aa2num[static_cast<int>(letter)] = this->aa2num[(int)'D']; break;
294             default:
295                 this->aa2num[static_cast<int>(letter)] = this->aa2num[(int)'X'];
296                 break;
297         }
298     }
299 }
300 
parseAlphabet(char * word,char * num2aa,int * aa2num)301 int SubstitutionMatrix::parseAlphabet(char *word, char *num2aa, int *aa2num) {
302     char *charReader = word;
303     int minAAInt = INT_MAX;
304     // find amino acid with minimal int value
305     while (isalpha(*charReader)) {
306         const char aa = *charReader;
307         const int intAA = aa2num[static_cast<int>(aa)];
308         minAAInt = std::max(minAAInt, intAA);
309         charReader++;
310     }
311     if(minAAInt==-1){
312 
313     }
314     char minAAChar = num2aa[minAAInt];
315     // do alphabet reduction
316     charReader = word;
317     while (isalpha(*charReader)) {
318         const char aa = *charReader;
319         const int intAA = aa2num[static_cast<int>(aa)];
320         aa2num[static_cast<int>(aa)] = minAAInt;
321         num2aa[intAA] = minAAChar;
322         charReader++;
323     }
324     return minAAInt;
325 }
326 
readProbMatrix(const std::string & matrixData,const bool containsX)327 void SubstitutionMatrix::readProbMatrix(const std::string &matrixData, const bool containsX) {
328     std::stringstream in(matrixData);
329     std::string line;
330     bool probMatrixStart = false;
331     const char *words[256];
332     bool hasLambda = false;
333     bool hasBackground = false;
334     while (in.good()) {
335         getline(in, line);
336         size_t wordCnt = Util::getWordsOfLine((char *) line.c_str(), words, 256);
337         // skip comments
338         if (line[0] == '#') {
339             if (line.find("# Background (precomputed optional):") == 0) {
340                 for (size_t i = 4; i < wordCnt; i++) {
341                     double f = strtod(words[i], NULL);
342                     pBack[i-4] = f;
343                 }
344                 hasBackground = true;
345             }
346             if (line.find("# Lambda     (precomputed optional):") == 0) {
347                 double f = strtod(words[4], NULL);
348                 lambda = f;
349                 hasLambda = true;
350             }
351             continue;
352         }
353 
354         if (wordCnt > 1 && probMatrixStart == false) {
355             probMatrixStart = true;
356             continue;
357         }
358         if (wordCnt > 1 && probMatrixStart == true) {
359             if (isalpha(words[0][0]) == false) {
360                 Debug(Debug::ERROR) << "First element in probability line must be an alphabet letter.\n";
361                 EXIT(EXIT_FAILURE);
362             }
363             int aa = static_cast<int>(aa2num[toupper(words[0][0])]);
364             for (int i = 0; i < alphabetSize; i++) {
365                 double f = strtod(words[i + 1], NULL);
366                 probMatrix[aa][i] = f; // divided by 2 because we scale bit/2 ot bit
367             }
368         }
369     }
370     bool xIsPositive = false;
371     if( containsX == true ){
372         for (int j = 0; j < alphabetSize; j++) {
373             int xIndex = static_cast<int>(aa2num[static_cast<int>('X')]);
374             if ((probMatrix[xIndex][j] > 0) || (probMatrix[j][xIndex] > 0)) {
375                 xIsPositive = true;
376                 break;
377             }
378         }
379     }
380 
381 
382     if (containsX == false) {
383         Debug(Debug::ERROR) << "Please add X to your substitution matrix.\n";
384         EXIT(EXIT_FAILURE);
385     }
386 
387     if(hasLambda == false || hasBackground == false){
388         if (estimateLambdaAndBackground(const_cast<const double **>(probMatrix), alphabetSize - ((xIsPositive) ? 0 : 1),
389                                         pBack, lambda) == false) {
390             Debug(Debug::ERROR) << "Computing inverse of substitution matrix failed\n";
391             EXIT(EXIT_FAILURE);
392         }
393         pBack[static_cast<int>(aa2num[static_cast<int>('X')])]=ANY_BACK;
394     }
395     if(xIsPositive == false){
396         for (int i = 0; i < alphabetSize - 1; i++) {
397             pBack[i] = pBack[i] * (1.0 - pBack[static_cast<int>(aa2num[static_cast<int>('X')])]);
398         }
399     }
400     // Reconstruct Probability Sab=(Pab/Pa*Pb) -> Pab = exp(Sab) * Pa * Pb
401     for (int i = 0; i < alphabetSize; i++) {
402         //smat[i] = smatData+((subMat.alphabetSize-1)*i);
403         for (int j = 0; j < alphabetSize; j++) {
404             probMatrix[i][j] = std::exp(lambda * probMatrix[i][j]) * pBack[i] * pBack[j];
405         }
406     }
407 }
408 
setAaMappingDetectAlphSize(std::string & matrixData)409 std::pair<int, bool> SubstitutionMatrix::setAaMappingDetectAlphSize(std::string &matrixData){
410     std::stringstream in(matrixData);
411     std::string line;
412     const char *words[256];
413     int alphabetSize = 0;
414     bool containsX;
415     while (in.good()) {
416         getline(in, line);
417         size_t wordCnt = Util::getWordsOfLine((char *) line.c_str(), words, 256);
418 
419         if (line[0] == '#') {
420             continue;
421         }
422         if (wordCnt > 1) {
423             for (size_t i = 0; i < wordCnt; i++) {
424                 if (isalpha(words[i][0]) == false) {
425                     Debug(Debug::ERROR) << "Probability matrix must start with alphabet header.\n";
426                     EXIT(EXIT_FAILURE);
427                 }
428                 int aa = toupper(words[i][0]);
429                 aa2num[aa] = static_cast<unsigned char>(i);
430                 num2aa[i] = aa;
431                 if (aa == 'X') {
432                     containsX = true;
433                 }
434 //                column_aa[i] = parseAlphabet(words[i], num2aa, aa2num);
435             }
436             alphabetSize = wordCnt;
437             return std::make_pair(alphabetSize, containsX);
438         }
439     }
440     return std::make_pair(-1, false);
441 }
442 
443 
444 
445 
446