1 #include "SubstitutionMatrix.h"
2 #include "Util.h"
3 #include "Debug.h"
4 #include "lambda_calculator.h"
5
6
7 #include <cstring>
8 #include <algorithm>
9 #include <fstream>
10 #include <cmath>
11 #include <climits>
12
SubstitutionMatrix(const char * filename,float bitFactor,float scoreBias)13 SubstitutionMatrix::SubstitutionMatrix(const char *filename, float bitFactor, float scoreBias) : bitFactor(bitFactor) {
14 std::pair<std::string, std::string> parsedMatrix = BaseMatrix::unserialize(filename);
15 if(parsedMatrix.second != "") {
16 // the filename can contain the substituion matrix
17 // SUBMATNAME.out:DATA
18 // this is used for index databases
19 matrixName = parsedMatrix.first;
20 matrixData = parsedMatrix.second;
21 } else {
22 // read amino acid substitution matrix from file
23 std::string fileName(parsedMatrix.first.c_str());
24 matrixName = Util::base_name(fileName, "/\\");
25 if (fileName.length() < 4 || fileName.substr(fileName.length() - 4, 4).compare(".out") != 0) {
26 Debug(Debug::ERROR) << "Invalid format of the substitution matrix input file! Only .out files are accepted.\n";
27 EXIT(EXIT_FAILURE);
28 }
29 std::ifstream in(fileName);
30 if (in.fail()) {
31 Debug(Debug::ERROR) << "Cannot read " << filename << "\n";
32 EXIT(EXIT_FAILURE);
33 }
34 matrixData = std::string((std::istreambuf_iterator<char>(in)), std::istreambuf_iterator<char>());
35 in.close();
36 }
37
38 std::pair<int, bool> alphSizeAndX = setAaMappingDetectAlphSize(matrixData);
39 alphabetSize = alphSizeAndX.first;
40 if(alphabetSize == -1){
41 Debug(Debug::ERROR) << "Could not estimate alphabet size.\n";
42 EXIT(EXIT_FAILURE);
43 }
44 initMatrixMemory(alphabetSize);
45 readProbMatrix(matrixData, alphSizeAndX.second);
46
47 setupLetterMapping();
48
49 //print(probMatrix, num2aa, alphabetSize);
50 generateSubMatrix(probMatrix, subMatrixPseudoCounts, subMatrix, alphabetSize, true, bitFactor, scoreBias);
51 }
52
53
estimateLambdaAndBackground(const double ** scoreMatrix,int alphabetSize,double * pBack,double & lambda)54 bool SubstitutionMatrix::estimateLambdaAndBackground(const double **scoreMatrix,
55 int alphabetSize, double *pBack, double &lambda) {
56 // We need to pass the parameters as 1-based pointers, hence the +1s and -1s.
57 std::vector<double> cells(alphabetSize * (alphabetSize + 1));
58 std::vector<const double *> pointers(alphabetSize + 1);
59
60 for (int i = 0; i < alphabetSize; ++i) {
61 pointers[i + 1] = &cells[i * alphabetSize];
62 for (int j = 0; j < alphabetSize; ++j) {
63 cells[i * alphabetSize + j + 1] = scoreMatrix[i][j];
64 }
65 }
66
67 std::vector<double> letterProbs1(alphabetSize, 0);
68 std::vector<double> letterProbs2(alphabetSize, 0);
69
70 lambda = calculate_lambda(&pointers[0], alphabetSize,
71 &letterProbs1[0] - 1,
72 &letterProbs2[0] - 1);
73
74 for (int i = 0; i < alphabetSize; i++) {
75 pBack[i] = letterProbs1[i];
76 }
77
78 if (lambda < 0)
79 return false; //bad
80 else
81 return true; //good
82 }
83
84
calcLocalAaBiasCorrection(const BaseMatrix * m,const unsigned char * int_sequence,const int N,float * compositionBias)85 void SubstitutionMatrix::calcLocalAaBiasCorrection(const BaseMatrix *m,
86 const unsigned char *int_sequence,
87 const int N,
88 float *compositionBias) {
89 const int windowSize = 40;
90 for (int i = 0; i < N; i++) {
91 const int minPos = std::max(0, (i - windowSize / 2));
92 const int maxPos = std::min(N, (i + windowSize / 2));
93 const int windowLength = maxPos - minPos;
94
95 // negative score for the amino acids in the neighborhood of i
96 int sumSubScores = 0;
97 short *subMat = m->subMatrix[int_sequence[i]];
98 for (int j = minPos; j < maxPos; j++) {
99 sumSubScores += subMat[int_sequence[j]];
100 }
101
102 // remove own amino acid
103 sumSubScores -= subMat[int_sequence[i]];
104 float deltaS_i = (float) sumSubScores;
105 // negative avg.
106 deltaS_i /= -1.0 * static_cast<float>(windowLength);
107 // positive score for the background score distribution for i
108 for (int a = 0; a < m->alphabetSize; a++) {
109 deltaS_i += m->pBack[a] * static_cast<float>(subMat[a]);
110 }
111 compositionBias[i] = deltaS_i;
112 // std::cout << i << " " << compositionBias[i] << std::endl;
113 }
114 }
115
116
calcProfileProfileLocalAaBiasCorrection(short * profileScores,const size_t profileAASize,const int N,size_t alphabetSize)117 void SubstitutionMatrix::calcProfileProfileLocalAaBiasCorrection(short *profileScores,
118 const size_t profileAASize,
119 const int N, size_t alphabetSize) {
120
121 const int windowSize = 40;
122
123 float * pnul = new float[alphabetSize];
124 float * aaSum = new float[alphabetSize];
125
126 memset(pnul, 0, sizeof(float) * alphabetSize);
127
128
129 for (int pos = 0; pos < N; pos++) {
130 const short * subMat = profileScores + (pos * profileAASize);
131 for(size_t aa = 0; aa < alphabetSize; aa++) {
132 pnul[aa] += subMat[aa] ;
133 }
134 }
135 for(size_t aa = 0; aa < alphabetSize; aa++)
136 pnul[aa] /= N;
137 for (int i = 0; i < N; i++){
138 const int minPos = std::max(0, (i - windowSize/2));
139 const int maxPos = std::min(N, (i + windowSize/2));
140 const int windowLength = maxPos - minPos;
141 // negative score for the amino acids in the neighborhood of i
142 memset(aaSum, 0, sizeof(float) * alphabetSize);
143
144 for (int j = minPos; j < maxPos; j++){
145 const short * subMat = profileScores + (j * profileAASize);
146 if( i == j )
147 continue;
148 for(size_t aa = 0; aa < alphabetSize; aa++){
149 aaSum[aa] += subMat[aa] - pnul[aa];
150 }
151 }
152 for(size_t aa = 0; aa < alphabetSize; aa++) {
153 profileScores[i*profileAASize + aa] = static_cast<int>((profileScores + (i * profileAASize))[aa] - aaSum[aa]/windowLength);
154 }
155 }
156 delete [] aaSum;
157 delete [] pnul;
158 }
159
calcProfileProfileLocalAaBiasCorrectionAln(int8_t * profileScores,int N,size_t alphabetSize,BaseMatrix * subMat)160 void SubstitutionMatrix::calcProfileProfileLocalAaBiasCorrectionAln(int8_t *profileScores,
161 int N, size_t alphabetSize, BaseMatrix *subMat) {
162
163 const int windowSize = 40;
164
165 float * pnul = new float[N]; // expected score of the prof ag. a random (blosum bg dist) seq
166 memset(pnul, 0, sizeof(float) * N);
167 float * aaSum = new float[alphabetSize];
168
169 ProfileStates ps(alphabetSize,subMat->pBack);
170
171 for (int pos = 0; pos < N; pos++) {
172 for(size_t aa = 0; aa < alphabetSize; aa++) {
173 pnul[pos] += *(profileScores + pos + N*aa) * ps.prior[aa];
174 }
175 }
176
177 for (int i = 0; i < N; i++){
178 const int minPos = std::max(0, (i - windowSize/2));
179 const int maxPos = std::min(N, (i + windowSize/2));
180 const int windowLength = maxPos - minPos;
181 // negative score for the amino acids in the neighborhood of i
182 memset(aaSum, 0, sizeof(float) * alphabetSize);
183
184 for (int j = minPos; j < maxPos; j++){
185 if( i == j )
186 continue;
187 for(size_t aa = 0; aa < alphabetSize; aa++){
188 aaSum[aa] += *(profileScores + aa*N + j) - pnul[j];
189 }
190 }
191 for(size_t aa = 0; aa < alphabetSize; aa++) {
192 profileScores[i + aa*N] = static_cast<int8_t>(*(profileScores + i + N*aa) - aaSum[aa]/windowLength);
193 }
194 }
195 delete [] aaSum;
196 delete [] pnul;
197 }
198
199
200
201 /* Compute aa correction
202 => p(a) = ( \prod_{i=1}^L pi(a) )^(1/L)
203 => p(a) = 2^[ (1/L) * \log2 ( \prod_{i=1}^L pi(a) )
204 => p(a) = 2^[ (1/L) * \sum_{i=1}^L \log2 pi(a) ]
205 => p(a) = 2^[ (1/L) * \sum_{i=1}^L \log2 ( pi(a) / f(a) ) + log2 f(a) ]
206 => p(a) = f(a) * 2^[ (1/L) * \sum_{i=1}^L S(pi, a) ]
207 */
208
calcGlobalAaBiasCorrection(const BaseMatrix * m,short * profileScores,float * pNullBuffer,const size_t profileAASize,const int N)209 void SubstitutionMatrix::calcGlobalAaBiasCorrection(const BaseMatrix *m,
210 short *profileScores,
211 float *pNullBuffer,
212 const size_t profileAASize,
213 const int N) {
214 memset(pNullBuffer, 0, sizeof(float) * N);
215 const int windowSize = 40;
216 for (int pos = 0; pos < N; pos++) {
217 const short * subMat = profileScores + (pos * profileAASize);
218 for(size_t aa = 0; aa < 20; aa++) {
219 pNullBuffer[pos] += m->pBack[aa] * static_cast<float>(subMat[aa]);
220 }
221 }
222 // for(size_t aa = 0; aa < 20; aa++)
223 // pNullBuffer[aa] /= N;
224 for (int i = 0; i < N; i++) {
225 const int minPos = std::max(0, (i - windowSize / 2));
226 const int maxPos = std::min(N, (i + windowSize / 2));
227 const int windowLength = maxPos - minPos;
228 // negative score for the amino acids in the neighborhood of i
229 float aaSum[20];
230 memset(aaSum, 0, sizeof(float) * 20);
231
232 for (int j = minPos; j < maxPos; j++) {
233 const short *subMat = profileScores + (j * profileAASize);
234 if (i == j) {
235 continue;
236 }
237 for (size_t aa = 0; aa < 20; aa++) {
238 aaSum[aa] += subMat[aa] - pNullBuffer[j];
239 }
240 }
241 for (size_t aa = 0; aa < 20; aa++) {
242 // printf("%d\t%d\t%2.3f\t%d\n", i, (profileScores + (i * profileAASize))[aa],
243 // aaSum[aa]/windowLength,
244 // static_cast<int>((profileScores + (i * profileAASize))[aa] - aaSum[aa]/windowLength) );
245 //std::cout << i << "\t" << (profileScores + (i * profileAASize))[aa] << "\t" << aaSum[aa]/windowLength << "\t" << (profileScores + (i * profileAASize))[aa] - aaSum[aa]/windowLength << std::endl;
246 profileScores[i * profileAASize + aa] = static_cast<int>((profileScores + (i * profileAASize))[aa] -
247 aaSum[aa] / windowLength);
248 // avg += static_cast<int>((profileScores + (i * profileAASize))[aa] - aaSum[aa]/windowLength);
249 }
250 }
251 // std::cout << "avg=" << avg/(N*20) << std::endl;
252 }
253
254
~SubstitutionMatrix()255 SubstitutionMatrix::~SubstitutionMatrix() {
256 }
257
setupLetterMapping()258 void SubstitutionMatrix::setupLetterMapping(){
259 for(int letter = 0; letter < UCHAR_MAX; letter++){
260 char upperLetter = toupper(static_cast<char>(letter));
261 switch(upperLetter){
262 case 'A':
263 case 'T':
264 case 'G':
265 case 'C':
266 case 'D':
267 case 'E':
268 case 'F':
269 case 'H':
270 case 'I':
271 case 'K':
272 case 'L':
273 case 'M':
274 case 'N':
275 case 'P':
276 case 'Q':
277 case 'R':
278 case 'S':
279 case 'V':
280 case 'W':
281 case 'Y':
282 case 'X':
283 this->aa2num[static_cast<int>(letter)] = this->aa2num[static_cast<int>(upperLetter)];
284 break;
285 case 'J':
286 this->aa2num[static_cast<int>(letter)] = this->aa2num[(int)'L'];
287 break;
288 case 'U':
289 case 'O':
290 this->aa2num[static_cast<int>(letter)] = this->aa2num[(int)'X'];
291 break;
292 case 'Z': this->aa2num[static_cast<int>(letter)] = this->aa2num[(int)'E']; break;
293 case 'B': this->aa2num[static_cast<int>(letter)] = this->aa2num[(int)'D']; break;
294 default:
295 this->aa2num[static_cast<int>(letter)] = this->aa2num[(int)'X'];
296 break;
297 }
298 }
299 }
300
parseAlphabet(char * word,char * num2aa,int * aa2num)301 int SubstitutionMatrix::parseAlphabet(char *word, char *num2aa, int *aa2num) {
302 char *charReader = word;
303 int minAAInt = INT_MAX;
304 // find amino acid with minimal int value
305 while (isalpha(*charReader)) {
306 const char aa = *charReader;
307 const int intAA = aa2num[static_cast<int>(aa)];
308 minAAInt = std::max(minAAInt, intAA);
309 charReader++;
310 }
311 if(minAAInt==-1){
312
313 }
314 char minAAChar = num2aa[minAAInt];
315 // do alphabet reduction
316 charReader = word;
317 while (isalpha(*charReader)) {
318 const char aa = *charReader;
319 const int intAA = aa2num[static_cast<int>(aa)];
320 aa2num[static_cast<int>(aa)] = minAAInt;
321 num2aa[intAA] = minAAChar;
322 charReader++;
323 }
324 return minAAInt;
325 }
326
readProbMatrix(const std::string & matrixData,const bool containsX)327 void SubstitutionMatrix::readProbMatrix(const std::string &matrixData, const bool containsX) {
328 std::stringstream in(matrixData);
329 std::string line;
330 bool probMatrixStart = false;
331 const char *words[256];
332 bool hasLambda = false;
333 bool hasBackground = false;
334 while (in.good()) {
335 getline(in, line);
336 size_t wordCnt = Util::getWordsOfLine((char *) line.c_str(), words, 256);
337 // skip comments
338 if (line[0] == '#') {
339 if (line.find("# Background (precomputed optional):") == 0) {
340 for (size_t i = 4; i < wordCnt; i++) {
341 double f = strtod(words[i], NULL);
342 pBack[i-4] = f;
343 }
344 hasBackground = true;
345 }
346 if (line.find("# Lambda (precomputed optional):") == 0) {
347 double f = strtod(words[4], NULL);
348 lambda = f;
349 hasLambda = true;
350 }
351 continue;
352 }
353
354 if (wordCnt > 1 && probMatrixStart == false) {
355 probMatrixStart = true;
356 continue;
357 }
358 if (wordCnt > 1 && probMatrixStart == true) {
359 if (isalpha(words[0][0]) == false) {
360 Debug(Debug::ERROR) << "First element in probability line must be an alphabet letter.\n";
361 EXIT(EXIT_FAILURE);
362 }
363 int aa = static_cast<int>(aa2num[toupper(words[0][0])]);
364 for (int i = 0; i < alphabetSize; i++) {
365 double f = strtod(words[i + 1], NULL);
366 probMatrix[aa][i] = f; // divided by 2 because we scale bit/2 ot bit
367 }
368 }
369 }
370 bool xIsPositive = false;
371 if( containsX == true ){
372 for (int j = 0; j < alphabetSize; j++) {
373 int xIndex = static_cast<int>(aa2num[static_cast<int>('X')]);
374 if ((probMatrix[xIndex][j] > 0) || (probMatrix[j][xIndex] > 0)) {
375 xIsPositive = true;
376 break;
377 }
378 }
379 }
380
381
382 if (containsX == false) {
383 Debug(Debug::ERROR) << "Please add X to your substitution matrix.\n";
384 EXIT(EXIT_FAILURE);
385 }
386
387 if(hasLambda == false || hasBackground == false){
388 if (estimateLambdaAndBackground(const_cast<const double **>(probMatrix), alphabetSize - ((xIsPositive) ? 0 : 1),
389 pBack, lambda) == false) {
390 Debug(Debug::ERROR) << "Computing inverse of substitution matrix failed\n";
391 EXIT(EXIT_FAILURE);
392 }
393 pBack[static_cast<int>(aa2num[static_cast<int>('X')])]=ANY_BACK;
394 }
395 if(xIsPositive == false){
396 for (int i = 0; i < alphabetSize - 1; i++) {
397 pBack[i] = pBack[i] * (1.0 - pBack[static_cast<int>(aa2num[static_cast<int>('X')])]);
398 }
399 }
400 // Reconstruct Probability Sab=(Pab/Pa*Pb) -> Pab = exp(Sab) * Pa * Pb
401 for (int i = 0; i < alphabetSize; i++) {
402 //smat[i] = smatData+((subMat.alphabetSize-1)*i);
403 for (int j = 0; j < alphabetSize; j++) {
404 probMatrix[i][j] = std::exp(lambda * probMatrix[i][j]) * pBack[i] * pBack[j];
405 }
406 }
407 }
408
setAaMappingDetectAlphSize(std::string & matrixData)409 std::pair<int, bool> SubstitutionMatrix::setAaMappingDetectAlphSize(std::string &matrixData){
410 std::stringstream in(matrixData);
411 std::string line;
412 const char *words[256];
413 int alphabetSize = 0;
414 bool containsX;
415 while (in.good()) {
416 getline(in, line);
417 size_t wordCnt = Util::getWordsOfLine((char *) line.c_str(), words, 256);
418
419 if (line[0] == '#') {
420 continue;
421 }
422 if (wordCnt > 1) {
423 for (size_t i = 0; i < wordCnt; i++) {
424 if (isalpha(words[i][0]) == false) {
425 Debug(Debug::ERROR) << "Probability matrix must start with alphabet header.\n";
426 EXIT(EXIT_FAILURE);
427 }
428 int aa = toupper(words[i][0]);
429 aa2num[aa] = static_cast<unsigned char>(i);
430 num2aa[i] = aa;
431 if (aa == 'X') {
432 containsX = true;
433 }
434 // column_aa[i] = parseAlphabet(words[i], num2aa, aa2num);
435 }
436 alphabetSize = wordCnt;
437 return std::make_pair(alphabetSize, containsX);
438 }
439 }
440 return std::make_pair(-1, false);
441 }
442
443
444
445
446