1 /* ===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================*/
24
25 /** @file composition_adjustment.c
26 * Highest level functions to solve the optimization problem for
27 * compositional score matrix adjustment.
28 *
29 * @author Yi-Kuo Yu, Alejandro Schaffer, E. Michael Gertz
30 */
31
32 #include <limits.h>
33 #include <assert.h>
34 #include <algo/blast/core/ncbi_std.h>
35 #include <algo/blast/composition_adjustment/composition_constants.h>
36 #include <algo/blast/composition_adjustment/composition_adjustment.h>
37 #include <algo/blast/composition_adjustment/matrix_frequency_data.h>
38 #include <algo/blast/composition_adjustment/nlm_linear_algebra.h>
39 #include <algo/blast/composition_adjustment/optimize_target_freq.h>
40 #include <algo/blast/composition_adjustment/unified_pvalues.h>
41
42
43 /**positions of true characters in protein alphabet*/
44 static int trueCharPositions[COMPO_NUM_TRUE_AA] =
45 {1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,22};
46
47 /**
48 * conversion from 26 letter NCBIstdaa alphabet to 20 letter order
49 * for true amino acids: ARNDCQEGHILKMFPSTWYV. This order is
50 * alphabetical in the standard three-letter abbreviation of each
51 * amino acid */
52 static int alphaConvert[COMPO_LARGEST_ALPHABET] =
53 {(-1), 0, (-1), 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15,
54 16, 19, 17, (-1), 18, (-1), (-1), (-1), (-1), (-1)};
55
56
57 /**
58 * Desired margin between an end of region used for computing a
59 * composition, and the nearest StopChar; the desired margin may
60 * not be attained. */
61 static const int kCompositionMargin = 20;
62
63 /** iteration limit for Newton's method for computing Lambda */
64 static const int kLambdaIterationLimit = 100;
65 /** bound on error for estimating lambda */
66 static const double kLambdaErrorTolerance = 0.0000001;
67 /** bound on the difference between the expected value of
68 * exp(lambda x S) and 1 */
69 static const double kLambdaFunctionTolerance = 0.00001;
70
71
72 /** bound on error for Newton's method */
73 static const double kCompoAdjustErrTolerance = 0.00000001;
74 /** iteration limit for Newton's method */
75 static const int kCompoAdjustIterationLimit = 2000;
76 /** relative entropy of BLOSUM62 */
77 static const double kFixedReBlosum62 = 0.44;
78 /** largest permitted value of score(i,X), when scores are computed using
79 composition adjustment */
80 static const double kMaximumXscore = -1.0;
81
82 /** The BLOSUM62 matrix for the ARND... alphabet, using standard scale */
83 static double BLOS62[COMPO_NUM_TRUE_AA][COMPO_NUM_TRUE_AA]=
84 {{4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0},
85 {-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3},
86 {-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3},
87 {-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3},
88 {0,-3,-3,-3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1},
89 {-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2},
90 {-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2},
91 {0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3},
92 {-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3},
93 {-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3},
94 {-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1},
95 {-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2},
96 {-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1},
97 {-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1},
98 {-1,-2,-2,-1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2},
99 {1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2},
100 {0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0},
101 {-3,-3,-4,-4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3},
102 {-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1},
103 {0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4}};
104
105 /* Documented in composition_adjustment.h. */
106 void
Blast_ApplyPseudocounts(double * probs20,int number_of_observations,const double * background_probs20,int pseudocounts)107 Blast_ApplyPseudocounts(double * probs20,
108 int number_of_observations,
109 const double * background_probs20,
110 int pseudocounts)
111 {
112 int i; /* loop index */
113 double weight; /* weight assigned to pseudocounts */
114 double sum; /* sum of the observed frequencies */
115 /* pseudocounts as a double */
116 double dpseudocounts = pseudocounts;
117
118 /* Normalize probabilities */
119 sum = 0.0;
120 for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
121 sum += probs20[i];
122 }
123 if (sum == 0.0) { /* Can't normalize a zero vector */
124 sum = 1.0;
125 }
126 weight = dpseudocounts / (number_of_observations + dpseudocounts);
127 for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
128 probs20[i] = (1.0 - weight) * probs20[i] / sum
129 + weight * background_probs20[i];
130 }
131 }
132
133
134 /* Documented in composition_adjustment.h. */
135 double
Blast_GetRelativeEntropy(const double A[],const double B[])136 Blast_GetRelativeEntropy(const double A[], const double B[])
137 {
138 int i; /* loop index over letters */
139 double temp; /* intermediate term */
140 double value = 0.0; /* square of relative entropy */
141
142 for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
143 temp = (A[i] + B[i]) / 2;
144 if (temp > 0) {
145 if (A[i] > 0) {
146 value += A[i] * log(A[i] / temp) / 2;
147 }
148 if (B[i] > 0) {
149 value += B[i] * log(B[i] / temp) / 2;
150 }
151 }
152 }
153 if (value < 0) { /* must be numerical rounding error */
154 value = 0;
155 }
156 return sqrt(value);
157 }
158
159
160
161 /* Blast_CalcLambdaFullPrecision -- interface documented in
162 * composition_adjustment.h.
163 *
164 * If the average score for a composition is negative, and the maximum
165 * score that occurs with nonzero probability is positive, then
166 * statistical parameter Lambda exists and is the unique, positive
167 * solution to
168 *
169 * phi(lambda) = sum_{i,j} P_1(i) P_2(j) exp(S_{ij} lambda) - 1 = 0,
170 *
171 * where S_{ij} is the matrix "score" and P_1 and P_2 are row_probs and
172 * col_probs respectively.
173 *
174 * It is simpler to solve this problem in x = exp(-lambda) than it is
175 * to solve it in lambda, because we know that for x, a solution lies
176 * in [0,1]. Furthermore, if M is the largest S_{ij} so that P_1(i)
177 * and P_2(j) are nonzero, then the function
178 *
179 * f(x) = -x^M - sum_{i,j} P_1(i) P_2(j) x^{M - S_{ij}},
180 *
181 * obtained by multiplying phi(lambda) by x^M, is well behaved in
182 * (0,1] -- if the scores are integers, it is a polynomial. Since x = 0
183 * is not a solution, x solves f(x) = 0 in [0,1), if and only if
184 * lambda = -ln(x) is a positive solution to phi(lambda). Therefore,
185 * we may define a safeguarded Newton iteration to find a solution of
186 * f(x) = 0.
187 *
188 * For the most part, this is a standard safeguarded Newton iteration:
189 * define an interval of uncertainty [a,b] with f(a) > 0 and f(b) < 0
190 * (except for the initial value b = 1, where f(b) = 0); evaluate the
191 * function and use the sign of that value to shrink the interval of
192 * uncertainty; compute a Newton step; and if the Newton step suggests
193 * a point outside the interval of uncertainty or fails to decrease
194 * the function sufficiently, then bisect. There are three further
195 * details needed to understand the algorithm:
196 *
197 * 1) If y the unique solution in [0,1], then f is positive to the left of
198 * y, and negative to the right. Therefore, we may determine whether
199 * the Newton step -f(x)/f'(x) is moving toward, or away from, y by
200 * examining the sign of f'(x). If f'(x) >= 0, we bisect, instead
201 * of taking the Newton step.
202 * 2) There is a neighborhood around x = 1 for which f'(x) >= 0,
203 * so (1) prevents convergence to x = 1.
204 * 3) Conditions like fabs(p) < lambda_tolerance * x * (1-x) are used in
205 * convergence criteria because these values translate to a bound
206 * on the relative error in lambda. This is proved in the
207 * "Blast Scoring Parameters" document that accompanies the BLAST
208 * code.
209 *
210 * We have observed that in typical cases the safeguarded Newton
211 * iteration on f(x) requires half the iterations of a Newton
212 * iteration on phi(lambda). More importantly, the iteration on f(x)
213 * is robust and doesn't overflow; defining a robust safeguarded
214 * Newton iteration on phi(lambda) that cannot converge to zero and
215 * that is protected against overflow is more difficult. So (despite
216 * the length of this comment) the Newton iteration on f(x) is the
217 * simpler solution.
218 */
219 void
Blast_CalcLambdaFullPrecision(double * plambda,int * piterations,double ** score,int alphsize,const double row_prob[],const double col_prob[],double lambda_tolerance,double function_tolerance,int max_iterations)220 Blast_CalcLambdaFullPrecision(double * plambda, int *piterations,
221 double **score, int alphsize,
222 const double row_prob[],
223 const double col_prob[],
224 double lambda_tolerance,
225 double function_tolerance,
226 int max_iterations)
227 {
228 double f = 4; /* The current function value; initially
229 set to a value greater than any possible
230 real value of f */
231 double left = 0, right = 1; /* (left, right) is an interval containing
232 a solution */
233 double x = 0.367879441171; /* The current iterate; initially exp(-1) */
234 int is_newton = 0; /* true if the last iteration was a Newton
235 step; initially false */
236 int i, j, k; /* iteration indices */
237 /* maximum score that occurs with nonzero probability */
238 double max_score = COMPO_SCORE_MIN;
239 /* average score */
240 double avg_score = 0.0;
241
242 /* Find the maximum score with nonzero probability */
243 for (i = 0; i < alphsize; i++) {
244 if (row_prob[i] == 0.0) {
245 continue;
246 }
247 for (j = 0; j < alphsize; j++) {
248 if (col_prob[j] == 0.0) {
249 continue;
250 }
251 if (max_score < score[i][j]) {
252 max_score = score[i][j];
253 }
254 avg_score += row_prob[i] * col_prob[j] * score[i][j];
255 }
256 }
257 if (max_score <= 0.0 || avg_score >= 0) {
258 /* The iteration cannot converge if max_score is nonpositive
259 * or the average score is nonnegative; lambda doesn't exist */
260 *piterations = max_iterations;
261 *plambda = -1.0;
262 return;
263 }
264 for (k = 0; k < max_iterations; k++) {
265 double slope; /* slope of f at x */
266 double fold = f; /* previous value of f */
267 double x_pow_max_score; /* x raised to the power max_score */
268 double lambda = -log(x); /* the iterate lambda, see above */
269 int was_newton = is_newton; /* true if the previous iteration
270 was a Newton step; instead of a
271 bisection step */
272 /* Evaluate the function and its derivative */
273 x_pow_max_score = exp(-max_score * lambda);
274 f = -x_pow_max_score;
275 slope = max_score * f / x;
276 for (i = 0; i < alphsize; i++) {
277 if (row_prob[i] == 0.0) {
278 continue;
279 }
280 for (j = 0; j < alphsize; j++) {
281 double ff; /* a term in the sum used to compute f */
282
283 if (col_prob[j] == 0.0) {
284 continue;
285 }
286 if (max_score != score[i][j]) {
287 double diff_score = max_score - score[i][j];
288
289 ff = row_prob[i] * col_prob[j] * exp(-lambda * diff_score);
290 slope += diff_score * ff / x;
291 } else {
292 ff = row_prob[i] * col_prob[j];
293 }
294 f += ff;
295 }
296 }
297 /* Finished evaluating the function and its derivative */
298 if (f > 0) {
299 left = x; /* move the left endpoint */
300 } else if (f < 0) {
301 right = x; /* move the right endpoint */
302 } else { /* f == 0 */
303 break; /* x is an exact solution */
304 }
305 if (right - left <= 2 * left * (1 - right) * lambda_tolerance &&
306 fabs(f/x_pow_max_score) <= function_tolerance) {
307 /* The midpoint of the interval converged */
308 x = (left + right) / 2;
309 break;
310 }
311 if ((was_newton && fabs(f) > .9 * fabs(fold))
312 /* if the previous iteration was a Newton step but didn't
313 * decrease f sufficiently; or */
314 || slope >= 0
315 /* if a Newton step will move us away from the desired solution */
316 ) {/* then */
317 x = (left + right)/2; /* bisect */
318 } else {
319 double p = -f/slope; /* The Newton step */
320 double y = x + p; /* The proposed next iterate */
321 if (y <= left || y >= right) { /* The proposed iterate is
322 not in (left,right) */
323 x = (left + right)/2; /* bisect */
324 } else {/* The proposed iterate is in (left,right). Accept it. */
325 is_newton = 1;
326 x = y;
327 if (fabs(p) <= lambda_tolerance * x * (1-x) &&
328 fabs(f/x_pow_max_score) <= function_tolerance) break;
329 }
330 }
331 } /* End for all iterations k */
332 *plambda = (k < max_iterations) ? -log(x) : -1.0;
333 *piterations = k;
334 }
335
336
337 /* Documented in composition_adjustment.h. */
338 double
Blast_MatrixEntropy(double ** matrix,int alphsize,const double row_prob[],const double col_prob[],double Lambda)339 Blast_MatrixEntropy(double ** matrix, int alphsize, const double row_prob[],
340 const double col_prob[], double Lambda)
341 {
342 int i, j;
343 double entropy = 0.0;
344 for (i = 0; i < alphsize; i++) {
345 for (j = 0; j < alphsize; j++) {
346 /* the score at (i,j), rescaled to nats */
347 double nat_score = Lambda * matrix[i][j];
348 entropy += nat_score * exp(nat_score) * row_prob[i] * col_prob[j];
349 }
350 }
351 return entropy;
352 }
353
354
355
356 /* Documented in composition_adjustment.h. */
357 double
Blast_TargetFreqEntropy(double ** target_freq)358 Blast_TargetFreqEntropy(double ** target_freq)
359 {
360 int i, j; /* iteration indices */
361 double entropy; /* the entropy to be returned */
362 /* Row probabilities consistent with the target frequencies */
363 double row_prob[COMPO_NUM_TRUE_AA] = {0,};
364 double col_prob[COMPO_NUM_TRUE_AA] = {0,};
365
366 for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
367 for (j = 0; j < COMPO_NUM_TRUE_AA; j++) {
368 row_prob[i] += target_freq[i][j];
369 col_prob[j] += target_freq[i][j];
370 }
371 }
372 entropy = 0.0;
373 for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
374 for (j = 0; j < COMPO_NUM_TRUE_AA; j++) {
375 double freq = target_freq[i][j];
376 entropy += freq * log(freq / row_prob[i] / col_prob[j]);
377 }
378 }
379 return entropy;
380 }
381
382
383 /* Documented in composition_adjustment.h. */
384 void
Blast_CalcFreqRatios(double ** ratios,int alphsize,double row_prob[],double col_prob[])385 Blast_CalcFreqRatios(double ** ratios, int alphsize,
386 double row_prob[], double col_prob[])
387 {
388 int i, j;
389 for (i = 0; i < alphsize; i++) {
390 if (row_prob[i] > 0) {
391 for (j = 0; j < alphsize; j++) {
392 if (col_prob[j] > 0) {
393 ratios[i][j] /= (row_prob[i] * col_prob[j]);
394 }
395 }
396 }
397 }
398 }
399
400
401 /* Documented in composition_adjustment.h. */
402 void
Blast_FreqRatioToScore(double ** matrix,int rows,int cols,double Lambda)403 Blast_FreqRatioToScore(double ** matrix, int rows, int cols, double Lambda)
404 {
405 int i;
406 for (i = 0; i < rows; i++) {
407 int j;
408 for (j = 0; j < cols; j++) {
409 if (0.0 == matrix[i][j]) {
410 matrix[i][j] = COMPO_SCORE_MIN;
411 } else {
412 matrix[i][j] = log(matrix[i][j])/Lambda;
413 }
414 }
415 }
416 }
417
418
419 /**
420 * Convert letter probabilities from the NCBIstdaa alphabet to
421 * a 20 letter ARND... amino acid alphabet. (@see alphaConvert)
422 *
423 * @param outputLetterProbs the ARND letter probabilities [out]
424 * @param inputLetterProbs the NCBIstdaa letter probabilities [in]
425 * @param alphsize the size of the alphabet
426 */
427 static void
s_GatherLetterProbs(double * outputLetterProbs,const double * inputLetterProbs,int alphsize)428 s_GatherLetterProbs(double * outputLetterProbs,
429 const double * inputLetterProbs, int alphsize)
430 {
431 int c; /*index over characters*/
432
433 for (c = 0; c < alphsize; c++) {
434 if ((-1) != alphaConvert[c]) {
435 outputLetterProbs[alphaConvert[c]] = inputLetterProbs[c];
436 }
437 }
438 }
439
440 /**
441 * Convert letter probabilities from a a 20 letter ARND... amino acid
442 * alphabet to the NCBIstdaa alphabet, (@see alphaConvert) setting
443 * probabilities for nonstandard characters to zero.
444 *
445 * @param std_probs the NCBIstdaa letter probabilities [out]
446 * @param alphsize size of the NCBIstdaa alphabet [in]
447 * @param probs the ARND letter probabilities [in]
448 */
449 static void
s_UnpackLetterProbs(double std_probs[],int alphsize,const double probs[])450 s_UnpackLetterProbs(double std_probs[], int alphsize, const double probs[])
451 {
452 int c; /*index over characters*/
453
454 for (c = 0; c < alphsize; c++) {
455 if ((-1) != alphaConvert[c]) {
456 std_probs[c] = probs[alphaConvert[c]];
457 } else {
458 std_probs[c] = 0.0;
459 }
460 }
461 }
462
463
464 /**
465 * Set the probabilities for the two-character ambiguity characters.
466 * @param probs the probabilities for the NCBIstdaa alphabet.
467 * On entry, values for the standard amino acids must
468 * be set; on exit, values for the two-character
469 * ambiguities will also be set.
470 * @param alphsize the size of the alphabet
471 */
472 static void
s_SetPairAmbigProbsToSum(double probs[],int alphsize)473 s_SetPairAmbigProbsToSum(double probs[], int alphsize)
474 {
475 probs[eBchar] = probs[eDchar] + probs[eNchar];
476 probs[eZchar] = probs[eEchar] + probs[eQchar];
477 if (alphsize > eJchar) {
478 probs[eJchar] = probs[eIchar] + probs[eLchar];
479 }
480 }
481
482
483 /* Documented in composition_adjustment.h. */
484 int
Blast_EntropyOldFreqNewContext(double * entropy,double * Lambda,int * iter_count,double ** target_freq,const double row_prob[],const double col_prob[])485 Blast_EntropyOldFreqNewContext(double * entropy,
486 double * Lambda,
487 int * iter_count,
488 double ** target_freq,
489 const double row_prob[],
490 const double col_prob[])
491 {
492 /* iteration indices */
493 int i, j;
494 /* Status flag; will be set to zero on success */
495 int status = 1;
496 /* A matrix of scores in the context consistent with the target
497 * frequencies */
498 double **scores;
499 /* Row and column probabilities consistent with the target
500 * frequencies; the old context */
501 double old_col_prob[COMPO_NUM_TRUE_AA] = {0.0,};
502 double old_row_prob[COMPO_NUM_TRUE_AA] = {0.0,};
503
504 *entropy = 0;
505 status = 1;
506
507 /* Calculate the matrix "scores" from the target frequencies */
508 scores = Nlm_DenseMatrixNew(COMPO_NUM_TRUE_AA, COMPO_NUM_TRUE_AA);
509 if (scores == NULL) {
510 return -1;
511 }
512 for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
513 for (j = 0; j < COMPO_NUM_TRUE_AA; j++) {
514 old_row_prob[i] += target_freq[i][j];
515 old_col_prob[j] += target_freq[i][j];
516 }
517 }
518 for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
519 memcpy(scores[i], target_freq[i], COMPO_NUM_TRUE_AA * sizeof(double));
520 }
521 Blast_CalcFreqRatios(scores, COMPO_NUM_TRUE_AA,
522 old_row_prob, old_col_prob);
523 Blast_FreqRatioToScore(scores, COMPO_NUM_TRUE_AA, COMPO_NUM_TRUE_AA, 1.0);
524 /* Finished calculating the matrix "scores" */
525
526 Blast_CalcLambdaFullPrecision(Lambda, iter_count, scores,
527 COMPO_NUM_TRUE_AA, row_prob,
528 col_prob, kLambdaErrorTolerance,
529 kLambdaFunctionTolerance,
530 kLambdaIterationLimit);
531 if (*iter_count < kLambdaIterationLimit) {
532 *entropy = Blast_MatrixEntropy(scores, COMPO_NUM_TRUE_AA,
533 row_prob, col_prob, *Lambda);
534 status = 0;
535 }
536 Nlm_DenseMatrixFree(&scores);
537 return status;
538 }
539
540
541 /* Documented in composition_adjustment.h. */
542 void
Blast_TrueAaToStdTargetFreqs(double ** StdFreq,int StdAlphsize,double ** freq)543 Blast_TrueAaToStdTargetFreqs(double ** StdFreq, int StdAlphsize,
544 double ** freq)
545 {
546 /* Note I'm using a rough convention for this routine that uppercase
547 * letters refer to quantities in the standard (larger) alphabet
548 * and lowercase letters refer to the true amino acid (smaller)
549 * alphabet.
550 */
551 /* Shorter names for the sizes of the two alphabets */
552 const int small_alphsize = COMPO_NUM_TRUE_AA;
553 int A, B; /* characters in the std (big) alphabet */
554 int a, b; /* characters in the small alphabet */
555 double sum; /* sum of values in target_freq; used to normalize */
556 sum = 0.0;
557 for (a = 0; a < small_alphsize; a++) {
558 for (b = 0; b < small_alphsize; b++) {
559 sum += freq[a][b];
560 }
561 }
562 for (A = 0; A < StdAlphsize; A++) {
563 /* for all rows */
564 if (alphaConvert[A] < 0) {
565 /* the row corresponds to a nonstandard reside */
566 for (B = 0; B < StdAlphsize; B++) {
567 StdFreq[A][B] = 0.0;
568 }
569 } else {
570 /* the row corresponds to a standard reside */
571 a = alphaConvert[A];
572
573 for (B = 0; B < StdAlphsize; B++) {
574 /* for all columns */
575 if (alphaConvert[B] < 0) {
576 /* the column corresponds to a nonstandard reside */
577 StdFreq[A][B] = 0.0;
578 } else {
579 /* the column corresponds to a standard reside */
580 b = alphaConvert[B];
581 StdFreq[A][B] = freq[a][b] / sum;
582 }
583 }
584 /* Set values for two-character ambiguities */
585 StdFreq[A][eBchar] = StdFreq[A][eDchar] + StdFreq[A][eNchar];
586 StdFreq[A][eZchar] = StdFreq[A][eEchar] + StdFreq[A][eQchar];
587 if (StdAlphsize > eJchar) {
588 StdFreq[A][eJchar] = StdFreq[A][eIchar] + StdFreq[A][eLchar];
589 }
590 }
591 }
592 /* Add rows to set values for two-character ambiguities */
593 memcpy(StdFreq[eBchar], StdFreq[eDchar], StdAlphsize * sizeof(double));
594 Nlm_AddVectors(StdFreq[eBchar], StdAlphsize, 1.0, StdFreq[eNchar]);
595
596 memcpy(StdFreq[eZchar], StdFreq[eEchar], StdAlphsize * sizeof(double));
597 Nlm_AddVectors(StdFreq[eZchar], StdAlphsize, 1.0, StdFreq[eQchar]);
598
599 if (StdAlphsize > eJchar) {
600 memcpy(StdFreq[eJchar],StdFreq[eIchar], StdAlphsize * sizeof(double));
601 Nlm_AddVectors(StdFreq[eJchar], StdAlphsize, 1.0, StdFreq[eLchar]);
602 }
603 }
604
605
606 /**
607 * Calculate the average score of a row or column of a matrix.
608 *
609 * @param M a vector of scores
610 * @param alphsize the size of the alphabet
611 * @param incM stride between elements of M; used to pass a column
612 * of a score matrix to this routine by setting incM
613 * to the number of elements in a column.
614 * @param probs background probabilities for a sequence.
615 */
616 static double
s_CalcAvgScore(double * M,int alphsize,int incM,const double probs[])617 s_CalcAvgScore(double * M, int alphsize, int incM, const double probs[])
618 {
619 int j; /* iteration index */
620 double score_iX = 0.0; /* score of character i substituted by X */
621
622 for (j = 0; j < alphsize; j++) {
623 if (alphaConvert[j] >= 0) {
624 /* If the column corresponds to a true amino acid */
625 score_iX += M[j * incM] * probs[j];
626 }
627 }
628 return score_iX;
629 }
630
631 /**
632 * Calculate values for the X ambiguity score, give an vector of scores and
633 * a set of character probabilities.
634 * @param M a vector of scores
635 * @param alphsize the size of the alphabet
636 * @param incM stride between elements of M; used to pass a column
637 * of a score matrix to this routine by setting incM
638 * to the number of elements in a column.
639 * @param probs background probabilities for a sequence.
640 */
641 static double
s_CalcXScore(double * M,int alphsize,int incM,const double probs[])642 s_CalcXScore(double * M, int alphsize, int incM, const double probs[])
643 {
644 return MIN( s_CalcAvgScore(M, alphsize, incM, probs), kMaximumXscore);
645 }
646
647
648 /**
649 * Given a standard matrix with the scores for true amino acid, and
650 * pairwise ambiguity scores set, calculate and set the scores for the
651 * X, U (Selenocystiene) and O (pyrrolysine) characters.
652 *
653 * @param M a scoring matrix
654 * @param alphsize the size of the alphabet
655 * @param row_probs character frequencies for the sequence corresponding
656 * to the rows of M.
657 * @param col_probs character frequencies for the sequence corresponding
658 * to the columns of M.
659 */
660 static void
s_SetXUOScores(double ** M,int alphsize,const double row_probs[],const double col_probs[])661 s_SetXUOScores(double ** M, int alphsize,
662 const double row_probs[], const double col_probs[])
663 {
664 int i; /* iteration index */
665 double score_XX = 0.0; /* score of matching an X to an X */
666 /* the matrix has alphsize colums (this variable exists just to
667 make things easier to read) */
668 const int cols = alphsize;
669
670 for (i = 0; i < alphsize; i++) {
671 if (alphaConvert[i] >= 0) {
672 double avg_iX = s_CalcAvgScore(M[i], alphsize, 1, col_probs);
673 M[i][eXchar] = MIN(avg_iX, kMaximumXscore);
674 score_XX += avg_iX * row_probs[i];
675
676 M[eXchar][i] = s_CalcXScore(&M[0][i], alphsize, cols, row_probs);
677 }
678 }
679 M[eXchar][eXchar] = MIN(score_XX, kMaximumXscore);
680
681 /* Set X scores for pairwise ambiguity characters */
682 M[eBchar][eXchar] = s_CalcXScore(M[eBchar], alphsize, 1, col_probs);
683 M[eXchar][eBchar] = s_CalcXScore(&M[0][eBchar], alphsize, cols, row_probs);
684
685 M[eZchar][eXchar] = s_CalcXScore(M[eZchar], alphsize, 1, col_probs);
686 M[eXchar][eZchar] = s_CalcXScore(&M[0][eZchar], alphsize, cols, row_probs);
687 if( alphsize > eJchar) {
688 M[eJchar][eXchar] = s_CalcXScore(M[eJchar], alphsize, 1, col_probs);
689 M[eXchar][eJchar] =
690 s_CalcXScore(&M[0][eJchar], alphsize, cols, row_probs);
691 }
692 /* Copy C scores to U */
693 memcpy(M[eSelenocysteine], M[eCchar], alphsize * sizeof(double));
694 for (i = 0; i < alphsize; i++) {
695 M[i][eSelenocysteine] = M[i][eCchar];
696 }
697 /* Copy X scores to O */
698 if (alphsize > eOchar) {
699 memcpy(M[eOchar], M[eXchar], alphsize * sizeof(double));
700 for (i = 0; i < alphsize; i++) {
701 M[i][eOchar] = M[i][eXchar];
702 }
703 }
704 }
705
706
707 /** Return the nearest integer to x. */
Nint(double x)708 static long Nint(double x)
709 {
710 x += (x >= 0. ? 0.5 : -0.5);
711 return (long)x;
712 }
713
714
715 /**
716 * Round a floating point matrix of scores to produce an integer matrix of
717 * scores.
718 *
719 * @param matrix the matrix of integer valued scores [out]
720 * @param floatScoreMatrix the matrix of floating point valued
721 * scores [in]
722 * @param rows the number of rows of the matrices.
723 * @param cols the number of columns in matrix.
724 */
725 static void
s_RoundScoreMatrix(int ** matrix,int rows,int cols,double ** floatScoreMatrix)726 s_RoundScoreMatrix(int **matrix, int rows, int cols,
727 double **floatScoreMatrix)
728 {
729 int p, c; /*indices over positions and characters*/
730
731 for (p = 0; p < rows; p++) {
732 for (c = 0; c < cols; c++) {
733 if (floatScoreMatrix[p][c] < INT_MIN) {
734 matrix[p][c] = INT_MIN;
735 } else {
736 matrix[p][c] = Nint(floatScoreMatrix[p][c]);
737 }
738 }
739 }
740 }
741
742
743 /**
744 * Given a set of target frequencies and two sets of character
745 * probabilities for the true amino acids in the ARND alphabet,
746 * calculate a scoring matrix that has valid entries for all
747 * characters in the NCBIstdaa amino acid alphabet.
748 *
749 * @param Matrix the newly computed matrix
750 * @param Alphsize the size of the NCBIstdaa alphabet
751 * @param target_freq target frequencies for true amino acids (20x20)
752 * @param StartMatrix a matrix containing values for the stop character
753 * @param row_prob probabilities of true amino acids in the sequence
754 * corresponding to the rows of matrix (length = 20)
755 * @param col_prob probabilities of true amino acids in the sequence
756 * corresponding to the columns of matrix (length = 20)
757 * @param Lambda the desired scale of this matrix
758 */
759 static int
s_ScoresStdAlphabet(int ** Matrix,int Alphsize,double ** target_freq,int ** StartMatrix,const double row_prob[],const double col_prob[],double Lambda)760 s_ScoresStdAlphabet(int ** Matrix, int Alphsize,
761 double ** target_freq, int ** StartMatrix,
762 const double row_prob[], const double col_prob[],
763 double Lambda)
764 {
765 /* Note: I'm using a rough convention for this routine that uppercase
766 * letters refer to quantities in the standard (larger) alphabet
767 * and lowercase letters refer to the true amino acid (smaller)
768 * alphabet.
769 */
770 int i;
771 /* row and column probabilities in the NCBIstdaa alphabet */
772 double RowProb[COMPO_LARGEST_ALPHABET];
773 double ColProb[COMPO_LARGEST_ALPHABET];
774 /* A double precision score matrix */
775 double ** Scores = Nlm_DenseMatrixNew(Alphsize, Alphsize);
776 if (Scores == NULL) {
777 return -1;
778 }
779 s_UnpackLetterProbs(RowProb, Alphsize, row_prob);
780 s_SetPairAmbigProbsToSum(RowProb, Alphsize);
781
782 s_UnpackLetterProbs(ColProb, Alphsize, col_prob);
783 s_SetPairAmbigProbsToSum(ColProb, Alphsize);
784
785 Blast_TrueAaToStdTargetFreqs(Scores, Alphsize, target_freq);
786 Blast_CalcFreqRatios(Scores, Alphsize, RowProb, ColProb);
787 Blast_FreqRatioToScore(Scores, Alphsize, Alphsize, Lambda);
788 s_SetXUOScores(Scores, Alphsize, RowProb, ColProb);
789
790 s_RoundScoreMatrix(Matrix, Alphsize, Alphsize, Scores);
791 Nlm_DenseMatrixFree(&Scores);
792
793 for (i = 0; i < Alphsize; i++) {
794 Matrix[i][eStopChar] = StartMatrix[i][eStopChar];
795 Matrix[eStopChar][i] = StartMatrix[eStopChar][i];
796 }
797 return 0;
798 }
799
800
801 /**
802 * Find the range of scores contained in an scoring matrix.
803 * @param obs_min smallest value in the matrix
804 * @param obs_max largest value in the matrix
805 * @param matrix a scoring matrix for the ncbistdaa alphabet
806 * @param rows number of rows in the matrix
807 */
s_GetScoreRange(int * obs_min,int * obs_max,int ** matrix,int rows)808 static void s_GetScoreRange(int * obs_min, int * obs_max,
809 int ** matrix, int rows)
810 {
811 int aa; /* index of an amino-acid in the 20
812 letter alphabet */
813 int irow, jcol; /* matrix row and column indices */
814 int minScore, maxScore; /* largest and smallest observed scores */
815
816 minScore = maxScore = 0;
817 for (irow = 0; irow < rows; irow++) {
818 for (aa = 0; aa < COMPO_NUM_TRUE_AA; aa++) {
819 jcol = trueCharPositions[aa];
820 if (matrix[irow][jcol] < minScore &&
821 matrix[irow][jcol] > COMPO_SCORE_MIN)
822 minScore = matrix[irow][jcol];
823 if (matrix[irow][jcol] > maxScore)
824 maxScore = matrix[irow][jcol];
825 }
826 }
827 *obs_min = minScore;
828 *obs_max = maxScore;
829 }
830
831
832 /**
833 * Compute the score probabilities for a given amino acid substitution matrix
834 * in the context of given query and subject amino acid frequencies.
835 *
836 * @param *obs_min the smallest score in the score matrix [out]
837 * @param *obs_max the largest score in the score matrix [out]
838 * @param *scoreProb the new array, of length (*obs_max - *obs_min + 1),
839 * of score probabilities, where (*scoreProb)[0] is
840 * the probability for score *obs_min.
841 * @param matrix a amino-acid substitution matrix (not
842 * position-specific)
843 * @param alphsize the size of the alphabet
844 * @param subjectProbArray is an array containing the probability of
845 * occurrence of each residue in the subject
846 * @param queryProbArray is an array containing the probability of
847 * occurrence of each residue in the query
848 * @param scoreProb is an array of probabilities for each score
849 * that is to be used as a field in return_sfp
850 * @return 0 on success, -1 on out-of-memory
851 */
852 static int
s_GetMatrixScoreProbs(double ** scoreProb,int * obs_min,int * obs_max,int ** matrix,int alphsize,const double * subjectProbArray,const double * queryProbArray)853 s_GetMatrixScoreProbs(double **scoreProb, int * obs_min, int * obs_max,
854 int **matrix, int alphsize,
855 const double *subjectProbArray,
856 const double *queryProbArray)
857 {
858 int aa; /* index of an amino-acid in the 20 letter
859 alphabet */
860 int irow, jcol; /* matrix row and column indices */
861 double * sprob; /* a pointer to the element of the score
862 probabilities array that represents the
863 probability of the score 0*/
864 int minScore; /* smallest score in matrix; the same value as
865 (*obs_min). */
866 int range; /* the range of scores in the matrix */
867
868 s_GetScoreRange(obs_min, obs_max, matrix, alphsize);
869 minScore = *obs_min;
870 range = *obs_max - *obs_min + 1;
871 *scoreProb = calloc(range, sizeof(double));
872 if (*scoreProb == NULL) {
873 return -1;
874 }
875 sprob = &((*scoreProb)[-(*obs_min)]); /*center around 0*/
876 for (irow = 0; irow < alphsize; irow++) {
877 for (aa = 0; aa < COMPO_NUM_TRUE_AA; aa++) {
878 jcol = trueCharPositions[aa];
879 if (matrix[irow][jcol] >= minScore) {
880 sprob[matrix[irow][jcol]] +=
881 (queryProbArray[irow] * subjectProbArray[jcol]);
882 }
883 }
884 }
885 return 0;
886 }
887
888
889 /**
890 * Compute the score probabilities for a given amino acid position-specific
891 * substitution matrix in the context of a given set of subject amino
892 * acid frequencies.
893 *
894 * @param *obs_min the smallest score in the score matrix [out]
895 * @param *obs_max the largest score in the score matrix [out]
896 * @param *scoreProb the new array, of length (*obs_max - *obs_min + 1),
897 * of score probabilities, where (*scoreProb)[0] is
898 * the probability for score *obs_min.
899 * @param matrix a position-specific amino-acid substitution matrix.
900 * @param rows the number of rows in matrix.
901 * @param subjectProbArray is an array containing the probability of
902 * occurrence of each residue in the subject
903 * @return 0 on success, -1 on out-of-memory
904 */
905 static int
s_GetPssmScoreProbs(double ** scoreProb,int * obs_min,int * obs_max,int ** matrix,int rows,const double * subjectProbArray)906 s_GetPssmScoreProbs(double ** scoreProb, int * obs_min, int * obs_max,
907 int **matrix, int rows,
908 const double *subjectProbArray)
909 {
910 int aa; /* index of an amino-acid in the 20 letter
911 alphabet */
912 int irow, jcol; /* matrix row and column indices */
913 double onePosFrac; /* matrix length as a double*/
914 double * sprob; /* pointer to the element of the score
915 * probabilities array the represents the
916 * probability of zero */
917 int minScore; /* smallest score in matrix; the same value as
918 (*obs_min). */
919 int range; /* the range of scores in the matrix */
920
921 s_GetScoreRange(obs_min, obs_max, matrix, rows);
922 minScore = *obs_min;
923 range = *obs_max - *obs_min + 1;
924 *scoreProb = calloc(range, sizeof(double));
925 if (*scoreProb == NULL) {
926 return -1;
927 }
928 sprob = &((*scoreProb)[-(*obs_min)]); /*center around 0*/
929 onePosFrac = 1.0/ ((double) rows);
930 for (irow = 0; irow < rows; irow++) {
931 for (aa = 0; aa < COMPO_NUM_TRUE_AA; aa++) {
932 jcol = trueCharPositions[aa];
933 if (matrix[irow][jcol] >= minScore) {
934 sprob[matrix[irow][jcol]] +=
935 onePosFrac * subjectProbArray[jcol];
936 }
937 }
938 }
939 return 0;
940 }
941
942
943 /* Documented in composition_adjustment.h. */
944 void
Blast_Int4MatrixFromFreq(int ** matrix,int alphsize,double ** freq,double Lambda)945 Blast_Int4MatrixFromFreq(int **matrix, int alphsize,
946 double ** freq, double Lambda)
947 {
948 /* A row of the matrix in double precision */
949 double dMatrixStore[COMPO_LARGEST_ALPHABET];
950 double * dMatrix[1];
951 int i;
952
953 dMatrix[0] = dMatrixStore;
954
955 for (i = 0; i < alphsize; i++) {
956 memcpy(dMatrix[0], freq[i], alphsize * sizeof(double));
957 Blast_FreqRatioToScore(dMatrix, 1, alphsize, Lambda);
958 s_RoundScoreMatrix(&matrix[i], 1, alphsize, dMatrix);
959 }
960 }
961
962
963 /* Documented in composition_adjustment.h. */
Blast_MatrixInfoFree(Blast_MatrixInfo ** ss)964 void Blast_MatrixInfoFree(Blast_MatrixInfo ** ss)
965 {
966 if (*ss != NULL) {
967 free((*ss)->matrixName);
968 Nlm_Int4MatrixFree(&(*ss)->startMatrix);
969 Nlm_DenseMatrixFree(&(*ss)->startFreqRatios);
970 free(*ss);
971 *ss = NULL;
972 }
973 }
974
975
976 /* Documented in composition_adjustment.h. */
977 Blast_MatrixInfo *
Blast_MatrixInfoNew(int rows,int cols,int positionBased)978 Blast_MatrixInfoNew(int rows, int cols, int positionBased)
979 {
980 int i; /* loop index */
981 Blast_MatrixInfo * ss = malloc(sizeof(Blast_MatrixInfo));
982 if (ss != NULL) {
983 ss->rows = rows;
984 ss->cols = cols;
985 ss->positionBased = positionBased;
986
987 ss->matrixName = NULL;
988 ss->startMatrix = NULL;
989 ss->startFreqRatios = NULL;
990
991 ss->startMatrix = Nlm_Int4MatrixNew(rows + 1, cols);
992 if (ss->startMatrix == NULL)
993 goto error_return;
994 ss->startFreqRatios = Nlm_DenseMatrixNew(rows + 1, cols);
995
996 if (ss->startFreqRatios == NULL)
997 goto error_return;
998 for (i = 0; i < cols; i++) {
999 ss->startMatrix[rows][i] = COMPO_SCORE_MIN;
1000 ss->startFreqRatios[rows][i] = (double) COMPO_SCORE_MIN;
1001 }
1002
1003 }
1004 goto normal_return;
1005 error_return:
1006 Blast_MatrixInfoFree(&ss);
1007 normal_return:
1008 return ss;
1009 }
1010
1011
1012 /**
1013 * Fill in all scores for a PSSM at a given scale Lambda.
1014 *
1015 * @param matrix the newly computed matrix [output]
1016 * @param rows number of positions (rows) in the PSSM
1017 * @param cols the number of columns in the PSSM; the alphabet size
1018 * @param freq_ratios frequency ratios defining the PSSM
1019 * @param start_matrix an existing PSSM; used to set values for the
1020 * stop character.
1021 * @param col_prob letter probabilities
1022 * @param Lambda scale of the new matrix
1023 */
1024 static void
s_ScalePSSM(int ** matrix,int rows,int cols,double ** freq_ratios,int ** start_matrix,const double col_prob[],double Lambda)1025 s_ScalePSSM(int **matrix, int rows, int cols, double ** freq_ratios,
1026 int ** start_matrix, const double col_prob[], double Lambda)
1027 {
1028 int p; /* index over matrix rows */
1029 /* A row of scores corresponding to one position in the PSSM */
1030 double row[COMPO_LARGEST_ALPHABET];
1031 /* A matrix with one row */
1032 double * row_matrix[1];
1033
1034 row_matrix[0] = row;
1035
1036 for (p = 0; p < rows; p++) {
1037 double Xscore;
1038 memcpy(row, freq_ratios[p], cols * sizeof(double));
1039
1040 Blast_FreqRatioToScore(row_matrix, 1, cols, Lambda);
1041 row[eXchar] = Xscore = s_CalcXScore(row, cols, 1, col_prob);
1042 /* use Cysteine score for Selenocysteine */
1043 row[eSelenocysteine] = row[eCchar];
1044 if (cols > eOchar) {
1045 row[eOchar] = Xscore;
1046 }
1047 s_RoundScoreMatrix(&matrix[p], 1, cols, row_matrix);
1048
1049 matrix[p][eStopChar] = start_matrix[p][eStopChar];
1050 }
1051 }
1052
1053
1054 /**
1055 * Fill in all scores for a scoring matrix for the NCBIstdaa alphabet
1056 * at a given scale Lambda.
1057 *
1058 * @param matrix the newly computed matrix [output]
1059 * @param alphsize the alphabet size
1060 * @param freq_ratios frequency ratios defining the PSSM
1061 * @param start_matrix an existing matrix; used to set values for the
1062 * stop character.
1063 * @param row_prob letter probabilities for the sequence
1064 * corresponding to the rows of the matrix.
1065 * @param col_prob letter probabilities for the sequence
1066 * corresponding to the columns of the matrix.
1067 * @param Lambda scale of the new matrix
1068 */
1069 static int
s_ScaleSquareMatrix(int ** matrix,int alphsize,double ** freq_ratios,int ** start_matrix,const double row_prob[],const double col_prob[],double Lambda)1070 s_ScaleSquareMatrix(int **matrix, int alphsize,
1071 double ** freq_ratios, int ** start_matrix,
1072 const double row_prob[], const double col_prob[],
1073 double Lambda)
1074 {
1075 double ** scores; /* a double precision matrix of scores */
1076 int i; /* iteration index */
1077
1078 scores = Nlm_DenseMatrixNew(alphsize, alphsize);
1079 if (scores == 0) return -1;
1080
1081 for (i = 0; i < alphsize; i++) {
1082 memcpy(scores[i], freq_ratios[i], alphsize * sizeof(double));
1083 }
1084 Blast_FreqRatioToScore(scores, alphsize, alphsize, Lambda);
1085 s_SetXUOScores(scores, alphsize, row_prob, col_prob);
1086 s_RoundScoreMatrix(matrix, alphsize, alphsize, scores);
1087 for (i = 0; i < alphsize; i++) {
1088 matrix[i][eStopChar] = start_matrix[i][eStopChar];
1089 matrix[eStopChar][i] = start_matrix[eStopChar][i];
1090 }
1091 Nlm_DenseMatrixFree(&scores);
1092
1093 return 0;
1094 }
1095
1096
1097 /** LambdaRatioLowerBound is used when the expected score is too large
1098 * causing impalaKarlinLambdaNR to give a Lambda estimate that
1099 * is too small, or to fail entirely returning -1*/
1100 #define LambdaRatioLowerBound 0.5
1101
1102
1103 /* Documented in composition_adjustment.h. */
1104 int
Blast_CompositionBasedStats(int ** matrix,double * LambdaRatio,const Blast_MatrixInfo * ss,const double queryProb[],const double resProb[],double (* calc_lambda)(double *,int,int,double),int pValueAdjustment)1105 Blast_CompositionBasedStats(int ** matrix, double * LambdaRatio,
1106 const Blast_MatrixInfo * ss,
1107 const double queryProb[], const double resProb[],
1108 double (*calc_lambda)(double*,int,int,double),
1109 int pValueAdjustment)
1110 {
1111 double correctUngappedLambda; /* new value of ungapped lambda */
1112 int obs_min, obs_max; /* smallest and largest score in the
1113 unscaled matrix */
1114 double *scoreArray; /* an array of score probabilities */
1115 int out_of_memory; /* status flag to indicate out of memory */
1116
1117 if (ss->positionBased) {
1118 out_of_memory =
1119 s_GetPssmScoreProbs(&scoreArray, &obs_min, &obs_max,
1120 ss->startMatrix, ss->rows, resProb);
1121 } else {
1122 out_of_memory =
1123 s_GetMatrixScoreProbs(&scoreArray, &obs_min, &obs_max,
1124 ss->startMatrix, ss->cols,
1125 resProb, queryProb);
1126 }
1127 if (out_of_memory)
1128 return -1;
1129 correctUngappedLambda =
1130 calc_lambda(scoreArray, obs_min, obs_max, ss->ungappedLambda);
1131
1132 /* calc_lambda will return -1 in the case where the
1133 * expected score is >=0; however, because of the MAX statement 3
1134 * lines below, LambdaRatio should always be > 0; the succeeding
1135 * test is retained as a vestige, in case one wishes to remove the
1136 * MAX statement and allow LambdaRatio to take on the error value
1137 * -1 */
1138 *LambdaRatio = correctUngappedLambda / ss->ungappedLambda;
1139 if (0 == pValueAdjustment)
1140 *LambdaRatio = MIN(1, *LambdaRatio);
1141 *LambdaRatio = MAX(*LambdaRatio, LambdaRatioLowerBound);
1142
1143 if (*LambdaRatio > 0) {
1144 double scaledLambda = ss->ungappedLambda/(*LambdaRatio);
1145 if (ss->positionBased) {
1146 s_ScalePSSM(matrix, ss->rows, ss->cols, ss->startFreqRatios,
1147 ss->startMatrix, resProb, scaledLambda);
1148 } else {
1149 s_ScaleSquareMatrix(matrix, ss->cols,
1150 ss->startFreqRatios, ss->startMatrix,
1151 queryProb, resProb, scaledLambda);
1152 }
1153 }
1154 free(scoreArray);
1155
1156 return 0;
1157 }
1158
1159
1160 /* Documented in composition_adjustment.h. */
1161 void
Blast_ReadAaComposition(Blast_AminoAcidComposition * composition,int alphsize,const Uint1 * sequence,int length)1162 Blast_ReadAaComposition(Blast_AminoAcidComposition * composition,
1163 int alphsize,
1164 const Uint1 * sequence, int length)
1165 {
1166 int i; /* iteration index */
1167
1168 /* fields of composition as local variables */
1169 int numTrueAminoAcids = 0;
1170 double * prob = composition->prob;
1171
1172 for (i = 0; i < alphsize; i++) {
1173 prob[i] = 0.0;
1174 }
1175 for (i = 0; i < length; i++) {
1176 if (alphaConvert[sequence[i]] >= 0 || sequence[i] == eSelenocysteine) {
1177 prob[sequence[i]]++;
1178 numTrueAminoAcids++;
1179 }
1180 }
1181
1182 /* Count Selenocysteine (U) as if it was Cysteine (C) to avoid a result
1183 where C aligned to U is scored higher than C aligned to C. Otherwise,
1184 because U is not counted as a true amino acid, sequences with U would
1185 appear more complex than those composed only of the 20 "true" amino
1186 acids. Therefore computationally adjusted alignment score would be
1187 larger for sequences containing U than for sequences with C. */
1188 if (prob[eSelenocysteine] > 0.0) {
1189 prob[eCchar] += prob[eSelenocysteine];
1190 prob[eSelenocysteine] = 0.0;
1191 }
1192
1193 composition->numTrueAminoAcids = numTrueAminoAcids;
1194 if (numTrueAminoAcids > 0) {
1195 for (i = 0; i < alphsize; i++) {
1196 prob[i] /= numTrueAminoAcids;
1197 }
1198 }
1199 }
1200
1201
1202 /* Documented in composition_adjustment.h. */
1203 void
Blast_GetCompositionRange(int * pleft,int * pright,const Uint1 * subject_data,int length,int start,int finish)1204 Blast_GetCompositionRange(int * pleft, int * pright,
1205 const Uint1 * subject_data, int length,
1206 int start, int finish)
1207 {
1208 int i; /* iteration index */
1209 int left, right;
1210
1211 left = start;
1212 /* Search leftward for a StopChar */
1213 for (i = left; i > 0; i--) {
1214 if (subject_data[i - 1] == eStopChar) {
1215 /* We have found a StopChar. Unless the StopChar is
1216 * too close to the start of the subject region of the
1217 * HSP, */
1218 if (i + kCompositionMargin < left) {
1219 /* reset the left endpoint. */
1220 left = i + kCompositionMargin;
1221 }
1222 break;
1223 }
1224 }
1225 if (i == 0) {
1226 /* No stop codon was found to the left. */
1227 left = 0;
1228 }
1229 right = finish;
1230 /* Search rightward for a StopChar */
1231 for (i = right; i < length; i++) {
1232 if (subject_data[i] == eStopChar) {
1233 /* We have found a StopChar. Unless the StopChar is
1234 * too close to the end of the subject region of the
1235 * HSP, */
1236 if (i - kCompositionMargin > right) {
1237 /* reset the right endpoint */
1238 right = i - kCompositionMargin;
1239 }
1240 break;
1241 }
1242 }
1243 if (i == length) {
1244 /* No stop codon was found to the right. */
1245 right = length;
1246 }
1247 *pleft = left; *pright = right;
1248 }
1249
1250
1251 /* Documented in composition_adjustment.h. */
1252 void
Blast_CompositionWorkspaceFree(Blast_CompositionWorkspace ** pNRrecord)1253 Blast_CompositionWorkspaceFree(Blast_CompositionWorkspace ** pNRrecord)
1254 {
1255 Blast_CompositionWorkspace * NRrecord = *pNRrecord;
1256
1257 if (NRrecord != NULL) {
1258 free(NRrecord->first_standard_freq);
1259 free(NRrecord->second_standard_freq);
1260
1261 Nlm_DenseMatrixFree(&NRrecord->mat_final);
1262 Nlm_DenseMatrixFree(&NRrecord->mat_b);
1263
1264 free(NRrecord);
1265 }
1266 *pNRrecord = NULL;
1267 }
1268
1269
1270 /* Documented in composition_adjustment.h. */
Blast_CompositionWorkspaceNew(void)1271 Blast_CompositionWorkspace * Blast_CompositionWorkspaceNew(void)
1272 {
1273 Blast_CompositionWorkspace * NRrecord; /* record to allocate
1274 and return */
1275 int i; /* loop index */
1276
1277 NRrecord = (Blast_CompositionWorkspace *)
1278 malloc(sizeof(Blast_CompositionWorkspace));
1279 if (NRrecord == NULL) goto error_return;
1280
1281 NRrecord->first_standard_freq = NULL;
1282 NRrecord->second_standard_freq = NULL;
1283 NRrecord->mat_final = NULL;
1284 NRrecord->mat_b = NULL;
1285
1286 NRrecord->first_standard_freq =
1287 (double *) malloc(COMPO_NUM_TRUE_AA * sizeof(double));
1288 if (NRrecord->first_standard_freq == NULL) goto error_return;
1289
1290 NRrecord->second_standard_freq =
1291 (double *) malloc(COMPO_NUM_TRUE_AA * sizeof(double));
1292 if (NRrecord->second_standard_freq == NULL) goto error_return;
1293
1294 NRrecord->mat_final = Nlm_DenseMatrixNew(COMPO_NUM_TRUE_AA,
1295 COMPO_NUM_TRUE_AA);
1296 if (NRrecord->mat_final == NULL) goto error_return;
1297
1298 NRrecord->mat_b = Nlm_DenseMatrixNew(COMPO_NUM_TRUE_AA,
1299 COMPO_NUM_TRUE_AA);
1300 if (NRrecord->mat_b == NULL) goto error_return;
1301
1302 for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
1303 NRrecord->first_standard_freq[i] =
1304 NRrecord->second_standard_freq[i] = 0.0;
1305 }
1306
1307 goto normal_return;
1308 error_return:
1309 Blast_CompositionWorkspaceFree(&NRrecord);
1310 normal_return:
1311 return NRrecord;
1312 }
1313
1314
1315 /* Documented in composition_adjustment.h. */
1316 int
Blast_CompositionWorkspaceInit(Blast_CompositionWorkspace * NRrecord,const char * matrixName)1317 Blast_CompositionWorkspaceInit(Blast_CompositionWorkspace * NRrecord,
1318 const char *matrixName)
1319 {
1320 if (0 == Blast_GetJointProbsForMatrix(NRrecord->mat_b,
1321 NRrecord->first_standard_freq,
1322 NRrecord->second_standard_freq,
1323 matrixName)) {
1324 return 0;
1325 } else {
1326 fprintf(stderr,
1327 "Matrix %s not currently supported for RE based adjustment\n",
1328 matrixName);
1329 return -1;
1330 }
1331 }
1332
1333
1334 /* Documented in composition_adjustment.h. */
1335 int
Blast_CompositionMatrixAdj(int ** matrix,int alphsize,EMatrixAdjustRule matrix_adjust_rule,int length1,int length2,const double * stdaa_row_probs,const double * stdaa_col_probs,int pseudocounts,double specifiedRE,Blast_CompositionWorkspace * NRrecord,const Blast_MatrixInfo * matrixInfo)1336 Blast_CompositionMatrixAdj(int ** matrix,
1337 int alphsize,
1338 EMatrixAdjustRule matrix_adjust_rule,
1339 int length1,
1340 int length2,
1341 const double * stdaa_row_probs,
1342 const double * stdaa_col_probs,
1343 int pseudocounts,
1344 double specifiedRE,
1345 Blast_CompositionWorkspace * NRrecord,
1346 const Blast_MatrixInfo * matrixInfo)
1347 {
1348 int iteration_count, status;
1349 double row_probs[COMPO_NUM_TRUE_AA], col_probs[COMPO_NUM_TRUE_AA];
1350 /* Target RE when optimizing the matrix; zero if the relative
1351 entropy should not be constrained. */
1352 double dummy, desired_re = 0.0;
1353
1354 s_GatherLetterProbs(row_probs, stdaa_row_probs, alphsize);
1355 s_GatherLetterProbs(col_probs, stdaa_col_probs, alphsize);
1356
1357 switch (matrix_adjust_rule) {
1358 case eUnconstrainedRelEntropy:
1359 desired_re = 0.0;
1360 break;
1361 case eRelEntropyOldMatrixNewContext:
1362 /* Calculate the desired re using the new marginal probs with
1363 the old matrix */
1364 status = Blast_EntropyOldFreqNewContext(&desired_re, &dummy,
1365 &iteration_count,
1366 NRrecord->mat_b,
1367 row_probs, col_probs);
1368 if (status < 0) /* Error, e.g. memory */
1369 return status;
1370 else if (status > 0) /* we could not calculate the desired re */
1371 desired_re = 0.0; /* so, leave the re unconstrained */
1372
1373 break;
1374 case eRelEntropyOldMatrixOldContext:
1375 desired_re = Blast_TargetFreqEntropy(NRrecord->mat_b);
1376 break;
1377 case eUserSpecifiedRelEntropy:
1378 desired_re = specifiedRE;
1379 break;
1380 default: /* I assert that we can't get here */
1381 fprintf(stderr, "Unknown flag for setting relative entropy"
1382 "in composition matrix adjustment");
1383 exit(1);
1384 }
1385 Blast_ApplyPseudocounts(row_probs, length1,
1386 NRrecord->first_standard_freq, pseudocounts);
1387 Blast_ApplyPseudocounts(col_probs, length2,
1388 NRrecord->second_standard_freq, pseudocounts);
1389
1390 status =
1391 Blast_OptimizeTargetFrequencies(&NRrecord->mat_final[0][0],
1392 COMPO_NUM_TRUE_AA,
1393 &iteration_count,
1394 &NRrecord->mat_b[0][0],
1395 row_probs, col_probs,
1396 (desired_re > 0.0),
1397 desired_re,
1398 kCompoAdjustErrTolerance,
1399 kCompoAdjustIterationLimit);
1400
1401 if (status != 0) /* Did not compute the target freqs */
1402 return status;
1403
1404 return
1405 s_ScoresStdAlphabet(matrix, alphsize, NRrecord->mat_final,
1406 matrixInfo->startMatrix,
1407 row_probs, col_probs,
1408 matrixInfo->ungappedLambda);
1409 }
1410
1411
1412 /* Documented in composition_adjustment.h. */
1413 int
Blast_AdjustScores(int ** matrix,const Blast_AminoAcidComposition * query_composition,int queryLength,const Blast_AminoAcidComposition * subject_composition,int subjectLength,const Blast_MatrixInfo * matrixInfo,ECompoAdjustModes composition_adjust_mode,int RE_pseudocounts,Blast_CompositionWorkspace * NRrecord,EMatrixAdjustRule * matrix_adjust_rule,double calc_lambda (double *,int,int,double),double * pvalueForThisPair,int compositionTestIndex,double * ratioToPassBack)1414 Blast_AdjustScores(int ** matrix,
1415 const Blast_AminoAcidComposition * query_composition,
1416 int queryLength,
1417 const Blast_AminoAcidComposition * subject_composition,
1418 int subjectLength,
1419 const Blast_MatrixInfo * matrixInfo,
1420 ECompoAdjustModes composition_adjust_mode,
1421 int RE_pseudocounts,
1422 Blast_CompositionWorkspace *NRrecord,
1423 EMatrixAdjustRule *matrix_adjust_rule,
1424 double calc_lambda(double *,int,int,double),
1425 double *pvalueForThisPair,
1426 int compositionTestIndex,
1427 double *ratioToPassBack)
1428 {
1429 const int alphsize = matrixInfo->cols;
1430
1431 double lambdaForPair; /*lambda for this pair of compositions*/
1432 int iter_count; /*used as argument to Blast_CalcLambdaFullPrecision*/
1433
1434 /* The next two arrays are letter probabilities of query and
1435 * match in 20 letter ARND... alphabet. */
1436 double permutedQueryProbs[COMPO_NUM_TRUE_AA];
1437 double permutedMatchProbs[COMPO_NUM_TRUE_AA];
1438
1439 if (query_composition->numTrueAminoAcids == 0 ||
1440 subject_composition->numTrueAminoAcids == 0) {
1441 /* Either the query or subject contains only ambiguity
1442 characters, most likely because the entire subject has been
1443 SEGed. Compositional adjustment is meaningless. */
1444 return 1;
1445 }
1446
1447 if ((compositionTestIndex > 0) ||
1448 ((!(matrixInfo->positionBased)) &&
1449 (composition_adjust_mode != eCompositionBasedStats))) {
1450 s_GatherLetterProbs(permutedQueryProbs,
1451 query_composition->prob, alphsize);
1452 s_GatherLetterProbs(permutedMatchProbs,
1453 subject_composition->prob, alphsize);
1454 }
1455
1456 if (compositionTestIndex > 0) {
1457 int i,j; /*loop indices*/
1458 /* a score matrix to pass*/
1459 double **scores = Nlm_DenseMatrixNew(alphsize, alphsize);
1460
1461 if (scores == NULL) {
1462 return -1;
1463 }
1464 for (i = 0; i < COMPO_NUM_TRUE_AA; i++)
1465 for (j = 0; j < COMPO_NUM_TRUE_AA; j++)
1466 scores[i][j] = BLOS62[i][j];
1467 Blast_CalcLambdaFullPrecision(&lambdaForPair, &iter_count,
1468 scores,
1469 COMPO_NUM_TRUE_AA,
1470 permutedQueryProbs,
1471 permutedMatchProbs,
1472 kLambdaErrorTolerance,
1473 kLambdaFunctionTolerance,
1474 kLambdaIterationLimit);
1475 if (iter_count >= kLambdaIterationLimit) {
1476 /* The algorithm for lambda didn't converge, likely
1477 * because the matrix has positive expected score;
1478 * set lambda to the smallest value in the table. */
1479 lambdaForPair = COMPO_MIN_LAMBDA;
1480 }
1481 /*use lengths of query and subject not counting X's */
1482 *pvalueForThisPair = Blast_CompositionPvalue(lambdaForPair);
1483
1484 Nlm_DenseMatrixFree(&scores);
1485 }
1486
1487 if (matrixInfo->positionBased ||
1488 composition_adjust_mode == eCompositionBasedStats) {
1489 /* Use old-style composition-based statistics unconditionally. */
1490 *matrix_adjust_rule = eCompoScaleOldMatrix;
1491 } else {
1492 /* else call Yi-Kuo's code to choose mode for matrix adjustment. */
1493 *matrix_adjust_rule =
1494 Blast_ChooseMatrixAdjustRule(queryLength, subjectLength,
1495 permutedQueryProbs,
1496 permutedMatchProbs,
1497 matrixInfo->matrixName,
1498 composition_adjust_mode);
1499 } /* end else call Yi-Kuo's code to choose mode for matrix adjustment. */
1500
1501 if (eCompoScaleOldMatrix != *matrix_adjust_rule) {
1502 /* Try matrix optimization, if it fails to converge, we
1503 fall back to traditional scaling below */
1504 int status =
1505 Blast_CompositionMatrixAdj(matrix,
1506 alphsize,
1507 *matrix_adjust_rule,
1508 query_composition->
1509 numTrueAminoAcids,
1510 subject_composition->
1511 numTrueAminoAcids,
1512 query_composition->prob,
1513 subject_composition->prob,
1514 RE_pseudocounts,
1515 kFixedReBlosum62,
1516 NRrecord,
1517 matrixInfo);
1518
1519 *ratioToPassBack = 1.0; /* meaningless for this mode */
1520 if (status <= 0)
1521 return status; /* Success (=0) or fatal error (<0)*/
1522
1523 } /* End try matrix optimization */
1524
1525 *matrix_adjust_rule = eCompoScaleOldMatrix;
1526 return Blast_CompositionBasedStats(matrix, ratioToPassBack, matrixInfo,
1527 query_composition->prob,
1528 subject_composition->prob,
1529 calc_lambda,
1530 (compositionTestIndex > 0));
1531 }
1532