1 /* ===========================================================================
2 *
3 *                            PUBLIC DOMAIN NOTICE
4 *               National Center for Biotechnology Information
5 *
6 *  This software/database is a "United States Government Work" under the
7 *  terms of the United States Copyright Act.  It was written as part of
8 *  the author's official duties as a United States Government employee and
9 *  thus cannot be copyrighted.  This software/database is freely available
10 *  to the public for use. The National Library of Medicine and the U.S.
11 *  Government have not placed any restriction on its use or reproduction.
12 *
13 *  Although all reasonable efforts have been taken to ensure the accuracy
14 *  and reliability of the software and data, the NLM and the U.S.
15 *  Government do not and cannot warrant the performance or results that
16 *  may be obtained by using this software or data. The NLM and the U.S.
17 *  Government disclaim all warranties, express or implied, including
18 *  warranties of performance, merchantability or fitness for any particular
19 *  purpose.
20 *
21 *  Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================*/
24 
25 /** @file composition_adjustment.c
26  * Highest level functions to solve the optimization problem for
27  * compositional score matrix adjustment.
28  *
29  * @author Yi-Kuo Yu, Alejandro Schaffer, E. Michael Gertz
30  */
31 
32 #include <limits.h>
33 #include <assert.h>
34 #include <algo/blast/core/ncbi_std.h>
35 #include <algo/blast/composition_adjustment/composition_constants.h>
36 #include <algo/blast/composition_adjustment/composition_adjustment.h>
37 #include <algo/blast/composition_adjustment/matrix_frequency_data.h>
38 #include <algo/blast/composition_adjustment/nlm_linear_algebra.h>
39 #include <algo/blast/composition_adjustment/optimize_target_freq.h>
40 #include <algo/blast/composition_adjustment/unified_pvalues.h>
41 
42 
43 /**positions of true characters in protein alphabet*/
44 static int trueCharPositions[COMPO_NUM_TRUE_AA] =
45 {1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,22};
46 
47 /**
48  * conversion from 26 letter NCBIstdaa alphabet to 20 letter order
49  * for true amino acids: ARNDCQEGHILKMFPSTWYV.  This order is
50  * alphabetical in the standard three-letter abbreviation of each
51  * amino acid */
52 static int alphaConvert[COMPO_LARGEST_ALPHABET] =
53   {(-1), 0, (-1),  4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15,
54    16, 19, 17, (-1), 18, (-1), (-1), (-1), (-1), (-1)};
55 
56 
57 /**
58  * Desired margin between an end of region used for computing a
59  * composition, and the nearest StopChar; the desired margin may
60  * not be attained. */
61 static const int kCompositionMargin = 20;
62 
63 /** iteration limit for Newton's method for computing Lambda */
64 static const int kLambdaIterationLimit = 100;
65 /** bound on error for estimating lambda */
66 static const double kLambdaErrorTolerance = 0.0000001;
67 /** bound on the difference between the expected value of
68  *  exp(lambda x S) and 1 */
69 static const double kLambdaFunctionTolerance = 0.00001;
70 
71 
72 /** bound on error for Newton's method */
73 static const double kCompoAdjustErrTolerance = 0.00000001;
74 /** iteration limit for Newton's method */
75 static const int kCompoAdjustIterationLimit = 2000;
76 /** relative entropy of BLOSUM62 */
77 static const double kFixedReBlosum62 = 0.44;
78 /** largest permitted value of score(i,X), when scores are computed using
79     composition adjustment */
80 static const double kMaximumXscore = -1.0;
81 
82 /** The BLOSUM62 matrix for the ARND... alphabet, using standard scale */
83 static double BLOS62[COMPO_NUM_TRUE_AA][COMPO_NUM_TRUE_AA]=
84 {{4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0},
85  {-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3},
86  {-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3},
87  {-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3},
88  {0,-3,-3,-3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1},
89  {-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2},
90  {-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2},
91  {0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3},
92  {-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3},
93  {-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3},
94  {-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1},
95  {-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2},
96  {-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1},
97  {-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1},
98  {-1,-2,-2,-1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2},
99  {1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2},
100  {0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0},
101  {-3,-3,-4,-4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3},
102  {-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1},
103  {0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4}};
104 
105 /* Documented in composition_adjustment.h. */
106 void
Blast_ApplyPseudocounts(double * probs20,int number_of_observations,const double * background_probs20,int pseudocounts)107 Blast_ApplyPseudocounts(double * probs20,
108                     int number_of_observations,
109                     const double * background_probs20,
110                     int pseudocounts)
111 {
112     int i;                 /* loop index */
113     double weight;         /* weight assigned to pseudocounts */
114     double sum;            /* sum of the observed frequencies */
115     /* pseudocounts as a double */
116     double dpseudocounts = pseudocounts;
117 
118     /* Normalize probabilities */
119     sum = 0.0;
120     for (i = 0;  i < COMPO_NUM_TRUE_AA;  i++) {
121         sum += probs20[i];
122     }
123     if (sum == 0.0) {  /* Can't normalize a zero vector */
124         sum = 1.0;
125     }
126     weight = dpseudocounts / (number_of_observations + dpseudocounts);
127     for (i = 0;  i < COMPO_NUM_TRUE_AA;  i++) {
128         probs20[i] = (1.0 - weight) * probs20[i] / sum
129             + weight * background_probs20[i];
130     }
131 }
132 
133 
134 /* Documented in composition_adjustment.h. */
135 double
Blast_GetRelativeEntropy(const double A[],const double B[])136 Blast_GetRelativeEntropy(const double A[], const double B[])
137 {
138     int i;                 /* loop index over letters */
139     double temp;           /* intermediate term */
140     double value = 0.0;    /* square of relative entropy */
141 
142     for (i = 0;  i < COMPO_NUM_TRUE_AA;  i++) {
143         temp = (A[i] + B[i]) / 2;
144         if (temp > 0) {
145             if (A[i] > 0) {
146                 value += A[i] * log(A[i] / temp) / 2;
147             }
148             if (B[i] > 0) {
149                 value += B[i] * log(B[i] / temp) / 2;
150             }
151         }
152     }
153     if (value < 0) {             /* must be numerical rounding error */
154         value = 0;
155     }
156     return sqrt(value);
157 }
158 
159 
160 
161 /* Blast_CalcLambdaFullPrecision -- interface documented in
162  * composition_adjustment.h.
163  *
164  * If the average score for a composition is negative, and the maximum
165  * score that occurs with nonzero probability is positive, then
166  * statistical parameter Lambda exists and is the unique, positive
167  * solution to
168  *
169  *    phi(lambda) = sum_{i,j} P_1(i) P_2(j) exp(S_{ij} lambda) - 1 = 0,
170  *
171  * where S_{ij} is the matrix "score" and P_1 and P_2 are row_probs and
172  * col_probs respectively.
173  *
174  * It is simpler to solve this problem in x = exp(-lambda) than it is
175  * to solve it in lambda, because we know that for x, a solution lies
176  * in [0,1].  Furthermore, if M is the largest S_{ij} so that P_1(i)
177  * and P_2(j) are nonzero, then the function
178  *
179  *    f(x) = -x^M - sum_{i,j} P_1(i) P_2(j) x^{M - S_{ij}},
180  *
181  * obtained by multiplying phi(lambda) by x^M, is well behaved in
182  * (0,1] -- if the scores are integers, it is a polynomial.  Since x = 0
183  * is not a solution, x solves f(x) = 0 in [0,1), if and only if
184  * lambda = -ln(x) is a positive solution to phi(lambda).  Therefore,
185  * we may define a safeguarded Newton iteration to find a solution of
186  * f(x) = 0.
187  *
188  * For the most part, this is a standard safeguarded Newton iteration:
189  * define an interval of uncertainty [a,b] with f(a) > 0 and f(b) < 0
190  * (except for the initial value b = 1, where f(b) = 0); evaluate the
191  * function and use the sign of that value to shrink the interval of
192  * uncertainty; compute a Newton step; and if the Newton step suggests
193  * a point outside the interval of uncertainty or fails to decrease
194  * the function sufficiently, then bisect.  There are three further
195  * details needed to understand the algorithm:
196  *
197  * 1)  If y the unique solution in [0,1], then f is positive to the left of
198  *     y, and negative to the right.  Therefore, we may determine whether
199  *     the Newton step -f(x)/f'(x) is moving toward, or away from, y by
200  *     examining the sign of f'(x).  If f'(x) >= 0, we bisect, instead
201  *     of taking the Newton step.
202  * 2)  There is a neighborhood around x = 1 for which f'(x) >= 0,
203  *     so (1) prevents convergence to x = 1.
204  * 3)  Conditions like  fabs(p) < lambda_tolerance * x * (1-x) are used in
205  *     convergence criteria because these values translate to a bound
206  *     on the relative error in lambda.  This is proved in the
207  *     "Blast Scoring Parameters" document that accompanies the BLAST
208  *     code.
209  *
210  * We have observed that in typical cases the safeguarded Newton
211  * iteration on f(x) requires half the iterations of a Newton
212  * iteration on phi(lambda). More importantly, the iteration on f(x)
213  * is robust and doesn't overflow; defining a robust safeguarded
214  * Newton iteration on phi(lambda) that cannot converge to zero and
215  * that is protected against overflow is more difficult.  So (despite
216  * the length of this comment) the Newton iteration on f(x) is the
217  * simpler solution.
218  */
219 void
Blast_CalcLambdaFullPrecision(double * plambda,int * piterations,double ** score,int alphsize,const double row_prob[],const double col_prob[],double lambda_tolerance,double function_tolerance,int max_iterations)220 Blast_CalcLambdaFullPrecision(double * plambda, int *piterations,
221                               double **score, int alphsize,
222                               const double row_prob[],
223                               const double col_prob[],
224                               double lambda_tolerance,
225                               double function_tolerance,
226                               int max_iterations)
227 {
228     double f = 4;               /* The current function value; initially
229                                    set to a value greater than any possible
230                                    real value of f */
231     double left = 0, right = 1; /* (left, right) is an interval containing
232                                    a solution */
233     double x = 0.367879441171;  /* The current iterate; initially exp(-1) */
234     int is_newton = 0;          /* true if the last iteration was a Newton
235                                    step; initially false */
236     int i, j, k;                /* iteration indices */
237     /* maximum score that occurs with nonzero probability */
238     double max_score = COMPO_SCORE_MIN;
239     /* average score */
240     double avg_score = 0.0;
241 
242     /* Find the maximum score with nonzero probability */
243     for (i = 0;  i < alphsize;  i++) {
244         if (row_prob[i] == 0.0) {
245             continue;
246         }
247         for (j = 0;  j < alphsize;  j++) {
248             if (col_prob[j] == 0.0) {
249                 continue;
250             }
251             if (max_score < score[i][j]) {
252                 max_score = score[i][j];
253             }
254             avg_score += row_prob[i] * col_prob[j] * score[i][j];
255         }
256     }
257     if (max_score <= 0.0 || avg_score >= 0) {
258         /* The iteration cannot converge if max_score is nonpositive
259          * or the average score is nonnegative; lambda doesn't exist */
260         *piterations = max_iterations;
261         *plambda = -1.0;
262         return;
263     }
264     for (k = 0;  k < max_iterations;  k++) {
265         double slope;               /* slope of f at x */
266         double fold = f;            /* previous value of f */
267         double x_pow_max_score;     /* x raised to the power max_score */
268         double lambda = -log(x);    /* the iterate lambda, see above */
269         int was_newton = is_newton; /* true if the previous iteration
270                                        was a Newton step; instead of a
271                                        bisection step */
272         /* Evaluate the function and its derivative */
273         x_pow_max_score = exp(-max_score * lambda);
274         f = -x_pow_max_score;
275         slope = max_score * f / x;
276         for (i = 0;  i < alphsize;  i++) {
277             if (row_prob[i] == 0.0) {
278                 continue;
279             }
280             for (j = 0;  j < alphsize;  j++) {
281                 double ff;  /* a term in the sum used to compute f */
282 
283                if (col_prob[j] == 0.0) {
284                    continue;
285                }
286                if (max_score != score[i][j]) {
287                    double diff_score = max_score - score[i][j];
288 
289                    ff = row_prob[i] * col_prob[j] * exp(-lambda * diff_score);
290                    slope += diff_score * ff / x;
291                } else {
292                    ff = row_prob[i] * col_prob[j];
293                }
294                f += ff;
295             }
296         }
297         /* Finished evaluating the function and its derivative */
298         if (f > 0) {
299             left = x; /* move the left endpoint */
300         } else if (f < 0) {
301             right = x; /* move the right endpoint */
302         } else { /* f == 0 */
303             break; /* x is an exact solution */
304         }
305         if (right - left <= 2 * left * (1 - right) * lambda_tolerance &&
306             fabs(f/x_pow_max_score) <= function_tolerance) {
307             /* The midpoint of the interval converged */
308             x = (left + right) / 2;
309             break;
310         }
311         if ((was_newton && fabs(f) > .9 * fabs(fold))
312             /* if the previous iteration was a Newton step but didn't
313              * decrease f sufficiently; or */
314              || slope >= 0
315              /* if a Newton step will move us away from the desired solution */
316         ) {/* then */
317             x = (left + right)/2;  /* bisect */
318         } else {
319             double p = -f/slope;   /* The Newton step */
320             double y = x + p;      /* The proposed next iterate */
321             if (y <= left || y >= right) { /* The proposed iterate is
322                                               not in (left,right) */
323                 x = (left + right)/2;  /* bisect */
324             } else {/* The proposed iterate is in (left,right). Accept it. */
325                 is_newton = 1;
326                 x = y;
327                 if (fabs(p) <= lambda_tolerance * x * (1-x) &&
328                     fabs(f/x_pow_max_score) <= function_tolerance) break;
329             }
330         }
331     }  /* End for all iterations k */
332     *plambda = (k < max_iterations) ? -log(x) : -1.0;
333     *piterations = k;
334 }
335 
336 
337 /* Documented in composition_adjustment.h. */
338 double
Blast_MatrixEntropy(double ** matrix,int alphsize,const double row_prob[],const double col_prob[],double Lambda)339 Blast_MatrixEntropy(double ** matrix, int alphsize, const double row_prob[],
340                     const double col_prob[], double Lambda)
341 {
342     int i, j;
343     double entropy = 0.0;
344     for (i = 0;  i < alphsize;  i++) {
345         for (j = 0;  j < alphsize;  j++) {
346             /* the score at (i,j), rescaled to nats */
347             double nat_score = Lambda * matrix[i][j];
348             entropy += nat_score * exp(nat_score) * row_prob[i] * col_prob[j];
349         }
350     }
351     return entropy;
352 }
353 
354 
355 
356 /* Documented in composition_adjustment.h. */
357 double
Blast_TargetFreqEntropy(double ** target_freq)358 Blast_TargetFreqEntropy(double ** target_freq)
359 {
360     int i, j;          /* iteration indices */
361     double entropy;    /* the entropy to be returned */
362     /* Row probabilities consistent with the target frequencies */
363     double row_prob[COMPO_NUM_TRUE_AA] = {0,};
364     double col_prob[COMPO_NUM_TRUE_AA] = {0,};
365 
366     for (i = 0;  i < COMPO_NUM_TRUE_AA;  i++) {
367         for (j = 0;  j < COMPO_NUM_TRUE_AA;  j++) {
368             row_prob[i] += target_freq[i][j];
369             col_prob[j] += target_freq[i][j];
370         }
371     }
372     entropy = 0.0;
373     for (i = 0;  i < COMPO_NUM_TRUE_AA;  i++) {
374         for (j = 0;  j < COMPO_NUM_TRUE_AA;  j++) {
375             double freq = target_freq[i][j];
376             entropy += freq * log(freq / row_prob[i] / col_prob[j]);
377         }
378     }
379     return entropy;
380 }
381 
382 
383 /* Documented in composition_adjustment.h. */
384 void
Blast_CalcFreqRatios(double ** ratios,int alphsize,double row_prob[],double col_prob[])385 Blast_CalcFreqRatios(double ** ratios, int alphsize,
386                      double row_prob[], double col_prob[])
387 {
388     int i, j;
389     for (i = 0;  i < alphsize;  i++) {
390         if (row_prob[i] > 0) {
391             for (j = 0;  j < alphsize;  j++) {
392                 if (col_prob[j] > 0) {
393                     ratios[i][j] /= (row_prob[i] * col_prob[j]);
394                 }
395             }
396         }
397     }
398 }
399 
400 
401 /* Documented in composition_adjustment.h. */
402 void
Blast_FreqRatioToScore(double ** matrix,int rows,int cols,double Lambda)403 Blast_FreqRatioToScore(double ** matrix, int rows, int cols, double Lambda)
404 {
405     int i;
406     for (i = 0;  i < rows;  i++) {
407         int j;
408         for (j = 0;  j < cols;  j++) {
409             if (0.0 == matrix[i][j]) {
410                 matrix[i][j] = COMPO_SCORE_MIN;
411             } else {
412                 matrix[i][j] = log(matrix[i][j])/Lambda;
413             }
414         }
415     }
416 }
417 
418 
419 /**
420  * Convert letter probabilities from the NCBIstdaa alphabet to
421  * a 20 letter ARND... amino acid alphabet. (@see alphaConvert)
422  *
423  * @param outputLetterProbs   the ARND letter probabilities [out]
424  * @param inputLetterProbs    the NCBIstdaa letter probabilities [in]
425  * @param alphsize            the size of the alphabet
426  */
427 static void
s_GatherLetterProbs(double * outputLetterProbs,const double * inputLetterProbs,int alphsize)428 s_GatherLetterProbs(double * outputLetterProbs,
429                     const double * inputLetterProbs, int alphsize)
430 {
431     int c; /*index over characters*/
432 
433     for (c = 0;  c < alphsize;  c++) {
434         if ((-1) != alphaConvert[c]) {
435             outputLetterProbs[alphaConvert[c]] = inputLetterProbs[c];
436         }
437     }
438 }
439 
440 /**
441  * Convert letter probabilities from a a 20 letter ARND... amino acid
442  * alphabet to the NCBIstdaa alphabet, (@see alphaConvert) setting
443  * probabilities for nonstandard characters to zero.
444  *
445  * @param std_probs   the NCBIstdaa letter probabilities [out]
446  * @param alphsize    size of the NCBIstdaa alphabet [in]
447  * @param probs       the ARND letter probabilities [in]
448  */
449 static void
s_UnpackLetterProbs(double std_probs[],int alphsize,const double probs[])450 s_UnpackLetterProbs(double std_probs[], int alphsize, const double probs[])
451 {
452     int c; /*index over characters*/
453 
454     for (c = 0;  c < alphsize;  c++) {
455         if ((-1) != alphaConvert[c]) {
456             std_probs[c] = probs[alphaConvert[c]];
457         } else {
458             std_probs[c] = 0.0;
459         }
460     }
461 }
462 
463 
464 /**
465  * Set the probabilities for the two-character ambiguity characters.
466  * @param probs     the probabilities for the NCBIstdaa alphabet.
467  *                  On entry, values for the standard amino acids must
468  *                  be set; on exit, values for the two-character
469  *                  ambiguities will also be set.
470  * @param alphsize  the size of the alphabet
471  */
472 static void
s_SetPairAmbigProbsToSum(double probs[],int alphsize)473 s_SetPairAmbigProbsToSum(double probs[], int alphsize)
474 {
475     probs[eBchar] = probs[eDchar] + probs[eNchar];
476     probs[eZchar] = probs[eEchar] + probs[eQchar];
477     if (alphsize > eJchar) {
478         probs[eJchar] = probs[eIchar] + probs[eLchar];
479     }
480 }
481 
482 
483 /* Documented in composition_adjustment.h. */
484 int
Blast_EntropyOldFreqNewContext(double * entropy,double * Lambda,int * iter_count,double ** target_freq,const double row_prob[],const double col_prob[])485 Blast_EntropyOldFreqNewContext(double * entropy,
486                                double * Lambda,
487                                int * iter_count,
488                                double ** target_freq,
489                                const double row_prob[],
490                                const double col_prob[])
491 {
492     /* iteration indices */
493     int i, j;
494     /* Status flag; will be set to zero on success */
495     int status = 1;
496     /* A matrix of scores in the context consistent with the target
497      * frequencies */
498     double  **scores;
499     /* Row and column probabilities consistent with the target
500      * frequencies; the old context */
501     double old_col_prob[COMPO_NUM_TRUE_AA] = {0.0,};
502     double old_row_prob[COMPO_NUM_TRUE_AA] = {0.0,};
503 
504     *entropy = 0;
505     status = 1;
506 
507     /* Calculate the matrix "scores" from the target frequencies */
508     scores = Nlm_DenseMatrixNew(COMPO_NUM_TRUE_AA, COMPO_NUM_TRUE_AA);
509     if (scores == NULL) {
510         return -1;
511     }
512     for (i = 0;  i < COMPO_NUM_TRUE_AA;  i++) {
513         for (j = 0;  j < COMPO_NUM_TRUE_AA; j++) {
514             old_row_prob[i] += target_freq[i][j];
515             old_col_prob[j] += target_freq[i][j];
516         }
517     }
518     for (i = 0;  i < COMPO_NUM_TRUE_AA;  i++) {
519         memcpy(scores[i], target_freq[i], COMPO_NUM_TRUE_AA * sizeof(double));
520     }
521     Blast_CalcFreqRatios(scores, COMPO_NUM_TRUE_AA,
522                          old_row_prob, old_col_prob);
523     Blast_FreqRatioToScore(scores, COMPO_NUM_TRUE_AA, COMPO_NUM_TRUE_AA, 1.0);
524     /* Finished calculating the matrix "scores" */
525 
526     Blast_CalcLambdaFullPrecision(Lambda, iter_count, scores,
527                                   COMPO_NUM_TRUE_AA, row_prob,
528                                   col_prob, kLambdaErrorTolerance,
529                                   kLambdaFunctionTolerance,
530                                   kLambdaIterationLimit);
531     if (*iter_count <  kLambdaIterationLimit) {
532         *entropy = Blast_MatrixEntropy(scores, COMPO_NUM_TRUE_AA,
533                                        row_prob, col_prob, *Lambda);
534         status = 0;
535     }
536     Nlm_DenseMatrixFree(&scores);
537     return status;
538 }
539 
540 
541 /* Documented in composition_adjustment.h. */
542 void
Blast_TrueAaToStdTargetFreqs(double ** StdFreq,int StdAlphsize,double ** freq)543 Blast_TrueAaToStdTargetFreqs(double ** StdFreq, int StdAlphsize,
544                              double ** freq)
545 {
546     /* Note I'm using a rough convention for this routine that uppercase
547      * letters refer to quantities in the standard (larger) alphabet
548      * and lowercase letters refer to the true amino acid (smaller)
549      * alphabet.
550      */
551     /* Shorter names for the sizes of the two alphabets */
552     const int small_alphsize = COMPO_NUM_TRUE_AA;
553     int A, B;          /* characters in the std (big) alphabet */
554     int a, b;          /* characters in the small alphabet */
555     double sum;        /* sum of values in target_freq; used to normalize */
556     sum = 0.0;
557     for (a = 0;  a < small_alphsize;  a++) {
558         for (b = 0;  b < small_alphsize;  b++) {
559             sum +=  freq[a][b];
560         }
561     }
562     for (A = 0;  A < StdAlphsize;  A++) {
563         /* for all rows */
564         if (alphaConvert[A] < 0) {
565             /* the row corresponds to a nonstandard reside */
566             for (B = 0;  B < StdAlphsize;  B++) {
567                 StdFreq[A][B] = 0.0;
568             }
569         } else {
570             /* the row corresponds to a standard reside */
571             a = alphaConvert[A];
572 
573             for (B = 0;  B < StdAlphsize;  B++) {
574                 /* for all columns */
575                 if (alphaConvert[B] < 0) {
576                     /* the column corresponds to a nonstandard reside */
577                     StdFreq[A][B] = 0.0;
578                 } else {
579                     /* the column corresponds to a standard reside */
580                     b = alphaConvert[B];
581                     StdFreq[A][B] = freq[a][b] / sum;
582                 }
583             }
584             /* Set values for two-character ambiguities */
585             StdFreq[A][eBchar] = StdFreq[A][eDchar] + StdFreq[A][eNchar];
586             StdFreq[A][eZchar] = StdFreq[A][eEchar] + StdFreq[A][eQchar];
587             if (StdAlphsize > eJchar) {
588                 StdFreq[A][eJchar] = StdFreq[A][eIchar] + StdFreq[A][eLchar];
589             }
590         }
591     }
592     /* Add rows to set values for two-character ambiguities */
593     memcpy(StdFreq[eBchar], StdFreq[eDchar], StdAlphsize * sizeof(double));
594     Nlm_AddVectors(StdFreq[eBchar], StdAlphsize, 1.0, StdFreq[eNchar]);
595 
596     memcpy(StdFreq[eZchar], StdFreq[eEchar], StdAlphsize * sizeof(double));
597     Nlm_AddVectors(StdFreq[eZchar], StdAlphsize, 1.0, StdFreq[eQchar]);
598 
599     if (StdAlphsize > eJchar) {
600         memcpy(StdFreq[eJchar],StdFreq[eIchar], StdAlphsize * sizeof(double));
601         Nlm_AddVectors(StdFreq[eJchar], StdAlphsize, 1.0, StdFreq[eLchar]);
602     }
603 }
604 
605 
606 /**
607  * Calculate the average score of a row or column of a matrix.
608  *
609  * @param M            a vector of scores
610  * @param alphsize     the size of the alphabet
611  * @param incM         stride between elements of M; used to pass a column
612  *                     of a score matrix to this routine by setting incM
613  *                     to the number of elements in a column.
614  * @param probs        background probabilities for a sequence.
615  */
616 static double
s_CalcAvgScore(double * M,int alphsize,int incM,const double probs[])617 s_CalcAvgScore(double * M, int alphsize, int incM, const double probs[])
618 {
619     int j;                   /* iteration index */
620     double score_iX = 0.0;   /* score of character i substituted by X */
621 
622     for (j = 0;  j < alphsize;  j++) {
623         if (alphaConvert[j] >= 0) {
624             /* If the column corresponds to a true amino acid */
625             score_iX += M[j * incM] * probs[j];
626         }
627     }
628     return score_iX;
629 }
630 
631 /**
632  * Calculate values for the X ambiguity score, give an vector of scores and
633  * a set of character probabilities.
634  * @param M            a vector of scores
635  * @param alphsize     the size of the alphabet
636  * @param incM         stride between elements of M; used to pass a column
637  *                     of a score matrix to this routine by setting incM
638  *                     to the number of elements in a column.
639  * @param probs        background probabilities for a sequence.
640  */
641 static double
s_CalcXScore(double * M,int alphsize,int incM,const double probs[])642 s_CalcXScore(double * M, int alphsize, int incM, const double probs[])
643 {
644     return MIN( s_CalcAvgScore(M, alphsize, incM, probs), kMaximumXscore);
645 }
646 
647 
648 /**
649  * Given a standard matrix with the scores for true amino acid, and
650  * pairwise ambiguity scores set, calculate and set the scores for the
651  * X, U (Selenocystiene) and O (pyrrolysine) characters.
652  *
653  * @param M             a scoring matrix
654  * @param alphsize      the size of the alphabet
655  * @param row_probs     character frequencies for the sequence corresponding
656  *                      to the rows of M.
657  * @param col_probs     character frequencies for the sequence corresponding
658  *                      to the columns of M.
659  */
660 static void
s_SetXUOScores(double ** M,int alphsize,const double row_probs[],const double col_probs[])661 s_SetXUOScores(double ** M, int alphsize,
662               const double row_probs[], const double col_probs[])
663 {
664     int i;                      /* iteration index */
665     double score_XX = 0.0;      /* score of matching an X to an X */
666     /* the matrix has alphsize colums (this variable exists just to
667        make things easier to read) */
668     const int cols = alphsize;
669 
670     for (i = 0;  i < alphsize;  i++) {
671         if (alphaConvert[i] >= 0) {
672             double avg_iX = s_CalcAvgScore(M[i], alphsize, 1, col_probs);
673             M[i][eXchar] = MIN(avg_iX, kMaximumXscore);
674             score_XX += avg_iX * row_probs[i];
675 
676             M[eXchar][i] = s_CalcXScore(&M[0][i], alphsize, cols, row_probs);
677         }
678     }
679     M[eXchar][eXchar] = MIN(score_XX, kMaximumXscore);
680 
681     /* Set X scores for pairwise ambiguity characters */
682     M[eBchar][eXchar] = s_CalcXScore(M[eBchar], alphsize, 1, col_probs);
683     M[eXchar][eBchar] = s_CalcXScore(&M[0][eBchar], alphsize, cols, row_probs);
684 
685     M[eZchar][eXchar] = s_CalcXScore(M[eZchar], alphsize, 1, col_probs);
686     M[eXchar][eZchar] = s_CalcXScore(&M[0][eZchar], alphsize, cols, row_probs);
687     if( alphsize > eJchar) {
688         M[eJchar][eXchar] = s_CalcXScore(M[eJchar], alphsize, 1, col_probs);
689         M[eXchar][eJchar] =
690             s_CalcXScore(&M[0][eJchar], alphsize, cols, row_probs);
691     }
692     /* Copy C scores to U */
693     memcpy(M[eSelenocysteine], M[eCchar], alphsize * sizeof(double));
694     for (i = 0;  i < alphsize;  i++) {
695         M[i][eSelenocysteine] = M[i][eCchar];
696     }
697     /* Copy X scores to O */
698     if (alphsize > eOchar) {
699         memcpy(M[eOchar], M[eXchar], alphsize * sizeof(double));
700         for (i = 0;  i < alphsize;  i++) {
701             M[i][eOchar] = M[i][eXchar];
702         }
703     }
704 }
705 
706 
707 /** Return the nearest integer to x. */
Nint(double x)708 static long Nint(double x)
709 {
710     x += (x >= 0. ? 0.5 : -0.5);
711     return (long)x;
712 }
713 
714 
715 /**
716  * Round a floating point matrix of scores to produce an integer matrix of
717  * scores.
718  *
719  * @param matrix             the matrix of integer valued scores [out]
720  * @param floatScoreMatrix   the matrix of floating point valued
721  *                           scores [in]
722  * @param rows               the number of rows of the matrices.
723  * @param cols               the number of columns in matrix.
724  */
725 static void
s_RoundScoreMatrix(int ** matrix,int rows,int cols,double ** floatScoreMatrix)726 s_RoundScoreMatrix(int **matrix, int rows, int cols,
727                    double **floatScoreMatrix)
728 {
729     int p, c; /*indices over positions and characters*/
730 
731     for (p = 0;  p < rows;  p++) {
732         for (c = 0;  c < cols;  c++) {
733             if (floatScoreMatrix[p][c] < INT_MIN) {
734                 matrix[p][c] = INT_MIN;
735             } else {
736                 matrix[p][c] = Nint(floatScoreMatrix[p][c]);
737             }
738         }
739     }
740 }
741 
742 
743 /**
744  * Given a set of target frequencies and two sets of character
745  * probabilities for the true amino acids in the ARND alphabet,
746  * calculate a scoring matrix that has valid entries for all
747  * characters in the NCBIstdaa amino acid alphabet.
748  *
749  * @param Matrix        the newly computed matrix
750  * @param Alphsize      the size of the NCBIstdaa alphabet
751  * @param target_freq   target frequencies for true amino acids (20x20)
752  * @param StartMatrix   a matrix containing values for the stop character
753  * @param row_prob      probabilities of true amino acids in the sequence
754  *                      corresponding to the rows of matrix (length = 20)
755  * @param col_prob      probabilities of true amino acids in the sequence
756  *                      corresponding to the columns of matrix (length = 20)
757  * @param Lambda        the desired scale of this matrix
758  */
759 static int
s_ScoresStdAlphabet(int ** Matrix,int Alphsize,double ** target_freq,int ** StartMatrix,const double row_prob[],const double col_prob[],double Lambda)760 s_ScoresStdAlphabet(int ** Matrix, int Alphsize,
761                     double ** target_freq, int ** StartMatrix,
762                     const double row_prob[], const double col_prob[],
763                     double Lambda)
764 {
765     /* Note: I'm using a rough convention for this routine that uppercase
766      * letters refer to quantities in the standard (larger) alphabet
767      * and lowercase letters refer to the true amino acid (smaller)
768      * alphabet.
769      */
770     int i;
771     /* row and column probabilities in the NCBIstdaa alphabet */
772     double RowProb[COMPO_LARGEST_ALPHABET];
773     double ColProb[COMPO_LARGEST_ALPHABET];
774     /* A double precision score matrix */
775     double ** Scores = Nlm_DenseMatrixNew(Alphsize, Alphsize);
776     if (Scores == NULL) {
777         return -1;
778     }
779     s_UnpackLetterProbs(RowProb, Alphsize, row_prob);
780     s_SetPairAmbigProbsToSum(RowProb, Alphsize);
781 
782     s_UnpackLetterProbs(ColProb, Alphsize, col_prob);
783     s_SetPairAmbigProbsToSum(ColProb, Alphsize);
784 
785     Blast_TrueAaToStdTargetFreqs(Scores, Alphsize, target_freq);
786     Blast_CalcFreqRatios(Scores, Alphsize, RowProb, ColProb);
787     Blast_FreqRatioToScore(Scores, Alphsize, Alphsize, Lambda);
788     s_SetXUOScores(Scores, Alphsize, RowProb, ColProb);
789 
790     s_RoundScoreMatrix(Matrix, Alphsize, Alphsize, Scores);
791     Nlm_DenseMatrixFree(&Scores);
792 
793     for (i = 0;  i < Alphsize;  i++) {
794         Matrix[i][eStopChar] = StartMatrix[i][eStopChar];
795         Matrix[eStopChar][i] = StartMatrix[eStopChar][i];
796     }
797     return 0;
798 }
799 
800 
801 /**
802  * Find the range of scores contained in an scoring matrix.
803  * @param obs_min    smallest value in the matrix
804  * @param obs_max    largest value in the matrix
805  * @param matrix     a scoring matrix for the ncbistdaa alphabet
806  * @param rows       number of rows in the matrix
807  */
s_GetScoreRange(int * obs_min,int * obs_max,int ** matrix,int rows)808 static void s_GetScoreRange(int * obs_min, int * obs_max,
809                             int ** matrix, int rows)
810 {
811     int aa;                    /* index of an amino-acid in the 20
812                                   letter alphabet */
813     int irow, jcol;            /* matrix row and column indices */
814     int minScore, maxScore;    /* largest and smallest observed scores */
815 
816     minScore = maxScore = 0;
817     for (irow = 0;  irow < rows;  irow++) {
818         for (aa = 0;  aa < COMPO_NUM_TRUE_AA;  aa++) {
819             jcol = trueCharPositions[aa];
820             if (matrix[irow][jcol] < minScore &&
821                 matrix[irow][jcol] > COMPO_SCORE_MIN)
822                 minScore = matrix[irow][jcol];
823             if (matrix[irow][jcol] > maxScore)
824                 maxScore = matrix[irow][jcol];
825         }
826     }
827     *obs_min = minScore;
828     *obs_max = maxScore;
829 }
830 
831 
832 /**
833  * Compute the score probabilities for a given amino acid substitution matrix
834  * in the context of given query and subject amino acid frequencies.
835  *
836  * @param *obs_min          the smallest score in the score matrix [out]
837  * @param *obs_max          the largest score in the score matrix [out]
838  * @param *scoreProb        the new array, of length (*obs_max - *obs_min + 1),
839  *                          of score probabilities, where (*scoreProb)[0] is
840  *                          the probability for score *obs_min.
841  * @param matrix            a amino-acid substitution matrix (not
842  *                          position-specific)
843  * @param alphsize          the size of the alphabet
844  * @param subjectProbArray  is an array containing the probability of
845  *                          occurrence of each residue in the subject
846  * @param queryProbArray    is an array containing the probability of
847  *                          occurrence of each residue in the query
848  * @param scoreProb         is an array of probabilities for each score
849  *                          that is to be used as a field in return_sfp
850  * @return 0 on success, -1 on out-of-memory
851  */
852 static int
s_GetMatrixScoreProbs(double ** scoreProb,int * obs_min,int * obs_max,int ** matrix,int alphsize,const double * subjectProbArray,const double * queryProbArray)853 s_GetMatrixScoreProbs(double **scoreProb, int * obs_min, int * obs_max,
854                       int **matrix, int alphsize,
855                       const double *subjectProbArray,
856                       const double *queryProbArray)
857 {
858     int aa;          /* index of an amino-acid in the 20 letter
859                         alphabet */
860     int irow, jcol;  /* matrix row and column indices */
861     double * sprob;  /* a pointer to the element of the score
862                         probabilities array that represents the
863                         probability of the score 0*/
864     int minScore;    /* smallest score in matrix; the same value as
865                         (*obs_min). */
866     int range;       /* the range of scores in the matrix */
867 
868     s_GetScoreRange(obs_min, obs_max, matrix, alphsize);
869     minScore = *obs_min;
870     range = *obs_max - *obs_min + 1;
871     *scoreProb = calloc(range, sizeof(double));
872     if (*scoreProb == NULL) {
873         return -1;
874     }
875     sprob = &((*scoreProb)[-(*obs_min)]); /*center around 0*/
876     for (irow = 0;  irow < alphsize;  irow++) {
877         for (aa = 0;  aa < COMPO_NUM_TRUE_AA;  aa++) {
878             jcol = trueCharPositions[aa];
879             if (matrix[irow][jcol] >= minScore) {
880                 sprob[matrix[irow][jcol]] +=
881                     (queryProbArray[irow] * subjectProbArray[jcol]);
882             }
883         }
884     }
885     return 0;
886 }
887 
888 
889 /**
890  * Compute the score probabilities for a given amino acid position-specific
891  * substitution matrix in the context of a given set of subject amino
892  * acid frequencies.
893  *
894  * @param *obs_min          the smallest score in the score matrix [out]
895  * @param *obs_max          the largest score in the score matrix [out]
896  * @param *scoreProb        the new array, of length (*obs_max - *obs_min + 1),
897  *                          of score probabilities, where (*scoreProb)[0] is
898  *                          the probability for score *obs_min.
899  * @param matrix            a position-specific amino-acid substitution matrix.
900  * @param rows              the number of rows in matrix.
901  * @param subjectProbArray  is an array containing the probability of
902  *                          occurrence of each residue in the subject
903  * @return 0 on success, -1 on out-of-memory
904  */
905 static int
s_GetPssmScoreProbs(double ** scoreProb,int * obs_min,int * obs_max,int ** matrix,int rows,const double * subjectProbArray)906 s_GetPssmScoreProbs(double ** scoreProb, int * obs_min, int * obs_max,
907                     int **matrix, int rows,
908                     const double *subjectProbArray)
909 {
910     int aa;            /* index of an amino-acid in the 20 letter
911                           alphabet */
912     int irow, jcol;    /* matrix row and column indices */
913     double onePosFrac; /* matrix length as a double*/
914     double * sprob;    /* pointer to the element of the score
915                         * probabilities array the represents the
916                         * probability of zero */
917     int minScore;      /* smallest score in matrix; the same value as
918                           (*obs_min). */
919     int range;         /* the range of scores in the matrix */
920 
921     s_GetScoreRange(obs_min, obs_max, matrix, rows);
922     minScore = *obs_min;
923     range = *obs_max - *obs_min + 1;
924     *scoreProb = calloc(range, sizeof(double));
925     if (*scoreProb == NULL) {
926         return -1;
927     }
928     sprob = &((*scoreProb)[-(*obs_min)]); /*center around 0*/
929     onePosFrac = 1.0/ ((double) rows);
930     for (irow = 0;  irow < rows;  irow++) {
931         for (aa = 0;  aa < COMPO_NUM_TRUE_AA;  aa++) {
932             jcol = trueCharPositions[aa];
933             if (matrix[irow][jcol] >= minScore) {
934                 sprob[matrix[irow][jcol]] +=
935                     onePosFrac * subjectProbArray[jcol];
936             }
937         }
938     }
939     return 0;
940 }
941 
942 
943 /* Documented in composition_adjustment.h. */
944 void
Blast_Int4MatrixFromFreq(int ** matrix,int alphsize,double ** freq,double Lambda)945 Blast_Int4MatrixFromFreq(int **matrix, int alphsize,
946                          double ** freq, double Lambda)
947 {
948     /* A row of the matrix in double precision */
949     double dMatrixStore[COMPO_LARGEST_ALPHABET];
950     double * dMatrix[1];
951     int i;
952 
953     dMatrix[0] = dMatrixStore;
954 
955     for (i = 0;  i < alphsize;  i++) {
956         memcpy(dMatrix[0], freq[i], alphsize * sizeof(double));
957         Blast_FreqRatioToScore(dMatrix, 1, alphsize, Lambda);
958         s_RoundScoreMatrix(&matrix[i], 1, alphsize, dMatrix);
959     }
960 }
961 
962 
963 /* Documented in composition_adjustment.h. */
Blast_MatrixInfoFree(Blast_MatrixInfo ** ss)964 void Blast_MatrixInfoFree(Blast_MatrixInfo ** ss)
965 {
966     if (*ss != NULL) {
967         free((*ss)->matrixName);
968         Nlm_Int4MatrixFree(&(*ss)->startMatrix);
969         Nlm_DenseMatrixFree(&(*ss)->startFreqRatios);
970         free(*ss);
971         *ss = NULL;
972     }
973 }
974 
975 
976 /* Documented in composition_adjustment.h. */
977 Blast_MatrixInfo *
Blast_MatrixInfoNew(int rows,int cols,int positionBased)978 Blast_MatrixInfoNew(int rows, int cols, int positionBased)
979 {
980     int i;       /* loop index */
981     Blast_MatrixInfo * ss = malloc(sizeof(Blast_MatrixInfo));
982     if (ss != NULL) {
983         ss->rows = rows;
984         ss->cols = cols;
985         ss->positionBased = positionBased;
986 
987         ss->matrixName = NULL;
988         ss->startMatrix = NULL;
989         ss->startFreqRatios = NULL;
990 
991         ss->startMatrix  = Nlm_Int4MatrixNew(rows + 1, cols);
992         if (ss->startMatrix == NULL)
993             goto error_return;
994         ss->startFreqRatios = Nlm_DenseMatrixNew(rows + 1, cols);
995 
996         if (ss->startFreqRatios == NULL)
997             goto error_return;
998         for (i = 0;  i < cols;  i++) {
999             ss->startMatrix[rows][i] = COMPO_SCORE_MIN;
1000             ss->startFreqRatios[rows][i] = (double) COMPO_SCORE_MIN;
1001         }
1002 
1003     }
1004     goto normal_return;
1005 error_return:
1006     Blast_MatrixInfoFree(&ss);
1007 normal_return:
1008     return ss;
1009 }
1010 
1011 
1012 /**
1013  * Fill in all scores for a PSSM at a given scale Lambda.
1014  *
1015  * @param matrix       the newly computed matrix [output]
1016  * @param rows         number of positions (rows) in the PSSM
1017  * @param cols         the number of columns in the PSSM; the alphabet size
1018  * @param freq_ratios  frequency ratios defining the PSSM
1019  * @param start_matrix an existing PSSM; used to set values for the
1020  *                     stop character.
1021  * @param col_prob     letter probabilities
1022  * @param Lambda       scale of the new matrix
1023  */
1024 static void
s_ScalePSSM(int ** matrix,int rows,int cols,double ** freq_ratios,int ** start_matrix,const double col_prob[],double Lambda)1025 s_ScalePSSM(int **matrix, int rows, int cols, double ** freq_ratios,
1026             int ** start_matrix, const double col_prob[], double Lambda)
1027 {
1028     int p;          /* index over matrix rows */
1029     /* A row of scores corresponding to one position in the PSSM */
1030     double row[COMPO_LARGEST_ALPHABET];
1031     /* A matrix with one row */
1032     double * row_matrix[1];
1033 
1034     row_matrix[0] = row;
1035 
1036     for (p = 0;  p < rows;  p++) {
1037         double Xscore;
1038         memcpy(row, freq_ratios[p], cols * sizeof(double));
1039 
1040         Blast_FreqRatioToScore(row_matrix, 1, cols, Lambda);
1041         row[eXchar] = Xscore = s_CalcXScore(row, cols, 1, col_prob);
1042         /* use Cysteine score for Selenocysteine */
1043         row[eSelenocysteine] = row[eCchar];
1044         if (cols > eOchar) {
1045             row[eOchar] = Xscore;
1046         }
1047         s_RoundScoreMatrix(&matrix[p], 1, cols, row_matrix);
1048 
1049         matrix[p][eStopChar] = start_matrix[p][eStopChar];
1050     }
1051 }
1052 
1053 
1054 /**
1055  * Fill in all scores for a scoring matrix for the NCBIstdaa alphabet
1056  * at a given scale Lambda.
1057  *
1058  * @param matrix       the newly computed matrix [output]
1059  * @param alphsize     the alphabet size
1060  * @param freq_ratios  frequency ratios defining the PSSM
1061  * @param start_matrix an existing matrix; used to set values for the
1062  *                     stop character.
1063  * @param row_prob     letter probabilities for the sequence
1064  *                     corresponding to the rows of the matrix.
1065  * @param col_prob     letter probabilities for the sequence
1066  *                     corresponding to the columns of the matrix.
1067  * @param Lambda       scale of the new matrix
1068  */
1069 static int
s_ScaleSquareMatrix(int ** matrix,int alphsize,double ** freq_ratios,int ** start_matrix,const double row_prob[],const double col_prob[],double Lambda)1070 s_ScaleSquareMatrix(int **matrix, int alphsize,
1071                     double ** freq_ratios, int ** start_matrix,
1072                     const double row_prob[], const double col_prob[],
1073                     double Lambda)
1074 {
1075     double ** scores;     /* a double precision matrix of scores */
1076     int i;                /* iteration index */
1077 
1078     scores = Nlm_DenseMatrixNew(alphsize, alphsize);
1079     if (scores == 0) return -1;
1080 
1081     for (i = 0;  i < alphsize;  i++) {
1082         memcpy(scores[i], freq_ratios[i], alphsize * sizeof(double));
1083     }
1084     Blast_FreqRatioToScore(scores, alphsize, alphsize, Lambda);
1085     s_SetXUOScores(scores, alphsize, row_prob, col_prob);
1086     s_RoundScoreMatrix(matrix, alphsize, alphsize, scores);
1087     for (i = 0;  i < alphsize;  i++) {
1088         matrix[i][eStopChar] = start_matrix[i][eStopChar];
1089         matrix[eStopChar][i] = start_matrix[eStopChar][i];
1090     }
1091     Nlm_DenseMatrixFree(&scores);
1092 
1093     return 0;
1094 }
1095 
1096 
1097 /** LambdaRatioLowerBound is used when the expected score is too large
1098  * causing impalaKarlinLambdaNR to give a Lambda estimate that
1099  * is too small, or to fail entirely returning -1*/
1100 #define LambdaRatioLowerBound 0.5
1101 
1102 
1103 /* Documented in composition_adjustment.h. */
1104 int
Blast_CompositionBasedStats(int ** matrix,double * LambdaRatio,const Blast_MatrixInfo * ss,const double queryProb[],const double resProb[],double (* calc_lambda)(double *,int,int,double),int pValueAdjustment)1105 Blast_CompositionBasedStats(int ** matrix, double * LambdaRatio,
1106                             const Blast_MatrixInfo * ss,
1107                             const double queryProb[], const double resProb[],
1108                             double (*calc_lambda)(double*,int,int,double),
1109                             int pValueAdjustment)
1110 {
1111     double correctUngappedLambda; /* new value of ungapped lambda */
1112     int obs_min, obs_max;         /* smallest and largest score in the
1113                                      unscaled matrix */
1114     double *scoreArray;           /* an array of score probabilities */
1115     int out_of_memory;            /* status flag to indicate out of memory */
1116 
1117     if (ss->positionBased) {
1118         out_of_memory =
1119             s_GetPssmScoreProbs(&scoreArray, &obs_min, &obs_max,
1120                                 ss->startMatrix, ss->rows, resProb);
1121     } else {
1122         out_of_memory =
1123             s_GetMatrixScoreProbs(&scoreArray, &obs_min, &obs_max,
1124                                   ss->startMatrix, ss->cols,
1125                                   resProb, queryProb);
1126     }
1127     if (out_of_memory)
1128         return -1;
1129     correctUngappedLambda =
1130         calc_lambda(scoreArray, obs_min, obs_max, ss->ungappedLambda);
1131 
1132     /* calc_lambda will return -1 in the case where the
1133      * expected score is >=0; however, because of the MAX statement 3
1134      * lines below, LambdaRatio should always be > 0; the succeeding
1135      * test is retained as a vestige, in case one wishes to remove the
1136      * MAX statement and allow LambdaRatio to take on the error value
1137      * -1 */
1138     *LambdaRatio = correctUngappedLambda / ss->ungappedLambda;
1139     if (0 == pValueAdjustment)
1140       *LambdaRatio = MIN(1, *LambdaRatio);
1141     *LambdaRatio = MAX(*LambdaRatio, LambdaRatioLowerBound);
1142 
1143     if (*LambdaRatio > 0) {
1144         double scaledLambda = ss->ungappedLambda/(*LambdaRatio);
1145         if (ss->positionBased) {
1146             s_ScalePSSM(matrix, ss->rows, ss->cols, ss->startFreqRatios,
1147                         ss->startMatrix, resProb, scaledLambda);
1148         } else {
1149             s_ScaleSquareMatrix(matrix, ss->cols,
1150                                 ss->startFreqRatios, ss->startMatrix,
1151                                 queryProb, resProb, scaledLambda);
1152         }
1153     }
1154     free(scoreArray);
1155 
1156     return 0;
1157 }
1158 
1159 
1160 /* Documented in composition_adjustment.h. */
1161 void
Blast_ReadAaComposition(Blast_AminoAcidComposition * composition,int alphsize,const Uint1 * sequence,int length)1162 Blast_ReadAaComposition(Blast_AminoAcidComposition * composition,
1163                         int alphsize,
1164                         const Uint1 * sequence, int length)
1165 {
1166     int i; /* iteration index */
1167 
1168     /* fields of composition as local variables */
1169     int numTrueAminoAcids = 0;
1170     double * prob = composition->prob;
1171 
1172     for (i = 0;  i < alphsize;  i++) {
1173         prob[i] = 0.0;
1174     }
1175     for (i = 0;  i < length;  i++) {
1176         if (alphaConvert[sequence[i]] >= 0 || sequence[i] == eSelenocysteine) {
1177             prob[sequence[i]]++;
1178             numTrueAminoAcids++;
1179         }
1180     }
1181 
1182     /* Count Selenocysteine (U) as if it was Cysteine (C) to avoid a result
1183        where C aligned to U is scored higher than C aligned to C. Otherwise,
1184        because U is not counted as a true amino acid, sequences with U would
1185        appear more complex than those composed only of the 20 "true" amino
1186        acids. Therefore computationally adjusted alignment score would be
1187        larger for sequences containing U than for sequences with C. */
1188     if (prob[eSelenocysteine] > 0.0) {
1189         prob[eCchar] += prob[eSelenocysteine];
1190         prob[eSelenocysteine] = 0.0;
1191     }
1192 
1193     composition->numTrueAminoAcids = numTrueAminoAcids;
1194     if (numTrueAminoAcids > 0) {
1195         for (i = 0;  i < alphsize;  i++) {
1196             prob[i] /= numTrueAminoAcids;
1197         }
1198     }
1199 }
1200 
1201 
1202 /* Documented in composition_adjustment.h. */
1203 void
Blast_GetCompositionRange(int * pleft,int * pright,const Uint1 * subject_data,int length,int start,int finish)1204 Blast_GetCompositionRange(int * pleft, int * pright,
1205                           const Uint1 * subject_data, int length,
1206                           int start, int finish)
1207 {
1208     int i;                /* iteration index */
1209     int left, right;
1210 
1211     left = start;
1212     /* Search leftward for a StopChar */
1213     for (i = left;  i > 0;  i--) {
1214         if (subject_data[i - 1] == eStopChar) {
1215             /* We have found a StopChar. Unless the StopChar is
1216              * too close to the start of the subject region of the
1217              * HSP, */
1218             if (i + kCompositionMargin < left) {
1219                 /* reset the left endpoint. */
1220                 left = i + kCompositionMargin;
1221             }
1222             break;
1223         }
1224     }
1225     if (i == 0) {
1226         /* No stop codon was found to the left. */
1227         left = 0;
1228     }
1229     right = finish;
1230     /* Search rightward for a StopChar */
1231     for (i = right;  i < length;  i++) {
1232         if (subject_data[i] == eStopChar) {
1233             /* We have found a StopChar. Unless the StopChar is
1234              * too close to the end of the subject region of the
1235              * HSP, */
1236             if (i - kCompositionMargin > right) {
1237                 /* reset the right endpoint */
1238                 right = i - kCompositionMargin;
1239             }
1240             break;
1241         }
1242     }
1243     if (i == length) {
1244         /* No stop codon was found to the right. */
1245         right = length;
1246     }
1247     *pleft = left; *pright = right;
1248 }
1249 
1250 
1251 /* Documented in composition_adjustment.h. */
1252 void
Blast_CompositionWorkspaceFree(Blast_CompositionWorkspace ** pNRrecord)1253 Blast_CompositionWorkspaceFree(Blast_CompositionWorkspace ** pNRrecord)
1254 {
1255     Blast_CompositionWorkspace * NRrecord = *pNRrecord;
1256 
1257     if (NRrecord != NULL) {
1258         free(NRrecord->first_standard_freq);
1259         free(NRrecord->second_standard_freq);
1260 
1261         Nlm_DenseMatrixFree(&NRrecord->mat_final);
1262         Nlm_DenseMatrixFree(&NRrecord->mat_b);
1263 
1264         free(NRrecord);
1265     }
1266     *pNRrecord = NULL;
1267 }
1268 
1269 
1270 /* Documented in composition_adjustment.h. */
Blast_CompositionWorkspaceNew(void)1271 Blast_CompositionWorkspace * Blast_CompositionWorkspaceNew(void)
1272 {
1273     Blast_CompositionWorkspace * NRrecord;        /* record to allocate
1274                                                     and return */
1275     int i;                     /* loop index */
1276 
1277     NRrecord = (Blast_CompositionWorkspace *)
1278         malloc(sizeof(Blast_CompositionWorkspace));
1279     if (NRrecord == NULL) goto error_return;
1280 
1281     NRrecord->first_standard_freq      = NULL;
1282     NRrecord->second_standard_freq     = NULL;
1283     NRrecord->mat_final                = NULL;
1284     NRrecord->mat_b                    = NULL;
1285 
1286     NRrecord->first_standard_freq =
1287         (double *) malloc(COMPO_NUM_TRUE_AA * sizeof(double));
1288     if (NRrecord->first_standard_freq == NULL) goto error_return;
1289 
1290     NRrecord->second_standard_freq =
1291         (double *) malloc(COMPO_NUM_TRUE_AA * sizeof(double));
1292     if (NRrecord->second_standard_freq == NULL) goto error_return;
1293 
1294     NRrecord->mat_final   = Nlm_DenseMatrixNew(COMPO_NUM_TRUE_AA,
1295                                                COMPO_NUM_TRUE_AA);
1296     if (NRrecord->mat_final == NULL) goto error_return;
1297 
1298     NRrecord->mat_b       = Nlm_DenseMatrixNew(COMPO_NUM_TRUE_AA,
1299                                                COMPO_NUM_TRUE_AA);
1300     if (NRrecord->mat_b == NULL) goto error_return;
1301 
1302     for (i = 0;  i < COMPO_NUM_TRUE_AA;  i++) {
1303         NRrecord->first_standard_freq[i] =
1304             NRrecord->second_standard_freq[i] = 0.0;
1305     }
1306 
1307     goto normal_return;
1308 error_return:
1309     Blast_CompositionWorkspaceFree(&NRrecord);
1310 normal_return:
1311     return NRrecord;
1312 }
1313 
1314 
1315 /* Documented in composition_adjustment.h. */
1316 int
Blast_CompositionWorkspaceInit(Blast_CompositionWorkspace * NRrecord,const char * matrixName)1317 Blast_CompositionWorkspaceInit(Blast_CompositionWorkspace * NRrecord,
1318                                const char *matrixName)
1319 {
1320     if (0 == Blast_GetJointProbsForMatrix(NRrecord->mat_b,
1321                                           NRrecord->first_standard_freq,
1322                                           NRrecord->second_standard_freq,
1323                                           matrixName)) {
1324         return 0;
1325     } else {
1326         fprintf(stderr,
1327                 "Matrix %s not currently supported for RE based adjustment\n",
1328                 matrixName);
1329         return -1;
1330     }
1331 }
1332 
1333 
1334 /* Documented in composition_adjustment.h. */
1335 int
Blast_CompositionMatrixAdj(int ** matrix,int alphsize,EMatrixAdjustRule matrix_adjust_rule,int length1,int length2,const double * stdaa_row_probs,const double * stdaa_col_probs,int pseudocounts,double specifiedRE,Blast_CompositionWorkspace * NRrecord,const Blast_MatrixInfo * matrixInfo)1336 Blast_CompositionMatrixAdj(int ** matrix,
1337                            int alphsize,
1338                            EMatrixAdjustRule matrix_adjust_rule,
1339                            int length1,
1340                            int length2,
1341                            const double * stdaa_row_probs,
1342                            const double * stdaa_col_probs,
1343                            int pseudocounts,
1344                            double specifiedRE,
1345                            Blast_CompositionWorkspace * NRrecord,
1346                            const Blast_MatrixInfo * matrixInfo)
1347 {
1348     int iteration_count, status;
1349     double row_probs[COMPO_NUM_TRUE_AA], col_probs[COMPO_NUM_TRUE_AA];
1350     /* Target RE when optimizing the matrix; zero if the relative
1351        entropy should not be constrained. */
1352     double dummy, desired_re = 0.0;
1353 
1354     s_GatherLetterProbs(row_probs, stdaa_row_probs, alphsize);
1355     s_GatherLetterProbs(col_probs, stdaa_col_probs, alphsize);
1356 
1357     switch (matrix_adjust_rule) {
1358     case eUnconstrainedRelEntropy:
1359         desired_re = 0.0;
1360         break;
1361     case eRelEntropyOldMatrixNewContext:
1362         /* Calculate the desired re using the new marginal probs with
1363            the old matrix */
1364         status = Blast_EntropyOldFreqNewContext(&desired_re, &dummy,
1365                                                 &iteration_count,
1366                                                 NRrecord->mat_b,
1367                                                 row_probs, col_probs);
1368         if (status < 0)     /* Error, e.g. memory */
1369             return status;
1370         else if (status > 0) /* we could not calculate the desired re */
1371             desired_re = 0.0; /* so, leave the re unconstrained */
1372 
1373         break;
1374     case eRelEntropyOldMatrixOldContext:
1375         desired_re = Blast_TargetFreqEntropy(NRrecord->mat_b);
1376         break;
1377     case eUserSpecifiedRelEntropy:
1378         desired_re = specifiedRE;
1379         break;
1380     default:  /* I assert that we can't get here */
1381         fprintf(stderr, "Unknown flag for setting relative entropy"
1382                 "in composition matrix adjustment");
1383         exit(1);
1384     }
1385     Blast_ApplyPseudocounts(row_probs, length1,
1386                             NRrecord->first_standard_freq, pseudocounts);
1387     Blast_ApplyPseudocounts(col_probs, length2,
1388                             NRrecord->second_standard_freq, pseudocounts);
1389 
1390     status =
1391         Blast_OptimizeTargetFrequencies(&NRrecord->mat_final[0][0],
1392                                         COMPO_NUM_TRUE_AA,
1393                                         &iteration_count,
1394                                         &NRrecord->mat_b[0][0],
1395                                         row_probs, col_probs,
1396                                         (desired_re > 0.0),
1397                                         desired_re,
1398                                         kCompoAdjustErrTolerance,
1399                                         kCompoAdjustIterationLimit);
1400 
1401     if (status != 0)            /* Did not compute the target freqs */
1402         return status;
1403 
1404     return
1405         s_ScoresStdAlphabet(matrix, alphsize, NRrecord->mat_final,
1406                             matrixInfo->startMatrix,
1407                             row_probs, col_probs,
1408                             matrixInfo->ungappedLambda);
1409 }
1410 
1411 
1412 /* Documented in composition_adjustment.h. */
1413 int
Blast_AdjustScores(int ** matrix,const Blast_AminoAcidComposition * query_composition,int queryLength,const Blast_AminoAcidComposition * subject_composition,int subjectLength,const Blast_MatrixInfo * matrixInfo,ECompoAdjustModes composition_adjust_mode,int RE_pseudocounts,Blast_CompositionWorkspace * NRrecord,EMatrixAdjustRule * matrix_adjust_rule,double calc_lambda (double *,int,int,double),double * pvalueForThisPair,int compositionTestIndex,double * ratioToPassBack)1414 Blast_AdjustScores(int ** matrix,
1415                    const Blast_AminoAcidComposition * query_composition,
1416                    int queryLength,
1417                    const Blast_AminoAcidComposition * subject_composition,
1418                    int subjectLength,
1419                    const Blast_MatrixInfo * matrixInfo,
1420                    ECompoAdjustModes composition_adjust_mode,
1421                    int RE_pseudocounts,
1422                    Blast_CompositionWorkspace *NRrecord,
1423                    EMatrixAdjustRule *matrix_adjust_rule,
1424                    double calc_lambda(double *,int,int,double),
1425                    double *pvalueForThisPair,
1426                    int compositionTestIndex,
1427                    double *ratioToPassBack)
1428 {
1429     const int alphsize = matrixInfo->cols;
1430 
1431     double lambdaForPair;     /*lambda for this pair of compositions*/
1432     int iter_count; /*used as argument to Blast_CalcLambdaFullPrecision*/
1433 
1434     /* The next two arrays are letter probabilities of query and
1435      * match in 20 letter ARND... alphabet. */
1436     double permutedQueryProbs[COMPO_NUM_TRUE_AA];
1437     double permutedMatchProbs[COMPO_NUM_TRUE_AA];
1438 
1439     if (query_composition->numTrueAminoAcids == 0 ||
1440         subject_composition->numTrueAminoAcids == 0) {
1441         /* Either the query or subject contains only ambiguity
1442            characters, most likely because the entire subject has been
1443            SEGed.  Compositional adjustment is meaningless. */
1444         return 1;
1445     }
1446 
1447     if ((compositionTestIndex > 0) ||
1448         ((!(matrixInfo->positionBased)) &&
1449          (composition_adjust_mode != eCompositionBasedStats))) {
1450         s_GatherLetterProbs(permutedQueryProbs,
1451                             query_composition->prob, alphsize);
1452         s_GatherLetterProbs(permutedMatchProbs,
1453                             subject_composition->prob, alphsize);
1454     }
1455 
1456     if (compositionTestIndex > 0) {
1457         int i,j; /*loop indices*/
1458         /* a score matrix to pass*/
1459         double **scores = Nlm_DenseMatrixNew(alphsize, alphsize);
1460 
1461         if (scores == NULL) {
1462             return -1;
1463         }
1464         for (i = 0;  i < COMPO_NUM_TRUE_AA;  i++)
1465             for (j = 0;  j < COMPO_NUM_TRUE_AA; j++)
1466                 scores[i][j] = BLOS62[i][j];
1467         Blast_CalcLambdaFullPrecision(&lambdaForPair, &iter_count,
1468                                       scores,
1469                                       COMPO_NUM_TRUE_AA,
1470                                       permutedQueryProbs,
1471                                       permutedMatchProbs,
1472                                       kLambdaErrorTolerance,
1473                                       kLambdaFunctionTolerance,
1474                                       kLambdaIterationLimit);
1475         if (iter_count >= kLambdaIterationLimit) {
1476             /* The algorithm for lambda didn't converge, likely
1477              * because the matrix has positive expected score;
1478              * set lambda to the smallest value in the table. */
1479             lambdaForPair = COMPO_MIN_LAMBDA;
1480         }
1481         /*use lengths of query and subject not counting X's */
1482         *pvalueForThisPair = Blast_CompositionPvalue(lambdaForPair);
1483 
1484         Nlm_DenseMatrixFree(&scores);
1485     }
1486 
1487     if (matrixInfo->positionBased ||
1488         composition_adjust_mode == eCompositionBasedStats) {
1489         /* Use old-style composition-based statistics unconditionally. */
1490         *matrix_adjust_rule =  eCompoScaleOldMatrix;
1491     } else {
1492         /* else call Yi-Kuo's code to choose mode for matrix adjustment. */
1493         *matrix_adjust_rule =
1494             Blast_ChooseMatrixAdjustRule(queryLength, subjectLength,
1495                                          permutedQueryProbs,
1496                                          permutedMatchProbs,
1497                                          matrixInfo->matrixName,
1498                                          composition_adjust_mode);
1499     }  /* end else call Yi-Kuo's code to choose mode for matrix adjustment. */
1500 
1501     if (eCompoScaleOldMatrix != *matrix_adjust_rule) {
1502         /* Try matrix optimization, if it fails to converge, we
1503            fall back to traditional scaling below */
1504         int status =
1505             Blast_CompositionMatrixAdj(matrix,
1506                                        alphsize,
1507                                        *matrix_adjust_rule,
1508                                        query_composition->
1509                                        numTrueAminoAcids,
1510                                        subject_composition->
1511                                        numTrueAminoAcids,
1512                                        query_composition->prob,
1513                                        subject_composition->prob,
1514                                        RE_pseudocounts,
1515                                        kFixedReBlosum62,
1516                                        NRrecord,
1517                                        matrixInfo);
1518 
1519         *ratioToPassBack = 1.0;    /* meaningless for this mode */
1520         if (status <= 0)
1521             return status;      /* Success (=0) or fatal error (<0)*/
1522 
1523     } /* End try matrix optimization */
1524 
1525     *matrix_adjust_rule = eCompoScaleOldMatrix;
1526     return Blast_CompositionBasedStats(matrix, ratioToPassBack, matrixInfo,
1527                                        query_composition->prob,
1528                                        subject_composition->prob,
1529                                        calc_lambda,
1530                                        (compositionTestIndex > 0));
1531 }
1532