1 static char const rcsid[] = "$Id: blastkar.c,v 6.115 2006/12/14 17:08:19 madden Exp $";
2 
3 /* ===========================================================================
4 *
5 *                            PUBLIC DOMAIN NOTICE
6 *               National Center for Biotechnology Information
7 *
8 *  This software/database is a "United States Government Work" under the
9 *  terms of the United States Copyright Act.  It was written as part of
10 *  the author's official duties as a United States Government employee and
11 *  thus cannot be copyrighted.  This software/database is freely available
12 *  to the public for use. The National Library of Medicine and the U.S.
13 *  Government have not placed any restriction on its use or reproduction.
14 *
15 *  Although all reasonable efforts have been taken to ensure the accuracy
16 *  and reliability of the software and data, the NLM and the U.S.
17 *  Government do not and cannot warrant the performance or results that
18 *  may be obtained by using this software or data. The NLM and the U.S.
19 *  Government disclaim all warranties, express or implied, including
20 *  warranties of performance, merchantability or fitness for any particular
21 *  purpose.
22 *
23 *  Please cite the author in any work or product based on this material.
24 *
25 * ===========================================================================*/
26 /*****************************************************************************
27 
28 File name: blastkar.c
29 
30 Author: Tom Madden
31 
32 Contents: Functions to calculate BLAST probabilities etc.
33 
34 Detailed Contents:
35 
36 	- allocate and deallocate structures used by BLAST to calculate
37 	probabilities etc.
38 
39 	- calculate residue frequencies for query and "average" database.
40 
41 	- read in matrix.
42 
43         - calculate sum-p from a collection of HSP's, for both the case
44 	of a "small" gap and a "large" gap, when give a total score and the
45 	number of HSP's.
46 
47 	- calculate expect values for p-values.
48 
49 	- calculate pseuod-scores from p-values.
50 
51 ******************************************************************************
52  * $Revision: 6.115 $
53  * $Log: blastkar.c,v $
54  * Revision 6.115  2006/12/14 17:08:19  madden
55  * Fix BLAST_MatrixDestruct to not use hard-coded alphabet size of 26 (from Mike Gertz)
56  *
57  * Revision 6.114  2006/10/02 12:36:01  madden
58  * Comment out BLOSUM62_20
59  *
60  * Revision 6.113  2006/04/07 13:46:24  madden
61  * Improved the comment for NlmKarlinLambdaNR.  Reformatted the
62  * function prototype to fit in 80 characters. (from Mike Gertz).
63  *
64  * Revision 6.112  2005/10/31 14:16:10  madden
65  * Add support for reward/penalty of 1/-5, 3/-4, and 3/-2
66  *
67  * Revision 6.111  2005/10/14 16:32:34  madden
68  * Add preliminary support for vecscreen parameters
69  *
70  * Revision 6.110  2005/10/12 19:21:03  madden
71  * Fix bug in s_GetNuclValuesArray
72  *
73  * Revision 6.109  2005/10/06 12:51:39  madden
74  * Add code to produce correct statistics for gapped blastn, changes include:
75  *
76  * 1.) series of array_of_8 structs with precalculated data
77  * 2.) function s_GetNuclValuesArray
78  * 3.) function BlastKarlinGetNuclAlphaBeta
79  * 4.) function BlastKarlinBlkNuclGappedCalc
80  *
81  * Revision 6.108  2005/08/09 14:14:46  dondosha
82  * From A. Shaffer: added comments to clarify usage of BlastKarlinEtoP and BlastKarlinPtoE
83  *
84  * Revision 6.107  2005/07/28 14:57:09  coulouri
85  * remove dead code
86  *
87  * Revision 6.106  2005/07/27 17:48:57  coulouri
88  * remove hardcoded paths
89  *
90  * Revision 6.105  2005/04/27 17:20:45  papadopo
91  * copy X scores to U scores when building score matrix
92  *
93  * Revision 6.104  2005/04/20 19:02:15  lavr
94  * +<assert.h>
95  *
96  * Revision 6.103  2005/01/31 17:02:03  dondosha
97  * Fixed bug in BlastKarlinLHtoK for blastn with penalty == -reward
98  *
99  * Revision 6.102  2004/09/28 16:04:19  papadopo
100  * From Michael Gertz:
101  * 1. Pass the effective database size into BlastSmallGapSumE,
102  *     BlastLargeGapSumE and BlastUnevenGapSumE.  The routines use this
103  *     value in a simplified formula to compute the e-value of singleton sets.
104  * 2. Caused all routines for calculating the significance of multiple
105  *     distinct alignments (BlastSmallGapSumE, BlastLargeGapSumE and
106  *     BlastUnevenGapSumE) to use
107  *
108  *        sum_{i in linked_set} (\lambda_i s_i - \ln K_i)
109  *
110  *     as the weighted sum score, where (\lambda_i, K_i) are taken from
111  *     the appropriate query context.
112  *
113  * Revision 6.101  2004/06/07 20:03:23  coulouri
114  * use floating point constants for comparisons with floating point variables
115  *
116  * Revision 6.100  2004/04/28 14:36:00  madden
117  * Changes from Mike Gertz:
118  * - I created the new routine BlastGapDecayDivisor that computes a
119  *   divisor used to weight the evalue of a collection of distinct
120  *   alignments.
121  * - I removed  BlastGapDecay and BlastGapDecayInverse which had become
122  *   redundant.
123  * - I modified the BlastCutoffs routine so that it uses the value
124  *   returned by BlastGapDecayDivisor to weight evalues.
125  * - I modified BlastSmallGapSumE, BlastLargeGapSumE and
126  *   BlastUnevenGapSumE no longer refer to the gap_prob parameter.
127  *   Replaced the gap_decay_rate parameter of each of these routines with
128  *   a weight_divisor parameter.  Added documentation.
129  *
130  * Revision 6.99  2004/04/23 13:49:43  madden
131  * Cleaned up ifndef in BlastKarlinLHtoK
132  *
133  * Revision 6.98  2004/04/23 13:19:53  madden
134  * Rewrote BlastKarlinLHtoK to do the following and more:
135  * 1. fix a bug whereby the wrong formula was used when high score == 1
136  *    and low score == -1;
137  * 2. fix a methodological error of truncating the first sum
138  *    and trying to make it converge quickly by adding terms
139  *    of a geometric progression, even though the geometric progression
140  *    estimate is not correct in all cases;
141  *    the old adjustment code is left in for historical purposes but
142  *    #ifdef'd out
143  * 3. Eliminate the Boolean bi_modal_score variable.  The old test that
144  *    set the value of bi_modal_score would frequently fail to choose the
145  *    correct value due to rounding error.
146  * 4. changed numerous local variable names to make them more meaningful;
147  * 5. added substantial comments to explain what the procedure
148  *    is doing and what each variable represents
149  *
150  * Revision 6.97  2004/04/01 13:43:08  lavr
151  * Spell "occurred", "occurrence", and "occurring"
152  *
153  * Revision 6.96  2004/03/31 17:58:51  papadopo
154  * Mike Gertz' changes for length adjustment calculations
155  *
156  * Revision 6.95  2003/12/12 16:00:34  madden
157  * Add gap_decay_rate to BlastCutoffs, remove BlastCutoffs_simple, protection against overflow, removal of defunct _real variables (all from Mike Gertz)
158  *
159  * Revision 6.94  2003/11/30 03:36:38  camacho
160  * Fix compilation error
161  *
162  * Revision 6.93  2003/11/28 22:39:40  camacho
163  * +static keyword to BlastKarlinLtoH
164  *
165  * Revision 6.92  2003/11/28 15:16:38  camacho
166  * Combine newkar.c's contents with blastkar.c
167  *
168  * Revision 6.91  2003/11/26 19:08:10  madden
169  * simplified BlastKarlinLtoH and BlastKarlinLHtoK and provided better protection against overflow, new function NlmKarlinLambdaNR (all from Mike Gertz)
170  *
171  *
172  * Revision 6.90  2003/06/30 20:01:32  dondosha
173  * Correction in logic of finding matrices by BLASTMAT environment variable
174  *
175  * Revision 6.89  2003/05/30 17:25:36  coulouri
176  * add rcsid
177  *
178  * Revision 6.88  2003/05/06 18:54:02  dondosha
179  * Set all gap/residue scores in blastn matrix to INT4_MIN/2
180  *
181  * Revision 6.87  2003/03/07 22:33:25  bealer
182  * - Fix UMR when alphabet_type is not set.
183  *
184  * Revision 6.86  2003/02/27 19:07:56  madden
185  * Add functions PrintMatrixMessage and PrintAllowedValuesMessage
186  *
187  * Revision 6.85  2003/02/26 18:23:49  madden
188  * Add functions BlastKarlinkGapBlkFill and BlastKarlinReportAllowedValues, call from BlastKarlinBlkGappedCalcEx
189  *
190  * Revision 6.84  2002/10/24 22:52:14  dondosha
191  * When checking config file for matrices path, allow aa or nt subdirectories too
192  *
193  * Revision 6.83  2002/07/22 20:10:12  dondosha
194  * Correction: previous change did not work for proteins
195  *
196  * Revision 6.82  2002/07/19 18:34:58  dondosha
197  * Ignore bits higher than 4 when computing frequencies - needed for megablast
198  *
199  * Revision 6.81  2002/05/17 20:30:37  madden
200  * Add comments on adding new matrix values
201  *
202  * Revision 6.80  2002/04/09 18:44:19  madden
203  * Do not return if status of BlastScoreBlkMatCreate is non-zero
204  *
205  * Revision 6.79  2002/02/26 22:25:21  dondosha
206  * Return error as soon as it is found that matrix name is not supported
207  *
208  * Revision 6.78  2001/12/13 14:30:49  madden
209  * Add BLASTKAR_SMALL_FLOAT to prevent floating point exception for very small floats
210  *
211  * Revision 6.77  2001/09/05 20:32:21  dondosha
212  * Fixed uninitialized variable bug
213  *
214  * Revision 6.76  2001/08/23 21:19:05  dondosha
215  * Improvements for lambda and K computation when all scores are multiples of a common factor
216  *
217  * Revision 6.75  2001/02/20 18:31:28  egorov
218  * Added protection agains freeing zero pointer
219  *
220  * Revision 6.74  2001/01/29 16:11:34  madden
221  * Added BLOSUM80 values for 25,2
222  *
223  * Revision 6.73  2000/12/28 16:23:24  madden
224  * Function getAlphaBeta from AS
225  *
226  * Revision 6.72  2000/12/26 17:46:20  madden
227  * Add function BlastKarlinGetMatrixValuesEx2 to return alpha and beta
228  *
229  * Revision 6.71  2000/11/27 16:04:34  dondosha
230  * Check if original_matrix not NULL before destructing it
231  *
232  * Revision 6.70  2000/11/24 22:07:32  shavirin
233  * Added new function BlastResFreqFree().
234  *
235  * Revision 6.69  2000/11/24 21:44:21  shavirin
236  * Fixed memory leak in the function BLAST_MatrixDestruct() in case of
237  * PSI Blast.
238  *
239  * Revision 6.68  2000/11/22 15:32:39  madden
240  * Remove unneeded line
241  *
242  * Revision 6.67  2000/11/21 19:06:03  madden
243  * Set best value for blosum62_20
244  *
245  * Revision 6.66  2000/11/21 18:45:56  madden
246  * New parameter values from S. Altschul for matrices
247  *
248  * Revision 6.65  2000/11/03 17:15:13  madden
249  * Add 13,2 for blosum80
250  *
251  * Revision 6.64  2000/10/25 16:39:46  madden
252  * Add protection in BlastMatrixToTxMatrix for NULL matrix
253  *
254  * Revision 6.63  2000/10/23 15:52:18  dondosha
255  * Bug in previous revision: INT4_MIN is too small to be a matrix value
256  *
257  * Revision 6.62  2000/10/20 21:51:09  dondosha
258  * Set matrix[15][] values to INT4_MIN for blastn to prevent crossing boundary between strands
259  *
260  * Revision 6.61  2000/09/27 21:28:40  dondosha
261  * Copy square matrix in BLAST_MatrixFill to the original_matrix member of BLAST_Matrix
262  *
263  * Revision 6.60  2000/09/27 19:29:04  dondosha
264  * Check boolean parameter in BLAST_MatrixFill to use position based matrix
265  *
266  * Revision 6.59  2000/09/15 18:48:16  madden
267  * Added new blosum62_20 values
268  *
269  * Revision 6.58  2000/09/13 16:16:52  dondosha
270  * Added LIBCALL to TxMatrix functions declarations
271  *
272  * Revision 6.57  2000/09/13 15:50:28  dondosha
273  * Use SeqMapTable functions for conversion of search matrix to txalign-style matrix
274  *
275  * Revision 6.56  2000/09/12 16:03:50  dondosha
276  * Added functions to create and destroy matrix used in txalign
277  *
278  * Revision 6.55  2000/09/11 20:49:32  madden
279  * New values for BLOSUM62_20 from Stephen Altschul
280  *
281  * Revision 6.54  2000/08/31 15:45:07  dondosha
282  * Added function BlastUnevenGapSumE for sum evalue computation with different gap size on two sequences
283  *
284  * Revision 6.53  2000/08/28 13:38:31  shavirin
285  * Fixed bug in the function BLAST_MatrixFill() detected by Alejandro.
286  *
287  * Revision 6.52  2000/08/23 18:50:01  madden
288  * Changes to support decline-to-align checking
289  *
290  * Revision 6.51  2000/08/22 12:58:55  madden
291  * Add new BLOSUM62_20 values
292  *
293  * Revision 6.50  2000/08/04 15:49:25  sicotte
294  * added BlastScoreBlkMatCreateEx(reward,penalty) and BlastKarlinGetDefaultMatrixValues as external functions
295  *
296  * Revision 6.49  2000/08/04 15:13:29  dondosha
297  * Initialize posFreqs to NULL in BLAST_MatrixFill
298  *
299  * Revision 6.48  2000/08/04 14:43:58  shavirin
300  * Changed values for data corresponding to BLOSUM62_20A and
301  * BLOSUM62_20B matrixes.
302  *
303  * Revision 6.47  2000/08/03 22:23:13  shavirin
304  * Added initialization of the posFreqs array in the function BLAST_MatrixFill()
305  *
306  * Revision 6.46  2000/07/24 18:46:37  shavirin
307  * Added stat parameters for the matrixes BLOSUM62_20a and BLOSUM62_20b
308  *
309  * Revision 6.45  2000/07/24 17:32:42  shavirin
310  * Fixed values for size of matrix in the function BlastScoreBlkMaxScoreSet()
311  *
312  * Revision 6.44  2000/07/20 20:46:30  madden
313  * New values for blosum62_20
314  *
315  * Revision 6.43  2000/06/12 21:37:33  shavirin
316  * Adjusted blosum62_values{} and blosum80_values{}.
317  *
318  * Revision 6.42  2000/06/02 16:20:54  shavirin
319  * Fixed minor bug in function BLAST_MatrixFill
320  *
321  * Revision 6.41  2000/05/26 17:29:54  shavirin
322  * Added array of pos frequencies into BLAST_Matrix structure and it's
323  * handling.
324  *
325  * Revision 6.40  2000/04/17 20:41:37  madden
326  * Added BLAST_MatrixFetch
327  *
328  * Revision 6.39  2000/03/27 20:28:19  shavirin
329  * Added mulithred-safe protection to the function BlastScoreBlkMatRead()
330  *
331  * Revision 6.38  2000/02/24 16:39:03  shavirin
332  * Added check for existence of matrix file in BlastScoreBlkMatFill().
333  *
334  * Revision 6.37  2000/01/11 21:23:13  shavirin
335  * Increased size of index from Int2 to Int4 in BlastPSIMaxScoreGet() function
336  *
337  * Revision 6.36  1999/12/22 21:06:34  shavirin
338  * Added new function BlastPSIMaxScoreGet().
339  *
340  * Revision 6.35  1999/12/16 19:17:22  egorov
341  * Code cleanup
342  *
343  * Revision 6.34  1999/11/29 14:17:46  egorov
344  * Use LnFactorial instead of log(Factorial)
345  *
346  * Revision 6.33  1999/11/23 21:38:40  egorov
347  * Nlm_Factorial(num) causes overflow if num>170;  return DBL_MAX if num>170.
348  * It does not influence program's logic because this large number of HSPs is bogus anyway.
349  *
350  * Revision 6.32  1999/08/20 14:42:17  madden
351  * Changed Robinson frequencies per Stephen A. request
352  *
353  * Revision 6.31  1999/08/05 13:13:34  madden
354  * Improved help messages for permitted matrices and gap values
355  *
356  * Revision 6.30  1999/07/30 13:25:51  shavirin
357  * Fixed bug in the function BLAST_MatrixFill which created wrong matrix size.
358  *
359  * Revision 6.29  1998/12/31 18:17:04  madden
360  * Added strand option
361  *
362  * Revision 6.28  1998/09/28 12:28:50  madden
363  * Protection for a DEC Alpha problem with HUGE_VAL
364  *
365  * Revision 6.27  1998/09/24 15:26:36  egorov
366  * Fix lint complaints
367  *
368  * Revision 6.26  1998/08/11 12:57:23  madden
369  * Changed importance of BlastKarlinBlkGappedCalc error message
370  *
371  * Revision 6.25  1998/07/17 15:39:57  madden
372  * Changes for Effective search space.
373  *
374  * Revision 6.24  1998/06/12 15:52:50  madden
375  * Fixed warnings
376  *
377  * Revision 6.23  1998/06/02 21:21:20  madden
378  * Changes for DNA matrices
379  *
380  * Revision 6.22  1998/05/03 17:56:45  madden
381  * Fixed memory leaks
382  *
383  * Revision 6.21  1998/04/24 21:51:40  madden
384  * Check return value on BlastKarlinBlkCalc
385  *
386  * Revision 6.20  1998/04/24 19:27:54  madden
387  * Added BlastKarlinBlkStandardCalcEx for ideal KarlinBlk
388  *
389  * Revision 6.19  1998/04/10 20:57:55  madden
390  * Added data for BLOSUM80 and BLOSUM45
391  *
392  * Revision 6.18  1998/04/10 15:05:48  madden
393  * Added pref_flags return by value to BlastKarlinGetMatrixValues
394  *
395  * Revision 6.17  1998/04/02 21:12:29  madden
396  * Added infinite values to the arrays returned by BlastKarlinGetMatrixValues
397  *
398  * Revision 6.16  1998/03/16 17:41:56  madden
399  * Fixed leaks
400  *
401  * Revision 6.15  1998/03/09 17:15:01  madden
402  * Added BlastKarlinGetMatrixValues
403  *
404  * Revision 6.14  1998/02/26 22:34:32  madden
405  * Changes for 16 bit windows
406  *
407  * Revision 6.13  1998/02/06 18:28:05  madden
408  * Added functions to produce pseudo-scores from p and e values
409  *
410  * Revision 6.12  1997/12/15 21:55:44  madden
411  * Added new parameters for BLOSUM62_20
412  *
413  * Revision 6.11  1997/12/11 22:21:02  madden
414  * Removed unused variables
415  *
416  * Revision 6.10  1997/12/10 22:41:58  madden
417  * proper casting done
418  *
419  * Revision 6.9  1997/12/01 22:07:55  madden
420  * Added missing values, fixed UMR
421  *
422  * Revision 6.8  1997/11/14 21:29:58  madden
423  * Stephens new parameters
424  *
425  * Revision 6.7  1997/11/14 17:14:52  madden
426  * Added Karlin parameter to matrix structure
427  *
428  * Revision 6.6  1997/11/07 22:27:29  madden
429  * Fix in BLAST_MatrixFill
430  *
431  * Revision 6.5  1997/11/07 00:48:07  madden
432  * Added defintitions and functions for BLAST_Matrix
433  *
434  * Revision 6.4  1997/10/30 15:41:01  madden
435  * Casts and fixes for DEC alpha
436  *
437  * Revision 6.3  1997/10/22 21:46:59  madden
438  * Added BLOSUM90, changed to better BLOSUM62 values
439  *
440  * Revision 6.2  1997/10/09 19:47:35  madden
441  * changes for 1/20th bit BLOSUM62 matrices
442  *
443  * Revision 6.1  1997/10/06 17:59:17  madden
444  * Added checks to BLAST_ScoreBlkDestruct
445  *
446  * Revision 6.0  1997/08/25 18:52:37  madden
447  * Revision changed to 6.0
448  *
449  * Revision 1.39  1997/07/18 20:56:13  madden
450  * Added more BLOSUM50 values
451  *
452  * Revision 1.38  1997/07/14 20:11:17  madden
453  * Removed unused variables
454  *
455  * Revision 1.37  1997/07/14 15:30:59  madden
456  * Changed call to BlastKarlinBlkGappedCalc
457  *
458  * Revision 1.36  1997/06/23 20:48:37  madden
459  * Added support for BLOSUM50 and PAM250 matrices
460  *
461  * Revision 1.35  1997/06/23 17:53:07  madden
462  * BlastKarlinBlkGappedCalc checks for valid values if KarlinBlkPtr NULL
463  *
464  * Revision 1.34  1997/05/01 15:53:13  madden
465  * Addition of extra KarlinBlk's for psi-blast
466  *
467  * Revision 1.33  1997/04/22  16:36:49  madden
468  * Added Robinson amino acid frequencies.
469  *
470  * Revision 1.32  1997/04/03  19:48:13  madden
471  * Changes to use effective database length instead of the length of each
472  * sequence in statistical calculations.
473  *
474  * Revision 1.31  1997/03/07  22:24:43  madden
475  * Closed File* for matrix after use.
476  *
477  * Revision 1.30  1997/02/20  21:33:51  madden
478  * Added FindPath call to blastkar.c to look for matrix.
479  *
480  * Revision 1.29  1997/02/06  23:15:43  madden
481  * Added additional gap extension and opening penalties.
482  *
483  * Revision 1.28  1997/02/05  19:54:59  madden
484  * *** empty log message ***
485  *
486  * Revision 1.27  1997/01/16  22:58:08  madden
487  * Secured BlastScoreFreqDestruct against NULL pointers.
488  *
489  * Revision 1.26  1997/01/13  17:15:29  madden
490  * Save matrix name for blastn type matrices.
491  *
492  * Revision 1.25  1996/12/11  17:59:42  madden
493  * Fixed purify leaks.
494  *
495  * Revision 1.24  1996/12/10  17:30:59  madden
496  * Changed statistics for gapped blastp
497  *
498  * Revision 1.23  1996/12/03  19:13:47  madden
499  * Made BlastRes functions non-static.
500  *
501  * Revision 1.22  1996/11/27  16:39:11  madden
502  * Added NULLB to end of array, purify nit.
503  *
504  * Revision 1.21  1996/11/26  20:38:01  madden
505  * Removed unused variable.
506  *
507  * Revision 1.20  1996/11/26  19:55:25  madden
508  * Added check for matrices in standard places.
509  *
510  * Revision 1.19  1996/11/18  19:32:09  madden
511  * Fixed uninitialized variable found by CodeWarrior.
512  *
513  * Revision 1.18  1996/11/18  14:49:26  madden
514  * Rewrote BlastScoreSetAmbigRes to take multiple ambig. chars.
515  *
516  * Revision 1.17  1996/11/08  21:45:03  madden
517  * Fix for blastn matrices.
518  *
519  * Revision 1.16  1996/11/07  22:31:15  madden
520  * Corrected Nucl. matrix for ambiguity characters.
521  *
522  * Revision 1.15  1996/10/04  20:12:26  madden
523  * Fixed memory leaks found by purify.
524  *
525  * Revision 1.14  1996/10/03  20:49:29  madden
526  * Added function BlastKarlinBlkStandardCalc to calculate standard
527  * Karlin parameters for blastx and tblastx.
528  *
529  * Revision 1.13  1996/10/03  18:04:48  madden
530  * Each context (or frame) now uses "proper" Karlin parameters.
531  *
532  * Revision 1.12  1996/10/03  13:07:55  madden
533  * Added return statement.
534  *
535  * Revision 1.11  1996/10/02  20:00:38  madden
536  * Calc. different Karlin parameters for each frame.
537  *
538  * Revision 1.10  1996/10/01  21:24:02  madden
539  * Corrected statistics for partial blastn matching.
540  *
541  * Revision 1.9  1996/09/30  21:56:12  madden
542  * Replaced query alphabet of ncbi2na with blastna alphabet.
543  *
544  * Revision 1.8  1996/09/11  20:08:55  madden
545  * *** empty log message ***
546  *
547  * Revision 1.6  1996/09/11  19:14:09  madden
548  * Changes to blastn matrix.
549  *
550  * Revision 1.5  1996/09/10  19:40:35  madden
551  * Added function BlastScoreBlkMatCreate for blastn matrices.
552  *
553  * Revision 1.4  1996/08/23  19:07:01  madden
554  * Adjust changes for NT warning to give correct results.
555  *
556  * Revision 1.3  1996/08/23  15:32:30  shavirin
557  * Fixed a lot of NT compiler warnings about type mismatch
558  *
559  * Revision 1.2  1996/08/22  20:38:11  madden
560  * Changed all doubles and floats to Nlm_FloatHi.
561  *
562  * Revision 1.1  1996/08/05  19:47:42  madden
563  * Initial revision
564  *
565  * Revision 1.26  1996/06/21  15:23:54  madden
566  * Corelibed BlastSumP.
567  *
568  * Revision 1.25  1996/06/21  15:14:55  madden
569  * Made some functions static, added LIBCALL to others.
570  *
571  * Revision 1.24  1996/06/20  16:52:23  madden
572  * Changed "pow" to "Nlm_Powi".
573  *
574  * Revision 1.23  1996/06/11  15:42:46  madden
575  * Replaced strncasempc with toolbox function.
576  *
577  * Revision 1.22  1996/05/29  12:44:04  madden
578  * Added function BlastRepresentativeResidues.
579  *
580  * Revision 1.21  1996/05/22  20:38:05  madden
581  * *** empty log message ***
582  *
583  * Revision 1.20  1996/05/22  20:21:23  madden
584  * removed unused variables.
585  *
586  * Revision 1.19  1996/05/20  21:18:51  madden
587  * Added BLASTMatrixStructure.
588  *
589  * Revision 1.18  1996/05/16  19:50:15  madden
590  * Added documentation block.
591  *
592  * Revision 1.17  1996/05/14  21:30:37  madden
593  * *** empty log message ***
594  *
595  * Revision 1.16  1996/04/16  15:33:41  madden
596  * Changes for backward-compatability with old blast.
597  *
598  * Revision 1.15  1996/03/25  16:34:19  madden
599  * Changes to mimic old statistics.
600  *
601  * Revision 1.14  1996/03/01  19:40:37  madden
602  * Added factorial correction to SmallGapSum.
603  *
604  * Revision 1.13  1996/02/28  21:38:08  madden
605  * *** empty log message ***
606  *
607  * Revision 1.12  1996/02/15  23:20:01  madden
608  * Added query length to a number of calls.
609  *
610  * Revision 1.12  1996/02/15  23:20:01  madden
611  * Added query length to a number of calls.
612  *
613  * Revision 1.11  1996/02/09  13:51:08  madden
614  * Change to prevent log sign error.
615  *
616  * Revision 1.10  1996/01/22  22:33:06  madden
617  * Changed effective length, fixed (improper) calculation of sum_e.
618  *
619  * Revision 1.9  1996/01/17  23:18:41  madden
620  * Added BlastScoreSetAmbigRes.
621  * ci -l blastkar.h
622  *
623  * Revision 1.8  1996/01/17  17:01:19  madden
624  * Fixed BlastSmallGapSumE  and BlastLargeGapSumE.
625  *
626  * Revision 1.7  1996/01/17  13:46:36  madden
627  * Added function BlastKarlinPtoE.
628  *
629  * Revision 1.6  1996/01/06  18:58:07  madden
630  * Added BlastSumP.
631  *
632  * Revision 1.5  1995/12/28  21:26:16  madden
633  * *** empty log message ***
634  *
635  * Revision 1.4  1995/12/26  23:04:46  madden
636  * removed GetBLAST0Matrix, Added "cutoff" functions.
637  *
638  * Revision 1.3  1995/12/26  20:27:47  madden
639  * Replaced BLAST_ScoreMat wiht BLAST_ScorePtr PNTR.
640  *
641  * Revision 1.2  1995/12/26  14:25:56  madden
642  * *** empty log message ***
643  *
644  * Revision 1.1  1995/12/21  23:07:35  madden
645  * Initial revision
646  *
647  * */
648 #include <ncbi.h>
649 #include <assert.h>
650 #include <ncbimath.h>
651 #include <objcode.h>
652 #include <objseq.h>
653 #include <sequtil.h>
654 #include <blastkar.h>
655 #include <blastpri.h>
656 
657 static Int2 BlastScoreFreqCalc PROTO((BLAST_ScoreBlkPtr sbp, BLAST_ScoreFreqPtr sfp, BLAST_ResFreqPtr rfp1, BLAST_ResFreqPtr rfp2));
658 
659 /* OSF1 apparently doesn't like this. */
660 #if defined(HUGE_VAL) && !defined(OS_UNIX_OSF1)
661 #define BLASTKAR_HUGE_VAL HUGE_VAL
662 #else
663 #define BLASTKAR_HUGE_VAL 1.e30
664 #endif
665 
666 
667 /* Allocates and Deallocates the two-dimensional matrix. */
668 static BLASTMatrixStructurePtr BlastMatrixAllocate PROTO((Int2 alphabet_size));
669 static BLASTMatrixStructurePtr BlastMatrixDestruct PROTO((BLASTMatrixStructurePtr matrix_struct));
670 
671 /* performs sump calculation, used by BlastSumPStd */
672 static Nlm_FloatHi BlastSumPCalc PROTO((int r, Nlm_FloatHi s));
673 
674 #define COMMENT_CHR	'#'
675 #define TOKSTR	" \t\n\r"
676 #define BLAST_MAX_ALPHABET 40 /* ncbistdaa is only 26, this should be enough */
677 
678 /*
679 	How many of the first bases are not ambiguous
680 	(it's four, of course).
681 */
682 #define NUMBER_NON_AMBIG_BP 4
683 /*
684 	translates between ncbi4na and blastna.  the first four elements
685 	of this array match ncbi2na.
686 */
687 
688 Uint1 ncbi4na_to_blastna[] = {
689 15,/* Gap, 0 */
690 0, /* A,   1 */
691 1, /* C,   2 */
692 6, /* M,   3 */
693 2, /* G,   4 */
694 4, /* R,   5 */
695 9, /* S,   6 */
696 13, /* V,   7 */
697 3, /* T,   8 */
698 8, /* W,   9 */
699  5, /* Y,  10 */
700  12, /* H,  11 */
701  7, /* K,  12 */
702  11, /* D,  13 */
703  10, /* B,  14 */
704  14  /* N,  15 */
705 };
706 
707 Uint1 blastna_to_ncbi4na[] = {	 1, /* A, 0 */
708 				 2, /* C, 1 */
709 				 4, /* G, 2 */
710 				 8, /* T, 3 */
711 				 5, /* R, 4 */
712 				10, /* Y, 5 */
713 				 3, /* M, 6 */
714 				12, /* K, 7 */
715 				 9, /* W, 8 */
716 				 6, /* S, 9 */
717 				14, /* B, 10 */
718 				13, /* D, 11 */
719 				11, /* H, 12 */
720 				 7, /* V, 13 */
721 				15, /* N, 14 */
722 				 0, /* Gap, 15 */
723 			};
724 
725 /* Used in BlastKarlinBlkGappedCalc */
726 typedef FloatHi array_of_8[8];
727 
728 /* Used to temporarily store matrix values for retrieval. */
729 
730 typedef struct _matrix_info {
731 	CharPtr		name;			/* name of matrix (e.g., BLOSUM90). */
732 	array_of_8 	*values;		/* The values (below). */
733 	Int4		*prefs;			/* Preferences for display. */
734 	Int4		max_number_values;	/* number of values (e.g., BLOSUM90_VALUES_MAX). */
735 } MatrixInfo, PNTR MatrixInfoPtr;
736 
737 
738 /**************************************************************************************
739 
740 How the statistical parameters for the matrices are stored:
741 -----------------------------------------------------------
742 They parameters are stored in a two-dimensional array FloatHi (i.e.,
743 doubles, which has as it's first dimensions the number of different
744 gap existence and extension combinations and as it's second dimension 8.
745 The eight different columns specify:
746 
747 1.) gap existence penalty (INT2_MAX denotes infinite).
748 2.) gap extension penalty (INT2_MAX denotes infinite).
749 3.) decline to align penalty (INT2_MAX denotes infinite).
750 4.) Lambda
751 5.) K
752 6.) H
753 7.) alpha
754 8.) beta
755 
756 (Items 4-8 are explained in:
757 Altschul SF, Bundschuh R, Olsen R, Hwa T.
758 The estimation of statistical parameters for local alignment score distributions.
759 Nucleic Acids Res. 2001 Jan 15;29(2):351-61.).
760 
761 Take BLOSUM45 (below) as an example.  Currently (5/17/02) there are
762 14 different allowed combinations (specified by "#define BLOSUM45_VALUES_MAX 14").
763 The first row in the array "blosum45_values" has INT2_MAX (i.e., 32767) for gap
764 existence, extension, and decline-to-align penalties.  For all practical purposes
765 this value is large enough to be infinite, so the alignments will be ungapped.
766 BLAST may also use this value (INT2_MAX) as a signal to skip gapping, so a
767 different value should not be used if the intent is to have gapless extensions.
768 The next row is for the gap existence penalty 13 and the extension penalty 3.
769 The decline-to-align penalty is only supported in a few cases, so it is normally
770 set to INT2_MAX.
771 
772 
773 How to add a new matrix to blastkar.c:
774 --------------------------------------
775 To add a new matrix to blastkar.c it is necessary to complete
776 four steps.  As an example consider adding the matrix
777 called TESTMATRIX
778 
779 1.) add a define specifying how many different existence and extensions
780 penalties are allowed, so it would be necessary to add the line:
781 
782 #define TESTMATRIX_VALUES_MAX 14
783 
784 if 14 values were to be allowed.
785 
786 2.) add a two-dimensional array to contain the statistical parameters:
787 
788 static Nlm_FloatHi testmatrix_values[TESTMATRIX_VALUES_MAX][8] ={ ...
789 
790 3.) add a "prefs" array that should hint about the "optimal"
791 gap existence and extension penalties:
792 
793 static Int4 testmatrix_prefs[TESTMATRIX_VALUES_MAX] = {
794 BLAST_MATRIX_NOMINAL,
795 ...
796 };
797 
798 4.) Go to the function BlastLoadMatrixValues (in this file) and
799 add two lines before the return at the end of the function:
800 
801         matrix_info = MatrixInfoNew("TESTMATRIX", testmatrix_values, testmatrix_prefs, TESTMATRIX_VALUES_MAX);
802         ValNodeAddPointer(&retval, 0, matrix_info);
803 
804 
805 
806 ***************************************************************************************/
807 
808 
809 
810 
811 
812 #define BLOSUM45_VALUES_MAX 14
813 static Nlm_FloatHi  blosum45_values[BLOSUM45_VALUES_MAX][8] = {
814     {(Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, 0.2291, 0.0924, 0.2514, 0.9113, -5.7},
815     {13, 3, (Nlm_FloatHi) INT2_MAX, 0.207, 0.049, 0.14, 1.5, -22},
816     {12, 3, (Nlm_FloatHi) INT2_MAX, 0.199, 0.039, 0.11, 1.8, -34},
817     {11, 3, (Nlm_FloatHi) INT2_MAX, 0.190, 0.031, 0.095, 2.0, -38},
818     {10, 3, (Nlm_FloatHi) INT2_MAX, 0.179, 0.023, 0.075, 2.4, -51},
819     {16, 2, (Nlm_FloatHi) INT2_MAX, 0.210, 0.051, 0.14, 1.5, -24},
820     {15, 2, (Nlm_FloatHi) INT2_MAX, 0.203, 0.041, 0.12, 1.7, -31},
821     {14, 2, (Nlm_FloatHi) INT2_MAX, 0.195, 0.032, 0.10, 1.9, -36},
822     {13, 2, (Nlm_FloatHi) INT2_MAX, 0.185, 0.024, 0.084, 2.2, -45},
823     {12, 2, (Nlm_FloatHi) INT2_MAX, 0.171, 0.016, 0.061, 2.8, -65},
824     {19, 1, (Nlm_FloatHi) INT2_MAX, 0.205, 0.040, 0.11, 1.9, -43},
825     {18, 1, (Nlm_FloatHi) INT2_MAX, 0.198, 0.032, 0.10, 2.0, -43},
826     {17, 1, (Nlm_FloatHi) INT2_MAX, 0.189, 0.024, 0.079, 2.4, -57},
827     {16, 1, (Nlm_FloatHi) INT2_MAX, 0.176, 0.016, 0.063, 2.8, -67},
828 };
829 
830 static Int4 blosum45_prefs[BLOSUM45_VALUES_MAX] = {
831 BLAST_MATRIX_NOMINAL,
832 BLAST_MATRIX_NOMINAL,
833 BLAST_MATRIX_NOMINAL,
834 BLAST_MATRIX_NOMINAL,
835 BLAST_MATRIX_NOMINAL,
836 BLAST_MATRIX_NOMINAL,
837 BLAST_MATRIX_NOMINAL,
838 BLAST_MATRIX_BEST,
839 BLAST_MATRIX_NOMINAL,
840 BLAST_MATRIX_NOMINAL,
841 BLAST_MATRIX_NOMINAL,
842 BLAST_MATRIX_NOMINAL,
843 BLAST_MATRIX_NOMINAL,
844 BLAST_MATRIX_NOMINAL
845 };
846 
847 
848 #define BLOSUM50_VALUES_MAX 16
849 static Nlm_FloatHi  blosum50_values[BLOSUM50_VALUES_MAX][8] = {
850     {(Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, 0.2318, 0.112, 0.3362, 0.6895, -4.0},
851     {13, 3, (Nlm_FloatHi) INT2_MAX, 0.212, 0.063, 0.19, 1.1, -16},
852     {12, 3, (Nlm_FloatHi) INT2_MAX, 0.206, 0.055, 0.17, 1.2, -18},
853     {11, 3, (Nlm_FloatHi) INT2_MAX, 0.197, 0.042, 0.14, 1.4, -25},
854     {10, 3, (Nlm_FloatHi) INT2_MAX, 0.186, 0.031, 0.11, 1.7, -34},
855     {9, 3, (Nlm_FloatHi) INT2_MAX, 0.172, 0.022, 0.082, 2.1, -48},
856     {16, 2, (Nlm_FloatHi) INT2_MAX, 0.215, 0.066, 0.20, 1.05, -15},
857     {15, 2, (Nlm_FloatHi) INT2_MAX, 0.210, 0.058, 0.17, 1.2, -20},
858     {14, 2, (Nlm_FloatHi) INT2_MAX, 0.202, 0.045, 0.14, 1.4, -27},
859     {13, 2, (Nlm_FloatHi) INT2_MAX, 0.193, 0.035, 0.12, 1.6, -32},
860     {12, 2, (Nlm_FloatHi) INT2_MAX, 0.181, 0.025, 0.095, 1.9, -41},
861     {19, 1, (Nlm_FloatHi) INT2_MAX, 0.212, 0.057, 0.18, 1.2, -21},
862     {18, 1, (Nlm_FloatHi) INT2_MAX, 0.207, 0.050, 0.15, 1.4, -28},
863     {17, 1, (Nlm_FloatHi) INT2_MAX, 0.198, 0.037, 0.12, 1.6, -33},
864     {16, 1, (Nlm_FloatHi) INT2_MAX, 0.186, 0.025, 0.10, 1.9, -42},
865     {15, 1, (Nlm_FloatHi) INT2_MAX, 0.171, 0.015, 0.063, 2.7, -76},
866 };
867 
868 static Int4 blosum50_prefs[BLOSUM50_VALUES_MAX] = {
869 BLAST_MATRIX_NOMINAL,
870 BLAST_MATRIX_NOMINAL,
871 BLAST_MATRIX_NOMINAL,
872 BLAST_MATRIX_NOMINAL,
873 BLAST_MATRIX_NOMINAL,
874 BLAST_MATRIX_NOMINAL,
875 BLAST_MATRIX_NOMINAL,
876 BLAST_MATRIX_NOMINAL,
877 BLAST_MATRIX_NOMINAL,
878 BLAST_MATRIX_BEST,
879 BLAST_MATRIX_NOMINAL,
880 BLAST_MATRIX_NOMINAL,
881 BLAST_MATRIX_NOMINAL,
882 BLAST_MATRIX_NOMINAL,
883 BLAST_MATRIX_NOMINAL,
884 BLAST_MATRIX_NOMINAL
885 };
886 
887 #define BLOSUM62_VALUES_MAX 12
888 static Nlm_FloatHi  blosum62_values[BLOSUM62_VALUES_MAX][8] = {
889     {(Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, 0.3176, 0.134, 0.4012, 0.7916, -3.2},
890     {11, 2, (Nlm_FloatHi) INT2_MAX, 0.297, 0.082, 0.27, 1.1, -10},
891     {10, 2, (Nlm_FloatHi) INT2_MAX, 0.291, 0.075, 0.23, 1.3, -15},
892     {9, 2, (Nlm_FloatHi) INT2_MAX, 0.279, 0.058, 0.19, 1.5, -19},
893     {8, 2, (Nlm_FloatHi) INT2_MAX, 0.264, 0.045, 0.15, 1.8, -26},
894     {7, 2, (Nlm_FloatHi) INT2_MAX, 0.239, 0.027, 0.10, 2.5, -46},
895     {6, 2, (Nlm_FloatHi) INT2_MAX, 0.201, 0.012, 0.061, 3.3, -58},
896     {13, 1, (Nlm_FloatHi) INT2_MAX, 0.292, 0.071, 0.23, 1.2, -11},
897     {12, 1, (Nlm_FloatHi) INT2_MAX, 0.283, 0.059, 0.19, 1.5, -19},
898     {11, 1, (Nlm_FloatHi) INT2_MAX, 0.267, 0.041, 0.14, 1.9, -30},
899     {10, 1, (Nlm_FloatHi) INT2_MAX, 0.243, 0.024, 0.10, 2.5, -44},
900     {9, 1, (Nlm_FloatHi) INT2_MAX, 0.206, 0.010, 0.052, 4.0, -87},
901 };
902 
903 static Int4 blosum62_prefs[BLOSUM62_VALUES_MAX] = {
904     BLAST_MATRIX_NOMINAL,
905     BLAST_MATRIX_NOMINAL,
906     BLAST_MATRIX_NOMINAL,
907     BLAST_MATRIX_NOMINAL,
908     BLAST_MATRIX_NOMINAL,
909     BLAST_MATRIX_NOMINAL,
910     BLAST_MATRIX_NOMINAL,
911     BLAST_MATRIX_NOMINAL,
912     BLAST_MATRIX_NOMINAL,
913     BLAST_MATRIX_BEST,
914     BLAST_MATRIX_NOMINAL,
915     BLAST_MATRIX_NOMINAL,
916 };
917 
918 
919 #define BLOSUM80_VALUES_MAX 10
920 static Nlm_FloatHi  blosum80_values[BLOSUM80_VALUES_MAX][8] = {
921     {(Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, 0.3430, 0.177, 0.6568, 0.5222, -1.6},
922     {25, 2, (Nlm_FloatHi) INT2_MAX, 0.342, 0.17, 0.66, 0.52, -1.6},
923     {13, 2, (Nlm_FloatHi) INT2_MAX, 0.336, 0.15, 0.57, 0.59, -3},
924     {9, 2, (Nlm_FloatHi) INT2_MAX, 0.319, 0.11, 0.42, 0.76, -6},
925     {8, 2, (Nlm_FloatHi) INT2_MAX, 0.308, 0.090, 0.35, 0.89, -9},
926     {7, 2, (Nlm_FloatHi) INT2_MAX, 0.293, 0.070, 0.27, 1.1, -14},
927     {6, 2, (Nlm_FloatHi) INT2_MAX, 0.268, 0.045, 0.19, 1.4, -19},
928     {11, 1, (Nlm_FloatHi) INT2_MAX, 0.314, 0.095, 0.35, 0.90, -9},
929     {10, 1, (Nlm_FloatHi) INT2_MAX, 0.299, 0.071, 0.27, 1.1, -14},
930     {9, 1, (Nlm_FloatHi) INT2_MAX, 0.279, 0.048, 0.20, 1.4, -19},
931 };
932 
933 static Int4 blosum80_prefs[BLOSUM80_VALUES_MAX] = {
934     BLAST_MATRIX_NOMINAL,
935     BLAST_MATRIX_NOMINAL,
936     BLAST_MATRIX_NOMINAL,
937     BLAST_MATRIX_NOMINAL,
938     BLAST_MATRIX_NOMINAL,
939     BLAST_MATRIX_NOMINAL,
940     BLAST_MATRIX_NOMINAL,
941     BLAST_MATRIX_BEST,
942     BLAST_MATRIX_NOMINAL
943 };
944 
945 #define BLOSUM90_VALUES_MAX 8
946 static Nlm_FloatHi  blosum90_values[BLOSUM90_VALUES_MAX][8] = {
947     {(Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, 0.3346, 0.190, 0.7547, 0.4434, -1.4},
948     {9, 2, (Nlm_FloatHi) INT2_MAX, 0.310, 0.12, 0.46, 0.67, -6},
949     {8, 2, (Nlm_FloatHi) INT2_MAX, 0.300, 0.099, 0.39, 0.76, -7},
950     {7, 2, (Nlm_FloatHi) INT2_MAX, 0.283, 0.072, 0.30, 0.93, -11},
951     {6, 2, (Nlm_FloatHi) INT2_MAX, 0.259, 0.048, 0.22, 1.2, -16},
952     {11, 1, (Nlm_FloatHi) INT2_MAX, 0.302, 0.093, 0.39, 0.78, -8},
953     {10, 1, (Nlm_FloatHi) INT2_MAX, 0.290, 0.075, 0.28, 1.04, -15},
954     {9, 1, (Nlm_FloatHi) INT2_MAX, 0.265, 0.044, 0.20, 1.3, -19},
955 };
956 
957 static Int4 blosum90_prefs[BLOSUM90_VALUES_MAX] = {
958 	BLAST_MATRIX_NOMINAL,
959 	BLAST_MATRIX_NOMINAL,
960 	BLAST_MATRIX_NOMINAL,
961 	BLAST_MATRIX_NOMINAL,
962 	BLAST_MATRIX_NOMINAL,
963 	BLAST_MATRIX_NOMINAL,
964 	BLAST_MATRIX_BEST,
965 	BLAST_MATRIX_NOMINAL
966 };
967 
968 #define PAM250_VALUES_MAX 16
969 static Nlm_FloatHi  pam250_values[PAM250_VALUES_MAX][8] = {
970     {(Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, 0.2252, 0.0868, 0.2223, 0.98, -5.0},
971     {15, 3, (Nlm_FloatHi) INT2_MAX, 0.205, 0.049, 0.13, 1.6, -23},
972     {14, 3, (Nlm_FloatHi) INT2_MAX, 0.200, 0.043, 0.12, 1.7, -26},
973     {13, 3, (Nlm_FloatHi) INT2_MAX, 0.194, 0.036, 0.10, 1.9, -31},
974     {12, 3, (Nlm_FloatHi) INT2_MAX, 0.186, 0.029, 0.085, 2.2, -41},
975     {11, 3, (Nlm_FloatHi) INT2_MAX, 0.174, 0.020, 0.070, 2.5, -48},
976     {17, 2, (Nlm_FloatHi) INT2_MAX, 0.204, 0.047, 0.12, 1.7, -28},
977     {16, 2, (Nlm_FloatHi) INT2_MAX, 0.198, 0.038, 0.11, 1.8, -29},
978     {15, 2, (Nlm_FloatHi) INT2_MAX, 0.191, 0.031, 0.087, 2.2, -44},
979     {14, 2, (Nlm_FloatHi) INT2_MAX, 0.182, 0.024, 0.073, 2.5, -53},
980     {13, 2, (Nlm_FloatHi) INT2_MAX, 0.171, 0.017, 0.059, 2.9, -64},
981     {21, 1, (Nlm_FloatHi) INT2_MAX, 0.205, 0.045, 0.11, 1.8, -34},
982     {20, 1, (Nlm_FloatHi) INT2_MAX, 0.199, 0.037, 0.10, 1.9, -35},
983     {19, 1, (Nlm_FloatHi) INT2_MAX, 0.192, 0.029, 0.083, 2.3, -52},
984     {18, 1, (Nlm_FloatHi) INT2_MAX, 0.183, 0.021, 0.070, 2.6, -60},
985     {17, 1, (Nlm_FloatHi) INT2_MAX, 0.171, 0.014, 0.052, 3.3, -86},
986 };
987 
988 static Int4 pam250_prefs[PAM250_VALUES_MAX] = {
989 BLAST_MATRIX_NOMINAL,
990 BLAST_MATRIX_NOMINAL,
991 BLAST_MATRIX_NOMINAL,
992 BLAST_MATRIX_NOMINAL,
993 BLAST_MATRIX_NOMINAL,
994 BLAST_MATRIX_NOMINAL,
995 BLAST_MATRIX_NOMINAL,
996 BLAST_MATRIX_NOMINAL,
997 BLAST_MATRIX_BEST,
998 BLAST_MATRIX_NOMINAL,
999 BLAST_MATRIX_NOMINAL,
1000 BLAST_MATRIX_NOMINAL,
1001 BLAST_MATRIX_NOMINAL,
1002 BLAST_MATRIX_NOMINAL,
1003 BLAST_MATRIX_NOMINAL,
1004 BLAST_MATRIX_NOMINAL
1005 };
1006 
1007 #define PAM30_VALUES_MAX 7
1008 static Nlm_FloatHi  pam30_values[PAM30_VALUES_MAX][8] = {
1009     {(Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, 0.3400, 0.283, 1.754, 0.1938, -0.3},
1010     {7, 2, (Nlm_FloatHi) INT2_MAX, 0.305, 0.15, 0.87, 0.35, -3},
1011     {6, 2, (Nlm_FloatHi) INT2_MAX, 0.287, 0.11, 0.68, 0.42, -4},
1012     {5, 2, (Nlm_FloatHi) INT2_MAX, 0.264, 0.079, 0.45, 0.59, -7},
1013     {10, 1, (Nlm_FloatHi) INT2_MAX, 0.309, 0.15, 0.88, 0.35, -3},
1014     {9, 1, (Nlm_FloatHi) INT2_MAX, 0.294, 0.11, 0.61, 0.48, -6},
1015     {8, 1, (Nlm_FloatHi) INT2_MAX, 0.270, 0.072, 0.40, 0.68, -10},
1016 };
1017 
1018 static Int4 pam30_prefs[PAM30_VALUES_MAX] = {
1019 BLAST_MATRIX_NOMINAL,
1020 BLAST_MATRIX_NOMINAL,
1021 BLAST_MATRIX_NOMINAL,
1022 BLAST_MATRIX_NOMINAL,
1023 BLAST_MATRIX_NOMINAL,
1024 BLAST_MATRIX_BEST,
1025 BLAST_MATRIX_NOMINAL,
1026 };
1027 
1028 
1029 #define PAM70_VALUES_MAX 7
1030 static Nlm_FloatHi  pam70_values[PAM70_VALUES_MAX][8] = {
1031     {(Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, 0.3345, 0.229, 1.029, 0.3250,   -0.7},
1032     {8, 2, (Nlm_FloatHi) INT2_MAX, 0.301, 0.12, 0.54, 0.56, -5},
1033     {7, 2, (Nlm_FloatHi) INT2_MAX, 0.286, 0.093, 0.43, 0.67, -7},
1034     {6, 2, (Nlm_FloatHi) INT2_MAX, 0.264, 0.064, 0.29, 0.90, -12},
1035     {11, 1, (Nlm_FloatHi) INT2_MAX, 0.305, 0.12, 0.52, 0.59, -6},
1036     {10, 1, (Nlm_FloatHi) INT2_MAX, 0.291, 0.091, 0.41, 0.71, -9},
1037     {9, 1, (Nlm_FloatHi) INT2_MAX, 0.270, 0.060, 0.28, 0.97, -14},
1038 };
1039 
1040 static Int4 pam70_prefs[PAM70_VALUES_MAX] = {
1041 BLAST_MATRIX_NOMINAL,
1042 BLAST_MATRIX_NOMINAL,
1043 BLAST_MATRIX_NOMINAL,
1044 BLAST_MATRIX_NOMINAL,
1045 BLAST_MATRIX_NOMINAL,
1046 BLAST_MATRIX_BEST,
1047 BLAST_MATRIX_NOMINAL
1048 };
1049 
1050 
1051 
1052 #define BLOSUM62_20_VALUES_MAX 65
1053 static Nlm_FloatHi  blosum62_20_values[BLOSUM62_20_VALUES_MAX][8] = {
1054     {(Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, 0.03391, 0.125, 0.4544, 0.07462, -3.2},
1055     {100, 12, (Nlm_FloatHi) INT2_MAX, 0.0300, 0.056, 0.21, 0.14, -15},
1056     {95, 12, (Nlm_FloatHi) INT2_MAX, 0.0291, 0.047, 0.18, 0.16, -20},
1057     {90, 12, (Nlm_FloatHi) INT2_MAX, 0.0280, 0.038, 0.15, 0.19, -28},
1058     {85, 12, (Nlm_FloatHi) INT2_MAX, 0.0267, 0.030, 0.13, 0.21, -31},
1059     {80, 12, (Nlm_FloatHi) INT2_MAX, 0.0250, 0.021, 0.10, 0.25, -39},
1060     {105, 11, (Nlm_FloatHi) INT2_MAX, 0.0301, 0.056, 0.22, 0.14, -16},
1061     {100, 11, (Nlm_FloatHi) INT2_MAX, 0.0294, 0.049, 0.20, 0.15, -17},
1062     {95, 11, (Nlm_FloatHi) INT2_MAX, 0.0285, 0.042, 0.16, 0.18, -25},
1063     {90, 11, (Nlm_FloatHi) INT2_MAX, 0.0271, 0.031, 0.14, 0.20, -28},
1064     {85, 11, (Nlm_FloatHi) INT2_MAX, 0.0256, 0.023, 0.10, 0.26, -46},
1065     {115, 10, (Nlm_FloatHi) INT2_MAX, 0.0308, 0.062, 0.22, 0.14, -20},
1066     {110, 10, (Nlm_FloatHi) INT2_MAX, 0.0302, 0.056, 0.19, 0.16, -26},
1067     {105, 10, (Nlm_FloatHi) INT2_MAX, 0.0296, 0.050, 0.17, 0.17, -27},
1068     {100, 10, (Nlm_FloatHi) INT2_MAX, 0.0286, 0.041, 0.15, 0.19, -32},
1069     {95, 10, (Nlm_FloatHi) INT2_MAX, 0.0272, 0.030, 0.13, 0.21, -35},
1070     {90, 10, (Nlm_FloatHi) INT2_MAX, 0.0257, 0.022, 0.11, 0.24, -40},
1071     {85, 10, (Nlm_FloatHi) INT2_MAX, 0.0242, 0.017, 0.083, 0.29, -51},
1072     {115, 9, (Nlm_FloatHi) INT2_MAX, 0.0306, 0.061, 0.24, 0.13, -14},
1073     {110, 9, (Nlm_FloatHi) INT2_MAX, 0.0299, 0.053, 0.19, 0.16, -23},
1074     {105, 9, (Nlm_FloatHi) INT2_MAX, 0.0289, 0.043, 0.17, 0.17, -23},
1075     {100, 9, (Nlm_FloatHi) INT2_MAX, 0.0279, 0.036, 0.14, 0.20, -31},
1076     {95, 9, (Nlm_FloatHi) INT2_MAX, 0.0266, 0.028, 0.12, 0.23, -37},
1077     {120, 8, (Nlm_FloatHi) INT2_MAX, 0.0307, 0.062, 0.22, 0.14, -18},
1078     {115, 8, (Nlm_FloatHi) INT2_MAX, 0.0300, 0.053, 0.20, 0.15, -19},
1079     {110, 8, (Nlm_FloatHi) INT2_MAX, 0.0292, 0.046, 0.17, 0.17, -23},
1080     {105, 8, (Nlm_FloatHi) INT2_MAX, 0.0280, 0.035, 0.14, 0.20, -31},
1081     {100, 8, (Nlm_FloatHi) INT2_MAX, 0.0266, 0.026, 0.12, 0.23, -37},
1082     {125, 7, (Nlm_FloatHi) INT2_MAX, 0.0306, 0.058, 0.22, 0.14, -18},
1083     {120, 7, (Nlm_FloatHi) INT2_MAX, 0.0300, 0.052, 0.19, 0.16, -23},
1084     {115, 7, (Nlm_FloatHi) INT2_MAX, 0.0292, 0.044, 0.17, 0.17, -24},
1085     {110, 7, (Nlm_FloatHi) INT2_MAX, 0.0279, 0.032, 0.14, 0.20, -31},
1086     {105, 7, (Nlm_FloatHi) INT2_MAX, 0.0267, 0.026, 0.11, 0.24, -41},
1087     {120,10,5, 0.0298, 0.049, 0.19, 0.16, -21},
1088     {115,10,5, 0.0290, 0.042, 0.16, 0.18, -25},
1089     {110,10,5, 0.0279, 0.033, 0.13, 0.21, -32},
1090     {105,10,5, 0.0264, 0.024, 0.10, 0.26, -46},
1091     {100,10,5, 0.0250, 0.018, 0.081, 0.31, -56},
1092     {125,10,4, 0.0301, 0.053, 0.18, 0.17, -25},
1093     {120,10,4, 0.0292, 0.043, 0.15, 0.20, -33},
1094     {115,10,4, 0.0282, 0.035, 0.13, 0.22, -36},
1095     {110,10,4, 0.0270, 0.027, 0.11, 0.25, -41},
1096     {105,10,4, 0.0254, 0.020, 0.079, 0.32, -60},
1097     {130,10,3, 0.0300, 0.051, 0.17, 0.18, -27},
1098     {125,10,3, 0.0290, 0.040, 0.13, 0.22, -38},
1099     {120,10,3, 0.0278, 0.030, 0.11, 0.25, -44},
1100     {115,10,3, 0.0267, 0.025, 0.092, 0.29, -52},
1101     {110,10,3, 0.0252, 0.018, 0.070, 0.36, -70},
1102     {135,10,2, 0.0292, 0.040, 0.13, 0.22, -35},
1103     {130,10,2, 0.0283, 0.034, 0.10, 0.28, -51},
1104     {125,10,2, 0.0269, 0.024, 0.077, 0.35, -71},
1105     {120,10,2, 0.0253, 0.017, 0.059, 0.43, -90},
1106     {115,10,2, 0.0234, 0.011, 0.043, 0.55, -121},
1107     {100,14,3, 0.0258, 0.023, 0.087, 0.33, -59},
1108     {105,13,3, 0.0263, 0.024, 0.085, 0.31, -57},
1109     {110,12,3, 0.0271, 0.028, 0.093, 0.29, -54},
1110     {115,11,3, 0.0275, 0.030, 0.10, 0.27, -49},
1111     {125,9,3, 0.0283, 0.034, 0.12, 0.23, -38},
1112     {130,8,3, 0.0287, 0.037, 0.12, 0.23, -40},
1113     {125,7,3, 0.0287, 0.036, 0.12, 0.24, -44},
1114     {140,6,3, 0.0285, 0.033, 0.12, 0.23, -40},
1115     {105,14,3, 0.0270, 0.028, 0.10, 0.27, -46},
1116     {110,13,3, 0.0279, 0.034, 0.10, 0.27, -50},
1117     {115,12,3, 0.0282, 0.035, 0.12, 0.24, -42},
1118     {120,11,3, 0.0286, 0.037, 0.12, 0.24, -44},
1119 };
1120 
1121 static Int4 blosum62_20_prefs[BLOSUM62_20_VALUES_MAX] = {
1122 BLAST_MATRIX_NOMINAL,
1123 BLAST_MATRIX_NOMINAL,
1124 BLAST_MATRIX_NOMINAL,
1125 BLAST_MATRIX_NOMINAL,
1126 BLAST_MATRIX_NOMINAL,
1127 BLAST_MATRIX_NOMINAL,
1128 BLAST_MATRIX_NOMINAL,
1129 BLAST_MATRIX_NOMINAL,
1130 BLAST_MATRIX_NOMINAL,
1131 BLAST_MATRIX_NOMINAL,
1132 BLAST_MATRIX_NOMINAL,
1133 BLAST_MATRIX_NOMINAL,
1134 BLAST_MATRIX_NOMINAL,
1135 BLAST_MATRIX_NOMINAL,
1136 BLAST_MATRIX_NOMINAL,
1137 BLAST_MATRIX_NOMINAL,
1138 BLAST_MATRIX_NOMINAL,
1139 BLAST_MATRIX_NOMINAL,
1140 BLAST_MATRIX_NOMINAL,
1141 BLAST_MATRIX_NOMINAL,
1142 BLAST_MATRIX_NOMINAL,
1143 BLAST_MATRIX_NOMINAL,
1144 BLAST_MATRIX_NOMINAL,
1145 BLAST_MATRIX_NOMINAL,
1146 BLAST_MATRIX_NOMINAL,
1147 BLAST_MATRIX_NOMINAL,
1148 BLAST_MATRIX_NOMINAL,
1149 BLAST_MATRIX_NOMINAL,
1150 BLAST_MATRIX_NOMINAL,
1151 BLAST_MATRIX_NOMINAL,
1152 BLAST_MATRIX_NOMINAL,
1153 BLAST_MATRIX_NOMINAL,
1154 BLAST_MATRIX_NOMINAL,
1155 BLAST_MATRIX_NOMINAL,
1156 BLAST_MATRIX_NOMINAL,
1157 BLAST_MATRIX_NOMINAL,
1158 BLAST_MATRIX_NOMINAL,
1159 BLAST_MATRIX_NOMINAL,
1160 BLAST_MATRIX_NOMINAL,
1161 BLAST_MATRIX_NOMINAL,
1162 BLAST_MATRIX_NOMINAL,
1163 BLAST_MATRIX_NOMINAL,
1164 BLAST_MATRIX_NOMINAL,
1165 BLAST_MATRIX_NOMINAL,
1166 BLAST_MATRIX_NOMINAL,
1167 BLAST_MATRIX_BEST,
1168 BLAST_MATRIX_NOMINAL,
1169 BLAST_MATRIX_NOMINAL,
1170 BLAST_MATRIX_NOMINAL,
1171 BLAST_MATRIX_NOMINAL,
1172 BLAST_MATRIX_NOMINAL,
1173 BLAST_MATRIX_NOMINAL,
1174 BLAST_MATRIX_NOMINAL,
1175 BLAST_MATRIX_NOMINAL,
1176 BLAST_MATRIX_NOMINAL,
1177 BLAST_MATRIX_NOMINAL,
1178 BLAST_MATRIX_NOMINAL,
1179 BLAST_MATRIX_NOMINAL,
1180 BLAST_MATRIX_NOMINAL,
1181 BLAST_MATRIX_NOMINAL,
1182 BLAST_MATRIX_NOMINAL,
1183 BLAST_MATRIX_NOMINAL,
1184 BLAST_MATRIX_NOMINAL,
1185 BLAST_MATRIX_NOMINAL,
1186 BLAST_MATRIX_NOMINAL
1187 };
1188 
1189 /*
1190 	Allocates memory for the BLAST_ScoreBlkPtr.
1191 */
1192 
1193 BLAST_ScoreBlkPtr LIBCALL
BLAST_ScoreBlkNew(Uint1 alphabet,Int2 number_of_contexts)1194 BLAST_ScoreBlkNew(Uint1 alphabet, Int2 number_of_contexts)
1195 
1196 {
1197 	BLAST_ScoreBlkPtr sbp;
1198 	SeqCodeTablePtr sctp;
1199 
1200 	if (alphabet != BLASTNA_SEQ_CODE)
1201 	{
1202 		sctp = SeqCodeTableFindObj(alphabet);
1203 		if (sctp == NULL)
1204 			return NULL;
1205 	}
1206 
1207 	sbp = (BLAST_ScoreBlkPtr) MemNew(sizeof(BLAST_ScoreBlk));
1208 
1209 	if (sbp != NULL)
1210 	{
1211 		sbp->alphabet_code = alphabet;
1212 		if (alphabet != BLASTNA_SEQ_CODE)
1213 			sbp->alphabet_size = sctp->num;
1214 		else
1215 			sbp->alphabet_size = BLASTNA_SIZE;
1216 
1217 /* set the alphabet type (protein or not). */
1218 		switch (alphabet) {
1219 			case Seq_code_iupacaa:
1220 			case Seq_code_ncbi8aa:
1221 			case Seq_code_ncbieaa:
1222 			case Seq_code_ncbipaa:
1223 			case Seq_code_iupacaa3:
1224 				return NULL;
1225 
1226 			case Seq_code_ncbistdaa:
1227 				sbp->protein_alphabet = TRUE;
1228 				break;
1229 			case BLASTNA_SEQ_CODE:
1230 				sbp->protein_alphabet = FALSE;
1231 				break;
1232 			default:
1233 				break;
1234 		}
1235 
1236 		sbp->matrix_struct = BlastMatrixAllocate(sbp->alphabet_size);
1237 		if (sbp->matrix_struct == NULL)
1238 		{
1239 			sbp = BLAST_ScoreBlkDestruct(sbp);
1240 			return sbp;
1241 		}
1242 		sbp->matrix = sbp->matrix_struct->matrix;
1243 		sbp->maxscore = (BLAST_ScorePtr) MemNew(BLAST_MATRIX_SIZE*sizeof(BLAST_Score));
1244 		sbp->number_of_contexts = number_of_contexts;
1245 		sbp->sfp = MemNew(sbp->number_of_contexts*sizeof(BLAST_ScoreFreqPtr));
1246 		sbp->kbp_std = MemNew(sbp->number_of_contexts*sizeof(BLAST_KarlinBlkPtr));
1247 		sbp->kbp_gap_std = MemNew(sbp->number_of_contexts*sizeof(BLAST_KarlinBlkPtr));
1248 		sbp->kbp_psi = MemNew(sbp->number_of_contexts*sizeof(BLAST_KarlinBlkPtr));
1249 		sbp->kbp_gap_psi = MemNew(sbp->number_of_contexts*sizeof(BLAST_KarlinBlkPtr));
1250 	}
1251 
1252 	return sbp;
1253 }
1254 
1255 BLAST_ScoreBlkPtr LIBCALL
BLAST_ScoreBlkDestruct(BLAST_ScoreBlkPtr sbp)1256 BLAST_ScoreBlkDestruct(BLAST_ScoreBlkPtr sbp)
1257 
1258 {
1259     Int4 index, rows;
1260     if (sbp == NULL)
1261         return NULL;
1262 
1263     for (index=0; index<sbp->number_of_contexts; index++) {
1264         if (sbp->sfp)
1265             sbp->sfp[index] = BlastScoreFreqDestruct(sbp->sfp[index]);
1266         if (sbp->kbp_std)
1267             sbp->kbp_std[index] = BlastKarlinBlkDestruct(sbp->kbp_std[index]);
1268         if (sbp->kbp_gap_std)
1269             sbp->kbp_gap_std[index] = BlastKarlinBlkDestruct(sbp->kbp_gap_std[index]);
1270         if (sbp->kbp_psi)
1271             sbp->kbp_psi[index] = BlastKarlinBlkDestruct(sbp->kbp_psi[index]);
1272         if (sbp->kbp_gap_psi)
1273             sbp->kbp_gap_psi[index] = BlastKarlinBlkDestruct(sbp->kbp_gap_psi[index]);
1274     }
1275     if (sbp->kbp_ideal)
1276         sbp->kbp_ideal = BlastKarlinBlkDestruct(sbp->kbp_ideal);
1277     sbp->sfp = MemFree(sbp->sfp);
1278     sbp->kbp_std = MemFree(sbp->kbp_std);
1279     sbp->kbp_psi = MemFree(sbp->kbp_psi);
1280     sbp->kbp_gap_std = MemFree(sbp->kbp_gap_std);
1281     sbp->kbp_gap_psi = MemFree(sbp->kbp_gap_psi);
1282     sbp->matrix_struct = BlastMatrixDestruct(sbp->matrix_struct);
1283     sbp->maxscore = MemFree(sbp->maxscore);
1284     sbp->comments = ValNodeFreeData(sbp->comments);
1285     sbp->name = MemFree(sbp->name);
1286     sbp->ambiguous_res = MemFree(sbp->ambiguous_res);
1287 
1288     /* Removing posMatrix and posFreqs if any */
1289     rows = sbp->query_length + 1;
1290     if(sbp->posMatrix != NULL) {
1291         for (index=0; index < rows; index++) {
1292             MemFree(sbp->posMatrix[index]);
1293         }
1294         MemFree(sbp->posMatrix);
1295     }
1296 
1297     if(sbp->posFreqs != NULL) {
1298         for (index = 0; index < rows; index++) {
1299             MemFree(sbp->posFreqs[index]);
1300         }
1301         MemFree(sbp->posFreqs);
1302     }
1303 
1304     sbp = MemFree(sbp);
1305 
1306     return sbp;
1307 }
1308 
1309 /*
1310 	Set the ambiguous residue (e.g, 'N', 'X') in the BLAST_ScoreBlkPtr.
1311 	Convert from ncbieaa to sbp->alphabet_code (i.e., ncbistdaa) first.
1312 */
1313 Int2 LIBCALL
BlastScoreSetAmbigRes(BLAST_ScoreBlkPtr sbp,Char ambiguous_res)1314 BlastScoreSetAmbigRes(BLAST_ScoreBlkPtr sbp, Char ambiguous_res)
1315 
1316 {
1317 	Int2 index;
1318 	SeqMapTablePtr smtp;
1319 	Uint1Ptr ambig_buffer;
1320 
1321 	if (sbp == NULL)
1322 		return 1;
1323 
1324 	if (sbp->ambig_occupy >= sbp->ambig_size)
1325 	{
1326 		sbp->ambig_size += 5;
1327 		ambig_buffer = MemNew(sbp->ambig_size*sizeof(Uint1));
1328 		for (index=0; index<sbp->ambig_occupy; index++)
1329 		{
1330 			ambig_buffer[index] = sbp->ambiguous_res[index];
1331 		}
1332 		sbp->ambiguous_res = MemFree(sbp->ambiguous_res);
1333 		sbp->ambiguous_res = ambig_buffer;
1334 	}
1335 
1336 	if (sbp->alphabet_code == Seq_code_ncbistdaa)
1337 	{
1338 		smtp = SeqMapTableFind(sbp->alphabet_code, Seq_code_ncbieaa);
1339 		sbp->ambiguous_res[sbp->ambig_occupy] = SeqMapTableConvert(smtp, ambiguous_res);
1340 	}
1341 	else if (sbp->alphabet_code == BLASTNA_SEQ_CODE)
1342 	{
1343 		smtp = SeqMapTableFind(Seq_code_ncbi4na, Seq_code_iupacna);
1344 		sbp->ambiguous_res[sbp->ambig_occupy] = ncbi4na_to_blastna[SeqMapTableConvert(smtp, ambiguous_res)];
1345 	}
1346 	else if (sbp->alphabet_code == Seq_code_ncbi4na)
1347 	{
1348 		smtp = SeqMapTableFind(Seq_code_ncbi4na, Seq_code_iupacna);
1349 		sbp->ambiguous_res[sbp->ambig_occupy] = SeqMapTableConvert(smtp, ambiguous_res);
1350 	}
1351 
1352 	(sbp->ambig_occupy)++;
1353 
1354 
1355 	return 0;
1356 }
1357 
1358 /*
1359 	Deallocates all data associated with the BLAST_MatrixPtr.
1360 */
1361 BLAST_MatrixPtr LIBCALL
BLAST_MatrixDestruct(BLAST_MatrixPtr blast_matrix)1362 BLAST_MatrixDestruct(BLAST_MatrixPtr blast_matrix)
1363 
1364 {
1365     Int4 index;
1366 
1367     if (blast_matrix == NULL)
1368         return NULL;
1369 
1370     /* We may have 2 different matrices in there */
1371 
1372     if(blast_matrix->original_matrix &&
1373        blast_matrix->original_matrix != blast_matrix->matrix) {
1374 	/* blast_matrix->original_matrix is a square matrix with the
1375 	   same number of columns as blast_matrix->matrix.  Therefore
1376 	   blast_matrix->original_matrix has blast_matrix->columns
1377 	   rows. */
1378         for (index=0; index < blast_matrix->columns; index++) {
1379             MemFree(blast_matrix->original_matrix[index]);
1380         }
1381         MemFree(blast_matrix->original_matrix);
1382     }
1383 
1384     blast_matrix->name = MemFree(blast_matrix->name);
1385     if (blast_matrix->matrix) {
1386 	for (index=0; index<blast_matrix->rows; index++) {
1387 	    MemFree(blast_matrix->matrix[index]);
1388 	}
1389 	MemFree(blast_matrix->matrix);
1390     }
1391 
1392     if(blast_matrix->posFreqs != NULL) {
1393         for (index = 0; index < blast_matrix->rows; index++) {
1394             MemFree(blast_matrix->posFreqs[index]);
1395         }
1396         MemFree(blast_matrix->posFreqs);
1397     }
1398 
1399     MemFree(blast_matrix);
1400 
1401     return NULL;
1402 }
1403 
1404 
1405 /*
1406 	Allocates and fills the BLAST_MatrixPtr.
1407 	positionBased indicates that the posMatrix
1408 	on BLAST_ScoreBlkPtr should be used rather
1409 	than the 'normal' matrix.
1410 */
1411 BLAST_MatrixPtr LIBCALL
BLAST_MatrixFill(BLAST_ScoreBlkPtr sbp,Boolean positionBased)1412 BLAST_MatrixFill(BLAST_ScoreBlkPtr sbp, Boolean positionBased)
1413 
1414 {
1415     BLAST_MatrixPtr blast_matrix;
1416     FloatHi karlinK = 0.0;
1417     Int4 index1, index2, dim1, dim2;
1418     Int4Ptr PNTR matrix, PNTR original_matrix;
1419     Nlm_FloatHi **posFreqs = NULL;
1420 
1421     if (sbp == NULL)
1422         return NULL;
1423 
1424     blast_matrix = (BLAST_MatrixPtr) MemNew(sizeof(BLAST_Matrix));
1425 
1426     dim1 = sbp->alphabet_size;
1427     dim2 = sbp->alphabet_size;
1428     original_matrix = sbp->matrix;
1429 
1430     if (sbp->kbp_gap_psi[0])
1431         karlinK = sbp->kbp_gap_psi[0]->K;
1432 
1433     matrix = MemNew(dim1*sizeof(Int4Ptr));
1434     for (index1=0; index1<dim1; index1++) {
1435         matrix[index1] = MemNew(dim2*sizeof(Int4));
1436         for (index2=0; index2<dim2; index2++) {
1437             matrix[index1][index2] = original_matrix[index1][index2];
1438         }
1439     }
1440 
1441     blast_matrix->original_matrix = matrix;
1442 
1443     /* For PSI BLAST blast_matrix->matrix will be position based */
1444     if (sbp->posMatrix) {
1445         dim1 = sbp->query_length + 1;
1446         dim2 = sbp->alphabet_size;
1447         original_matrix = sbp->posMatrix;
1448         matrix = MemNew(dim1*sizeof(Int4Ptr));
1449         for (index1=0; index1<dim1; index1++) {
1450            matrix[index1] = MemNew(dim2*sizeof(Int4));
1451            for (index2=0; index2<dim2; index2++) {
1452               matrix[index1][index2] = original_matrix[index1][index2];
1453            }
1454         }
1455     }
1456     blast_matrix->matrix = matrix;
1457 
1458     /* Copying posFreqs to the BLAST_Matrix */
1459     if ((sbp->posFreqs != NULL) && (sbp->posMatrix != NULL)) {
1460         posFreqs = MemNew(dim1*sizeof(Nlm_FloatHi *));
1461         for (index1 = 0; index1 < dim1; index1++) {
1462             posFreqs[index1] = MemNew(dim2*sizeof(Nlm_FloatHi));
1463             for (index2=0; index2 < dim2; index2++) {
1464                 posFreqs[index1][index2] = sbp->posFreqs[index1][index2];
1465             }
1466         }
1467     }
1468 
1469     blast_matrix->is_prot = sbp->protein_alphabet;
1470     blast_matrix->name = StringSave(sbp->name);
1471     blast_matrix->rows = dim1;
1472     blast_matrix->columns = dim2;
1473     blast_matrix->matrix = matrix;
1474     blast_matrix->posFreqs = posFreqs;
1475     blast_matrix->karlinK = karlinK;
1476 
1477     return blast_matrix;
1478 }
1479 
1480 /*
1481 	Fill in the matrix for blastn using the penaly and rewards
1482 
1483 	The query sequence alphabet is blastna, the subject sequence
1484 	is ncbi2na.  The alphabet blastna is defined in blastkar.h
1485 	and the first four elements of blastna are identical to ncbi2na.
1486 
1487 	The query is in the first index, the subject is the second.
1488         if matrix==NULL, it is allocated and returned.
1489 */
1490 
BlastScoreBlkMatCreateEx(BLAST_ScorePtr PNTR matrix,BLAST_Score penalty,BLAST_Score reward)1491 BLAST_ScorePtr PNTR LIBCALL BlastScoreBlkMatCreateEx(BLAST_ScorePtr PNTR matrix,BLAST_Score penalty, BLAST_Score reward)
1492 {
1493 
1494 	Int2	index1, index2, degen;
1495 	Int2 degeneracy[BLASTNA_SIZE+1];
1496 
1497         if(!matrix) {
1498             BLASTMatrixStructurePtr matrix_struct;
1499             matrix_struct =BlastMatrixAllocate((Int2) BLASTNA_SIZE);
1500             matrix = matrix_struct->matrix;
1501         }
1502 	for (index1 = 0; index1<BLASTNA_SIZE; index1++) /* blastna */
1503 		for (index2 = 0; index2<BLASTNA_SIZE; index2++) /* blastna */
1504 			matrix[index1][index2] = 0;
1505 
1506 	/* In blastna the 1st four bases are A, C, G, and T, exactly as it is ncbi2na. */
1507 	/* ncbi4na gives them the value 1, 2, 4, and 8.  */
1508 
1509 	/* Set the first four bases to degen. one */
1510 	for (index1=0; index1<NUMBER_NON_AMBIG_BP; index1++)
1511 		degeneracy[index1] = 1;
1512 
1513 	for (index1=NUMBER_NON_AMBIG_BP; index1<BLASTNA_SIZE; index1++) /* blastna */
1514 	{
1515 		degen=0;
1516 		for (index2=0; index2<NUMBER_NON_AMBIG_BP; index2++) /* ncbi2na */
1517 		{
1518 			if (blastna_to_ncbi4na[index1] & blastna_to_ncbi4na[index2])
1519 				degen++;
1520 		}
1521 		degeneracy[index1] = degen;
1522 	}
1523 
1524 
1525 	for (index1=0; index1<BLASTNA_SIZE; index1++) /* blastna */
1526 	{
1527 		for (index2=index1; index2<BLASTNA_SIZE; index2++) /* blastna */
1528 		{
1529 			if (blastna_to_ncbi4na[index1] & blastna_to_ncbi4na[index2])
1530 			{ /* round up for positive scores, down for negatives. */
1531 				matrix[index1][index2] = Nlm_Nint( (double) ((degeneracy[index2]-1)*penalty + reward))/degeneracy[index2];
1532 				if (index1 != index2)
1533 				{
1534 				      matrix[index2][index1] = matrix[index1][index2];
1535 				}
1536 			}
1537 			else
1538 			{
1539 				matrix[index1][index2] = penalty;
1540 				matrix[index2][index1] = penalty;
1541 			}
1542 		}
1543 	}
1544 
1545         /* The value of 15 is a gap, which is a sentinel between strands in
1546            the ungapped extension algorithm */
1547 	for (index1=0; index1<BLASTNA_SIZE; index1++) /* blastna */
1548            matrix[BLASTNA_SIZE-1][index1] = INT4_MIN / 2;
1549 	for (index1=0; index1<BLASTNA_SIZE; index1++) /* blastna */
1550            matrix[index1][BLASTNA_SIZE-1] = INT4_MIN / 2;
1551 
1552 	return matrix;
1553 }
1554 /*
1555 	Fill in the matrix for blastn using the penaly and rewards on
1556 	the BLAST_ScoreBlkPtr.
1557 
1558 	The query sequence alphabet is blastna, the subject sequence
1559 	is ncbi2na.  The alphabet blastna is defined in blastkar.h
1560 	and the first four elements of blastna are identical to ncbi2na.
1561 
1562 	The query is in the first index, the subject is the second.
1563 */
1564 
1565 
BlastScoreBlkMatCreate(BLAST_ScoreBlkPtr sbp)1566 static Int2 BlastScoreBlkMatCreate(BLAST_ScoreBlkPtr sbp)
1567 {
1568 	Char buffer[25];
1569 
1570         sbp->matrix = BlastScoreBlkMatCreateEx(sbp->matrix,sbp->penalty, sbp->reward);
1571 	sbp->mat_dim1 = BLASTNA_SIZE;
1572 	sbp->mat_dim2 = BLASTNA_SIZE;
1573 
1574 	sprintf(buffer, "blastn matrix:%ld %ld", (long) sbp->reward, (long) sbp->penalty);
1575 	sbp->name = StringSave(buffer);
1576 
1577 	return 0;
1578 }
1579 
1580 
1581 
1582 
1583 
1584 /*
1585 	This function fills in the BLAST_ScoreBlk structure.
1586 	Tasks are:
1587 		-read in the matrix
1588 		-set maxscore
1589 */
1590 
1591 Int2 LIBCALL
BlastScoreBlkMatFill(BLAST_ScoreBlkPtr sbp,CharPtr matrix)1592 BlastScoreBlkMatFill(BLAST_ScoreBlkPtr sbp, CharPtr matrix)
1593 
1594 {
1595     Char string[PATH_MAX] = "", alphabet_type[3] = "";
1596     CharPtr matrix_dir = NULL;
1597     Int2 status = 0;
1598     FILE *fp = NULL;
1599 
1600     if (sbp->read_in_matrix) {
1601         /* Convert matrix name to upper case. */
1602         matrix = Nlm_StrUpper(matrix);
1603 
1604         sbp->name = StringSave(matrix);	/* Save the name of the matrix. */
1605 
1606         /* 1. Try local directory */
1607         if(FileLength(matrix) > 0)
1608             fp = FileOpen(matrix, "r");
1609 
1610         /* 2. Try configuration file */
1611         if (fp == NULL) {
1612            if (sbp->protein_alphabet)
1613               Nlm_StringNCpy(alphabet_type, "aa", 2);
1614            else
1615               Nlm_StringNCpy(alphabet_type, "nt", 2);
1616            alphabet_type[2] = NULLB;
1617 
1618            if(FindPath("ncbi", "ncbi", "data", string, PATH_MAX)) {
1619                matrix_dir = StringSave(string);
1620                sprintf(string, "%s%s", matrix_dir, matrix);
1621                if(FileLength(string) > 0) {
1622                   fp = FileOpen(string, "r");
1623                } else {
1624                   sprintf(string, "%s%s%s%s", matrix_dir,
1625                           alphabet_type, DIRDELIMSTR, matrix);
1626                   if(FileLength(string) > 0)
1627                      fp = FileOpen(string, "r");
1628                }
1629                matrix_dir = MemFree(matrix_dir);
1630             }
1631         }
1632         /* Trying to use local "data" directory */
1633 
1634         if(fp == NULL) {
1635             sprintf(string, "data%s%s", DIRDELIMSTR, matrix);
1636             if(FileLength(string) > 0)
1637                 fp = FileOpen(string, "r");
1638         }
1639 
1640         /* Get the matrix locations from the environment for UNIX. */
1641         if (fp == NULL) {
1642 
1643             matrix_dir = getenv("BLASTMAT");
1644             if (matrix_dir != NULL) {
1645                 sprintf(string, "%s%s%s%s%s", matrix_dir, DIRDELIMSTR,alphabet_type, DIRDELIMSTR, matrix);
1646             }
1647 
1648             if(FileLength(string) > 0)
1649                 fp = FileOpen(string, "r");
1650 
1651             /* Try again without "aa" or "nt" */
1652             if (fp == NULL) {
1653                 if (matrix_dir != NULL) {
1654                     sprintf(string, "%s%s%s", matrix_dir, DIRDELIMSTR, matrix);
1655                 }
1656 
1657                 if(FileLength(string) > 0)
1658                     fp = FileOpen(string, "r");
1659             }
1660         }
1661 
1662         if (fp == NULL) {
1663             ErrPostEx(SEV_WARNING, 0, 0, "Unable to open %s", matrix);
1664             return 4;
1665         }
1666 
1667         if((status=BlastScoreBlkMatRead(sbp, fp)) != 0) {
1668             FileClose(fp);
1669             return status;
1670         }
1671         FileClose(fp);
1672     } else {
1673         if((status=BlastScoreBlkMatCreate(sbp)) != 0)
1674             return status;
1675     }
1676 
1677     if((status=BlastScoreBlkMaxScoreSet(sbp)) != 0)
1678         return status;
1679 
1680     return status;
1681 }
1682 
1683 /*
1684 	Return the specified matrix.  Do this by setting up the ScoreBlkPtr
1685 	and then fetching the matrix from disk.
1686 */
1687 
1688 BLAST_MatrixPtr LIBCALL
BLAST_MatrixFetch(CharPtr matrix_name)1689 BLAST_MatrixFetch(CharPtr matrix_name)
1690 
1691 {
1692 	BLAST_MatrixPtr matrix;
1693 	BLAST_ScoreBlkPtr sbp;
1694 
1695 	if (matrix_name == NULL)
1696 		return NULL;
1697 
1698 	sbp = BLAST_ScoreBlkNew(Seq_code_ncbistdaa, 1);
1699 
1700 	/* Read in for protein. */
1701 	sbp->read_in_matrix = TRUE;
1702 
1703 	BlastScoreBlkMatFill(sbp, matrix_name);
1704 
1705 	matrix = BLAST_MatrixFill(sbp, FALSE);
1706 
1707 	sbp = BLAST_ScoreBlkDestruct(sbp);
1708 
1709 	return matrix;
1710 }
1711 
1712 
1713 /*
1714   Calculate the Karlin parameters.  This function should be called once
1715   for each context, or frame translated.
1716 
1717   The rfp and stdrfp are calculated for each context, this should be
1718   fixed.
1719 */
1720 
1721 Int2 LIBCALL
BlastScoreBlkFill(BLAST_ScoreBlkPtr sbp,CharPtr query,Int4 query_length,Int2 context_number)1722 BlastScoreBlkFill(BLAST_ScoreBlkPtr sbp, CharPtr query, Int4 query_length, Int2 context_number)
1723 {
1724 	BLAST_ResFreqPtr rfp, stdrfp;
1725 	Int2 retval=0;
1726 
1727 	rfp = BlastResFreqNew(sbp);
1728 	stdrfp = BlastResFreqNew(sbp);
1729 	BlastResFreqStdComp(sbp, stdrfp);
1730 	BlastResFreqString(sbp, rfp, query, query_length);
1731 	sbp->sfp[context_number] = BlastScoreFreqNew(sbp->loscore, sbp->hiscore);
1732 	BlastScoreFreqCalc(sbp, sbp->sfp[context_number], rfp, stdrfp);
1733 	sbp->kbp_std[context_number] = BlastKarlinBlkCreate();
1734 	retval = BlastKarlinBlkCalc(sbp->kbp_std[context_number], sbp->sfp[context_number]);
1735 	if (retval)
1736 	{
1737 		rfp = BlastResFreqDestruct(rfp);
1738 		stdrfp = BlastResFreqDestruct(stdrfp);
1739 		return retval;
1740 	}
1741 	sbp->kbp_psi[context_number] = BlastKarlinBlkCreate();
1742 	retval = BlastKarlinBlkCalc(sbp->kbp_psi[context_number], sbp->sfp[context_number]);
1743 	rfp = BlastResFreqDestruct(rfp);
1744 	stdrfp = BlastResFreqDestruct(stdrfp);
1745 
1746 	return retval;
1747 }
1748 
1749 /*
1750 	Calculates the standard Karlin parameters.  This is used
1751 	if the query is translated and the calculated (real) Karlin
1752 	parameters are bad, as they're calculated for non-coding regions.
1753 */
1754 
1755 BLAST_KarlinBlkPtr LIBCALL
BlastKarlinBlkStandardCalcEx(BLAST_ScoreBlkPtr sbp)1756 BlastKarlinBlkStandardCalcEx(BLAST_ScoreBlkPtr sbp)
1757 
1758 {
1759 	BLAST_KarlinBlkPtr kbp_ideal;
1760 	BLAST_ResFreqPtr stdrfp;
1761 	BLAST_ScoreFreqPtr sfp;
1762 
1763 	stdrfp = BlastResFreqNew(sbp);
1764 	BlastResFreqStdComp(sbp, stdrfp);
1765 	sfp = BlastScoreFreqNew(sbp->loscore, sbp->hiscore);
1766 	BlastScoreFreqCalc(sbp, sfp, stdrfp, stdrfp);
1767 	kbp_ideal = BlastKarlinBlkCreate();
1768 	BlastKarlinBlkCalc(kbp_ideal, sfp);
1769 	stdrfp = BlastResFreqDestruct(stdrfp);
1770 
1771 	sfp = BlastScoreFreqDestruct(sfp);
1772 
1773 	return kbp_ideal;
1774 }
1775 
1776 Int2 LIBCALL
BlastKarlinBlkStandardCalc(BLAST_ScoreBlkPtr sbp,Int2 context_start,Int2 context_end)1777 BlastKarlinBlkStandardCalc(BLAST_ScoreBlkPtr sbp, Int2 context_start, Int2 context_end)
1778 {
1779 
1780 	BLAST_KarlinBlkPtr kbp_ideal, kbp;
1781 	Int2 index;
1782 
1783 	kbp_ideal = BlastKarlinBlkStandardCalcEx(sbp);
1784 /* Replace the calculated values with ideal ones for blastx, tblastx. */
1785 	for (index=context_start; index<=context_end; index++)
1786 	{
1787 		kbp = sbp->kbp[index];
1788 		if (kbp->Lambda >= kbp_ideal->Lambda)
1789 		{
1790 			kbp->Lambda = kbp_ideal->Lambda;
1791 			kbp->K = kbp_ideal->K;
1792 			kbp->logK = kbp_ideal->logK;
1793 			kbp->H = kbp_ideal->H;
1794 		}
1795 	}
1796 	kbp_ideal = BlastKarlinBlkDestruct(kbp_ideal);
1797 
1798 	return 0;
1799 }
1800 
1801 /*
1802 	Creates the Karlin Block.
1803 */
1804 
1805 BLAST_KarlinBlkPtr LIBCALL
BlastKarlinBlkCreate(void)1806 BlastKarlinBlkCreate(void)
1807 
1808 {
1809 	BLAST_KarlinBlkPtr kbp;
1810 
1811 	kbp = (BLAST_KarlinBlkPtr) MemNew(sizeof(BLAST_KarlinBlk));
1812 
1813 	return kbp;
1814 }
1815 
1816 /*
1817 	Deallocates the Karlin Block.
1818 */
1819 
1820 BLAST_KarlinBlkPtr LIBCALL
BlastKarlinBlkDestruct(BLAST_KarlinBlkPtr kbp)1821 BlastKarlinBlkDestruct(BLAST_KarlinBlkPtr kbp)
1822 
1823 {
1824 	kbp = MemFree(kbp);
1825 
1826 	return kbp;
1827 }
1828 
1829 
1830 /*
1831 	Read in the matrix from the FILE *fp.
1832 
1833 	This function ASSUMES that the matrices are in the ncbistdaa
1834 	format.  BLAST should be able to use any alphabet, though it
1835 	is expected that ncbistdaa will be used.
1836 */
1837 
1838 static Char ASCII_TO_BLASTNA_CONVERT[128]={
1839 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1840 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1841 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1842 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1843 15, 0,10, 1,11,15,15, 2,12,15,15, 7,15, 6,14,15,
1844 15,15, 4, 9, 3,15,13, 8,15, 5,15,15,15,15,15,15,
1845 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1846 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15};
1847 
1848 #define X_CODE 21
1849 #define U_CODE 24
1850 
1851 Int2 LIBCALL
BlastScoreBlkMatRead(BLAST_ScoreBlkPtr sbp,FILE * fp)1852 BlastScoreBlkMatRead(BLAST_ScoreBlkPtr sbp, FILE *fp)
1853 {
1854     Char	buf[512+3];
1855     Char	temp[512];
1856     CharPtr	cp, lp;
1857     Char		ch;
1858     BLAST_ScorePtr PNTR	matrix;
1859     BLAST_ScorePtr	m;
1860     BLAST_Score	score;
1861     Int2		a1cnt = 0, a2cnt = 0;
1862     Char    a1chars[BLAST_MAX_ALPHABET], a2chars[BLAST_MAX_ALPHABET];
1863     long	lineno = 0;
1864     Nlm_FloatHi	xscore;
1865     register int	index1, index2, total;
1866     SeqCodeTablePtr sctp;
1867     SeqMapTablePtr smtp=NULL;
1868     Int2 status;
1869     static TNlmMutex read_matrix_mutex;
1870 
1871     NlmMutexInit(&read_matrix_mutex);
1872     NlmMutexLock(read_matrix_mutex);
1873 
1874     matrix = sbp->matrix;
1875 
1876     if (sbp->alphabet_code != BLASTNA_SEQ_CODE) {
1877         sctp = SeqCodeTableFindObj(sbp->alphabet_code);
1878         if(sctp == NULL) {
1879             NlmMutexUnlock(read_matrix_mutex);
1880 	    return 1;
1881         }
1882 
1883         total = sctp->start_at + sctp->num;
1884         for (index1 = sctp->start_at; index1 < total; index1++)
1885             for (index2 = sctp->start_at; index2 < total; index2++)
1886                 matrix[index1][index2] = BLAST_SCORE_MIN;
1887 
1888         if (sbp->alphabet_code != Seq_code_ncbieaa) {
1889             smtp = SeqMapTableFind(sbp->alphabet_code, Seq_code_ncbieaa);
1890             if (smtp == NULL) {
1891                 NlmMutexUnlock(read_matrix_mutex);
1892                 return 1;
1893             }
1894         }
1895     } else {
1896         /* Fill-in all the defaults ambiguity and normal codes */
1897         status=BlastScoreBlkMatCreate(sbp);
1898         if(status != 0)
1899 	{
1900         	NlmMutexUnlock(read_matrix_mutex);
1901         	return status;
1902 	}
1903     }
1904 
1905     /* Read the residue names for the second alphabet */
1906     while (Nlm_FileGets(buf, sizeof(buf), fp) != NULL) {
1907         ++lineno;
1908         if (Nlm_StrChr(buf, '\n') == NULL) {
1909             NlmMutexUnlock(read_matrix_mutex);
1910             return 2;
1911         }
1912 
1913         if (buf[0] == COMMENT_CHR) {
1914             /* save the comment line in a linked list */
1915             *Nlm_StrChr(buf, '\n') = NULLB;
1916             ValNodeCopyStr(&sbp->comments, 0, buf+1);
1917             continue;
1918         }
1919         if ((cp = Nlm_StrChr(buf, COMMENT_CHR)) != NULL)
1920             *cp = NULLB;
1921         lp = (CharPtr)Nlm_StrTok(buf, TOKSTR);
1922         if (lp == NULL) /* skip blank lines */
1923             continue;
1924         while (lp != NULL) {
1925 
1926             if (smtp)
1927                 ch = SeqMapTableConvert(smtp,  *lp);
1928             else {
1929                 if (sbp->alphabet_code != BLASTNA_SEQ_CODE) {
1930                     ch = *lp;
1931                 } else {
1932                     ch = ASCII_TO_BLASTNA_CONVERT[toupper(*lp)];
1933                 }
1934             }
1935             a2chars[a2cnt++] = ch;
1936             lp = (CharPtr)Nlm_StrTok(NULL, TOKSTR);
1937         }
1938 
1939         break;	/* Exit loop after reading one line. */
1940     }
1941 
1942     if (a2cnt <= 1) {
1943         NlmMutexUnlock(read_matrix_mutex);
1944         return 2;
1945     }
1946 
1947     if (sbp->alphabet_code != BLASTNA_SEQ_CODE) {
1948         sbp->mat_dim2 = a2cnt;
1949     }
1950     while (Nlm_FileGets(buf, sizeof(buf), fp) != NULL)  {
1951         ++lineno;
1952         if ((cp = Nlm_StrChr(buf, '\n')) == NULL) {
1953             NlmMutexUnlock(read_matrix_mutex);
1954             return 2;
1955         }
1956         if ((cp = Nlm_StrChr(buf, COMMENT_CHR)) != NULL)
1957             *cp = NULLB;
1958         if ((lp = (CharPtr)Nlm_StrTok(buf, TOKSTR)) == NULL)
1959             continue;
1960         ch = *lp;
1961         cp = (CharPtr) lp;
1962         if ((cp = Nlm_StrTok(NULL, TOKSTR)) == NULL) {
1963             NlmMutexUnlock(read_matrix_mutex);
1964             return 2;
1965         }
1966         if (a1cnt >= DIM(a1chars)) {
1967             NlmMutexUnlock(read_matrix_mutex);
1968             return 2;
1969         }
1970 
1971         if (smtp) {
1972             ch = SeqMapTableConvert(smtp,  ch);
1973 
1974         } else {
1975             if (sbp->alphabet_code == BLASTNA_SEQ_CODE) {
1976                 ch = ASCII_TO_BLASTNA_CONVERT[toupper(ch)];
1977             }
1978         }
1979         a1chars[a1cnt++] = ch;
1980         m = &matrix[(int)ch][0];
1981         index2 = 0;
1982         while (cp != NULL) {
1983             if (index2 >= a2cnt) {
1984                 NlmMutexUnlock(read_matrix_mutex);
1985                 return 2;
1986             }
1987             Nlm_StrCpy(temp, cp);
1988 
1989             if (Nlm_StrICmp(temp, "na") == 0)  {
1990                 score = BLAST_SCORE_1MIN;
1991             } else  {
1992                 if (sscanf(temp, "%lg", &xscore) != 1) {
1993                     NlmMutexUnlock(read_matrix_mutex);
1994                     return 2;
1995                 }
1996 				/*xscore = MAX(xscore, BLAST_SCORE_1MIN);*/
1997                 if (xscore > BLAST_SCORE_1MAX || xscore < BLAST_SCORE_1MIN) {
1998                     NlmMutexUnlock(read_matrix_mutex);
1999                     return 2;
2000                 }
2001                 xscore += (xscore >= 0. ? 0.5 : -0.5);
2002                 score = (BLAST_Score)xscore;
2003             }
2004 
2005             m[(int)a2chars[index2++]] = score;
2006 
2007             cp = Nlm_StrTok(NULL, TOKSTR);
2008         }
2009     }
2010 
2011     if (a1cnt <= 1) {
2012         NlmMutexUnlock(read_matrix_mutex);
2013         return 2;
2014     }
2015 
2016     if (sbp->alphabet_code != BLASTNA_SEQ_CODE) {
2017         sbp->mat_dim1 = a1cnt;
2018 
2019         /* For protein matrices, copy the X scores to the
2020            U scores. Keeping the U scores as BLAST_SCORE_MIN
2021            means that U never aligns with another letter,
2022            even another U */
2023         for (index1 = 0; index1 < a2cnt; index1++) {
2024             matrix[U_CODE][index1] = matrix[X_CODE][index1];
2025             matrix[index1][U_CODE] = matrix[index1][X_CODE];
2026         }
2027     }
2028 
2029     NlmMutexUnlock(read_matrix_mutex);
2030     return 0;
2031 }
2032 
2033 Int2 LIBCALL
BlastScoreBlkMaxScoreSet(BLAST_ScoreBlkPtr sbp)2034 BlastScoreBlkMaxScoreSet(BLAST_ScoreBlkPtr sbp)
2035 {
2036 	BLAST_Score score, maxscore;
2037 	BLAST_ScorePtr PNTR  matrix;
2038 	Int2 index1, index2;
2039 
2040 	sbp->loscore = BLAST_SCORE_1MAX;
2041         sbp->hiscore = BLAST_SCORE_1MIN;
2042 	matrix = sbp->matrix;
2043 	for (index1=0; index1<sbp->alphabet_size; index1++)
2044 	{
2045 		maxscore=BLAST_SCORE_MIN;
2046 		for (index2=0; index2<sbp->alphabet_size; index2++)
2047 		{
2048 			score = matrix[index1][index2];
2049 			if (score <= BLAST_SCORE_MIN || score >= BLAST_SCORE_MAX)
2050 				continue;
2051 			if (score > maxscore)
2052 			{
2053 				maxscore = score;
2054 			}
2055 			if (sbp->loscore > score)
2056 				sbp->loscore = score;
2057 			if (sbp->hiscore < score)
2058 				sbp->hiscore = score;
2059 		}
2060 		sbp->maxscore[index1] = maxscore;
2061 	}
2062 /* If the lo/hi-scores are BLAST_SCORE_MIN/BLAST_SCORE_MAX, (i.e., for
2063 gaps), then use other scores. */
2064 
2065 	if (sbp->loscore < BLAST_SCORE_1MIN)
2066 		sbp->loscore = BLAST_SCORE_1MIN;
2067 	if (sbp->hiscore > BLAST_SCORE_1MAX)
2068 		sbp->hiscore = BLAST_SCORE_1MAX;
2069 
2070 	return 0;
2071 }
2072 
2073 /* maxscore for PSI Blast depends not on the residue, but on the position
2074    in the posMatrix, so maxscore array has size of the length of posMatrix */
2075 /* SSH */
BlastPSIMaxScoreGet(BLAST_ScorePtr PNTR posMatrix,Int4 start,Int4 length)2076 BLAST_ScorePtr BlastPSIMaxScoreGet(BLAST_ScorePtr PNTR posMatrix,
2077                                    Int4 start, Int4 length)
2078 {
2079     BLAST_Score score, maxscore;
2080     BLAST_ScorePtr maxscore_pos;
2081     Int4 index1, index2;
2082 
2083     if(posMatrix == NULL)
2084         return NULL;
2085 
2086     maxscore_pos = MemNew(length * sizeof(BLAST_Score));
2087 
2088     for (index1 = start; index1 < length; index1++) {
2089         maxscore = BLAST_SCORE_MIN;
2090         for (index2 = 0; index2 < PSI_ALPHABET_SIZE; index2++) {
2091             score = posMatrix[index1][index2];
2092             if (score <= BLAST_SCORE_MIN || score >= BLAST_SCORE_MAX)
2093                 continue;
2094             if (score > maxscore) {
2095                 maxscore = score;
2096             }
2097         }
2098 
2099         maxscore_pos[index1] = maxscore;
2100     }
2101     return maxscore_pos;
2102 }
2103 
2104 static BLASTMatrixStructurePtr
BlastMatrixAllocate(Int2 alphabet_size)2105 BlastMatrixAllocate(Int2 alphabet_size)
2106 
2107 {
2108 	BLASTMatrixStructurePtr matrix_struct;
2109 	Int2 index;
2110 
2111 	if (alphabet_size <= 0 || alphabet_size >= BLAST_MATRIX_SIZE)
2112 		return NULL;
2113 
2114 	matrix_struct =	(BLASTMatrixStructurePtr) MemNew(sizeof(BLASTMatrixStructure));
2115 
2116 	if (matrix_struct == NULL)
2117 		return NULL;
2118 
2119 	for (index=0; index<BLAST_MATRIX_SIZE-1; index++)
2120 	{
2121 		matrix_struct->matrix[index] = matrix_struct->long_matrix + index*BLAST_MATRIX_SIZE;
2122 	}
2123 
2124 	return matrix_struct;
2125 }
2126 
2127 static BLASTMatrixStructurePtr
BlastMatrixDestruct(BLASTMatrixStructurePtr matrix_struct)2128 BlastMatrixDestruct(BLASTMatrixStructurePtr matrix_struct)
2129 
2130 {
2131 
2132 	if (matrix_struct == NULL)
2133 		return NULL;
2134 
2135 	matrix_struct = MemFree(matrix_struct);
2136 
2137 	return matrix_struct;
2138 }
2139 
2140 /*
2141 	Allocated the BLAST_ResCompPtr for a given alphabet.  Only the
2142 	alphabets ncbistdaa and ncbi4na should be used by BLAST.
2143 */
2144 
2145 BLAST_ResCompPtr LIBCALL
BlastResCompNew(BLAST_ScoreBlkPtr sbp)2146 BlastResCompNew(BLAST_ScoreBlkPtr sbp)
2147 {
2148 	BLAST_ResCompPtr	rcp;
2149 
2150 	rcp = (BLAST_ResCompPtr) MemNew(sizeof(BLAST_ResComp));
2151 	if (rcp == NULL)
2152 		return NULL;
2153 
2154 	rcp->alphabet_code = sbp->alphabet_code;
2155 
2156 /* comp0 has zero offset, comp starts at sctp->start_at, only one
2157 array is allocated.  */
2158 	rcp->comp0 = (Int4Ptr) MemNew(BLAST_MATRIX_SIZE*sizeof(Int4));
2159 	if (rcp->comp0 == NULL)
2160 	{
2161 		rcp = BlastResCompDestruct(rcp);
2162 		return rcp;
2163 	}
2164 
2165 	rcp->comp = rcp->comp0 - sbp->alphabet_start;
2166 
2167 	return rcp;
2168 }
2169 
2170 
2171 BLAST_ResCompPtr LIBCALL
BlastResCompDestruct(BLAST_ResCompPtr rcp)2172 BlastResCompDestruct(BLAST_ResCompPtr rcp)
2173 {
2174 	if (rcp == NULL)
2175 		return NULL;
2176 
2177 	if (rcp->comp0 != NULL)
2178 		rcp->comp0 = MemFree(rcp->comp0);
2179 
2180 	return MemFree(rcp);
2181 }
2182 
2183 /*
2184 	Store the composition of a (query) string.
2185 */
2186 Int2 LIBCALL
BlastResCompStr(BLAST_ScoreBlkPtr sbp,BLAST_ResCompPtr rcp,CharPtr str,Int4 length)2187 BlastResCompStr(BLAST_ScoreBlkPtr sbp, BLAST_ResCompPtr rcp, CharPtr str, Int4 length)
2188 {
2189 	CharPtr	lp, lpmax;
2190 	Int2 index;
2191         Uint1 mask;
2192 
2193 	if (sbp == NULL || rcp == NULL || str == NULL)
2194 		return 1;
2195 
2196 	if (rcp->alphabet_code != sbp->alphabet_code)
2197 		return 1;
2198 
2199         /* For megablast, check only the first 4 bits of the sequence values */
2200         mask = (sbp->protein_alphabet ? 0xff : 0x0f);
2201 
2202 /* comp0 starts at zero and extends for "num", comp is the same array, but
2203 "start_at" offset. */
2204 	for (index=0; index<(sbp->alphabet_size); index++)
2205 		rcp->comp0[index] = 0;
2206 
2207 	for (lp = str, lpmax = lp+length; lp < lpmax; lp++)
2208 	{
2209 		++rcp->comp[(int)(*lp & mask)];
2210 	}
2211 
2212 	/* Don't count ambig. residues. */
2213 	for (index=0; index<sbp->ambig_occupy; index++)
2214 	{
2215 		rcp->comp[sbp->ambiguous_res[index]] = 0;
2216 	}
2217 
2218 	return 0;
2219 }
2220 
2221 static Int2
BlastScoreChk(BLAST_Score lo,BLAST_Score hi)2222 BlastScoreChk(BLAST_Score lo, BLAST_Score hi)
2223 {
2224 	if (lo >= 0 || hi <= 0 ||
2225 			lo < BLAST_SCORE_1MIN || hi > BLAST_SCORE_1MAX)
2226 		return 1;
2227 
2228 	if (hi - lo > BLAST_SCORE_RANGE_MAX)
2229 		return 1;
2230 
2231 	return 0;
2232 }
2233 
2234 BLAST_ScoreFreqPtr
BlastScoreFreqNew(BLAST_Score score_min,BLAST_Score score_max)2235 BlastScoreFreqNew(BLAST_Score score_min, BLAST_Score score_max)
2236 {
2237 	BLAST_ScoreFreqPtr	sfp;
2238 	BLAST_Score	range;
2239 
2240 	if (BlastScoreChk(score_min, score_max) != 0)
2241 		return NULL;
2242 
2243 	sfp = (BLAST_ScoreFreqPtr) MemNew(sizeof(BLAST_ScoreFreq));
2244 	if (sfp == NULL)
2245 		return NULL;
2246 
2247 	range = score_max - score_min + 1;
2248 	sfp->sprob = (Nlm_FloatHi PNTR) MemNew(sizeof(Nlm_FloatHi) * range);
2249 	if (sfp->sprob == NULL)
2250 	{
2251 		BlastScoreFreqDestruct(sfp);
2252 		return NULL;
2253 	}
2254 
2255 	sfp->sprob0 = sfp->sprob;
2256 	sfp->sprob -= score_min;
2257 	sfp->score_min = score_min;
2258 	sfp->score_max = score_max;
2259 	sfp->obs_min = sfp->obs_max = 0;
2260 	sfp->score_avg = 0.0;
2261 	return sfp;
2262 }
2263 
2264 BLAST_ScoreFreqPtr
BlastScoreFreqDestruct(BLAST_ScoreFreqPtr sfp)2265 BlastScoreFreqDestruct(BLAST_ScoreFreqPtr sfp)
2266 {
2267 	if (sfp == NULL)
2268 		return NULL;
2269 
2270 	if (sfp->sprob0 != NULL)
2271 		sfp->sprob0 = MemFree(sfp->sprob0);
2272 	sfp = MemFree(sfp);
2273 	return sfp;
2274 }
2275 
2276 static Int2
BlastScoreFreqCalc(BLAST_ScoreBlkPtr sbp,BLAST_ScoreFreqPtr sfp,BLAST_ResFreqPtr rfp1,BLAST_ResFreqPtr rfp2)2277 BlastScoreFreqCalc(BLAST_ScoreBlkPtr sbp, BLAST_ScoreFreqPtr sfp, BLAST_ResFreqPtr rfp1, BLAST_ResFreqPtr rfp2)
2278 {
2279 	BLAST_ScorePtr PNTR	matrix;
2280 	BLAST_Score	score, obs_min, obs_max;
2281 	Nlm_FloatHi		score_sum, score_avg;
2282 	Int2 		alphabet_start, alphabet_end, index1, index2;
2283 
2284 	if (sbp == NULL || sfp == NULL)
2285 		return 1;
2286 
2287 	if (sbp->loscore < sfp->score_min || sbp->hiscore > sfp->score_max)
2288 		return 1;
2289 
2290 	for (score = sfp->score_min; score <= sfp->score_max; score++)
2291 		sfp->sprob[score] = 0.0;
2292 
2293 	matrix = sbp->matrix;
2294 
2295 	alphabet_start = sbp->alphabet_start;
2296 	alphabet_end = alphabet_start + sbp->alphabet_size;
2297 	for (index1=alphabet_start; index1<alphabet_end; index1++)
2298 	{
2299 		for (index2=alphabet_start; index2<alphabet_end; index2++)
2300 		{
2301 			score = matrix[index1][index2];
2302 			if (score >= sbp->loscore)
2303 			{
2304 				sfp->sprob[score] += rfp1->prob[index1] * rfp2->prob[index2];
2305 			}
2306 		}
2307 	}
2308 
2309 	score_sum = 0.;
2310 	obs_min = obs_max = BLAST_SCORE_MIN;
2311 	for (score = sfp->score_min; score <= sfp->score_max; score++)
2312 	{
2313 		if (sfp->sprob[score] > 0.)
2314 		{
2315 			score_sum += sfp->sprob[score];
2316 			obs_max = score;
2317 			if (obs_min == BLAST_SCORE_MIN)
2318 				obs_min = score;
2319 		}
2320 	}
2321 	sfp->obs_min = obs_min;
2322 	sfp->obs_max = obs_max;
2323 
2324 	score_avg = 0.0;
2325 	if (score_sum > 0.0001 || score_sum < -0.0001)
2326 	{
2327 		for (score = obs_min; score <= obs_max; score++)
2328 		{
2329 			sfp->sprob[score] /= score_sum;
2330 			score_avg += score * sfp->sprob[score];
2331 		}
2332 	}
2333 	sfp->score_avg = score_avg;
2334 
2335 	return 0;
2336 }
2337 
2338 typedef struct {
2339 		Char	ch;
2340 		Nlm_FloatHi	p;
2341 	} BLAST_LetterProb;
2342 
2343 #define STD_AMINO_ACID_FREQS Robinson_prob
2344 
2345 #if STD_AMINO_ACID_FREQS == Dayhoff_prob
2346 /*  M. O. Dayhoff amino acid background frequencies   */
2347 static BLAST_LetterProb	Dayhoff_prob[] = {
2348 		{ 'A', 87.13 },
2349 		{ 'C', 33.47 },
2350 		{ 'D', 46.87 },
2351 		{ 'E', 49.53 },
2352 		{ 'F', 39.77 },
2353 		{ 'G', 88.61 },
2354 		{ 'H', 33.62 },
2355 		{ 'I', 36.89 },
2356 		{ 'K', 80.48 },
2357 		{ 'L', 85.36 },
2358 		{ 'M', 14.75 },
2359 		{ 'N', 40.43 },
2360 		{ 'P', 50.68 },
2361 		{ 'Q', 38.26 },
2362 		{ 'R', 40.90 },
2363 		{ 'S', 69.58 },
2364 		{ 'T', 58.54 },
2365 		{ 'V', 64.72 },
2366 		{ 'W', 10.49 },
2367 		{ 'Y', 29.92 }
2368 	};
2369 #endif
2370 
2371 #if STD_AMINO_ACID_FREQS == Altschul_prob
2372 /* Stephen Altschul amino acid background frequencies */
2373 static BLAST_LetterProb Altschul_prob[] = {
2374 		{ 'A', 81.00 },
2375 		{ 'C', 15.00 },
2376 		{ 'D', 54.00 },
2377 		{ 'E', 61.00 },
2378 		{ 'F', 40.00 },
2379 		{ 'G', 68.00 },
2380 		{ 'H', 22.00 },
2381 		{ 'I', 57.00 },
2382 		{ 'K', 56.00 },
2383 		{ 'L', 93.00 },
2384 		{ 'M', 25.00 },
2385 		{ 'N', 45.00 },
2386 		{ 'P', 49.00 },
2387 		{ 'Q', 39.00 },
2388 		{ 'R', 57.00 },
2389 		{ 'S', 68.00 },
2390 		{ 'T', 58.00 },
2391 		{ 'V', 67.00 },
2392 		{ 'W', 13.00 },
2393 		{ 'Y', 32.00 }
2394 	};
2395 #endif
2396 
2397 #if STD_AMINO_ACID_FREQS == Robinson_prob
2398 /* amino acid background frequencies from Robinson and Robinson */
2399 static BLAST_LetterProb Robinson_prob[] = {
2400 		{ 'A', 78.05 },
2401 		{ 'C', 19.25 },
2402 		{ 'D', 53.64 },
2403 		{ 'E', 62.95 },
2404 		{ 'F', 38.56 },
2405 		{ 'G', 73.77 },
2406 		{ 'H', 21.99 },
2407 		{ 'I', 51.42 },
2408 		{ 'K', 57.44 },
2409 		{ 'L', 90.19 },
2410 		{ 'M', 22.43 },
2411 		{ 'N', 44.87 },
2412 		{ 'P', 52.03 },
2413 		{ 'Q', 42.64 },
2414 		{ 'R', 51.29 },
2415 		{ 'S', 71.20 },
2416 		{ 'T', 58.41 },
2417 		{ 'V', 64.41 },
2418 		{ 'W', 13.30 },
2419 		{ 'Y', 32.16 }
2420 	};
2421 #endif
2422 
2423 static BLAST_LetterProb	nt_prob[] = {
2424 		{ 'A', 25.00 },
2425 		{ 'C', 25.00 },
2426 		{ 'G', 25.00 },
2427 		{ 'T', 25.00 }
2428 	};
2429 
2430 /*
2431 	Allocates the BLAST_ResFreqPtr and then fills in the frequencies
2432 	in the probabilities.
2433 */
2434 
2435 BLAST_ResFreqPtr LIBCALL
BlastResFreqNew(BLAST_ScoreBlkPtr sbp)2436 BlastResFreqNew(BLAST_ScoreBlkPtr sbp)
2437 {
2438 	BLAST_ResFreqPtr	rfp;
2439 
2440 	if (sbp == NULL)
2441 	{
2442 		return NULL;
2443 	}
2444 
2445 	rfp = (BLAST_ResFreqPtr) MemNew(sizeof(BLAST_ResFreq));
2446 	if (rfp == NULL)
2447 		return NULL;
2448 
2449 	rfp->alphabet_code = sbp->alphabet_code;
2450 
2451 	rfp->prob0 = (Nlm_FloatHi PNTR) MemNew(sizeof(Nlm_FloatHi) * sbp->alphabet_size);
2452 	if (rfp->prob0 == NULL)
2453 	{
2454 		rfp = BlastResFreqDestruct(rfp);
2455 		return rfp;
2456 	}
2457 	rfp->prob = rfp->prob0 - sbp->alphabet_start;
2458 
2459 	return rfp;
2460 }
2461 
BlastResFreqFree(BLAST_ResFreqPtr rfp)2462 void LIBCALL BlastResFreqFree(BLAST_ResFreqPtr rfp)
2463 {
2464     MemFree(rfp->prob0);
2465     MemFree(rfp);
2466 
2467     return;
2468 }
2469 
2470 /*
2471 	Normalize the frequencies to "norm".
2472 */
2473 Int2 LIBCALL
BlastResFreqNormalize(BLAST_ScoreBlkPtr sbp,BLAST_ResFreqPtr rfp,Nlm_FloatHi norm)2474 BlastResFreqNormalize(BLAST_ScoreBlkPtr sbp, BLAST_ResFreqPtr rfp, Nlm_FloatHi norm)
2475 {
2476 	Int2	alphabet_stop, index;
2477 	Nlm_FloatHi	sum = 0., p;
2478 
2479 	if (norm == 0.)
2480 		return 1;
2481 
2482 	alphabet_stop = sbp->alphabet_start + sbp->alphabet_size;
2483 	for (index=sbp->alphabet_start; index<alphabet_stop; index++)
2484 	{
2485 		p = rfp->prob[index];
2486 		if (p < 0.)
2487 			return 1;
2488 		sum += p;
2489 	}
2490 	if (sum <= 0.)
2491 		return 0;
2492 
2493 	for (index=sbp->alphabet_start; index<alphabet_stop; index++)
2494 	{
2495 		rfp->prob[index] /= sum;
2496 		rfp->prob[index] *= norm;
2497 	}
2498 	return 0;
2499 }
2500 
2501 /*
2502 	Fills a buffer with the 'standard' alphabet (given by
2503 	STD_AMINO_ACID_FREQS[index].ch).
2504 
2505 	Return value is the number of residues in alphabet.
2506 	Negative returns upon error.
2507 */
2508 
2509 Int2 LIBCALL
BlastGetStdAlphabet(Uint1 alphabet_code,Uint1Ptr residues,Int4 residues_size)2510 BlastGetStdAlphabet (Uint1 alphabet_code, Uint1Ptr residues, Int4 residues_size)
2511 
2512 {
2513 	Int2 index;
2514 	SeqMapTablePtr smtp=NULL;
2515 
2516 	if (residues_size < DIM(STD_AMINO_ACID_FREQS))
2517 		return -2;
2518 
2519 	if (alphabet_code != Seq_code_ncbieaa)
2520 	{
2521 		smtp = SeqMapTableFind(alphabet_code, Seq_code_ncbieaa);
2522 		if (smtp == NULL)
2523 		{
2524 			return -1;
2525 		}
2526 	}
2527 
2528 	for (index=0; index<DIM(STD_AMINO_ACID_FREQS); index++)
2529 	{
2530 		if (smtp)
2531 		{
2532 	 		residues[index] = SeqMapTableConvert(smtp,  STD_AMINO_ACID_FREQS[index].ch);
2533 		}
2534 		else
2535 		{
2536 			residues[index] = STD_AMINO_ACID_FREQS[index].ch;
2537 		}
2538 	}
2539 
2540 	return index;
2541 }
2542 
2543 Int2 LIBCALL
BlastResFreqStdComp(BLAST_ScoreBlkPtr sbp,BLAST_ResFreqPtr rfp)2544 BlastResFreqStdComp(BLAST_ScoreBlkPtr sbp, BLAST_ResFreqPtr rfp)
2545 {
2546 	Int2 index, retval;
2547 	Uint1Ptr residues;
2548 
2549 	if (sbp->protein_alphabet == TRUE)
2550 	{
2551 		residues = (Uint1Ptr) MemNew(DIM(STD_AMINO_ACID_FREQS)*sizeof(Uint1));
2552 		retval = BlastGetStdAlphabet(sbp->alphabet_code, residues, DIM(STD_AMINO_ACID_FREQS));
2553 		if (retval < 1)
2554 			return retval;
2555 
2556 		for (index=0; index<DIM(STD_AMINO_ACID_FREQS); index++)
2557 		{
2558 			rfp->prob[residues[index]] = STD_AMINO_ACID_FREQS[index].p;
2559 		}
2560 		residues = MemFree(residues);
2561 	}
2562 	else
2563 	{	/* beginning of blastna and ncbi2na are the same. */
2564 		/* Only blastna used  for nucleotides. */
2565 		for (index=0; index<DIM(nt_prob); index++)
2566 		{
2567 			rfp->prob[index] = nt_prob[index].p;
2568 		}
2569 	}
2570 
2571 	BlastResFreqNormalize(sbp, rfp, 1.0);
2572 
2573 	return 0;
2574 }
2575 
2576 CharPtr  LIBCALL
BlastRepresentativeResidues(Int2 length)2577 BlastRepresentativeResidues(Int2 length)
2578 
2579 {
2580 	CharPtr buffer, ptr;
2581 	Int2 index, total;
2582 	Int4 number;
2583 	total=0;
2584 
2585 	for (index=0; index<DIM(STD_AMINO_ACID_FREQS); index++)
2586 	{
2587 		total += (Int2) STD_AMINO_ACID_FREQS[index].p;
2588 	}
2589 
2590 	buffer = (CharPtr) MemNew((length+1)*sizeof(Char));
2591 
2592 	ptr = buffer;
2593 	for (index=0; index<DIM(STD_AMINO_ACID_FREQS); index++)
2594 	{
2595 		number = Nint((STD_AMINO_ACID_FREQS[index].p)*((Nlm_FloatHi) length)/((Nlm_FloatHi) total));
2596 		while (number > 0)
2597 		{
2598 			*ptr = STD_AMINO_ACID_FREQS[index].ch;
2599 			ptr++;
2600 			number--;
2601 		}
2602 	}
2603 
2604 	return buffer;
2605 }
2606 
2607 Int2 LIBCALL
BlastResFreqString(BLAST_ScoreBlkPtr sbp,BLAST_ResFreqPtr rfp,CharPtr string,Int4 length)2608 BlastResFreqString(BLAST_ScoreBlkPtr sbp, BLAST_ResFreqPtr rfp, CharPtr string, Int4 length)
2609 {
2610 	BLAST_ResCompPtr rcp;
2611 
2612 	rcp = BlastResCompNew(sbp);
2613 
2614 	BlastResCompStr(sbp, rcp, string, length);
2615 
2616 	BlastResFreqResComp(sbp, rfp, rcp);
2617 
2618 	rcp = BlastResCompDestruct(rcp);
2619 
2620 	return 0;
2621 }
2622 
2623 BLAST_ResFreqPtr LIBCALL
BlastResFreqDestruct(BLAST_ResFreqPtr rfp)2624 BlastResFreqDestruct(BLAST_ResFreqPtr rfp)
2625 {
2626 	if (rfp == NULL)
2627 		return NULL;
2628 
2629 	if (rfp->prob0 != NULL)
2630 		MemFree(rfp->prob0);
2631 
2632 	rfp = MemFree(rfp);
2633 
2634 	return rfp;
2635 }
2636 
2637 /*
2638 	Calculate the residue frequencies associated with the provided ResComp
2639 */
2640 Int2 LIBCALL
BlastResFreqResComp(BLAST_ScoreBlkPtr sbp,BLAST_ResFreqPtr rfp,BLAST_ResCompPtr rcp)2641 BlastResFreqResComp(BLAST_ScoreBlkPtr sbp, BLAST_ResFreqPtr rfp, BLAST_ResCompPtr rcp)
2642 {
2643 	Int2	alphabet_max, index;
2644 	Nlm_FloatHi	sum = 0.;
2645 
2646 	if (rfp == NULL || rcp == NULL)
2647 		return 1;
2648 
2649 	if (rfp->alphabet_code != rcp->alphabet_code)
2650 		return 1;
2651 
2652 	alphabet_max = sbp->alphabet_start + sbp->alphabet_size;
2653 	for (index=sbp->alphabet_start; index<alphabet_max; index++)
2654 		sum += rcp->comp[index];
2655 
2656 	if (sum == 0.) {
2657 		BlastResFreqClr(sbp, rfp);
2658 		return 0;
2659 	}
2660 
2661 	for (index=sbp->alphabet_start; index<alphabet_max; index++)
2662 		rfp->prob[index] = rcp->comp[index] / sum;
2663 
2664 	return 0;
2665 }
2666 
2667 Int2 LIBCALL
BlastResFreqClr(BLAST_ScoreBlkPtr sbp,BLAST_ResFreqPtr rfp)2668 BlastResFreqClr(BLAST_ScoreBlkPtr sbp, BLAST_ResFreqPtr rfp)
2669 {
2670 	Int2	alphabet_max, index;
2671 
2672 	alphabet_max = sbp->alphabet_start + sbp->alphabet_size;
2673 	for (index=sbp->alphabet_start; index<alphabet_max; index++)
2674                 rfp->prob[index] = 0.0;
2675 
2676         return 0;
2677 }
2678 
2679 /** Supported substitution and gap costs with corresponding quality values
2680  * for nucleotide sequence comparisons.
2681  * NB: the values 0 and 0 for the gap costs are treated as the defaults used for
2682  * the greedy gapped extension, i.e.
2683  * gap opening = 0,
2684  * gap extension = 1/2 match - mismatch.
2685  *
2686  * The fields are:
2687  *
2688  * 1. Gap opening cost,
2689  * 2. Gap extension cost,
2690  * 3. Lambda,
2691  * 4. K,
2692  * 5. H,
2693  * 6. Alpha,
2694  * 7. Beta,
2695  * 8. Theta
2696  */
2697 
2698 /** Karlin-Altschul parameter values for substitution scores 1 and -5. */
2699 static const array_of_8 blastn_values_1_5[] = {
2700     { 0, 0, 1.39, 0.747, 1.38, 1.00,  0, 100 },
2701     { 3, 3, 1.39, 0.747, 1.38, 1.00,  0, 100 }
2702 };
2703 
2704 /** Karlin-Altschul parameter values for substitution scores 1 and -4. */
2705 static const array_of_8 blastn_values_1_4[] = {
2706     { 0, 0, 1.383, 0.738, 1.36, 1.02,  0, 100 },
2707     { 1, 2,  1.36,  0.67,  1.2,  1.1,  0,  98 },
2708     { 0, 2,  1.26,  0.43, 0.90,  1.4, -1,  91 },
2709     { 2, 1,  1.35,  0.61,  1.1,  1.2, -1,  98 },
2710     { 1, 1,  1.22,  0.35, 0.72,  1.7, -3,  88 }
2711 };
2712 
2713 /** Karlin-Altschul parameter values for substitution scores 2 and -7.
2714  * These parameters can only be applied to even scores. Any odd score must be
2715  * rounded down to the nearest even number before calculating the e-value.
2716  */
2717 static const array_of_8 blastn_values_2_7[] = {
2718     { 0, 0,  0.69, 0.73, 1.34, 0.515,  0, 100 },
2719     { 2, 4,  0.68, 0.67,  1.2,  0.55,  0,  99 },
2720     { 0, 4,  0.63, 0.43, 0.90,   0.7, -1,  91 },
2721     { 4, 2, 0.675, 0.62,  1.1,   0.6, -1,  98 },
2722     { 2, 2,  0.61, 0.35, 0.72,   1.7, -3,  88 }
2723 };
2724 
2725 /** Karlin-Altschul parameter values for substitution scores 1 and -3. */
2726 static const array_of_8 blastn_values_1_3[] = {
2727     { 0, 0, 1.374, 0.711, 1.31, 1.05,  0, 100 },
2728     { 2, 2,  1.37,  0.70,  1.2,  1.1,  0,  99 },
2729     { 1, 2,  1.35,  0.64,  1.1,  1.2, -1,  98 },
2730     { 0, 2,  1.25,  0.42, 0.83,  1.5, -2,  91 },
2731     { 2, 1,  1.34,  0.60,  1.1,  1.2, -1,  97 },
2732     { 1, 1,  1.21,  0.34, 0.71,  1.7, -2,  88 }
2733 };
2734 
2735 /** Karlin-Altschul parameter values for substitution scores 2 and -5.
2736  * These parameters can only be applied to even scores. Any odd score must be
2737  * rounded down to the nearest even number before calculating the e-value.
2738  */
2739 static const array_of_8 blastn_values_2_5[] = {
2740     { 0, 0, 0.675, 0.65,  1.1,  0.6, -1, 99 },
2741     { 2, 4,  0.67, 0.59,  1.1,  0.6, -1, 98 },
2742     { 0, 4,  0.62, 0.39, 0.78,  0.8, -2, 91 },
2743     { 4, 2,  0.67, 0.61,  1.0, 0.65, -2, 98 },
2744     { 2, 2,  0.56, 0.32, 0.59, 0.95, -4, 82 }
2745 };
2746 
2747 /** Karlin-Altschul parameter values for substitution scores 1 and -2. */
2748 static const array_of_8 blastn_values_1_2[] = {
2749     { 0, 0, 1.28, 0.46, 0.85, 1.5, -2, 96 },
2750     { 2, 2, 1.33, 0.62,  1.1, 1.2,  0, 99 },
2751     { 1, 2, 1.30, 0.52, 0.93, 1.4, -2, 97 },
2752     { 0, 2, 1.19, 0.34, 0.66, 1.8, -3, 89 },
2753     { 3, 1, 1.32, 0.57,  1.0, 1.3, -1, 99 },
2754     { 2, 1, 1.29, 0.49, 0.92, 1.4, -1, 96 },
2755     { 1, 1, 1.14, 0.26, 0.52, 2.2, -5, 85 }
2756 };
2757 
2758 /** Karlin-Altschul parameter values for substitution scores 2 and -3.
2759  * These parameters can only be applied to even scores. Any odd score must be
2760  * rounded down to the nearest even number before calculating the e-value.
2761  */
2762 static const array_of_8 blastn_values_2_3[] = {
2763     { 0, 0,  0.55, 0.21, 0.46,  1.2, -5, 87 },
2764     { 4, 4,  0.63, 0.42, 0.84, 0.75, -2, 99 },
2765     { 2, 4, 0.615, 0.37, 0.72, 0.85, -3, 97 },
2766     { 0, 4,  0.55, 0.21, 0.46,  1.2, -5, 87 },
2767     { 3, 3, 0.615, 0.37, 0.68,  0.9, -3, 97 },
2768     { 6, 2,  0.63, 0.42, 0.84, 0.75, -2, 99 },
2769     { 5, 2, 0.625, 0.41, 0.78,  0.8, -2, 99 },
2770     { 4, 2,  0.61, 0.35, 0.68,  0.9, -3, 96 },
2771     { 2, 2, 0.515, 0.14, 0.33, 1.55, -9, 81 }
2772 };
2773 
2774 /** Karlin-Altschul parameter values for substitution scores 3 and -4. */
2775 static const array_of_8 blastn_values_3_4[] = {
2776     { 6, 3, 0.389, 0.25, 0.56, 0.7, -5, 95},
2777     { 5, 3, 0.375, 0.21, 0.47, 0.8, -6, 92},
2778     { 4, 3, 0.351, 0.14, 0.35, 1.0, -9, 86},
2779     { 6, 2, 0.362, 0.16, 0.45, 0.8, -4, 88},
2780     { 5, 2, 0.330, 0.092, 0.28, 1.2, -13, 81},
2781     { 4, 2, 0.281, 0.046, 0.16, 1.8, -23, 69}
2782 };
2783 
2784 /** Karlin-Altschul parameter values for substitution scores 4 and -5. */
2785 static const array_of_8 blastn_values_4_5[] = {
2786     { 0, 0, 0.22, 0.061, 0.22, 1.0, -15, 74 },
2787     { 6, 5, 0.28,  0.21, 0.47, 0.6 , -7, 93 },
2788     { 5, 5, 0.27,  0.17, 0.39, 0.7,  -9, 90 },
2789     { 4, 5, 0.25,  0.10, 0.31, 0.8, -10, 83 },
2790     { 3, 5, 0.23, 0.065, 0.25, 0.9, -11, 76 }
2791 };
2792 
2793 /** Karlin-Altschul parameter values for substitution scores 1 and -1. */
2794 static const array_of_8 blastn_values_1_1[] = {
2795     { 3,  2, 1.09,  0.31, 0.55, 2.0,  -2, 99 },
2796     { 2,  2, 1.07,  0.27, 0.49, 2.2,  -3, 97 },
2797     { 1,  2, 1.02,  0.21, 0.36, 2.8,  -6, 92 },
2798     { 0,  2, 0.80, 0.064, 0.17, 4.8, -16, 72 },
2799     { 4,  1, 1.08,  0.28, 0.54, 2.0,  -2, 98 },
2800     { 3,  1, 1.06,  0.25, 0.46, 2.3,  -4, 96 },
2801     { 2,  1, 0.99,  0.17, 0.30, 3.3, -10, 90 }
2802 };
2803 
2804 /** Karlin-Altschul parameter values for substitution scores 3 and -2. */
2805 static const array_of_8 blastn_values_3_2[] = {
2806     {  5,  5, 0.208, 0.030, 0.072, 2.9, -47, 77}
2807 };
2808 
2809 /** Karlin-Altschul parameter values for substitution scores 5 and -4. */
2810 static const array_of_8 blastn_values_5_4[] = {
2811     { 10, 6, 0.163, 0.068, 0.16, 1.0, -19, 85 },
2812     {  8, 6, 0.146, 0.039, 0.11, 1.3, -29, 76 }
2813 };
2814 
2815 static Int2
s_SplitArrayOf8(const array_of_8 * input,const array_of_8 ** normal,const array_of_8 ** non_affine,Boolean * split)2816 s_SplitArrayOf8(const array_of_8* input, const array_of_8** normal, const array_of_8** non_affine, Boolean *split)
2817 {
2818 
2819     if (input == NULL || normal == NULL || non_affine == NULL)
2820             return -1;
2821 
2822     *normal = NULL;
2823     *non_affine = NULL;
2824 
2825     if (input[0][0] == 0 && input[0][1] == 0)
2826     {
2827             *normal = input+1;
2828             *non_affine = input;
2829             *split = TRUE;
2830     }
2831     else
2832     {
2833             *normal = input;
2834             *split = FALSE;
2835     }
2836     return 0;
2837 
2838 }
2839 
2840 /** Returns the array of values corresponding to the given match/mismatch
2841  * scores, the number of supported gap cost combinations and thresholds for
2842  * the gap costs, beyond which the ungapped statistics can be applied.
2843  * @param reward Match reward score [in]
2844  * @param penalty Mismatch penalty score [in]
2845  * @param array_size Number of supported combinations for this match/mismatch
2846  *                   pair [out]
2847  * @param normal the values for normal (e.g, "affine") gap costs [out]
2848  * @param non_affine specialized values used for megablast [out]
2849  * @param gap_open_max Gap opening cost threshold for infinite gap costs [out]
2850  * @param gap_extend_max Gap extension cost threshold for infinite gap costs [out]
2851  * @param round_down if set to TRUE only even scores should be used for calculation
2852  *    of expect value or bit scores [out]
2853  * @param error_return Pointer to error message [out]
2854  * @return zero on success, other values if error
2855  */
2856 static Int2
s_GetNuclValuesArray(Int4 reward,Int4 penalty,Int4 * array_size,const array_of_8 ** normal,const array_of_8 ** non_affine,Int4 * gap_open_max,Int4 * gap_extend_max,Boolean * round_down,ValNodePtr * error_return)2857 s_GetNuclValuesArray(Int4 reward, Int4 penalty, Int4* array_size,
2858                      const array_of_8** normal, const array_of_8** non_affine,
2859                      Int4* gap_open_max, Int4* gap_extend_max, Boolean* round_down,
2860                      ValNodePtr* error_return)
2861 {
2862     Int2 status = 0;
2863     const array_of_8 * kValues = NULL;
2864     const array_of_8 * kValues_non_affine = NULL;
2865     Boolean split = FALSE;
2866 
2867     *round_down = FALSE;
2868 
2869     *array_size = 0;
2870 
2871     if (reward == 1 && penalty == -5) {
2872         if ((status=s_SplitArrayOf8(blastn_values_1_5, &kValues, &kValues_non_affine, &split)))
2873            return status;
2874 
2875         *array_size = sizeof(blastn_values_1_5)/sizeof(array_of_8);
2876         *gap_open_max = 3;
2877         *gap_extend_max = 3;
2878     } else if (reward == 1 && penalty == -4) {
2879         if ((status=s_SplitArrayOf8(blastn_values_1_4, &kValues, &kValues_non_affine, &split)))
2880            return status;
2881 
2882         *array_size = sizeof(blastn_values_1_4)/sizeof(array_of_8);
2883         *gap_open_max = 2;
2884         *gap_extend_max = 2;
2885     } else if (reward == 2 && penalty == -7) {
2886         if ((status=s_SplitArrayOf8(blastn_values_2_7, &kValues, &kValues_non_affine, &split)))
2887            return status;
2888 
2889         *round_down = TRUE;
2890         *array_size = sizeof(blastn_values_2_7)/sizeof(array_of_8);
2891         *gap_open_max = 4;
2892         *gap_extend_max = 4;
2893     } else if (reward == 1 && penalty == -3) {
2894         if ((status=s_SplitArrayOf8(blastn_values_1_3, &kValues, &kValues_non_affine, &split)))
2895            return status;
2896 
2897         *array_size = sizeof(blastn_values_1_3)/sizeof(array_of_8);
2898         *gap_open_max = 2;
2899         *gap_extend_max = 2;
2900     } else if (reward == 2 && penalty == -5) {
2901         if ((status=s_SplitArrayOf8(blastn_values_2_5, &kValues, &kValues_non_affine, &split)))
2902            return status;
2903 
2904         *round_down = TRUE;
2905         *array_size = sizeof(blastn_values_2_5)/sizeof(array_of_8);
2906         *gap_open_max = 4;
2907         *gap_extend_max = 4;
2908     } else if (reward == 1 && penalty == -2) {
2909         if ((status=s_SplitArrayOf8(blastn_values_1_2, &kValues, &kValues_non_affine, &split)))
2910            return status;
2911 
2912         *array_size = sizeof(blastn_values_1_2)/sizeof(array_of_8);
2913         *gap_open_max = 2;
2914         *gap_extend_max = 2;
2915     } else if (reward == 2 && penalty == -3) {
2916         if ((status=s_SplitArrayOf8(blastn_values_2_3, &kValues, &kValues_non_affine, &split)))
2917            return status;
2918 
2919         *round_down = TRUE;
2920         *array_size = sizeof(blastn_values_2_3)/sizeof(array_of_8);
2921         *gap_open_max = 6;
2922         *gap_extend_max = 4;
2923     } else if (reward == 3 && penalty == -4) {
2924         if ((status=s_SplitArrayOf8(blastn_values_3_4, &kValues, &kValues_non_affine, &split)))
2925            return status;
2926 
2927         *round_down = TRUE;
2928         *array_size = sizeof(blastn_values_3_4)/sizeof(array_of_8);
2929         *gap_open_max = 6;
2930         *gap_extend_max = 3;
2931     } else if (reward == 1 && penalty == -1) {
2932         if ((status=s_SplitArrayOf8(blastn_values_1_1, &kValues, &kValues_non_affine, &split)))
2933            return status;
2934 
2935         *array_size = sizeof(blastn_values_1_1)/sizeof(array_of_8);
2936         *gap_open_max = 4;
2937         *gap_extend_max = 2;
2938     } else if (reward == 3 && penalty == -2) {
2939         if ((status=s_SplitArrayOf8(blastn_values_3_2, &kValues, &kValues_non_affine, &split)))
2940            return status;
2941 
2942         *array_size = sizeof(blastn_values_3_2)/sizeof(array_of_8);
2943         *gap_open_max = 5;
2944         *gap_extend_max = 5;
2945     } else if (reward == 4 && penalty == -5) {
2946         if ((status=s_SplitArrayOf8(blastn_values_4_5, &kValues, &kValues_non_affine, &split)))
2947            return status;
2948 
2949         *array_size = sizeof(blastn_values_4_5)/sizeof(array_of_8);
2950         *gap_open_max = 12;
2951         *gap_extend_max = 8;
2952     } else if (reward == 5 && penalty == -4) {
2953         if ((status=s_SplitArrayOf8(blastn_values_5_4, &kValues, &kValues_non_affine, &split)))
2954            return status;
2955 
2956         *array_size = sizeof(blastn_values_5_4)/sizeof(array_of_8);
2957         *gap_open_max = 25;
2958         *gap_extend_max = 10;
2959     } else  { /* Unsupported reward-penalty */
2960         status = -1;
2961         if (error_return) {
2962             char buffer[256];
2963             sprintf(buffer, "Substitution scores %d and %d are not supported",
2964                 reward, penalty);
2965             BlastConstructErrorMessage("s_GetNuclValuesArray", buffer, 2, error_return);
2966         }
2967     }
2968     if (split)
2969         (*array_size)--;
2970 
2971     *normal = kValues;
2972     *non_affine = kValues_non_affine;
2973 
2974     return status;
2975 }
2976 
2977 
2978 /** Returns the beta statistical parameter value, given the nucleotide
2979  * substitution scores.
2980  * @param reward Match reward score [in]
2981  * @param penalty Mismatch penalty score [in]
2982  * @return The value of the beta parameter.
2983  */
s_GetUngappedBeta(Int4 reward,Int4 penalty)2984 static double s_GetUngappedBeta(Int4 reward, Int4 penalty)
2985 {
2986     double beta = 0;
2987     if ((reward == 1 && penalty == -1) ||
2988         (reward == 2 && penalty == -3))
2989         beta = -2;
2990 
2991     return beta;
2992 }
2993 
BlastKarlinGetNuclAlphaBeta(Int4 reward,Int4 penalty,Int4 gap_open,Int4 gap_extend,BLAST_KarlinBlkPtr kbp,Boolean gapped_calculation,double * alpha,double * beta)2994 Int2 BlastKarlinGetNuclAlphaBeta(Int4 reward, Int4 penalty, Int4 gap_open,
2995                             Int4 gap_extend, BLAST_KarlinBlkPtr kbp,
2996                             Boolean gapped_calculation,
2997                             double *alpha, double *beta)
2998 {
2999     const int kGapOpenIndex = 0;
3000     const int kGapExtIndex = 1;
3001     const int kAlphaIndex = 5;
3002     const int kBetaIndex = 6;
3003     Int4 num_combinations = 0;
3004     Int4 gap_open_max = 0, gap_extend_max = 0;
3005     Int4 index = 0;
3006     const array_of_8* kNormal=NULL;
3007     const array_of_8* kNonAffine=NULL;
3008     Boolean round_down = FALSE;
3009     Int2 status = s_GetNuclValuesArray(reward,
3010                                        penalty,
3011                                        &num_combinations,
3012                                        &kNormal,
3013                                        &kNonAffine,
3014                                        &gap_open_max,
3015                                        &gap_extend_max,
3016                                        &round_down,
3017                                        NULL);
3018 
3019     if (status)
3020        return status;
3021 
3022     ASSERT(alpha && beta && kbp);
3023 
3024     /* For ungapped search return ungapped values of alpha and beta. */
3025     if (gapped_calculation && kNormal) {
3026         if (gap_open == 0 && gap_extend == 0 && kNonAffine)
3027         {
3028             *alpha = kNonAffine[0][kAlphaIndex];
3029              *beta = kNonAffine[0][kBetaIndex];
3030             return 0;
3031         }
3032         for (index = 0; index < num_combinations; ++index) {
3033             if (kNormal[index][kGapOpenIndex] == gap_open &&
3034                 kNormal[index][kGapExtIndex] == gap_extend) {
3035                 *alpha = kNormal[index][kAlphaIndex];
3036                 *beta = kNormal[index][kBetaIndex];
3037                 return 0;
3038             }
3039         }
3040     }
3041 
3042     /* If input values not found in tables, or if this is an ungapped search,
3043        return the ungapped values of alpha and beta. */
3044     *alpha = kbp->Lambda/kbp->H;
3045     *beta = s_GetUngappedBeta(reward, penalty);
3046 
3047     return 0;
3048 }
3049 
3050 /** Copies data from in to out.  Both in and out should be
3051  * allocated.
3052  * @param in object to be copied [in]
3053  * @param out object to be copied to [in|out]
3054  */
3055 static void
s_KarlinBlkCopy(BLAST_KarlinBlk * in,BLAST_KarlinBlk * out)3056 s_KarlinBlkCopy(BLAST_KarlinBlk* in, BLAST_KarlinBlk* out)
3057 {
3058     ASSERT(in && out);
3059 
3060     out->Lambda = in->Lambda;
3061     out->K = in->K;
3062     out->logK = in->logK;
3063     out->H = in->H;
3064     out->paramC = in->paramC;
3065 
3066     return;
3067 }
3068 
3069 Int2
BlastKarlinBlkNuclGappedCalc(BLAST_KarlinBlk * kbp,Int4 gap_open,Int4 gap_extend,Int4 reward,Int4 penalty,BLAST_KarlinBlk * kbp_ungap,Boolean * round_down,ValNodePtr * error_return)3070 BlastKarlinBlkNuclGappedCalc(BLAST_KarlinBlk* kbp, Int4 gap_open,
3071                               Int4 gap_extend, Int4 reward, Int4 penalty,
3072                               BLAST_KarlinBlk* kbp_ungap,
3073                               Boolean* round_down,
3074                               ValNodePtr* error_return)
3075 {
3076     const int kGapOpenIndex = 0;
3077     const int kGapExtIndex = 1;
3078     const int kLambdaIndex = 2;
3079     const int kKIndex = 3;
3080     const int kHIndex = 4;
3081     int num_combinations = 0;
3082     int gap_open_max, gap_extend_max;
3083     const array_of_8* kNormal=NULL;
3084     const array_of_8* kNonAffine=NULL;
3085     Int2 status = s_GetNuclValuesArray(reward,
3086                                        penalty,
3087                                        &num_combinations,
3088                                        &kNormal,
3089                                        &kNonAffine,
3090                                        &gap_open_max,
3091                                        &gap_extend_max,
3092                                        round_down,
3093                                        error_return);
3094 
3095     if (status)
3096        return status;
3097 
3098     ASSERT(kbp && kbp_ungap);
3099 
3100 
3101     /* Try to find the table entry corresponding to input gap costs. */
3102     if (gap_open == 0 && gap_extend == 0 && kNonAffine)
3103     {
3104         kbp->Lambda = kNonAffine[0][kLambdaIndex];
3105         kbp->K = kNonAffine[0][kKIndex];
3106         kbp->logK = log(kbp->K);
3107         kbp->H = kNonAffine[0][kHIndex];
3108     }
3109     else
3110     {
3111         int index=0;
3112         for (index = 0; index < num_combinations; ++index) {
3113             if (kNormal[index][kGapOpenIndex] == gap_open &&
3114                 kNormal[index][kGapExtIndex] == gap_extend) {
3115                 kbp->Lambda = kNormal[index][kLambdaIndex];
3116                 kbp->K = kNormal[index][kKIndex];
3117                 kbp->logK = log(kbp->K);
3118                 kbp->H = kNormal[index][kHIndex];
3119                 break;
3120             }
3121         }
3122 
3123         /* If gap costs are not found in the table, check if they belong to the
3124         infinite domain, where ungapped values of the parameters can be used. */
3125         if (index == num_combinations) {
3126         /* If gap costs are larger than maximal provided in tables, copy
3127            the values from the ungapped Karlin block. */
3128             if (gap_open >= gap_open_max && gap_extend >= gap_extend_max) {
3129                 s_KarlinBlkCopy(kbp_ungap, kbp);
3130             } else if (error_return) {
3131                 char buffer[8192];
3132                 int i=0;
3133                 int len=0;
3134                 /* Unsupported gap costs combination. */
3135                 sprintf(buffer, "Gap existence and extension values %ld and %ld "
3136                         "are not supported for substitution scores %ld and %ld\n",
3137                         (long) gap_open, (long) gap_extend, (long) reward, (long) penalty);
3138                 for (i = 0; i < num_combinations; ++i)
3139                 {
3140                      len = strlen(buffer);
3141                      sprintf(buffer+len, "%ld and %ld are supported existence and extension values\n",
3142                         (long) kNormal[i][kGapOpenIndex],  (long) kNormal[i][kGapExtIndex]);
3143                 }
3144                 len = strlen(buffer);
3145                 sprintf(buffer+len, "%ld and %ld are supported existence and extension values\n",
3146                      (long) gap_open_max, (long) gap_extend_max);
3147                 len = strlen(buffer);
3148                 sprintf(buffer+len, "Any values more stringent than %ld and %ld are supported\n",
3149                      (long) gap_open_max, (long) gap_extend_max);
3150                 BlastConstructErrorMessage("BlastKarlinBlkNuclGappedCalc", buffer, 2, error_return);
3151                 return 1;
3152             }
3153         }
3154     }
3155 
3156     return 0;
3157 }
3158 
3159 
3160 /*
3161 	Deallocates MatrixInfoPtr
3162 */
3163 
3164 static MatrixInfoPtr
MatrixInfoDestruct(MatrixInfoPtr matrix_info)3165 MatrixInfoDestruct(MatrixInfoPtr matrix_info)
3166 
3167 {
3168 	if (matrix_info == NULL)
3169 		return NULL;
3170 
3171 	MemFree(matrix_info->name);
3172 	return MemFree(matrix_info);
3173 }
3174 
3175 /*
3176 	Makes New MatrixInfoPtr
3177 */
3178 
3179 static MatrixInfoPtr
MatrixInfoNew(CharPtr name,array_of_8 * values,Int4Ptr prefs,Int4 max_number)3180 MatrixInfoNew(CharPtr name, array_of_8 *values, Int4Ptr prefs, Int4 max_number)
3181 
3182 {
3183 	MatrixInfoPtr matrix_info;
3184 
3185 	matrix_info = (MatrixInfoPtr) MemNew(sizeof(MatrixInfo));
3186 	matrix_info->name = StringSave(name);
3187 	matrix_info->values = values;
3188 	matrix_info->prefs = prefs;
3189 	matrix_info->max_number_values = max_number;
3190 
3191 	return matrix_info;
3192 }
3193 
3194 static ValNodePtr
BlastMatrixValuesDestruct(ValNodePtr vnp)3195 BlastMatrixValuesDestruct(ValNodePtr vnp)
3196 
3197 {
3198 	ValNodePtr head;
3199 
3200 	head = vnp;
3201 	while (vnp)
3202 	{
3203 		MatrixInfoDestruct((MatrixInfoPtr) vnp->data.ptrvalue);
3204 		vnp = vnp->next;
3205 	}
3206 
3207 	head = ValNodeFree(head);
3208 
3209 	return head;
3210 }
3211 
3212 /*
3213 	ValNodePtr BlastLoadMatrixValues (void)
3214 
3215 	Loads all the matrix values, returns a ValNodePtr chain that contains
3216 	MatrixInfoPtr's.
3217 
3218 */
3219 static ValNodePtr
BlastLoadMatrixValues(void)3220 BlastLoadMatrixValues (void)
3221 
3222 {
3223 	MatrixInfoPtr matrix_info;
3224 	ValNodePtr retval=NULL;
3225 
3226 	matrix_info = MatrixInfoNew("BLOSUM80", blosum80_values, blosum80_prefs, BLOSUM80_VALUES_MAX);
3227 	ValNodeAddPointer(&retval, 0, matrix_info);
3228 
3229 	matrix_info = MatrixInfoNew("BLOSUM62", blosum62_values, blosum62_prefs, BLOSUM62_VALUES_MAX);
3230 	ValNodeAddPointer(&retval, 0, matrix_info);
3231 
3232 	matrix_info = MatrixInfoNew("BLOSUM50", blosum50_values, blosum50_prefs, BLOSUM50_VALUES_MAX);
3233 	ValNodeAddPointer(&retval, 0, matrix_info);
3234 
3235 	matrix_info = MatrixInfoNew("BLOSUM45", blosum45_values, blosum45_prefs, BLOSUM45_VALUES_MAX);
3236 	ValNodeAddPointer(&retval, 0, matrix_info);
3237 
3238 	matrix_info = MatrixInfoNew("PAM250", pam250_values, pam250_prefs, PAM250_VALUES_MAX);
3239 	ValNodeAddPointer(&retval, 0, matrix_info);
3240 
3241 /*
3242 	matrix_info = MatrixInfoNew("BLOSUM62_20", blosum62_20_values, blosum62_20_prefs, BLOSUM62_20_VALUES_MAX);
3243 	ValNodeAddPointer(&retval, 0, matrix_info);
3244 */
3245 
3246 	matrix_info = MatrixInfoNew("BLOSUM90", blosum90_values, blosum90_prefs, BLOSUM90_VALUES_MAX);
3247 	ValNodeAddPointer(&retval, 0, matrix_info);
3248 
3249 	matrix_info = MatrixInfoNew("PAM30", pam30_values, pam30_prefs, PAM30_VALUES_MAX);
3250 	ValNodeAddPointer(&retval, 0, matrix_info);
3251 
3252 	matrix_info = MatrixInfoNew("PAM70", pam70_values, pam70_prefs, PAM70_VALUES_MAX);
3253 	ValNodeAddPointer(&retval, 0, matrix_info);
3254 
3255 	return retval;
3256 }
3257 /*
3258 Int2 LIBCALL
3259 BlastKarlinGetMatrixValues(CharPtr matrix, Int4Ptr open, Int4Ptr extension, FloatHiPtr lambda, FloatHiPtr K, FloatHiPtr H)
3260 
3261 Obtains arrays of the allowed opening and extension penalties for gapped BLAST for
3262 the given matrix.  Also obtains arrays of Lambda, K, and H.  Any of these fields that
3263 are not required should be set to NULL.  The Int2 return value is the length of the
3264 arrays.
3265 */
3266 
3267 Int2 LIBCALL
BlastKarlinGetMatrixValues(CharPtr matrix,Int4Ptr PNTR open,Int4Ptr PNTR extension,FloatHiPtr PNTR lambda,FloatHiPtr PNTR K,FloatHiPtr PNTR H,Int4Ptr PNTR pref_flags)3268 BlastKarlinGetMatrixValues(CharPtr matrix, Int4Ptr PNTR open, Int4Ptr PNTR extension, FloatHiPtr PNTR lambda, FloatHiPtr PNTR K, FloatHiPtr PNTR H, Int4Ptr PNTR pref_flags)
3269 
3270 {
3271 	return BlastKarlinGetMatrixValuesEx2(matrix, open, extension, NULL, lambda, K, H, NULL, NULL, pref_flags);
3272 
3273 }
3274 
3275 /*
3276 Int2 LIBCALL
3277 BlastKarlinGetMatrixValuesEx(CharPtr matrix, Int4Ptr open, Int4Ptr extension, FloatHiPtr lambda, FloatHiPtr K, FloatHiPtr H)
3278 
3279 Obtains arrays of the allowed opening and extension penalties for gapped BLAST for
3280 the given matrix.  Also obtains arrays of Lambda, K, and H.  Any of these fields that
3281 are not required should be set to NULL.  The Int2 return value is the length of the
3282 arrays.
3283 */
3284 
3285 Int2 LIBCALL
BlastKarlinGetMatrixValuesEx(CharPtr matrix,Int4Ptr PNTR open,Int4Ptr PNTR extension,Int4Ptr PNTR decline_align,FloatHiPtr PNTR lambda,FloatHiPtr PNTR K,FloatHiPtr PNTR H,Int4Ptr PNTR pref_flags)3286 BlastKarlinGetMatrixValuesEx(CharPtr matrix, Int4Ptr PNTR open, Int4Ptr PNTR extension, Int4Ptr PNTR decline_align, FloatHiPtr PNTR lambda, FloatHiPtr PNTR K, FloatHiPtr PNTR H, Int4Ptr PNTR pref_flags)
3287 
3288 {
3289 	return BlastKarlinGetMatrixValuesEx2(matrix, open, extension, decline_align, lambda, K, H, NULL, NULL, pref_flags);
3290 
3291 }
3292 
3293 /*
3294 Int2 LIBCALL
3295 BlastKarlinGetMatrixValuesEx2(CharPtr matrix, Int4Ptr open, Int4Ptr extension, Int4Ptr decline_align, FloatHiPtr lambda, FloatHiPtr K, FloatHiPtr H)
3296 
3297 Obtains arrays of the allowed opening and extension penalties for gapped BLAST for
3298 the given matrix.  Also obtains arrays of Lambda, K, and H.  Any of these fields that
3299 are not required should be set to NULL.  The Int2 return value is the length of the
3300 arrays.
3301 */
3302 
3303 Int2 LIBCALL
BlastKarlinGetMatrixValuesEx2(CharPtr matrix,Int4Ptr PNTR open,Int4Ptr PNTR extension,Int4Ptr PNTR decline_align,FloatHiPtr PNTR lambda,FloatHiPtr PNTR K,FloatHiPtr PNTR H,FloatHiPtr PNTR alpha,FloatHiPtr PNTR beta,Int4Ptr PNTR pref_flags)3304 BlastKarlinGetMatrixValuesEx2(CharPtr matrix, Int4Ptr PNTR open, Int4Ptr PNTR extension, Int4Ptr PNTR decline_align, FloatHiPtr PNTR lambda, FloatHiPtr PNTR K, FloatHiPtr PNTR H, FloatHiPtr PNTR alpha, FloatHiPtr PNTR beta, Int4Ptr PNTR pref_flags)
3305 
3306 {
3307 	array_of_8 *values;
3308 	Boolean found_matrix=FALSE;
3309 	Int4 index, max_number_values=0;
3310 	Int4Ptr open_array=NULL, extension_array=NULL, decline_align_array=NULL, pref_flags_array=NULL, prefs;
3311 	Nlm_FloatHiPtr lambda_array=NULL, K_array=NULL, H_array=NULL, alpha_array=NULL, beta_array=NULL;
3312 	MatrixInfoPtr matrix_info;
3313 	ValNodePtr vnp, head;
3314 
3315 	if (matrix == NULL)
3316 		return 0;
3317 
3318 	vnp = head = BlastLoadMatrixValues();
3319 
3320 	while (vnp)
3321 	{
3322 		matrix_info = vnp->data.ptrvalue;
3323 		if (StringICmp(matrix_info->name, matrix) == 0)
3324 		{
3325 			values = matrix_info->values;
3326 			max_number_values = matrix_info->max_number_values;
3327 			prefs = matrix_info->prefs;
3328 			found_matrix = TRUE;
3329 			break;
3330 		}
3331 		vnp = vnp->next;
3332 	}
3333 
3334 	if (found_matrix)
3335 	{
3336 		if (open)
3337 			*open = open_array = MemNew(max_number_values*sizeof(Int4));
3338 		if (extension)
3339 			*extension = extension_array = MemNew(max_number_values*sizeof(Int4));
3340 		if (decline_align)
3341 			*decline_align = decline_align_array = MemNew(max_number_values*sizeof(Int4));
3342 		if (lambda)
3343 			*lambda = lambda_array = (FloatHiPtr) MemNew(max_number_values*sizeof(FloatHi));
3344 		if (K)
3345 			*K = K_array = (FloatHiPtr) MemNew(max_number_values*sizeof(FloatHi));
3346 		if (H)
3347 			*H = H_array = (FloatHiPtr) MemNew(max_number_values*sizeof(FloatHi));
3348 		if (alpha)
3349 			*alpha = alpha_array = (FloatHiPtr) MemNew(max_number_values*sizeof(FloatHi));
3350 		if (beta)
3351 			*beta = beta_array = (FloatHiPtr) MemNew(max_number_values*sizeof(FloatHi));
3352 		if (pref_flags)
3353 			*pref_flags = pref_flags_array = MemNew(max_number_values*sizeof(Int4));
3354 
3355 		for (index=0; index<max_number_values; index++)
3356 		{
3357 			if (open)
3358 				open_array[index] = values[index][0];
3359 			if (extension)
3360 				extension_array[index] = values[index][1];
3361 			if (decline_align)
3362 				decline_align_array[index] = values[index][2];
3363 			if (lambda)
3364 				lambda_array[index] = values[index][3];
3365 			if (K)
3366 				K_array[index] = values[index][4];
3367 			if (H)
3368 				H_array[index] = values[index][5];
3369 			if (alpha)
3370 				alpha_array[index] = values[index][6];
3371 			if (beta)
3372 				beta_array[index] = values[index][7];
3373 			if (pref_flags)
3374 				pref_flags_array[index] = prefs[index];
3375 		}
3376 	}
3377 
3378 	BlastMatrixValuesDestruct(head);
3379 
3380 	return max_number_values;
3381 }
3382 
3383 /*Extract the alpha and beta settings for this matrixName, and these
3384   gap open and gap extension costs*/
getAlphaBeta(CharPtr matrixName,Nlm_FloatHi * alpha,Nlm_FloatHi * beta,Boolean gapped,Int4 gap_open,Int4 gap_extend)3385 void LIBCALL getAlphaBeta(CharPtr matrixName, Nlm_FloatHi *alpha,
3386 Nlm_FloatHi *beta, Boolean gapped, Int4 gap_open, Int4 gap_extend)
3387 {
3388    Int4Ptr gapOpen_arr, gapExtend_arr, pref_flags;
3389    FloatHiPtr alpha_arr, beta_arr;
3390    Int2 num_values;
3391    Int4 i; /*loop index*/
3392 
3393    num_values = BlastKarlinGetMatrixValuesEx2(matrixName, &gapOpen_arr,
3394      &gapExtend_arr, NULL, NULL, NULL, NULL,  &alpha_arr, &beta_arr,
3395      &pref_flags);
3396 
3397    if (gapped) {
3398      if ((0 == gap_open) && (0 == gap_extend)) {
3399        for(i = 1; i < num_values; i++) {
3400 	 if(pref_flags[i]==BLAST_MATRIX_BEST) {
3401 	   (*alpha) = alpha_arr[i];
3402 	   (*beta) = beta_arr[i];
3403 	   break;
3404 	 }
3405        }
3406      }
3407      else {
3408        for(i = 1; i < num_values; i++) {
3409 	 if ((gapOpen_arr[i] == gap_open) &&
3410 	     (gapExtend_arr[i] == gap_extend)) {
3411 	   (*alpha) = alpha_arr[i];
3412 	   (*beta) = beta_arr[i];
3413 	   break;
3414 	 }
3415        }
3416      }
3417    }
3418    else if (num_values > 0) {
3419      (*alpha) = alpha_arr[0];
3420      (*beta) = beta_arr[0];
3421    }
3422 
3423    MemFree(gapOpen_arr);
3424    MemFree(gapExtend_arr);
3425    MemFree(pref_flags);
3426    MemFree(alpha_arr);
3427    MemFree(beta_arr);
3428 }
3429 
3430 /*
3431   Conveniently return default/best Karling-Altschul parameters for a given matrix.
3432 
3433  */
3434 
3435 Int2 LIBCALL
BlastKarlinGetDefaultMatrixValues(CharPtr matrix,Int4Ptr open,Int4Ptr extension,FloatHiPtr lambda,FloatHiPtr K,FloatHiPtr H)3436 BlastKarlinGetDefaultMatrixValues(CharPtr matrix, Int4Ptr open, Int4Ptr extension, FloatHiPtr lambda, FloatHiPtr K, FloatHiPtr H) {
3437     Int4Ptr gapOpen_arr, gapExtend_arr, pref_flags;
3438     FloatHiPtr Lambda_arr, Kappa_arr, H_arr;
3439     Int4 i,n;
3440     if(matrix==NULL)
3441         matrix = "BLOSUM62";
3442     n = BlastKarlinGetMatrixValues(matrix, &gapOpen_arr, &gapExtend_arr, &Lambda_arr, &Kappa_arr, &H_arr,&pref_flags);
3443     if(n) {
3444         *open = gapOpen_arr[0];
3445         *extension = gapExtend_arr[0];
3446         *K = Kappa_arr[0];
3447         *lambda = Lambda_arr[0];
3448         *H = H_arr[0];
3449         for(i=0;i<n;i++) {
3450             if(pref_flags[i]==BLAST_MATRIX_PREFERRED) {
3451                 *open = gapOpen_arr[i];
3452                 *extension = gapExtend_arr[i];
3453                 *K = Kappa_arr[i];
3454                 *lambda = Lambda_arr[i];
3455                 *H = H_arr[i];
3456             } else if(pref_flags[i]==BLAST_MATRIX_BEST) {
3457                 *open = gapOpen_arr[i];
3458                 *extension = gapExtend_arr[i];
3459                 *K = Kappa_arr[i];
3460                 *lambda = Lambda_arr[i];
3461                 *H = H_arr[i];
3462                 i+=n;
3463             }
3464         }
3465         MemFree(gapOpen_arr);
3466         MemFree(gapExtend_arr);
3467         MemFree(Kappa_arr);
3468         MemFree(Lambda_arr);
3469         MemFree(H_arr);
3470         MemFree(pref_flags);
3471         return 1;
3472     } else
3473         return 0;
3474 }
3475 /*
3476 	Supplies lambda, H, and K values, as calcualted by Stephen Altschul
3477 	in Methods in Enzy. (vol 266, page 474).
3478 
3479 	if kbp is NULL, then a validation is perfomed.
3480 */
3481 
3482 Int2 LIBCALL
BlastKarlinBlkGappedCalc(BLAST_KarlinBlkPtr kbp,Int4 gap_open,Int4 gap_extend,CharPtr matrix_name,ValNodePtr PNTR error_return)3483 BlastKarlinBlkGappedCalc(BLAST_KarlinBlkPtr kbp, Int4 gap_open, Int4 gap_extend, CharPtr matrix_name, ValNodePtr PNTR error_return)
3484 
3485 {
3486 
3487 	return BlastKarlinBlkGappedCalcEx(kbp, gap_open, gap_extend, (FloatHi) INT2_MAX, matrix_name, error_return);
3488 
3489 }
3490 
3491 /*
3492 	Supplies lambda, H, and K values, as calcualted by Stephen Altschul
3493 	in Methods in Enzy. (vol 266, page 474).
3494 
3495 	if kbp is NULL, then a validation is perfomed.
3496 */
3497 
3498 Int2 LIBCALL
BlastKarlinBlkGappedCalcEx(BLAST_KarlinBlkPtr kbp,Int4 gap_open,Int4 gap_extend,Int4 decline_align,CharPtr matrix_name,ValNodePtr PNTR error_return)3499 BlastKarlinBlkGappedCalcEx(BLAST_KarlinBlkPtr kbp, Int4 gap_open, Int4 gap_extend, Int4 decline_align, CharPtr matrix_name, ValNodePtr PNTR error_return)
3500 
3501 {
3502 	Char buffer[256];
3503 	Int2 status = BlastKarlinkGapBlkFill(kbp, gap_open, gap_extend, decline_align, matrix_name);
3504 
3505 	if (status && error_return)
3506 	{
3507 		if (status == 1)
3508 		{
3509 			MatrixInfoPtr matrix_info;
3510 			ValNodePtr vnp, head;
3511 
3512 			vnp = head = BlastLoadMatrixValues();
3513 
3514 			sprintf(buffer, "%s is not a supported matrix", matrix_name);
3515 			BlastConstructErrorMessage("BlastKarlinBlkGappedCalc", buffer, 2, error_return);
3516 
3517 			while (vnp)
3518 			{
3519 				matrix_info = vnp->data.ptrvalue;
3520 				sprintf(buffer, "%s is a supported matrix", matrix_info->name);
3521 				BlastConstructErrorMessage("BlastKarlinBlkGappedCalc", buffer, 2, error_return);
3522 				vnp = vnp->next;
3523 			}
3524 
3525 			BlastMatrixValuesDestruct(head);
3526 		}
3527 		else if (status == 2)
3528 		{
3529 			if (decline_align == INT2_MAX)
3530 				sprintf(buffer, "Gap existence and extension values of %ld and %ld not supported for %s", (long) gap_open, (long) gap_extend, matrix_name);
3531 			else
3532 				sprintf(buffer, "Gap existence, extension and decline-to-align values of %ld, %ld and %ld not supported for %s", (long) gap_open, (long) gap_extend, (long) decline_align, matrix_name);
3533 			BlastConstructErrorMessage("BlastKarlinBlkGappedCalc", buffer, 2, error_return);
3534 			BlastKarlinReportAllowedValues(matrix_name, error_return);
3535 		}
3536 	}
3537 
3538 	return status;
3539 }
3540 
3541 /*
3542 	Attempts to fill KarlinBlk for given gap opening, extensions etc.
3543 	Will return non-zero status if that fails.
3544 
3545 	return values:	-1 if matrix_name is NULL;
3546 			1 if matrix not found
3547 			2 if matrix found, but open, extend etc. values not supported.
3548 */
3549 Int2 LIBCALL
BlastKarlinkGapBlkFill(BLAST_KarlinBlkPtr kbp,Int4 gap_open,Int4 gap_extend,Int4 decline_align,CharPtr matrix_name)3550 BlastKarlinkGapBlkFill(BLAST_KarlinBlkPtr kbp, Int4 gap_open, Int4 gap_extend, Int4 decline_align, CharPtr matrix_name)
3551 {
3552 	Boolean found_matrix=FALSE, found_values=FALSE;
3553 	array_of_8 *values;
3554 	Int2 status=0;
3555 	Int4 index, max_number_values=0;
3556 	MatrixInfoPtr matrix_info;
3557 	ValNodePtr vnp, head;
3558 
3559 	if (matrix_name == NULL)
3560 		return -1;
3561 
3562 	values = NULL;
3563 
3564 	vnp = head = BlastLoadMatrixValues();
3565 	while (vnp)
3566 	{
3567 		matrix_info = vnp->data.ptrvalue;
3568 		if (StringICmp(matrix_info->name, matrix_name) == 0)
3569 		{
3570 			values = matrix_info->values;
3571 			max_number_values = matrix_info->max_number_values;
3572 			found_matrix = TRUE;
3573 			break;
3574 		}
3575 		vnp = vnp->next;
3576 	}
3577 
3578 
3579 	if (found_matrix)
3580 	{
3581 		for (index=0; index<max_number_values; index++)
3582 		{
3583 			if (Nint(values[index][0]) == gap_open &&
3584 				Nint(values[index][1]) == gap_extend &&
3585 				(Nint(values[index][2]) == INT2_MAX || Nint(values[index][2]) == decline_align))
3586 			{
3587 				if (kbp)
3588 				{
3589 					kbp->Lambda = values[index][3];
3590 					kbp->K = values[index][4];
3591 					kbp->logK = log(kbp->K);
3592 					kbp->H = values[index][5];
3593 				}
3594 				found_values = TRUE;
3595 				break;
3596 			}
3597 		}
3598 
3599 		if (found_values == TRUE)
3600 		{
3601 			status = 0;
3602 		}
3603 		else
3604 		{
3605 			status = 2;
3606 		}
3607 	}
3608 	else
3609 	{
3610 		status = 1;
3611 	}
3612 
3613 	BlastMatrixValuesDestruct(head);
3614 
3615 	return status;
3616 }
3617 
3618 CharPtr
PrintMatrixMessage(const Char * matrix_name)3619 PrintMatrixMessage(const Char *matrix_name)
3620 {
3621 	CharPtr buffer=MemNew(1024*sizeof(Char));
3622 	CharPtr ptr;
3623 	MatrixInfoPtr matrix_info;
3624         ValNodePtr vnp, head;
3625 
3626 	ptr = buffer;
3627         sprintf(ptr, "%s is not a supported matrix, supported matrices are:\n", matrix_name);
3628 
3629 	ptr += StringLen(ptr);
3630 
3631         vnp = head = BlastLoadMatrixValues();
3632 
3633         while (vnp)
3634         {
3635         	matrix_info = vnp->data.ptrvalue;
3636         	sprintf(ptr, "%s \n", matrix_info->name);
3637 		ptr += StringLen(ptr);
3638 		vnp = vnp->next;
3639         }
3640         BlastMatrixValuesDestruct(head);
3641 
3642 	return buffer;
3643 }
3644 
3645 CharPtr
PrintAllowedValuesMessage(const Char * matrix_name,Int4 gap_open,Int4 gap_extend,Int4 decline_align)3646 PrintAllowedValuesMessage(const Char *matrix_name, Int4 gap_open, Int4 gap_extend, Int4 decline_align)
3647 {
3648 	array_of_8 *values;
3649 	Boolean found_matrix=FALSE;
3650 	CharPtr buffer, ptr;
3651 	Int4 index, max_number_values=0;
3652 	MatrixInfoPtr matrix_info;
3653 	ValNodePtr vnp, head;
3654 
3655 	ptr = buffer = MemNew(2048*sizeof(Char));
3656 
3657         if (decline_align == INT2_MAX)
3658               sprintf(ptr, "Gap existence and extension values of %ld and %ld not supported for %s\nsupported values are:\n",
3659 		(long) gap_open, (long) gap_extend, matrix_name);
3660         else
3661                sprintf(ptr, "Gap existence, extension and decline-to-align values of %ld, %ld and %ld not supported for %s\n supported values are:\n",
3662 		(long) gap_open, (long) gap_extend, (long) decline_align, matrix_name);
3663 
3664 	ptr += StringLen(ptr);
3665 
3666 	vnp = head = BlastLoadMatrixValues();
3667 	while (vnp)
3668 	{
3669 		matrix_info = vnp->data.ptrvalue;
3670 		if (StringICmp(matrix_info->name, matrix_name) == 0)
3671 		{
3672 			values = matrix_info->values;
3673 			max_number_values = matrix_info->max_number_values;
3674 			found_matrix = TRUE;
3675 			break;
3676 		}
3677 		vnp = vnp->next;
3678 	}
3679 
3680 	if (found_matrix)
3681 	{
3682 		for (index=0; index<max_number_values; index++)
3683 		{
3684 			if (Nint(values[index][2]) == INT2_MAX)
3685 				sprintf(ptr, "%ld, %ld\n", (long) Nint(values[index][0]), (long) Nint(values[index][1]));
3686 			else
3687 				sprintf(ptr, "%ld, %ld, %ld\n", (long) Nint(values[index][0]), (long) Nint(values[index][1]), (long) Nint(values[index][2]));
3688 			ptr += StringLen(ptr);
3689 		}
3690 	}
3691 
3692 	BlastMatrixValuesDestruct(head);
3693 
3694 	return buffer;
3695 }
3696 
3697 
3698 Int2 LIBCALL
BlastKarlinReportAllowedValues(const Char * matrix_name,ValNodePtr PNTR error_return)3699 BlastKarlinReportAllowedValues(const Char *matrix_name, ValNodePtr PNTR error_return)
3700 {
3701 	array_of_8 *values;
3702 	Boolean found_matrix=FALSE;
3703 	Char buffer[256];
3704 	Int4 index, max_number_values=0;
3705 	MatrixInfoPtr matrix_info;
3706 	ValNodePtr vnp, head;
3707 
3708 	vnp = head = BlastLoadMatrixValues();
3709 	while (vnp)
3710 	{
3711 		matrix_info = vnp->data.ptrvalue;
3712 		if (StringICmp(matrix_info->name, matrix_name) == 0)
3713 		{
3714 			values = matrix_info->values;
3715 			max_number_values = matrix_info->max_number_values;
3716 			found_matrix = TRUE;
3717 			break;
3718 		}
3719 		vnp = vnp->next;
3720 	}
3721 
3722 	if (found_matrix)
3723 	{
3724 		for (index=0; index<max_number_values; index++)
3725 		{
3726 			if (Nint(values[index][2]) == INT2_MAX)
3727 				sprintf(buffer, "Gap existence and extension values of %ld and %ld are supported", (long) Nint(values[index][0]), (long) Nint(values[index][1]));
3728 			else
3729 				sprintf(buffer, "Gap existence, extension and decline-to-align values of %ld, %ld and %ld are supported", (long) Nint(values[index][0]), (long) Nint(values[index][1]), (long) Nint(values[index][2]));
3730 			BlastConstructErrorMessage("BlastKarlinBlkGappedCalc", buffer, 2, error_return);
3731 		}
3732 	}
3733 
3734 	BlastMatrixValuesDestruct(head);
3735 
3736 	return 0;
3737 }
3738 
3739 /*
3740 	BlastKarlinLtoH
3741 
3742 	Calculate H, the relative entropy of the p's and q's
3743 */
3744 static Nlm_FloatHi LIBCALL
BlastKarlinLtoH(BLAST_ScoreFreqPtr sfp,Nlm_FloatHi lambda)3745 BlastKarlinLtoH(BLAST_ScoreFreqPtr sfp, Nlm_FloatHi	lambda)
3746 {
3747 	BLAST_Score	score;
3748 	Nlm_FloatHi	H, etonlam, sum, scale;
3749 
3750 	Nlm_FloatHi PNTR probs = sfp->sprob;
3751 	BLAST_Score low   = sfp->obs_min,  high  = sfp->obs_max;
3752 
3753 	if (lambda < 0.) {
3754 		return -1.;
3755 	}
3756 	if (BlastScoreChk(low, high) != 0) return -1.;
3757 
3758 	etonlam = exp( - lambda );
3759   sum = low * probs[low];
3760   for( score = low + 1; score <= high; score++ ) {
3761     sum = score * probs[score] + etonlam * sum;
3762   }
3763 
3764   scale = Nlm_Powi( etonlam, high );
3765   if( scale > 0.0 ) {
3766     H = lambda * sum/scale;
3767   } else { /* Underflow of exp( -lambda * high ) */
3768     H = lambda * exp( lambda * high + log(sum) );
3769   }
3770 	return H;
3771 }
3772 
3773 /*
3774         Everything below here was (more or less) copied from the old
3775         karlin.c and could work separately from the stuff above.
3776 */
3777 
3778 /**************** Statistical Significance Parameter Subroutine ****************
3779 
3780     Version 1.0     February 2, 1990
3781     Version 1.2     July 6,     1990
3782 
3783     Program by:     Stephen Altschul
3784 
3785     Address:        National Center for Biotechnology Information
3786                     National Library of Medicine
3787                     National Institutes of Health
3788                     Bethesda, MD  20894
3789 
3790     Internet:       altschul@ncbi.nlm.nih.gov
3791 
3792 See:  Karlin, S. & Altschul, S.F. "Methods for Assessing the Statistical
3793     Significance of Molecular Sequence Features by Using General Scoring
3794     Schemes,"  Proc. Natl. Acad. Sci. USA 87 (1990), 2264-2268.
3795 
3796     Computes the parameters lambda and K for use in calculating the
3797     statistical significance of high-scoring segments or subalignments.
3798 
3799     The scoring scheme must be integer valued.  A positive score must be
3800     possible, but the expected (mean) score must be negative.
3801 
3802     A program that calls this routine must provide the value of the lowest
3803     possible score, the value of the greatest possible score, and a pointer
3804     to an array of probabilities for the occurrence of all scores between
3805     these two extreme scores.  For example, if score -2 occurs with
3806     probability 0.7, score 0 occurs with probability 0.1, and score 3
3807     occurs with probability 0.2, then the subroutine must be called with
3808     low = -2, high = 3, and pr pointing to the array of values
3809     { 0.7, 0.0, 0.1, 0.0, 0.0, 0.2 }.  The calling program must also provide
3810     pointers to lambda and K; the subroutine will then calculate the values
3811     of these two parameters.  In this example, lambda=0.330 and K=0.154.
3812 
3813     The parameters lambda and K can be used as follows.  Suppose we are
3814     given a length N random sequence of independent letters.  Associated
3815     with each letter is a score, and the probabilities of the letters
3816     determine the probability for each score.  Let S be the aggregate score
3817     of the highest scoring contiguous segment of this sequence.  Then if N
3818     is sufficiently large (greater than 100), the following bound on the
3819     probability that S is greater than or equal to x applies:
3820 
3821             P( S >= x )   <=   1 - exp [ - KN exp ( - lambda * x ) ].
3822 
3823     In other words, the p-value for this segment can be written as
3824     1-exp[-KN*exp(-lambda*S)].
3825 
3826     This formula can be applied to pairwise sequence comparison by assigning
3827     scores to pairs of letters (e.g. amino acids), and by replacing N in the
3828     formula with N*M, where N and M are the lengths of the two sequences
3829     being compared.
3830 
3831     In addition, letting y = KN*exp(-lambda*S), the p-value for finding m
3832     distinct segments all with score >= S is given by:
3833 
3834                            2             m-1           -y
3835             1 - [ 1 + y + y /2! + ... + y   /(m-1)! ] e
3836 
3837     Notice that for m=1 this formula reduces to 1-exp(-y), which is the same
3838     as the previous formula.
3839 
3840 *******************************************************************************/
3841 
3842 Int2 LIBCALL
BlastKarlinBlkCalc(BLAST_KarlinBlkPtr kbp,BLAST_ScoreFreqPtr sfp)3843 BlastKarlinBlkCalc(BLAST_KarlinBlkPtr kbp, BLAST_ScoreFreqPtr sfp)
3844 {
3845 
3846 
3847 	if (kbp == NULL || sfp == NULL)
3848 		return 1;
3849 
3850 	/* Calculate the parameter Lambda */
3851 
3852 	kbp->Lambda = BlastKarlinLambdaNR(sfp);
3853 	if (kbp->Lambda < 0.)
3854 		goto ErrExit;
3855 
3856 
3857 	/* Calculate H */
3858 
3859 	kbp->H = BlastKarlinLtoH(sfp, kbp->Lambda);
3860 	if (kbp->H < 0.)
3861 		goto ErrExit;
3862 
3863 
3864 	/* Calculate K and log(K) */
3865 
3866 	kbp->K = BlastKarlinLHtoK(sfp, kbp->Lambda, kbp->H);
3867 	if (kbp->K < 0.)
3868 		goto ErrExit;
3869 	kbp->logK = log(kbp->K);
3870 
3871 	/* Normal return */
3872 	return 0;
3873 
3874 ErrExit:
3875 	kbp->Lambda = kbp->H = kbp->K = -1.;
3876 #ifdef BLASTKAR_HUGE_VAL
3877 	kbp->logK = BLASTKAR_HUGE_VAL;
3878 #else
3879 	kbp->logK = 1.e30;
3880 #endif
3881 	return 1;
3882 }
3883 #define DIMOFP0	(iterlimit*range + 1)
3884 #define DIMOFP0_MAX (BLAST_KARLIN_K_ITER_MAX*BLAST_SCORE_RANGE_MAX+1)
3885 
3886 
3887 #define smallLambdaThreshold 20 /*defines special case in K computation*/
3888                                 /*threshold is on exp(-Lambda)*/
3889 
3890 /*The following procedure computes K. The input includes Lambda, H,
3891  *  and an array of probabilities for each score.
3892  *  There are distinct closed form for three cases:
3893  *  1. high score is 1 low score is -1
3894  *  2. high score is 1 low score is not -1
3895  *  3. low score is -1, high score is not 1
3896  *
3897  * Otherwise, in most cases the value is computed as:
3898  * -exp(-2.0*outerSum) / ((H/lambda)*(exp(-lambda) - 1)
3899  * The last term (exp(-lambda) - 1) can be computed in two different
3900  * ways depending on whether lambda is small or not.
3901  * outerSum is a sum of the terms
3902  * innerSum/j, where j is denoted by iterCounter in the code.
3903  * The sum is truncated when the new term innersum/j i sufficiently small.
3904  * innerSum is a weighted sum of the probabilities of
3905  * of achieving a total score i in a gapless alignment,
3906  * which we denote by P(i,j).
3907  * of exactly j characters. innerSum(j) has two parts
3908  * Sum over i < 0  P(i,j)exp(-i * lambda) +
3909  * Sum over i >=0  P(i,j)
3910  * The terms P(i,j) are computed by dynamic programming.
3911  * An earlier version was flawed in that ignored the special case 1
3912  * and tried to replace the tail of the computation of outerSum
3913  * by a geometric series, but the base of the geometric series
3914  * was not accurately estimated in some cases.
3915  */
3916 
3917 Nlm_FloatHi
BlastKarlinLHtoK(BLAST_ScoreFreqPtr sfp,Nlm_FloatHi lambda,Nlm_FloatHi H)3918 BlastKarlinLHtoK(BLAST_ScoreFreqPtr sfp, Nlm_FloatHi    lambda, Nlm_FloatHi H)
3919 {
3920     /*The next array stores the probabilities of getting each possible
3921       score in an alignment of fixed length; the array is shifted
3922       during part of the computation, so that
3923       entry 0 is for score 0.  */
3924     Nlm_FloatHi     PNTR alignmentScoreProbabilities = NULL;
3925     BLAST_Score     low;    /* Lowest score (must be negative) */
3926     BLAST_Score     high;   /* Highest score (must be positive) */
3927     BLAST_Score     range;  /* range of scores, computed as high - low*/
3928     Nlm_FloatHi     K;      /* local copy of K  to return*/
3929     Int4            i;   /*loop index*/
3930     Int4            iterCounter; /*counter on iterations*/
3931     BLAST_Score     divisor; /*candidate divisor of all scores with
3932                                non-zero probabilities*/
3933     /*highest and lowest possible alignment scores for current length*/
3934     BLAST_Score   lowAlignmentScore, highAlignmentScore;
3935     BLAST_Score   first, last; /*loop indices for dynamic program*/
3936     register Nlm_FloatHi    innerSum;
3937     Nlm_FloatHi    oldsum, oldsum2;  /* values of innerSum on previous
3938                                         iterations*/
3939     Nlm_FloatHi     outerSum;        /* holds sum over j of (innerSum
3940                                         for iteration j/j)*/
3941 
3942     Nlm_FloatHi    score_avg; /*average score*/
3943     /*first term to use in the closed form for the case where
3944       high == 1 or low == -1, but not both*/
3945     Nlm_FloatHi     firstTermClosedForm;  /*usually store H/lambda*/
3946     Int4            iterlimit; /*upper limit on iterations*/
3947     Nlm_FloatHi     sumlimit; /*lower limit on contributions
3948                                 to sum over scores*/
3949 
3950     /*array of score probabilities reindexed so that low is at index 0*/
3951     Nlm_FloatHi     PNTR probArrayStartLow;
3952 
3953     /*pointers used in dynamic program*/
3954     Nlm_FloatHi     PNTR ptrP, PNTR ptr1, PNTR ptr2, PNTR ptr1e;
3955     Nlm_FloatHi     expMinusLambda; /*e^^(-Lambda) */
3956 
3957     if (lambda <= 0. || H <= 0.) {
3958         /* Theory dictates that H and lambda must be positive, so
3959          * return -1 to indicate an error */
3960         return -1.;
3961     }
3962 
3963     /*Karlin-Altschul theory works only if the expected score
3964       is negative*/
3965     if (sfp->score_avg >= 0.0) {
3966         return -1.;
3967     }
3968 
3969     low   = sfp->obs_min;
3970     high  = sfp->obs_max;
3971     range = high - low;
3972 
3973     probArrayStartLow = &sfp->sprob[low];
3974     /* Look for the greatest common divisor ("delta" in Appendix of PNAS 87 of
3975        Karlin&Altschul (1990) */
3976     for (i = 1, divisor = -low; i <= range && divisor > 1; ++i) {
3977         if (probArrayStartLow[i] != 0.0)
3978             divisor = Nlm_Gcd(divisor, i);
3979     }
3980 
3981     high   /= divisor;
3982     low    /= divisor;
3983     lambda *= divisor;
3984 
3985     range = high - low;
3986 
3987     firstTermClosedForm = H/lambda;
3988     expMinusLambda      = exp((Nlm_FloatHi) -lambda);
3989 
3990     if (low == -1 && high == 1) {
3991         K = (sfp->sprob[low*divisor] - sfp->sprob[high*divisor]) *
3992             (sfp->sprob[low*divisor] - sfp->sprob[high*divisor]) /
3993             sfp->sprob[low*divisor];
3994         return(K);
3995     }
3996 
3997     if (low == -1 || high == 1) {
3998         if (high != 1) {
3999             score_avg = sfp->score_avg / divisor;
4000             firstTermClosedForm
4001                 = (score_avg * score_avg) / firstTermClosedForm;
4002         }
4003         return firstTermClosedForm * (1.0 - expMinusLambda);
4004     }
4005 
4006     sumlimit  = BLAST_KARLIN_K_SUMLIMIT_DEFAULT;
4007     iterlimit = BLAST_KARLIN_K_ITER_MAX;
4008 
4009     if (DIMOFP0 > DIMOFP0_MAX) {
4010         return -1.;
4011     }
4012     alignmentScoreProbabilities =
4013         (Nlm_FloatHi PNTR)MemNew(DIMOFP0 *
4014                                  sizeof(*alignmentScoreProbabilities));
4015     if (alignmentScoreProbabilities == NULL)
4016         return -1.;
4017 
4018     outerSum = 0.;
4019     lowAlignmentScore = highAlignmentScore = 0;
4020     alignmentScoreProbabilities[0] = innerSum = oldsum = oldsum2 = 1.;
4021 
4022     for (iterCounter = 0;
4023          ((iterCounter < iterlimit) && (innerSum > sumlimit));
4024          outerSum += innerSum /= ++iterCounter) {
4025         first = last = range;
4026         lowAlignmentScore  += low;
4027         highAlignmentScore += high;
4028         /*dynamic program to compute P(i,j)*/
4029         for (ptrP = alignmentScoreProbabilities +
4030                  (highAlignmentScore-lowAlignmentScore);
4031              ptrP >= alignmentScoreProbabilities;
4032              *ptrP-- =innerSum) {
4033             ptr1  = ptrP - first;
4034             ptr1e = ptrP - last;
4035             ptr2  = probArrayStartLow + first;
4036             for (innerSum = 0.; ptr1 >= ptr1e; )
4037                 innerSum += *ptr1--  *  *ptr2++;
4038             if (first)
4039                 --first;
4040             if (ptrP - alignmentScoreProbabilities <= range)
4041                 --last;
4042         }
4043         /* Horner's rule */
4044         innerSum = *++ptrP;
4045         for( i = lowAlignmentScore + 1; i < 0; i++ ) {
4046             innerSum = *++ptrP + innerSum * expMinusLambda;
4047         }
4048         innerSum *= expMinusLambda;
4049 
4050         for (; i <= highAlignmentScore; ++i)
4051             innerSum += *++ptrP;
4052         oldsum2 = oldsum;
4053         oldsum  = innerSum;
4054     }
4055 
4056 #ifdef ADD_GEOMETRIC_TERMS_TO_K
4057     /*old code assumed that the later terms in sum were
4058       asymptotically comparable to those of a geometric
4059       progression, and tried to speed up convergence by
4060       guessing the estimated ratio between sucessive terms
4061       and using the explicit terms of a geometric progression
4062       to speed up convergence. However, the assumption does not
4063       always hold, and convergenece of the above code is fast
4064       enough in practice*/
4065     /* Terms of geometric progression added for correction */
4066     {
4067         Nlm_FloatHi     ratio;  /* fraction used to generate the
4068                                    geometric progression */
4069 
4070         ratio = oldsum / oldsum2;
4071         if (ratio >= (1.0 - sumlimit*0.001)) {
4072             K = -1.;
4073             if (alignmentScoreProbabilities != NULL)
4074                 MemFree(alignmentScoreProbabilities);
4075             return K;
4076         }
4077         sumlimit *= 0.01;
4078         while (innerSum > sumlimit) {
4079             oldsum   *= ratio;
4080             outerSum += innerSum = oldsum / ++iterCounter;
4081         }
4082     }
4083 #endif
4084 
4085     if (expMinusLambda <  smallLambdaThreshold ) {
4086         K = -exp((double)-2.0*outerSum) /
4087             (firstTermClosedForm*(expMinusLambda - 1.0));
4088     } else {
4089         K = -exp((double)-2.0*outerSum) /
4090             (firstTermClosedForm*Expm1(-(double)lambda));
4091     }
4092 
4093     if (alignmentScoreProbabilities != NULL)
4094         MemFree(alignmentScoreProbabilities);
4095 
4096     return K;
4097 }
4098 
4099 
4100 /**
4101  * Find positive solution to
4102  *
4103  *     sum_{i=low}^{high} exp(i lambda) * probs[i] = 1.
4104  *
4105  * Note that this solution does not exist unless the average score is
4106  * negative and the largest score that occurs with nonzero probability
4107  * is positive.
4108  *
4109  * @param probs         probabilities of a score occurring
4110  * @param d             the gcd of the possible scores. This equals 1 if
4111  *                      the scores are not a lattice
4112  * @param low           the lowest possible score that occurs with
4113  *                      nonzero probability
4114  * @param high          the highest possible score that occurs with
4115  *                      nonzero probability.
4116  * @param lambda0       an initial guess for lambda
4117  * @param tolx          the tolerance to which lambda must be computed
4118  * @param itmax         the maximum number of times the function may be
4119  *                      evaluated
4120  * @param maxNewton     the maximum permissible number of Newton
4121  *                      iterations; after that the computation will proceed
4122  *                      by bisection.
4123  * @param *itn          the number of iterations needed to compute Lambda,
4124  *                      or itmax if Lambda could not be computed.
4125  *
4126  * Let phi(lambda) =  sum_{i=low}^{high} exp(i lambda) - 1. Then
4127  * phi(lambda) may be written
4128  *
4129  *     phi(lamdba) = exp(u lambda) f( exp(-lambda) )
4130  *
4131  * where f(x) is a polynomial that has exactly two zeros, one at x = 1
4132  * and one at x = exp(-lamdba).  It is simpler to solve this problem
4133  * in x = exp(-lambda) than it is to solve it in lambda, because we
4134  * know that for x, a solution lies in [0,1], and because Newton's
4135  * method is generally more stable and efficient for polynomials than
4136  * it is for exponentials.
4137  *
4138  * For the most part, this function is a standard safeguarded Newton
4139  * iteration: define an interval of uncertainty [a,b] with f(a) > 0
4140  * and f(b) < 0 (except for the initial value b = 1, where f(b) = 0);
4141  * evaluate the function and use the sign of that value to shrink the
4142  * interval of uncertainty; compute a Newton step; and if the Newton
4143  * step suggests a point outside the interval of uncertainty or fails
4144  * to decrease the function sufficiently, then bisect.  There are
4145  * three further details needed to understand the algorithm:
4146  *
4147  * 1)  If y the unique solution in [0,1], then f is positive to the left of
4148  *     y, and negative to the right.  Therefore, we may determine whether
4149  *     the Newton step -f(x)/f'(x) is moving toward, or away from, y by
4150  *     examining the sign of f'(x).  If f'(x) >= 0, we bisect instead
4151  *     of taking the Newton step.
4152  * 2)  There is a neighborhood around x = 1 for which f'(x) >= 0, so
4153  *     (1) prevents convergence to x = 1 (and for a similar reason
4154  *     prevents convergence to x = 0, if the function is incorrectly
4155  *     called with probs[high] == 0).
4156  * 3)  Conditions like  fabs(p) < lambda_tolerance * x * (1-x) are used in
4157  *     convergence criteria because these values translate to a bound
4158  *     on the relative error in lambda.  This is proved in the
4159  *     "Blast Scoring Parameters" document that accompanies the BLAST
4160  *     code.
4161  *
4162  * The iteration on f(x) is robust and doesn't overflow; defining a
4163  * robust safeguarded Newton iteration on phi(lambda) that cannot
4164  * converge to lambda = 0 and that is protected against overflow is
4165  * more difficult.  So (despite the length of this comment) the Newton
4166  * iteration on f(x) is the simpler solution.
4167  */
4168 
4169 static Nlm_FloatHi
NlmKarlinLambdaNR(Nlm_FloatHi PNTR probs,BLAST_Score d,BLAST_Score low,BLAST_Score high,Nlm_FloatHi lambda0,Nlm_FloatHi tolx,int itmax,int maxNewton,int * itn)4170 NlmKarlinLambdaNR(Nlm_FloatHi PNTR probs, BLAST_Score d, BLAST_Score low,
4171                   BLAST_Score high, Nlm_FloatHi lambda0, Nlm_FloatHi tolx,
4172                   int itmax, int maxNewton, int * itn)
4173 {
4174   int k;
4175   Nlm_FloatHi x0, x, a = 0, b = 1;
4176   Nlm_FloatHi f = 4;  /* Larger than any possible value of the poly in [0,1] */
4177   int isNewton = 0; /* we haven't yet taken a Newton step. */
4178 
4179   assert( d > 0 );
4180 
4181 	x0 = exp( -lambda0 );
4182   x = ( 0 < x0 && x0 < 1 ) ? x0 : .5;
4183 
4184   for( k = 0; k < itmax; k++ ) { /* all iteration indices k */
4185     int i;
4186     Nlm_FloatHi g, fold = f;
4187     int wasNewton = isNewton; /* If true, then the previous step was a */
4188                               /* Newton step */
4189     isNewton  = 0;            /* Assume that this step is not */
4190 
4191     /* Horner's rule for evaluating a polynomial and its derivative */
4192     g = 0;
4193     f = probs[low];
4194     for( i = low + d; i < 0; i += d ) {
4195       g = x * g + f;
4196       f = f * x + probs[i];
4197     }
4198     g = x * g + f;
4199     f = f * x + probs[0] - 1;
4200     for( i = d; i <= high; i += d ) {
4201       g = x * g + f;
4202       f = f * x + probs[i];
4203     }
4204     /* End Horner's rule */
4205 
4206     if( f > 0 ) {
4207       a = x; /* move the left endpoint */
4208     } else if( f < 0 ) {
4209       b = x; /* move the right endpoint */
4210     } else { /* f == 0 */
4211       break; /* x is an exact solution */
4212     }
4213     if( b - a < 2 * a * ( 1 - b ) * tolx ) {
4214       /* The midpoint of the interval converged */
4215       x = (a + b) / 2; break;
4216     }
4217 
4218     if( k >= maxNewton ||
4219         /* If convergence of Newton's method appears to be failing; or */
4220 				( wasNewton && fabs( f ) > .9 * fabs(fold) ) ||
4221         /* if the previous iteration was a Newton step but didn't decrease
4222          * f sufficiently; or */
4223         g >= 0
4224         /* if a Newton step will move us away from the desired solution */
4225         ) { /* then */
4226       /* bisect */
4227       x = (a + b)/2;
4228     } else {
4229       /* try a Newton step */
4230       double p = - f/g;
4231       double y = x + p;
4232       if( y <= a || y >= b ) { /* The proposed iterate is not in (a,b) */
4233         x = (a + b)/2;
4234       } else { /* The proposed iterate is in (a,b). Accept it. */
4235         isNewton = 1;
4236         x = y;
4237         if( fabs( p ) < tolx * x * (1-x) ) break; /* Converged */
4238       } /* else the proposed iterate is in (a,b) */
4239     } /* else try a Newton step. */
4240   } /* end for all iteration indices k */
4241 	*itn = k;
4242   return -log(x)/d;
4243 }
4244 
4245 
4246 Nlm_FloatHi
BlastKarlinLambdaNR(BLAST_ScoreFreqPtr sfp)4247 BlastKarlinLambdaNR(BLAST_ScoreFreqPtr sfp)
4248 {
4249 	BLAST_Score	low;			/* Lowest score (must be negative)  */
4250 	BLAST_Score	high;			/* Highest score (must be positive) */
4251 	int		itn;
4252 	BLAST_Score	i, d;
4253 	Nlm_FloatHi PNTR	sprob;
4254 	Nlm_FloatHi	returnValue;
4255 
4256 	low = sfp->obs_min;
4257 	high = sfp->obs_max;
4258 	if (sfp->score_avg >= 0.) {	/* Expected score must be negative */
4259 		return -1.0;
4260 	}
4261 	if (BlastScoreChk(low, high) != 0) return -1.;
4262 
4263 	sprob = sfp->sprob;
4264 	/* Find greatest common divisor of all scores */
4265 	for (i = 1, d = -low; i <= high-low && d > 1; ++i) {
4266 		if (sprob[i+low] != 0.0) {
4267 			d = Nlm_Gcd(d, i);
4268 		}
4269 	}
4270 	returnValue =
4271 		NlmKarlinLambdaNR( sprob, d, low, high,
4272 											 BLAST_KARLIN_LAMBDA0_DEFAULT,
4273 											 BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT,
4274 											 20, 20 + BLAST_KARLIN_LAMBDA_ITER_DEFAULT, &itn );
4275 
4276 
4277 	return returnValue;
4278 }
4279 
4280 Nlm_FloatHi LIBCALL
impalaKarlinLambdaNR(BLAST_ScoreFreqPtr sfp,Nlm_FloatHi initialLambda)4281 impalaKarlinLambdaNR(BLAST_ScoreFreqPtr sfp, Nlm_FloatHi initialLambda)
4282 {
4283 	Nlm_FloatHi returnValue;
4284 	int itn;
4285 	Nlm_FloatHi PNTR	sprob = sfp->sprob;
4286 
4287 	if (sfp->score_avg >= 0.) {	/* Expected score must be negative */
4288 		return -1.0;
4289 	}
4290 	{
4291 		Boolean foundPositive = FALSE;
4292 		BLAST_Score	j;
4293 		for(j = 1; j <=sfp->obs_max; j++) {
4294 			if (sprob[j] > 0.0) {
4295 				foundPositive = TRUE;
4296 				break;
4297 			}
4298 		}
4299 		if (!foundPositive) return(-1);
4300 	}
4301 
4302 	returnValue =
4303 		NlmKarlinLambdaNR( sprob, 1, sfp->obs_min, sfp->obs_max,
4304 			                 initialLambda, BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT,
4305 											 20, 20 + BLAST_KARLIN_LAMBDA_ITER_DEFAULT, &itn );
4306 
4307 	return returnValue;
4308 }
4309 
4310 
4311 
4312 /* Compute a divisor used to weight the evalue of a collection of
4313  * "nsegs" distinct alignments.  These divisors are used to compensate
4314  * for the effect of choosing the best among multiple collections of
4315  * alignments.  See
4316  *
4317  * Stephen F. Altschul. Evaluating the statitical significance of
4318  * multiple distinct local alignments. In Suhai, editior, Theoretical
4319  * and Computational Methods in Genome Research, pages 1-14. Plenum
4320  * Press, New York, 1997.
4321  *
4322  * The "decayrate" parameter of this routine is a value in the
4323  * interval (0,1). Typical values of decayrate are .1 and .5. */
4324 
4325 Nlm_FloatHi LIBCALL
BlastGapDecayDivisor(Nlm_FloatHi decayrate,unsigned nsegs)4326 BlastGapDecayDivisor(Nlm_FloatHi decayrate, unsigned nsegs )
4327 {
4328     return (1. - decayrate) * Nlm_Powi(decayrate, nsegs - 1);
4329 }
4330 
4331 /*
4332 	BlastCutoffs
4333 
4334 	Calculate the cutoff score, S, and the highest expected score.
4335 
4336 	WRG (later modified by TLM).
4337 */
4338 Int2 LIBCALL
BlastCutoffs(BLAST_ScorePtr S,Nlm_FloatHi PNTR E,BLAST_KarlinBlkPtr kbp,Nlm_FloatHi searchsp,Nlm_Boolean dodecay,Nlm_FloatHi gap_decay_rate)4339 BlastCutoffs(BLAST_ScorePtr S, /* cutoff score */
4340 	Nlm_FloatHi PNTR E, /* expected no. of HSPs scoring at or above S */
4341 	BLAST_KarlinBlkPtr kbp,
4342 	Nlm_FloatHi searchsp, /* size of search space. */
4343 	Nlm_Boolean dodecay,  /* TRUE ==> use gapdecay feature */
4344   Nlm_FloatHi gap_decay_rate )
4345 {
4346 	BLAST_Score	s = *S, es;
4347 	Nlm_FloatHi	e = *E, esave;
4348 	Boolean		s_changed = FALSE;
4349 
4350 	if (kbp->Lambda == -1. || kbp->K == -1. || kbp->H == -1.)
4351 		return 1;
4352 
4353 	/*
4354 	Calculate a cutoff score, S, from the Expected
4355 	(or desired) number of reported HSPs, E.
4356 	*/
4357 	es = 1;
4358 	esave = e;
4359 	if (e > 0.)
4360 	{
4361         if (dodecay) {
4362             /* Invert the adjustment to the e-value that will be applied
4363              * to compensate for the effect of choosing the best among
4364              * multiple alignments */
4365             if( gap_decay_rate > 0 && gap_decay_rate < 1 ) {
4366                 e *= BlastGapDecayDivisor(gap_decay_rate, 1);
4367             }
4368         }
4369 		es = BlastKarlinEtoS_simple(e, kbp, searchsp);
4370 	}
4371 	/*
4372 	Pick the larger cutoff score between the user's choice
4373 	and that calculated from the value of E.
4374 	*/
4375 	if (es > s) {
4376 		s_changed = TRUE;
4377 		*S = s = es;
4378 	}
4379 
4380 	/*
4381 		Re-calculate E from the cutoff score, if E going in was too high
4382 	*/
4383 	if (esave <= 0. || !s_changed)
4384 	{
4385 		e = BlastKarlinStoE_simple(s, kbp, searchsp);
4386         if (dodecay) {
4387             /* Weight the e-value to compensate for the effect of
4388              * choosing the best of more than one collection of
4389              * distinct alignments */
4390             if( gap_decay_rate > 0 && gap_decay_rate < 1 ) {
4391                 e /= BlastGapDecayDivisor(gap_decay_rate, 1);
4392             }
4393         }
4394 		*E = e;
4395 	}
4396 
4397 	return 0;
4398 }
4399 
4400 /*
4401 	BlastKarlinEtoS() -- given an Expect value, return the associated cutoff score
4402 
4403 	Error return value is BLAST_SCORE_MIN
4404 */
4405 BLAST_Score LIBCALL
BlastKarlinEtoS(Nlm_FloatHi E,BLAST_KarlinBlkPtr kbp,Nlm_FloatHi qlen,Nlm_FloatHi dblen)4406 BlastKarlinEtoS(Nlm_FloatHi	E,	/* Expect value */
4407 	BLAST_KarlinBlkPtr	kbp,
4408 	Nlm_FloatHi	qlen,	/* length of query sequence */
4409 	Nlm_FloatHi	dblen)	/* length of database */
4410 {
4411 	return BlastKarlinEtoS_simple(E, kbp, qlen*dblen);
4412 }
4413 
4414 
4415 /* Smallest float that might not cause a floating point exception in
4416 	S = (BLAST_Score) (ceil( log((Nlm_FloatHi)(K * searchsp / E)) / Lambda ));
4417 below.
4418 */
4419 #define BLASTKAR_SMALL_FLOAT 1.0e-297
4420 BLAST_Score LIBCALL
BlastKarlinEtoS_simple(Nlm_FloatHi E,BLAST_KarlinBlkPtr kbp,Nlm_FloatHi searchsp)4421 BlastKarlinEtoS_simple(Nlm_FloatHi	E,	/* Expect value */
4422 	BLAST_KarlinBlkPtr	kbp,
4423 	Nlm_FloatHi	searchsp)	/* size of search space */
4424 {
4425 
4426 	Nlm_FloatHi	Lambda, K, H; /* parameters for Karlin statistics */
4427 	BLAST_Score	S;
4428 
4429 	Lambda = kbp->Lambda;
4430 	K = kbp->K;
4431 	H = kbp->H;
4432 	if (Lambda < 0. || K < 0. || H < 0.)
4433 	{
4434 		return BLAST_SCORE_MIN;
4435 	}
4436 
4437 	E = MAX(E, BLASTKAR_SMALL_FLOAT);
4438 
4439 	S = (BLAST_Score) (ceil( log((Nlm_FloatHi)(K * searchsp / E)) / Lambda ));
4440 	return S;
4441 }
4442 
4443 /*
4444 	BlastKarlinStoP
4445 
4446 	Calculate the probability (as opposed to expectation)
4447 	of achieving a particular score.
4448 
4449 	On error, return value is -1. (same as BlastKarlinStoE()).
4450 */
4451 Nlm_FloatHi LIBCALL
BlastKarlinStoP(BLAST_Score S,BLAST_KarlinBlkPtr kbp,Nlm_FloatHi qlen,Nlm_FloatHi dblen)4452 BlastKarlinStoP(BLAST_Score S,
4453 		BLAST_KarlinBlkPtr kbp,
4454 		Nlm_FloatHi	qlen,	/* length of query sequence */
4455 		Nlm_FloatHi	dblen)	/* length of database */
4456 {
4457 	return BlastKarlinStoP_simple(S, kbp, qlen*dblen);
4458 }
4459 
4460 Nlm_FloatHi LIBCALL
BlastKarlinStoP_simple(BLAST_Score S,BLAST_KarlinBlkPtr kbp,Nlm_FloatHi searchsp)4461 BlastKarlinStoP_simple(BLAST_Score S,
4462 		BLAST_KarlinBlkPtr kbp,
4463 		Nlm_FloatHi	searchsp)	/* size of search space. */
4464 {
4465 	Nlm_FloatHi	x, p;
4466 
4467 	x = BlastKarlinStoE_simple(S, kbp, searchsp);
4468 	if (x == -1.)
4469 		return x;
4470 	p = BlastKarlinEtoP(x);
4471 
4472 	return p;
4473 }
4474 
4475 /*
4476 	BlastKarlinStoE() -- given a score, return the associated Expect value
4477 
4478 	Error return value is -1.
4479 */
4480 Nlm_FloatHi LIBCALL
BlastKarlinStoE(BLAST_Score S,BLAST_KarlinBlkPtr kbp,Nlm_FloatHi qlen,Nlm_FloatHi dblen)4481 BlastKarlinStoE(BLAST_Score S,
4482 		BLAST_KarlinBlkPtr kbp,
4483 		Nlm_FloatHi	qlen,	/* length of query sequence */
4484 		Nlm_FloatHi	dblen)	/* length of database */
4485 {
4486 	return BlastKarlinStoE_simple(S, kbp, qlen*dblen);
4487 }
4488 
4489 Nlm_FloatHi LIBCALL
BlastKarlinStoE_simple(BLAST_Score S,BLAST_KarlinBlkPtr kbp,Nlm_FloatHi searchsp)4490 BlastKarlinStoE_simple(BLAST_Score S,
4491 		BLAST_KarlinBlkPtr kbp,
4492 		Nlm_FloatHi	searchsp)	/* size of search space. */
4493 {
4494 	Nlm_FloatHi	Lambda, K, H; /* parameters for Karlin statistics */
4495 
4496 	Lambda = kbp->Lambda;
4497 	K = kbp->K;
4498 	H = kbp->H;
4499 	if (Lambda < 0. || K < 0. || H < 0.) {
4500 		return -1.;
4501 	}
4502 
4503 	return searchsp * exp((Nlm_FloatHi)(-Lambda * S) + kbp->logK);
4504 }
4505 
4506 /*
4507 BlastKarlinPtoE -- convert a P-value to an Expect value
4508 When using BlastKarlinPtoE in the context of a database search,
4509 the returned E-value should be multiplied by the effective
4510 length of the database and divided by the effective lnegth of
4511 the subject.
4512 */
4513 Nlm_FloatHi LIBCALL
BlastKarlinPtoE(Nlm_FloatHi p)4514 BlastKarlinPtoE(Nlm_FloatHi p)
4515 {
4516         if (p < 0. || p > 1.0)
4517 	{
4518                 return INT4_MIN;
4519         }
4520 
4521 	if (p == 1)
4522 		return INT4_MAX;
4523 
4524         return -Nlm_Log1p(-p);
4525 }
4526 
4527 /*
4528 BlastKarlinEtoP -- convert an Expect value to a P-value
4529 When using BlastKarlinEtoP in the context of a database search,
4530 the input paramter  E-value should be divided by the effective
4531 length of the database and multiplied by the effective lnegth of
4532 the subject, before BlastKarlinEtoP is called.
4533 */
4534 Nlm_FloatHi LIBCALL
BlastKarlinEtoP(Nlm_FloatHi x)4535 BlastKarlinEtoP(Nlm_FloatHi x)
4536 {
4537 	return -Nlm_Expm1((Nlm_FloatHi)-x);
4538 }
4539 
4540 /*
4541 	BlastKarlinStoLen()
4542 
4543 	Given a score, return the length expected for an HSP of that score
4544 */
4545 Nlm_FloatHi LIBCALL
BlastKarlinStoLen(BLAST_KarlinBlkPtr kbp,BLAST_Score S)4546 BlastKarlinStoLen(BLAST_KarlinBlkPtr kbp, BLAST_Score S)
4547 {
4548 	return kbp->Lambda * S / kbp->H;
4549 }
4550 
4551 static Nlm_FloatHi	tab2[] = { /* table for r == 2 */
4552 0.01669,  0.0249,   0.03683,  0.05390,  0.07794,  0.1111,   0.1559,   0.2146,
4553 0.2890,   0.3794,   0.4836,   0.5965,   0.7092,   0.8114,   0.8931,   0.9490,
4554 0.9806,   0.9944,   0.9989
4555 		};
4556 
4557 static Nlm_FloatHi	tab3[] = { /* table for r == 3 */
4558 0.9806,   0.9944,   0.9989,   0.0001682,0.0002542,0.0003829,0.0005745,0.0008587,
4559 0.001278, 0.001893, 0.002789, 0.004088, 0.005958, 0.008627, 0.01240,  0.01770,
4560 0.02505,  0.03514,  0.04880,  0.06704,  0.09103,  0.1220,   0.1612,   0.2097,
4561 0.2682,   0.3368,   0.4145,   0.4994,   0.5881,   0.6765,   0.7596,   0.8326,
4562 0.8922,   0.9367,   0.9667,   0.9846,   0.9939,   0.9980
4563 		};
4564 
4565 static Nlm_FloatHi	tab4[] = { /* table for r == 4 */
4566 2.658e-07,4.064e-07,6.203e-07,9.450e-07,1.437e-06,2.181e-06,3.302e-06,4.990e-06,
4567 7.524e-06,1.132e-05,1.698e-05,2.541e-05,3.791e-05,5.641e-05,8.368e-05,0.0001237,
4568 0.0001823,0.0002677,0.0003915,0.0005704,0.0008275,0.001195, 0.001718, 0.002457,
4569 0.003494, 0.004942, 0.006948, 0.009702, 0.01346,  0.01853,  0.02532,  0.03431,
4570 0.04607,  0.06128,  0.08068,  0.1051,   0.1352,   0.1719,   0.2157,   0.2669,
4571 0.3254,   0.3906,   0.4612,   0.5355,   0.6110,   0.6849,   0.7544,   0.8168,
4572 0.8699,   0.9127,   0.9451,   0.9679,   0.9827,   0.9915,   0.9963
4573 		};
4574 
4575 static Nlm_FloatHi PNTR table[] = { tab2, tab3, tab4 };
4576 static short tabsize[] = { DIM(tab2)-1, DIM(tab3)-1, DIM(tab4)-1 };
4577 
4578 static Nlm_FloatHi LIBCALL f PROTO((Nlm_FloatHi,Nlm_VoidPtr));
4579 static Nlm_FloatHi LIBCALL g PROTO((Nlm_FloatHi,Nlm_VoidPtr));
4580 
4581 
4582 /*
4583     Estimate the Sum P-value by calculation or interpolation, as appropriate.
4584 	Approx. 2-1/2 digits accuracy minimum throughout the range of r, s.
4585 	r = number of segments
4586 	s = total score (in nats), adjusted by -r*log(KN)
4587 */
4588 Nlm_FloatHi LIBCALL
BlastSumP(Int4 r,Nlm_FloatHi s)4589 BlastSumP(Int4 r, Nlm_FloatHi s)
4590 {
4591 	Int4		i, r1, r2;
4592 	Nlm_FloatHi	a;
4593 
4594 	if (r == 1)
4595 		return -Nlm_Expm1(-exp(-s));
4596 
4597 	if (r <= 4) {
4598 		if (r < 1)
4599 			return 0.;
4600 		r1 = r - 1;
4601 		if (s >= r*r+r1) {
4602 			a = Nlm_LnGammaInt(r+1);
4603 			return r * exp(r1*log(s)-s-a-a);
4604 		}
4605 		if (s > -2*r) {
4606 			/* interpolate */
4607 			i = (Int4) (a = s+s+(4*r));
4608 			a -= i;
4609 			i = tabsize[r2 = r - 2] - i;
4610 			return a*table[r2][i-1] + (1.-a)*table[r2][i];
4611 		}
4612 		return 1.;
4613 	}
4614 
4615 	return BlastSumPCalc(r, s);
4616 }
4617 
4618 /*
4619     BlastSumPCalc
4620 
4621     Evaluate the following Nlm_FloatHi integral, where r = number of segments
4622     and s = the adjusted score in nats:
4623 
4624                     (r-2)         oo           oo
4625      Prob(r,s) =   r              -            -   (r-2)
4626                  -------------   |   exp(-y)  |   x   exp(-exp(x - y/r)) dx dy
4627                  (r-1)! (r-2)!  U            U
4628                                 s            0
4629 */
4630 static Nlm_FloatHi
BlastSumPCalc(int r,Nlm_FloatHi s)4631 BlastSumPCalc(int r, Nlm_FloatHi s)
4632 {
4633 	int		r1, itmin;
4634 	Nlm_FloatHi	t, d, epsilon;
4635 	Nlm_FloatHi	est_mean, mean, stddev, stddev4;
4636 	Nlm_FloatHi	xr, xr1, xr2, logr;
4637 	Nlm_FloatHi	args[6];
4638 
4639 	epsilon = BLAST_SUMP_EPSILON_DEFAULT; /* accuracy for SumP calcs. */
4640 
4641 	if (r == 1) {
4642 		if (s > 8.)
4643 			return exp(-s);
4644 		return -Nlm_Expm1(-exp(-s));
4645 	}
4646 	if (r < 1)
4647 		return 0.;
4648 
4649 	xr = r;
4650 
4651 	if (r < 8) {
4652 		if (s <= -2.3*xr)
4653 			return 1.;
4654 	}
4655 	else if (r < 15) {
4656 			if (s <= -2.5*xr)
4657 				return 1.;
4658 	}
4659 	else if (r < 27) {
4660 			if (s <= -3.0*xr)
4661 				return 1.;
4662 	}
4663 	else if (r < 51) {
4664 			if (s <= -3.4*xr)
4665 				return 1.;
4666 	}
4667 	else if (r < 101) {
4668 			if (s <= -4.0*xr)
4669 				return 1.;
4670 	}
4671 
4672 	/* stddev in the limit of infinite r, but quite good for even small r */
4673 	stddev = sqrt(xr);
4674 	stddev4 = 4.*stddev;
4675 	xr1 = r1 = r - 1;
4676 
4677 	if (r > 100) {
4678 		/* Calculate lower bound on the mean using inequality log(r) <= r */
4679 		est_mean = -xr * xr1;
4680 		if (s <= est_mean - stddev4)
4681 			return 1.;
4682 	}
4683 
4684 	/* mean is rather close to the mode, and the mean is readily calculated */
4685 	/* mean in the limit of infinite r, but quite good for even small r */
4686 	logr = log(xr);
4687 	mean = xr * (1. - logr) - 0.5;
4688 	if (s <= mean - stddev4)
4689 		return 1.;
4690 
4691 	if (s >= mean) {
4692 		t = s + 6.*stddev;
4693 		itmin = 1;
4694 	}
4695 	else {
4696 		t = mean + 6.*stddev;
4697 		itmin = 2;
4698 	}
4699 
4700 #define ARG_R args[0]
4701 #define ARG_R2 args[1]
4702 #define ARG_ADJ1 args[2]
4703 #define ARG_ADJ2 args[3]
4704 #define ARG_SDIVR args[4]
4705 #define ARG_EPS args[5]
4706 
4707 	ARG_R = xr;
4708 	ARG_R2 = xr2 = r - 2;
4709 	ARG_ADJ1 = xr2*logr - Nlm_LnGammaInt(r1) - Nlm_LnGammaInt(r);
4710 	ARG_EPS = epsilon;
4711 
4712 	do {
4713 		d = Nlm_RombergIntegrate(g, args, s, t, epsilon, 0, itmin);
4714 #ifdef BLASTKAR_HUGE_VAL
4715 		if (d == BLASTKAR_HUGE_VAL)
4716 			return d;
4717 #endif
4718 	} while (s < mean && d < 0.4 && itmin++ < 4);
4719 
4720 	return (d < 1. ? d : 1.);
4721 }
4722 
4723 static Nlm_FloatHi LIBCALL
g(Nlm_FloatHi s,Nlm_VoidPtr vp)4724 g(Nlm_FloatHi	s, Nlm_VoidPtr	vp)
4725 {
4726 	register Nlm_FloatHi PNTR	args = vp;
4727 	Nlm_FloatHi	mx;
4728 
4729 	ARG_ADJ2 = ARG_ADJ1 - s;
4730 	ARG_SDIVR = s / ARG_R;	/* = s / r */
4731 	mx = (s > 0. ? ARG_SDIVR + 3. : 3.);
4732 	return Nlm_RombergIntegrate(f, vp, 0., mx, ARG_EPS, 0, 1);
4733 }
4734 
4735 static Nlm_FloatHi LIBCALL
f(Nlm_FloatHi x,Nlm_VoidPtr vp)4736 f(Nlm_FloatHi	x, Nlm_VoidPtr	vp)
4737 {
4738 	register Nlm_FloatHi PNTR	args = vp;
4739 	register Nlm_FloatHi	y;
4740 
4741 	y = exp(x - ARG_SDIVR);
4742 #ifdef BLASTKAR_HUGE_VAL
4743 	if (y == BLASTKAR_HUGE_VAL)
4744 		return 0.;
4745 #endif
4746 	if (ARG_R2 == 0.)
4747 		return exp(ARG_ADJ2 - y);
4748 	if (x == 0.)
4749 		return 0.;
4750 	return exp(ARG_R2*log(x) + ARG_ADJ2 - y);
4751 }
4752 
4753 /*
4754     Calculates the e-value for alignments with "small" gaps (typically
4755     under fifty residues/basepairs) following ideas of Stephen Altschul's.
4756 */
4757 
4758 Nlm_FloatHi LIBCALL
BlastSmallGapSumE(Int4 starting_points,Int2 num,Nlm_FloatHi xsum,Int4 query_length,Int4 subject_length,Int8 dblen_eff,Nlm_FloatHi weight_divisor)4759 BlastSmallGapSumE(
4760     Int4 starting_points,       /* the number of starting points
4761                                  * permitted between adjacent
4762                                  * alignments;
4763                                  * max_overlap + max_gap + 1 */
4764     Int2 num,                   /* the number of distinct alignments in this
4765                                  * collection */
4766     Nlm_FloatHi xsum,           /* the sum of the scores of these alignments
4767                                  * each weighted by an appropriate value of
4768                                  * Lambda and logK */
4769     Int4 query_length,          /* the effective len of the query seq */
4770     Int4 subject_length,        /* the effective len of the database seq */
4771     Int8 dblen_eff,             /* the effective len of the database */
4772     Nlm_FloatHi weight_divisor) /* a divisor used to weight the e-value
4773                                  * when multiple collections of alignments
4774                                  * are being considered by the calling
4775                                  * routine */
4776 {
4777     Nlm_FloatHi search_space;   /* The effective size of the search space */
4778     Nlm_FloatHi sum_e;          /* The e-value of this set of alignments */
4779 
4780     if(num == 1) {
4781         search_space = (Nlm_FloatHi) query_length * (Nlm_FloatHi)dblen_eff;
4782 
4783         sum_e = search_space * exp(-xsum);
4784     } else {
4785         Nlm_FloatHi sum_p;      /* The p-value of this set of alignments */
4786 
4787         search_space = (Nlm_FloatHi)subject_length * (Nlm_FloatHi)query_length;
4788 
4789         xsum -=
4790           log(search_space) + 2 * (num-1)*log((Nlm_FloatHi)starting_points);
4791         xsum -= LnFactorial((Nlm_FloatHi) num);
4792 
4793         sum_p = BlastSumP(num, xsum);
4794         sum_e = BlastKarlinPtoE(sum_p) *
4795             ((Nlm_FloatHi) dblen_eff / (Nlm_FloatHi) subject_length);
4796     }
4797     if( weight_divisor == 0 || (sum_e /= weight_divisor) > INT4_MAX ) {
4798         sum_e = INT4_MAX;
4799     }
4800 
4801     return sum_e;
4802 }
4803 
4804 
4805 /*
4806     Calculates the e-value of a collection multiple distinct
4807     alignments with asymmetric gaps between the alignments. The gaps
4808     in one (protein) sequence are typically small (like in
4809     BlastSmallGapSumE) gap an the gaps in the other (translated DNA)
4810     sequence are possibly large (up to 4000 bp.)  This routine is used
4811     for linking HSPs representing exons in the DNA sequence that are
4812     separated by introns.
4813 */
4814 
4815 Nlm_FloatHi LIBCALL
BlastUnevenGapSumE(Int4 query_start_points,Int4 subject_start_points,Int2 num,Nlm_FloatHi xsum,Int4 query_length,Int4 subject_length,Int8 dblen_eff,Nlm_FloatHi weight_divisor)4816 BlastUnevenGapSumE(
4817     Int4 query_start_points,    /* the number of starting points in
4818                                  * the query sequence permitted
4819                                  * between adjacent alignments;
4820                                  * max_overlap + max_gap + 1 */
4821     Int4 subject_start_points,  /* the number of starting points in
4822                                  * the subject sequence permitted
4823                                  * between adjacent alignments */
4824     Int2 num,                   /* the number of distinct alignments in this
4825                                  * collection */
4826     Nlm_FloatHi xsum,           /* the sum of the scores of these alignments
4827                                  * each weighted by an appropriate value of
4828                                  * Lambda and logK */
4829     Int4 query_length,          /* the effective len of the query seq */
4830     Int4 subject_length,        /* the effective len of the database seq */
4831     Int8 dblen_eff,             /* the effective len of the database */
4832     Nlm_FloatHi weight_divisor) /* a divisor used to weight the e-value
4833                                  * when multiple collections of alignments
4834                                  * are being considered by the calling
4835                                  * routine */
4836 {
4837     Nlm_FloatHi search_space;   /* The effective size of the search space */
4838     Nlm_FloatHi sum_e;          /* The e-value of this set of alignments */
4839 
4840     if( num == 1 ) {
4841         search_space = (Nlm_FloatHi)query_length * (Nlm_FloatHi)dblen_eff;
4842 
4843         sum_e = search_space * exp(-xsum);
4844     } else {
4845         Nlm_FloatHi sum_p;        /* The p-value of this set of alignments */
4846 
4847         search_space = (Nlm_FloatHi)subject_length * (Nlm_FloatHi)query_length;
4848 
4849         xsum -= log(search_space) +
4850             (num-1)*(log((Nlm_FloatHi) query_start_points) +
4851                      log((Nlm_FloatHi) subject_start_points));
4852         xsum -= LnFactorial((Nlm_FloatHi) num);
4853 
4854         sum_p = BlastSumP(num, xsum);
4855 
4856         sum_e = BlastKarlinPtoE(sum_p) *
4857             ((Nlm_FloatHi) dblen_eff / (Nlm_FloatHi) subject_length);
4858     }
4859     if( weight_divisor == 0 || (sum_e /= weight_divisor) > INT4_MAX ) {
4860         sum_e = INT4_MAX;
4861     }
4862 
4863     return sum_e;
4864 }
4865 
4866 /*
4867     Calculates the e-value if a collection of distinct alignments with
4868     arbitrarily large gaps between the alignments
4869 */
4870 
4871 Nlm_FloatHi LIBCALL
BlastLargeGapSumE(Int2 num,Nlm_FloatHi xsum,Int4 query_length,Int4 subject_length,Int8 dblen_eff,Nlm_FloatHi weight_divisor)4872 BlastLargeGapSumE(
4873     Int2 num,                   /* the number of distinct alignments in this
4874                                  * collection */
4875     Nlm_FloatHi xsum,           /* the sum of the scores of these alignments
4876                                  * each weighted by an appropriate value of
4877                                  * Lambda and logK */
4878     Int4 query_length,          /* the effective len of the query seq */
4879     Int4 subject_length,        /* the effective len of the database seq */
4880     Int8 dblen_eff,             /* the effective len of the database */
4881     Nlm_FloatHi weight_divisor) /* a divisor used to weight the e-value
4882                                  * when multiple collections of alignments
4883                                  * are being considered by the calling
4884                                  * routine */
4885 {
4886     Nlm_FloatHi sum_p;          /* The p-value of this set of alignments */
4887     Nlm_FloatHi sum_e;          /* The e-value of this set of alignments */
4888 
4889 /* The next two variables are for compatability with Warren's code. */
4890     Nlm_FloatHi lcl_subject_length;     /* query_length as a float */
4891     Nlm_FloatHi lcl_query_length;       /* subject_length as a float */
4892 
4893     lcl_query_length = (Nlm_FloatHi) query_length;
4894     lcl_subject_length = (Nlm_FloatHi) subject_length;
4895 
4896     if( num == 1 ) {
4897         sum_e = lcl_query_length * (Nlm_FloatHi) dblen_eff * exp(-xsum);
4898     } else {
4899         xsum -= num*log(lcl_subject_length*lcl_query_length)
4900             - LnFactorial((Nlm_FloatHi) num);
4901 
4902         sum_p = BlastSumP(num, xsum);
4903 
4904         sum_e = BlastKarlinPtoE(sum_p) *
4905             ((Nlm_FloatHi) dblen_eff / (Nlm_FloatHi) subject_length);
4906     }
4907     if( weight_divisor == 0 || (sum_e /= weight_divisor) > INT4_MAX ) {
4908         sum_e = INT4_MAX;
4909     }
4910 
4911     return sum_e;
4912 }
4913 
4914 
4915 /********************************************************************
4916 *
4917 *	The following function, from Stephen Altschul, calculates a
4918 *	pseudoscore from a p-vlaue and n, the product of the database
4919 *	and query lengths.
4920 *	double	p;		 p-value
4921 *	double	n;		 search space
4922 *********************************************************************/
4923 /* Move the following constant into blast.h??, or only the last one. */
4924 #define		PSCALE 20.0
4925 #define 	PSEUDO_SCORE_MAX 32767
4926 #define 	SYBASE_MIN 1.0e-300
4927 
4928 Int2
ConvertPtoPseudoS(Nlm_FloatHi p,Nlm_FloatHi n)4929 ConvertPtoPseudoS(Nlm_FloatHi p, Nlm_FloatHi n)
4930 {
4931 	Int2	s;
4932 	Nlm_FloatHi	E;
4933 
4934 /* If p is 1.0, then E is very large and E/n is about one. */
4935 	if (p > 0.99)
4936 		return 0.5;
4937 /* If p is very small, the highest score should be returned. */
4938 	else if (p < SYBASE_MIN)
4939 		return PSEUDO_SCORE_MAX;
4940 
4941 /* E = -ln(1-p); the following calculates it. */
4942         E = -Nlm_Log1p(-p);
4943 
4944 	s= ConvertEtoPseudoS (E, n);
4945 
4946 	return s;
4947 }
4948 
4949 /*******************************************************************
4950 *
4951 *	This function calculates a pseudo-score from an E value.
4952 *	The searchsp is the product of the query and database
4953 *	lengths.  As the E value is directly related to the search
4954 *	space, this effectively scales the database size out of
4955 *	the calculation of the pseudo-score.
4956 *******************************************************************/
4957 Int2
ConvertEtoPseudoS(Nlm_FloatHi E,Nlm_FloatHi searchsp)4958 ConvertEtoPseudoS(Nlm_FloatHi E, Nlm_FloatHi searchsp)
4959 {
4960 	Int2	s;
4961 
4962 /* If E is very small, the highest score should be returned. */
4963 	if (E < SYBASE_MIN)
4964 		return PSEUDO_SCORE_MAX;
4965 
4966 	if (E > searchsp)
4967 		return 0;
4968 
4969 /* 0.5 is added to make sure this is rounded up, not down. */
4970 	s= 0.5-PSCALE*log(E/searchsp);
4971 
4972 	return s;
4973 }
4974 
4975 
4976 
4977 /*
4978 Given a pseudoscore, a subroutine for calculating an E-value for a comparison
4979 of size n (the product of the sequence length) is:
4980 */
4981 
4982 Nlm_FloatHi
ConvertPseudoStoE(Int2 s,Nlm_FloatHi n)4983 ConvertPseudoStoE(Int2 s, Nlm_FloatHi n)
4984 {
4985 	return n*exp(-s/PSCALE);
4986 }
4987 
4988 /****************************************************************************
4989 *
4990 *	This function attaches a new ScorePtr to the "*old" one passed
4991 *	in.  If *old is NULL, then *old is set equal to the new ptr.
4992 *	The new pointer (NOT the head of the chain) is returned.
4993 *
4994 *	If the value is an int, then set prob to zero and pass the value in
4995 *	as score; if the value is a Nlm_FloatHi, pass it in as prob.
4996 *
4997 *	The type of score is stored in an Object-id.str
4998 ****************************************************************************/
4999 
5000 ScorePtr
MakeBlastScore(ScorePtr PNTR old,CharPtr scoretype,Nlm_FloatHi prob,Int4 score)5001 MakeBlastScore (ScorePtr PNTR old, CharPtr scoretype, Nlm_FloatHi prob, Int4 score)
5002 
5003 {
5004 	ScorePtr	scrp, scrp1;
5005 
5006 	scrp = ScoreNew();
5007 
5008 	if (score)
5009 	{
5010 		scrp->choice = 1;
5011 		scrp->value.intvalue = score;
5012 	}
5013 	else
5014 	{
5015 		scrp->choice = 2;
5016 		scrp->value.realvalue = (Nlm_FloatHi) prob;
5017 	}
5018 
5019 	scrp->id = ObjectIdNew();
5020 	scrp->id->str = StringSave(scoretype);
5021 
5022 
5023 	if (*old == NULL)
5024 		*old = scrp;
5025 	else
5026 	{
5027 		for (scrp1=*old; scrp1->next; scrp1=scrp1->next)
5028 			;
5029 		scrp1->next = scrp;
5030 	}
5031 
5032 
5033 	return scrp;
5034 }
5035 
5036 #ifndef TX_MATRIX_SIZE
5037 #define TX_MATRIX_SIZE 128
5038 #endif
5039 
BlastMatrixToTxMatrix(BLAST_MatrixPtr blast_matrix)5040 Int4Ptr PNTR LIBCALL BlastMatrixToTxMatrix(BLAST_MatrixPtr blast_matrix)
5041 {
5042    Uint1 i, j, index1, index2;
5043    Int4Ptr PNTR matrix = blast_matrix->original_matrix;
5044    Int4Ptr PNTR txmatrix;
5045    SeqMapTablePtr smtp;
5046    SeqCodeTablePtr sctp;
5047 
5048    if (!blast_matrix->is_prot || matrix == NULL)
5049       return NULL;
5050 
5051    sctp = SeqCodeTableFindObj(Seq_code_ncbistdaa);
5052    smtp = SeqMapTableFind(Seq_code_ncbieaa, Seq_code_ncbistdaa);
5053 
5054    txmatrix = Malloc(TX_MATRIX_SIZE*sizeof(Int4Ptr));
5055 
5056    for (i=0; i<TX_MATRIX_SIZE; i++) {
5057       txmatrix[i] = Malloc(TX_MATRIX_SIZE*sizeof(Int4));
5058       for (j=0; j<TX_MATRIX_SIZE; j++)
5059          txmatrix[i][j] = BLAST_SCORE_MIN;
5060    }
5061 
5062    for (i=sctp->start_at; i < sctp->start_at + sctp->num; i++) {
5063       for (j=sctp->start_at; j < sctp->start_at + sctp->num; j++) {
5064          index1 = SeqMapTableConvert(smtp, i);
5065          index2 = SeqMapTableConvert(smtp, j);
5066          txmatrix[index1][index2] = matrix[i][j];
5067       }
5068    }
5069 
5070    return txmatrix;
5071 }
5072 
TxMatrixDestruct(Int4Ptr PNTR txmatrix)5073 Int4Ptr PNTR LIBCALL TxMatrixDestruct(Int4Ptr PNTR txmatrix)
5074 {
5075    Int2 i;
5076 
5077    if (txmatrix == NULL)
5078       return NULL;
5079 
5080    for (i=0; i<TX_MATRIX_SIZE; i++)
5081       MemFree(txmatrix[i]);
5082 
5083    MemFree(txmatrix);
5084    return NULL;
5085 }
5086 
5087 
5088 /**
5089  * Computes the adjustment to the lengths of the query and database sequences
5090  * that is used to compensate for edge effects when computing evalues.
5091  *
5092  * The length adjustment is an integer-valued approximation to the fixed
5093  * point of the function
5094  *
5095  *    f(ell) = beta +
5096  *               (alpha/lambda) * (log K + log((m - ell)*(n - N ell)))
5097  *
5098  * where m is the query length n is the length of the database and N is the
5099  * number of sequences in the database. The values beta, alpha, lambda and
5100  * K are statistical, Karlin-Altschul parameters.
5101  *
5102  * The value of the length adjustment computed by this routine, A,
5103  * will always be an integer smaller than the fixed point of
5104  * f(ell). Usually, it will be the largest such integer.  However, the
5105  * computed length adjustment, A, will also be so small that
5106  *
5107  *    K * (m - A) * (n - N * A) > min(m,n).
5108  *
5109  * Moreover, an iterative method is used to compute A, and under
5110  * unusual circumstances the iterative method may not converge.
5111  *
5112  * @param K      the statistical parameter K
5113  * @param logK   the natural logarithm of K
5114  * @param alpha_d_lambda    the ratio of the statistical parameters
5115  *                          alpha and lambda (for ungapped alignments, the
5116  *                          value 1/H should be used)
5117  * @param beta              the statistical parameter beta (for ungapped
5118  *                          alignments, beta == 0)
5119  * @param query_length      the length of the query sequence
5120  * @param db_length         the length of the database
5121  * @param db_num_seq        the number of sequences in the database
5122  * @param length_adjustment the computed value of the length adjustment [out]
5123  *
5124  * @return   0 if length_adjustment is known to be the largest integer less
5125  *           than the fixed point of f(ell); 1 otherwise.
5126  */
5127 Int4
BlastComputeLengthAdjustment(Nlm_FloatHi K,Nlm_FloatHi logK,Nlm_FloatHi alpha_d_lambda,Nlm_FloatHi beta,Int4 query_length,Int8 db_length,Int4 db_num_seqs,Int4 * length_adjustment)5128 BlastComputeLengthAdjustment(Nlm_FloatHi K,
5129                              Nlm_FloatHi logK,
5130                              Nlm_FloatHi alpha_d_lambda,
5131                              Nlm_FloatHi beta,
5132                              Int4 query_length,
5133                              Int8 db_length,
5134                              Int4 db_num_seqs,
5135                              Int4 * length_adjustment)
5136 {
5137     Int4 i;                     /* iteration index */
5138     const Int4 maxits = 20;     /* maximum allowed iterations */
5139     Nlm_FloatHi m = query_length, n = db_length, N = db_num_seqs;
5140 
5141     Nlm_FloatHi ell;            /* A float value of the length adjustment */
5142     Nlm_FloatHi ss;             /* effective size of the search space */
5143     Nlm_FloatHi ell_min = 0, ell_max;   /* At each iteration i,
5144                                          * ell_min <= ell <= ell_max. */
5145     Boolean converged    = FALSE;       /* True if the iteration converged */
5146     Nlm_FloatHi ell_next = 0;   /* Value the variable ell takes at iteration
5147                                  * i + 1 */
5148     /* Choose ell_max to be the largest nonnegative value that satisfies
5149      *
5150      *    K * (m - ell) * (n - N * ell) > max(m,n)
5151      *
5152      * Use quadratic formula: 2 c /( - b + sqrt( b*b - 4 * a * c )) */
5153     { /* scope of a, mb, and c, the coefficients in the quadratic formula
5154        * (the variable mb is -b) */
5155         Nlm_FloatHi a  = N;
5156         Nlm_FloatHi mb = m * N + n;
5157         Nlm_FloatHi c  = n * m - MAX(m, n) / K;
5158 
5159         if(c < 0) {
5160             *length_adjustment = 0;
5161             return 1;
5162         } else {
5163             ell_max = 2 * c / (mb + sqrt(mb * mb - 4 * a * c));
5164         }
5165     } /* end scope of a, mb and c */
5166 
5167     for(i = 1; i <= maxits; i++) {      /* for all iteration indices */
5168         Nlm_FloatHi ell_bar;    /* proposed next value of ell */
5169         ell      = ell_next;
5170         ss       = (m - ell) * (n - N * ell);
5171         ell_bar  = alpha_d_lambda * (logK + log(ss)) + beta;
5172         if(ell_bar >= ell) { /* ell is no bigger than the true fixed point */
5173             ell_min = ell;
5174             if(ell_bar - ell_min <= 1.0) {
5175                 converged = TRUE;
5176                 break;
5177             }
5178             if(ell_min == ell_max) { /* There are no more points to check */
5179                 break;
5180             }
5181         } else { /* else ell is greater than the true fixed point */
5182             ell_max = ell;
5183         }
5184         if(ell_min <= ell_bar && ell_bar <= ell_max) {
5185           /* ell_bar is in range. Accept it */
5186             ell_next = ell_bar;
5187         } else { /* else ell_bar is not in range. Reject it */
5188             ell_next = (i == 1) ? ell_max : (ell_min + ell_max) / 2;
5189         }
5190     } /* end for all iteration indices */
5191     if(converged) { /* the iteration converged */
5192         /* If ell_fixed is the (unknown) true fixed point, then we
5193          * wish to set (*length_adjustment) to floor(ell_fixed).  We
5194          * assume that floor(ell_min) = floor(ell_fixed) */
5195         *length_adjustment = (Int4) ell_min;
5196         /* But verify that ceil(ell_min) != floor(ell_fixed) */
5197         ell = ceil(ell_min);
5198         if( ell <= ell_max ) {
5199           ss = (m - ell) * (n - N * ell);
5200           if(alpha_d_lambda * (logK + log(ss)) + beta >= ell) {
5201             /* ceil(ell_min) == floor(ell_fixed) */
5202             *length_adjustment = (Int4) ell;
5203           }
5204         }
5205     } else { /* else the iteration did not converge. */
5206         /* Use the best value seen so far */
5207         *length_adjustment = (Int4) ell_min;
5208     }
5209 
5210     return converged ? 0 : 1;
5211 }
5212