1 static char const rcsid[] = "$Id: blastkar.c,v 6.115 2006/12/14 17:08:19 madden Exp $";
2
3 /* ===========================================================================
4 *
5 * PUBLIC DOMAIN NOTICE
6 * National Center for Biotechnology Information
7 *
8 * This software/database is a "United States Government Work" under the
9 * terms of the United States Copyright Act. It was written as part of
10 * the author's official duties as a United States Government employee and
11 * thus cannot be copyrighted. This software/database is freely available
12 * to the public for use. The National Library of Medicine and the U.S.
13 * Government have not placed any restriction on its use or reproduction.
14 *
15 * Although all reasonable efforts have been taken to ensure the accuracy
16 * and reliability of the software and data, the NLM and the U.S.
17 * Government do not and cannot warrant the performance or results that
18 * may be obtained by using this software or data. The NLM and the U.S.
19 * Government disclaim all warranties, express or implied, including
20 * warranties of performance, merchantability or fitness for any particular
21 * purpose.
22 *
23 * Please cite the author in any work or product based on this material.
24 *
25 * ===========================================================================*/
26 /*****************************************************************************
27
28 File name: blastkar.c
29
30 Author: Tom Madden
31
32 Contents: Functions to calculate BLAST probabilities etc.
33
34 Detailed Contents:
35
36 - allocate and deallocate structures used by BLAST to calculate
37 probabilities etc.
38
39 - calculate residue frequencies for query and "average" database.
40
41 - read in matrix.
42
43 - calculate sum-p from a collection of HSP's, for both the case
44 of a "small" gap and a "large" gap, when give a total score and the
45 number of HSP's.
46
47 - calculate expect values for p-values.
48
49 - calculate pseuod-scores from p-values.
50
51 ******************************************************************************
52 * $Revision: 6.115 $
53 * $Log: blastkar.c,v $
54 * Revision 6.115 2006/12/14 17:08:19 madden
55 * Fix BLAST_MatrixDestruct to not use hard-coded alphabet size of 26 (from Mike Gertz)
56 *
57 * Revision 6.114 2006/10/02 12:36:01 madden
58 * Comment out BLOSUM62_20
59 *
60 * Revision 6.113 2006/04/07 13:46:24 madden
61 * Improved the comment for NlmKarlinLambdaNR. Reformatted the
62 * function prototype to fit in 80 characters. (from Mike Gertz).
63 *
64 * Revision 6.112 2005/10/31 14:16:10 madden
65 * Add support for reward/penalty of 1/-5, 3/-4, and 3/-2
66 *
67 * Revision 6.111 2005/10/14 16:32:34 madden
68 * Add preliminary support for vecscreen parameters
69 *
70 * Revision 6.110 2005/10/12 19:21:03 madden
71 * Fix bug in s_GetNuclValuesArray
72 *
73 * Revision 6.109 2005/10/06 12:51:39 madden
74 * Add code to produce correct statistics for gapped blastn, changes include:
75 *
76 * 1.) series of array_of_8 structs with precalculated data
77 * 2.) function s_GetNuclValuesArray
78 * 3.) function BlastKarlinGetNuclAlphaBeta
79 * 4.) function BlastKarlinBlkNuclGappedCalc
80 *
81 * Revision 6.108 2005/08/09 14:14:46 dondosha
82 * From A. Shaffer: added comments to clarify usage of BlastKarlinEtoP and BlastKarlinPtoE
83 *
84 * Revision 6.107 2005/07/28 14:57:09 coulouri
85 * remove dead code
86 *
87 * Revision 6.106 2005/07/27 17:48:57 coulouri
88 * remove hardcoded paths
89 *
90 * Revision 6.105 2005/04/27 17:20:45 papadopo
91 * copy X scores to U scores when building score matrix
92 *
93 * Revision 6.104 2005/04/20 19:02:15 lavr
94 * +<assert.h>
95 *
96 * Revision 6.103 2005/01/31 17:02:03 dondosha
97 * Fixed bug in BlastKarlinLHtoK for blastn with penalty == -reward
98 *
99 * Revision 6.102 2004/09/28 16:04:19 papadopo
100 * From Michael Gertz:
101 * 1. Pass the effective database size into BlastSmallGapSumE,
102 * BlastLargeGapSumE and BlastUnevenGapSumE. The routines use this
103 * value in a simplified formula to compute the e-value of singleton sets.
104 * 2. Caused all routines for calculating the significance of multiple
105 * distinct alignments (BlastSmallGapSumE, BlastLargeGapSumE and
106 * BlastUnevenGapSumE) to use
107 *
108 * sum_{i in linked_set} (\lambda_i s_i - \ln K_i)
109 *
110 * as the weighted sum score, where (\lambda_i, K_i) are taken from
111 * the appropriate query context.
112 *
113 * Revision 6.101 2004/06/07 20:03:23 coulouri
114 * use floating point constants for comparisons with floating point variables
115 *
116 * Revision 6.100 2004/04/28 14:36:00 madden
117 * Changes from Mike Gertz:
118 * - I created the new routine BlastGapDecayDivisor that computes a
119 * divisor used to weight the evalue of a collection of distinct
120 * alignments.
121 * - I removed BlastGapDecay and BlastGapDecayInverse which had become
122 * redundant.
123 * - I modified the BlastCutoffs routine so that it uses the value
124 * returned by BlastGapDecayDivisor to weight evalues.
125 * - I modified BlastSmallGapSumE, BlastLargeGapSumE and
126 * BlastUnevenGapSumE no longer refer to the gap_prob parameter.
127 * Replaced the gap_decay_rate parameter of each of these routines with
128 * a weight_divisor parameter. Added documentation.
129 *
130 * Revision 6.99 2004/04/23 13:49:43 madden
131 * Cleaned up ifndef in BlastKarlinLHtoK
132 *
133 * Revision 6.98 2004/04/23 13:19:53 madden
134 * Rewrote BlastKarlinLHtoK to do the following and more:
135 * 1. fix a bug whereby the wrong formula was used when high score == 1
136 * and low score == -1;
137 * 2. fix a methodological error of truncating the first sum
138 * and trying to make it converge quickly by adding terms
139 * of a geometric progression, even though the geometric progression
140 * estimate is not correct in all cases;
141 * the old adjustment code is left in for historical purposes but
142 * #ifdef'd out
143 * 3. Eliminate the Boolean bi_modal_score variable. The old test that
144 * set the value of bi_modal_score would frequently fail to choose the
145 * correct value due to rounding error.
146 * 4. changed numerous local variable names to make them more meaningful;
147 * 5. added substantial comments to explain what the procedure
148 * is doing and what each variable represents
149 *
150 * Revision 6.97 2004/04/01 13:43:08 lavr
151 * Spell "occurred", "occurrence", and "occurring"
152 *
153 * Revision 6.96 2004/03/31 17:58:51 papadopo
154 * Mike Gertz' changes for length adjustment calculations
155 *
156 * Revision 6.95 2003/12/12 16:00:34 madden
157 * Add gap_decay_rate to BlastCutoffs, remove BlastCutoffs_simple, protection against overflow, removal of defunct _real variables (all from Mike Gertz)
158 *
159 * Revision 6.94 2003/11/30 03:36:38 camacho
160 * Fix compilation error
161 *
162 * Revision 6.93 2003/11/28 22:39:40 camacho
163 * +static keyword to BlastKarlinLtoH
164 *
165 * Revision 6.92 2003/11/28 15:16:38 camacho
166 * Combine newkar.c's contents with blastkar.c
167 *
168 * Revision 6.91 2003/11/26 19:08:10 madden
169 * simplified BlastKarlinLtoH and BlastKarlinLHtoK and provided better protection against overflow, new function NlmKarlinLambdaNR (all from Mike Gertz)
170 *
171 *
172 * Revision 6.90 2003/06/30 20:01:32 dondosha
173 * Correction in logic of finding matrices by BLASTMAT environment variable
174 *
175 * Revision 6.89 2003/05/30 17:25:36 coulouri
176 * add rcsid
177 *
178 * Revision 6.88 2003/05/06 18:54:02 dondosha
179 * Set all gap/residue scores in blastn matrix to INT4_MIN/2
180 *
181 * Revision 6.87 2003/03/07 22:33:25 bealer
182 * - Fix UMR when alphabet_type is not set.
183 *
184 * Revision 6.86 2003/02/27 19:07:56 madden
185 * Add functions PrintMatrixMessage and PrintAllowedValuesMessage
186 *
187 * Revision 6.85 2003/02/26 18:23:49 madden
188 * Add functions BlastKarlinkGapBlkFill and BlastKarlinReportAllowedValues, call from BlastKarlinBlkGappedCalcEx
189 *
190 * Revision 6.84 2002/10/24 22:52:14 dondosha
191 * When checking config file for matrices path, allow aa or nt subdirectories too
192 *
193 * Revision 6.83 2002/07/22 20:10:12 dondosha
194 * Correction: previous change did not work for proteins
195 *
196 * Revision 6.82 2002/07/19 18:34:58 dondosha
197 * Ignore bits higher than 4 when computing frequencies - needed for megablast
198 *
199 * Revision 6.81 2002/05/17 20:30:37 madden
200 * Add comments on adding new matrix values
201 *
202 * Revision 6.80 2002/04/09 18:44:19 madden
203 * Do not return if status of BlastScoreBlkMatCreate is non-zero
204 *
205 * Revision 6.79 2002/02/26 22:25:21 dondosha
206 * Return error as soon as it is found that matrix name is not supported
207 *
208 * Revision 6.78 2001/12/13 14:30:49 madden
209 * Add BLASTKAR_SMALL_FLOAT to prevent floating point exception for very small floats
210 *
211 * Revision 6.77 2001/09/05 20:32:21 dondosha
212 * Fixed uninitialized variable bug
213 *
214 * Revision 6.76 2001/08/23 21:19:05 dondosha
215 * Improvements for lambda and K computation when all scores are multiples of a common factor
216 *
217 * Revision 6.75 2001/02/20 18:31:28 egorov
218 * Added protection agains freeing zero pointer
219 *
220 * Revision 6.74 2001/01/29 16:11:34 madden
221 * Added BLOSUM80 values for 25,2
222 *
223 * Revision 6.73 2000/12/28 16:23:24 madden
224 * Function getAlphaBeta from AS
225 *
226 * Revision 6.72 2000/12/26 17:46:20 madden
227 * Add function BlastKarlinGetMatrixValuesEx2 to return alpha and beta
228 *
229 * Revision 6.71 2000/11/27 16:04:34 dondosha
230 * Check if original_matrix not NULL before destructing it
231 *
232 * Revision 6.70 2000/11/24 22:07:32 shavirin
233 * Added new function BlastResFreqFree().
234 *
235 * Revision 6.69 2000/11/24 21:44:21 shavirin
236 * Fixed memory leak in the function BLAST_MatrixDestruct() in case of
237 * PSI Blast.
238 *
239 * Revision 6.68 2000/11/22 15:32:39 madden
240 * Remove unneeded line
241 *
242 * Revision 6.67 2000/11/21 19:06:03 madden
243 * Set best value for blosum62_20
244 *
245 * Revision 6.66 2000/11/21 18:45:56 madden
246 * New parameter values from S. Altschul for matrices
247 *
248 * Revision 6.65 2000/11/03 17:15:13 madden
249 * Add 13,2 for blosum80
250 *
251 * Revision 6.64 2000/10/25 16:39:46 madden
252 * Add protection in BlastMatrixToTxMatrix for NULL matrix
253 *
254 * Revision 6.63 2000/10/23 15:52:18 dondosha
255 * Bug in previous revision: INT4_MIN is too small to be a matrix value
256 *
257 * Revision 6.62 2000/10/20 21:51:09 dondosha
258 * Set matrix[15][] values to INT4_MIN for blastn to prevent crossing boundary between strands
259 *
260 * Revision 6.61 2000/09/27 21:28:40 dondosha
261 * Copy square matrix in BLAST_MatrixFill to the original_matrix member of BLAST_Matrix
262 *
263 * Revision 6.60 2000/09/27 19:29:04 dondosha
264 * Check boolean parameter in BLAST_MatrixFill to use position based matrix
265 *
266 * Revision 6.59 2000/09/15 18:48:16 madden
267 * Added new blosum62_20 values
268 *
269 * Revision 6.58 2000/09/13 16:16:52 dondosha
270 * Added LIBCALL to TxMatrix functions declarations
271 *
272 * Revision 6.57 2000/09/13 15:50:28 dondosha
273 * Use SeqMapTable functions for conversion of search matrix to txalign-style matrix
274 *
275 * Revision 6.56 2000/09/12 16:03:50 dondosha
276 * Added functions to create and destroy matrix used in txalign
277 *
278 * Revision 6.55 2000/09/11 20:49:32 madden
279 * New values for BLOSUM62_20 from Stephen Altschul
280 *
281 * Revision 6.54 2000/08/31 15:45:07 dondosha
282 * Added function BlastUnevenGapSumE for sum evalue computation with different gap size on two sequences
283 *
284 * Revision 6.53 2000/08/28 13:38:31 shavirin
285 * Fixed bug in the function BLAST_MatrixFill() detected by Alejandro.
286 *
287 * Revision 6.52 2000/08/23 18:50:01 madden
288 * Changes to support decline-to-align checking
289 *
290 * Revision 6.51 2000/08/22 12:58:55 madden
291 * Add new BLOSUM62_20 values
292 *
293 * Revision 6.50 2000/08/04 15:49:25 sicotte
294 * added BlastScoreBlkMatCreateEx(reward,penalty) and BlastKarlinGetDefaultMatrixValues as external functions
295 *
296 * Revision 6.49 2000/08/04 15:13:29 dondosha
297 * Initialize posFreqs to NULL in BLAST_MatrixFill
298 *
299 * Revision 6.48 2000/08/04 14:43:58 shavirin
300 * Changed values for data corresponding to BLOSUM62_20A and
301 * BLOSUM62_20B matrixes.
302 *
303 * Revision 6.47 2000/08/03 22:23:13 shavirin
304 * Added initialization of the posFreqs array in the function BLAST_MatrixFill()
305 *
306 * Revision 6.46 2000/07/24 18:46:37 shavirin
307 * Added stat parameters for the matrixes BLOSUM62_20a and BLOSUM62_20b
308 *
309 * Revision 6.45 2000/07/24 17:32:42 shavirin
310 * Fixed values for size of matrix in the function BlastScoreBlkMaxScoreSet()
311 *
312 * Revision 6.44 2000/07/20 20:46:30 madden
313 * New values for blosum62_20
314 *
315 * Revision 6.43 2000/06/12 21:37:33 shavirin
316 * Adjusted blosum62_values{} and blosum80_values{}.
317 *
318 * Revision 6.42 2000/06/02 16:20:54 shavirin
319 * Fixed minor bug in function BLAST_MatrixFill
320 *
321 * Revision 6.41 2000/05/26 17:29:54 shavirin
322 * Added array of pos frequencies into BLAST_Matrix structure and it's
323 * handling.
324 *
325 * Revision 6.40 2000/04/17 20:41:37 madden
326 * Added BLAST_MatrixFetch
327 *
328 * Revision 6.39 2000/03/27 20:28:19 shavirin
329 * Added mulithred-safe protection to the function BlastScoreBlkMatRead()
330 *
331 * Revision 6.38 2000/02/24 16:39:03 shavirin
332 * Added check for existence of matrix file in BlastScoreBlkMatFill().
333 *
334 * Revision 6.37 2000/01/11 21:23:13 shavirin
335 * Increased size of index from Int2 to Int4 in BlastPSIMaxScoreGet() function
336 *
337 * Revision 6.36 1999/12/22 21:06:34 shavirin
338 * Added new function BlastPSIMaxScoreGet().
339 *
340 * Revision 6.35 1999/12/16 19:17:22 egorov
341 * Code cleanup
342 *
343 * Revision 6.34 1999/11/29 14:17:46 egorov
344 * Use LnFactorial instead of log(Factorial)
345 *
346 * Revision 6.33 1999/11/23 21:38:40 egorov
347 * Nlm_Factorial(num) causes overflow if num>170; return DBL_MAX if num>170.
348 * It does not influence program's logic because this large number of HSPs is bogus anyway.
349 *
350 * Revision 6.32 1999/08/20 14:42:17 madden
351 * Changed Robinson frequencies per Stephen A. request
352 *
353 * Revision 6.31 1999/08/05 13:13:34 madden
354 * Improved help messages for permitted matrices and gap values
355 *
356 * Revision 6.30 1999/07/30 13:25:51 shavirin
357 * Fixed bug in the function BLAST_MatrixFill which created wrong matrix size.
358 *
359 * Revision 6.29 1998/12/31 18:17:04 madden
360 * Added strand option
361 *
362 * Revision 6.28 1998/09/28 12:28:50 madden
363 * Protection for a DEC Alpha problem with HUGE_VAL
364 *
365 * Revision 6.27 1998/09/24 15:26:36 egorov
366 * Fix lint complaints
367 *
368 * Revision 6.26 1998/08/11 12:57:23 madden
369 * Changed importance of BlastKarlinBlkGappedCalc error message
370 *
371 * Revision 6.25 1998/07/17 15:39:57 madden
372 * Changes for Effective search space.
373 *
374 * Revision 6.24 1998/06/12 15:52:50 madden
375 * Fixed warnings
376 *
377 * Revision 6.23 1998/06/02 21:21:20 madden
378 * Changes for DNA matrices
379 *
380 * Revision 6.22 1998/05/03 17:56:45 madden
381 * Fixed memory leaks
382 *
383 * Revision 6.21 1998/04/24 21:51:40 madden
384 * Check return value on BlastKarlinBlkCalc
385 *
386 * Revision 6.20 1998/04/24 19:27:54 madden
387 * Added BlastKarlinBlkStandardCalcEx for ideal KarlinBlk
388 *
389 * Revision 6.19 1998/04/10 20:57:55 madden
390 * Added data for BLOSUM80 and BLOSUM45
391 *
392 * Revision 6.18 1998/04/10 15:05:48 madden
393 * Added pref_flags return by value to BlastKarlinGetMatrixValues
394 *
395 * Revision 6.17 1998/04/02 21:12:29 madden
396 * Added infinite values to the arrays returned by BlastKarlinGetMatrixValues
397 *
398 * Revision 6.16 1998/03/16 17:41:56 madden
399 * Fixed leaks
400 *
401 * Revision 6.15 1998/03/09 17:15:01 madden
402 * Added BlastKarlinGetMatrixValues
403 *
404 * Revision 6.14 1998/02/26 22:34:32 madden
405 * Changes for 16 bit windows
406 *
407 * Revision 6.13 1998/02/06 18:28:05 madden
408 * Added functions to produce pseudo-scores from p and e values
409 *
410 * Revision 6.12 1997/12/15 21:55:44 madden
411 * Added new parameters for BLOSUM62_20
412 *
413 * Revision 6.11 1997/12/11 22:21:02 madden
414 * Removed unused variables
415 *
416 * Revision 6.10 1997/12/10 22:41:58 madden
417 * proper casting done
418 *
419 * Revision 6.9 1997/12/01 22:07:55 madden
420 * Added missing values, fixed UMR
421 *
422 * Revision 6.8 1997/11/14 21:29:58 madden
423 * Stephens new parameters
424 *
425 * Revision 6.7 1997/11/14 17:14:52 madden
426 * Added Karlin parameter to matrix structure
427 *
428 * Revision 6.6 1997/11/07 22:27:29 madden
429 * Fix in BLAST_MatrixFill
430 *
431 * Revision 6.5 1997/11/07 00:48:07 madden
432 * Added defintitions and functions for BLAST_Matrix
433 *
434 * Revision 6.4 1997/10/30 15:41:01 madden
435 * Casts and fixes for DEC alpha
436 *
437 * Revision 6.3 1997/10/22 21:46:59 madden
438 * Added BLOSUM90, changed to better BLOSUM62 values
439 *
440 * Revision 6.2 1997/10/09 19:47:35 madden
441 * changes for 1/20th bit BLOSUM62 matrices
442 *
443 * Revision 6.1 1997/10/06 17:59:17 madden
444 * Added checks to BLAST_ScoreBlkDestruct
445 *
446 * Revision 6.0 1997/08/25 18:52:37 madden
447 * Revision changed to 6.0
448 *
449 * Revision 1.39 1997/07/18 20:56:13 madden
450 * Added more BLOSUM50 values
451 *
452 * Revision 1.38 1997/07/14 20:11:17 madden
453 * Removed unused variables
454 *
455 * Revision 1.37 1997/07/14 15:30:59 madden
456 * Changed call to BlastKarlinBlkGappedCalc
457 *
458 * Revision 1.36 1997/06/23 20:48:37 madden
459 * Added support for BLOSUM50 and PAM250 matrices
460 *
461 * Revision 1.35 1997/06/23 17:53:07 madden
462 * BlastKarlinBlkGappedCalc checks for valid values if KarlinBlkPtr NULL
463 *
464 * Revision 1.34 1997/05/01 15:53:13 madden
465 * Addition of extra KarlinBlk's for psi-blast
466 *
467 * Revision 1.33 1997/04/22 16:36:49 madden
468 * Added Robinson amino acid frequencies.
469 *
470 * Revision 1.32 1997/04/03 19:48:13 madden
471 * Changes to use effective database length instead of the length of each
472 * sequence in statistical calculations.
473 *
474 * Revision 1.31 1997/03/07 22:24:43 madden
475 * Closed File* for matrix after use.
476 *
477 * Revision 1.30 1997/02/20 21:33:51 madden
478 * Added FindPath call to blastkar.c to look for matrix.
479 *
480 * Revision 1.29 1997/02/06 23:15:43 madden
481 * Added additional gap extension and opening penalties.
482 *
483 * Revision 1.28 1997/02/05 19:54:59 madden
484 * *** empty log message ***
485 *
486 * Revision 1.27 1997/01/16 22:58:08 madden
487 * Secured BlastScoreFreqDestruct against NULL pointers.
488 *
489 * Revision 1.26 1997/01/13 17:15:29 madden
490 * Save matrix name for blastn type matrices.
491 *
492 * Revision 1.25 1996/12/11 17:59:42 madden
493 * Fixed purify leaks.
494 *
495 * Revision 1.24 1996/12/10 17:30:59 madden
496 * Changed statistics for gapped blastp
497 *
498 * Revision 1.23 1996/12/03 19:13:47 madden
499 * Made BlastRes functions non-static.
500 *
501 * Revision 1.22 1996/11/27 16:39:11 madden
502 * Added NULLB to end of array, purify nit.
503 *
504 * Revision 1.21 1996/11/26 20:38:01 madden
505 * Removed unused variable.
506 *
507 * Revision 1.20 1996/11/26 19:55:25 madden
508 * Added check for matrices in standard places.
509 *
510 * Revision 1.19 1996/11/18 19:32:09 madden
511 * Fixed uninitialized variable found by CodeWarrior.
512 *
513 * Revision 1.18 1996/11/18 14:49:26 madden
514 * Rewrote BlastScoreSetAmbigRes to take multiple ambig. chars.
515 *
516 * Revision 1.17 1996/11/08 21:45:03 madden
517 * Fix for blastn matrices.
518 *
519 * Revision 1.16 1996/11/07 22:31:15 madden
520 * Corrected Nucl. matrix for ambiguity characters.
521 *
522 * Revision 1.15 1996/10/04 20:12:26 madden
523 * Fixed memory leaks found by purify.
524 *
525 * Revision 1.14 1996/10/03 20:49:29 madden
526 * Added function BlastKarlinBlkStandardCalc to calculate standard
527 * Karlin parameters for blastx and tblastx.
528 *
529 * Revision 1.13 1996/10/03 18:04:48 madden
530 * Each context (or frame) now uses "proper" Karlin parameters.
531 *
532 * Revision 1.12 1996/10/03 13:07:55 madden
533 * Added return statement.
534 *
535 * Revision 1.11 1996/10/02 20:00:38 madden
536 * Calc. different Karlin parameters for each frame.
537 *
538 * Revision 1.10 1996/10/01 21:24:02 madden
539 * Corrected statistics for partial blastn matching.
540 *
541 * Revision 1.9 1996/09/30 21:56:12 madden
542 * Replaced query alphabet of ncbi2na with blastna alphabet.
543 *
544 * Revision 1.8 1996/09/11 20:08:55 madden
545 * *** empty log message ***
546 *
547 * Revision 1.6 1996/09/11 19:14:09 madden
548 * Changes to blastn matrix.
549 *
550 * Revision 1.5 1996/09/10 19:40:35 madden
551 * Added function BlastScoreBlkMatCreate for blastn matrices.
552 *
553 * Revision 1.4 1996/08/23 19:07:01 madden
554 * Adjust changes for NT warning to give correct results.
555 *
556 * Revision 1.3 1996/08/23 15:32:30 shavirin
557 * Fixed a lot of NT compiler warnings about type mismatch
558 *
559 * Revision 1.2 1996/08/22 20:38:11 madden
560 * Changed all doubles and floats to Nlm_FloatHi.
561 *
562 * Revision 1.1 1996/08/05 19:47:42 madden
563 * Initial revision
564 *
565 * Revision 1.26 1996/06/21 15:23:54 madden
566 * Corelibed BlastSumP.
567 *
568 * Revision 1.25 1996/06/21 15:14:55 madden
569 * Made some functions static, added LIBCALL to others.
570 *
571 * Revision 1.24 1996/06/20 16:52:23 madden
572 * Changed "pow" to "Nlm_Powi".
573 *
574 * Revision 1.23 1996/06/11 15:42:46 madden
575 * Replaced strncasempc with toolbox function.
576 *
577 * Revision 1.22 1996/05/29 12:44:04 madden
578 * Added function BlastRepresentativeResidues.
579 *
580 * Revision 1.21 1996/05/22 20:38:05 madden
581 * *** empty log message ***
582 *
583 * Revision 1.20 1996/05/22 20:21:23 madden
584 * removed unused variables.
585 *
586 * Revision 1.19 1996/05/20 21:18:51 madden
587 * Added BLASTMatrixStructure.
588 *
589 * Revision 1.18 1996/05/16 19:50:15 madden
590 * Added documentation block.
591 *
592 * Revision 1.17 1996/05/14 21:30:37 madden
593 * *** empty log message ***
594 *
595 * Revision 1.16 1996/04/16 15:33:41 madden
596 * Changes for backward-compatability with old blast.
597 *
598 * Revision 1.15 1996/03/25 16:34:19 madden
599 * Changes to mimic old statistics.
600 *
601 * Revision 1.14 1996/03/01 19:40:37 madden
602 * Added factorial correction to SmallGapSum.
603 *
604 * Revision 1.13 1996/02/28 21:38:08 madden
605 * *** empty log message ***
606 *
607 * Revision 1.12 1996/02/15 23:20:01 madden
608 * Added query length to a number of calls.
609 *
610 * Revision 1.12 1996/02/15 23:20:01 madden
611 * Added query length to a number of calls.
612 *
613 * Revision 1.11 1996/02/09 13:51:08 madden
614 * Change to prevent log sign error.
615 *
616 * Revision 1.10 1996/01/22 22:33:06 madden
617 * Changed effective length, fixed (improper) calculation of sum_e.
618 *
619 * Revision 1.9 1996/01/17 23:18:41 madden
620 * Added BlastScoreSetAmbigRes.
621 * ci -l blastkar.h
622 *
623 * Revision 1.8 1996/01/17 17:01:19 madden
624 * Fixed BlastSmallGapSumE and BlastLargeGapSumE.
625 *
626 * Revision 1.7 1996/01/17 13:46:36 madden
627 * Added function BlastKarlinPtoE.
628 *
629 * Revision 1.6 1996/01/06 18:58:07 madden
630 * Added BlastSumP.
631 *
632 * Revision 1.5 1995/12/28 21:26:16 madden
633 * *** empty log message ***
634 *
635 * Revision 1.4 1995/12/26 23:04:46 madden
636 * removed GetBLAST0Matrix, Added "cutoff" functions.
637 *
638 * Revision 1.3 1995/12/26 20:27:47 madden
639 * Replaced BLAST_ScoreMat wiht BLAST_ScorePtr PNTR.
640 *
641 * Revision 1.2 1995/12/26 14:25:56 madden
642 * *** empty log message ***
643 *
644 * Revision 1.1 1995/12/21 23:07:35 madden
645 * Initial revision
646 *
647 * */
648 #include <ncbi.h>
649 #include <assert.h>
650 #include <ncbimath.h>
651 #include <objcode.h>
652 #include <objseq.h>
653 #include <sequtil.h>
654 #include <blastkar.h>
655 #include <blastpri.h>
656
657 static Int2 BlastScoreFreqCalc PROTO((BLAST_ScoreBlkPtr sbp, BLAST_ScoreFreqPtr sfp, BLAST_ResFreqPtr rfp1, BLAST_ResFreqPtr rfp2));
658
659 /* OSF1 apparently doesn't like this. */
660 #if defined(HUGE_VAL) && !defined(OS_UNIX_OSF1)
661 #define BLASTKAR_HUGE_VAL HUGE_VAL
662 #else
663 #define BLASTKAR_HUGE_VAL 1.e30
664 #endif
665
666
667 /* Allocates and Deallocates the two-dimensional matrix. */
668 static BLASTMatrixStructurePtr BlastMatrixAllocate PROTO((Int2 alphabet_size));
669 static BLASTMatrixStructurePtr BlastMatrixDestruct PROTO((BLASTMatrixStructurePtr matrix_struct));
670
671 /* performs sump calculation, used by BlastSumPStd */
672 static Nlm_FloatHi BlastSumPCalc PROTO((int r, Nlm_FloatHi s));
673
674 #define COMMENT_CHR '#'
675 #define TOKSTR " \t\n\r"
676 #define BLAST_MAX_ALPHABET 40 /* ncbistdaa is only 26, this should be enough */
677
678 /*
679 How many of the first bases are not ambiguous
680 (it's four, of course).
681 */
682 #define NUMBER_NON_AMBIG_BP 4
683 /*
684 translates between ncbi4na and blastna. the first four elements
685 of this array match ncbi2na.
686 */
687
688 Uint1 ncbi4na_to_blastna[] = {
689 15,/* Gap, 0 */
690 0, /* A, 1 */
691 1, /* C, 2 */
692 6, /* M, 3 */
693 2, /* G, 4 */
694 4, /* R, 5 */
695 9, /* S, 6 */
696 13, /* V, 7 */
697 3, /* T, 8 */
698 8, /* W, 9 */
699 5, /* Y, 10 */
700 12, /* H, 11 */
701 7, /* K, 12 */
702 11, /* D, 13 */
703 10, /* B, 14 */
704 14 /* N, 15 */
705 };
706
707 Uint1 blastna_to_ncbi4na[] = { 1, /* A, 0 */
708 2, /* C, 1 */
709 4, /* G, 2 */
710 8, /* T, 3 */
711 5, /* R, 4 */
712 10, /* Y, 5 */
713 3, /* M, 6 */
714 12, /* K, 7 */
715 9, /* W, 8 */
716 6, /* S, 9 */
717 14, /* B, 10 */
718 13, /* D, 11 */
719 11, /* H, 12 */
720 7, /* V, 13 */
721 15, /* N, 14 */
722 0, /* Gap, 15 */
723 };
724
725 /* Used in BlastKarlinBlkGappedCalc */
726 typedef FloatHi array_of_8[8];
727
728 /* Used to temporarily store matrix values for retrieval. */
729
730 typedef struct _matrix_info {
731 CharPtr name; /* name of matrix (e.g., BLOSUM90). */
732 array_of_8 *values; /* The values (below). */
733 Int4 *prefs; /* Preferences for display. */
734 Int4 max_number_values; /* number of values (e.g., BLOSUM90_VALUES_MAX). */
735 } MatrixInfo, PNTR MatrixInfoPtr;
736
737
738 /**************************************************************************************
739
740 How the statistical parameters for the matrices are stored:
741 -----------------------------------------------------------
742 They parameters are stored in a two-dimensional array FloatHi (i.e.,
743 doubles, which has as it's first dimensions the number of different
744 gap existence and extension combinations and as it's second dimension 8.
745 The eight different columns specify:
746
747 1.) gap existence penalty (INT2_MAX denotes infinite).
748 2.) gap extension penalty (INT2_MAX denotes infinite).
749 3.) decline to align penalty (INT2_MAX denotes infinite).
750 4.) Lambda
751 5.) K
752 6.) H
753 7.) alpha
754 8.) beta
755
756 (Items 4-8 are explained in:
757 Altschul SF, Bundschuh R, Olsen R, Hwa T.
758 The estimation of statistical parameters for local alignment score distributions.
759 Nucleic Acids Res. 2001 Jan 15;29(2):351-61.).
760
761 Take BLOSUM45 (below) as an example. Currently (5/17/02) there are
762 14 different allowed combinations (specified by "#define BLOSUM45_VALUES_MAX 14").
763 The first row in the array "blosum45_values" has INT2_MAX (i.e., 32767) for gap
764 existence, extension, and decline-to-align penalties. For all practical purposes
765 this value is large enough to be infinite, so the alignments will be ungapped.
766 BLAST may also use this value (INT2_MAX) as a signal to skip gapping, so a
767 different value should not be used if the intent is to have gapless extensions.
768 The next row is for the gap existence penalty 13 and the extension penalty 3.
769 The decline-to-align penalty is only supported in a few cases, so it is normally
770 set to INT2_MAX.
771
772
773 How to add a new matrix to blastkar.c:
774 --------------------------------------
775 To add a new matrix to blastkar.c it is necessary to complete
776 four steps. As an example consider adding the matrix
777 called TESTMATRIX
778
779 1.) add a define specifying how many different existence and extensions
780 penalties are allowed, so it would be necessary to add the line:
781
782 #define TESTMATRIX_VALUES_MAX 14
783
784 if 14 values were to be allowed.
785
786 2.) add a two-dimensional array to contain the statistical parameters:
787
788 static Nlm_FloatHi testmatrix_values[TESTMATRIX_VALUES_MAX][8] ={ ...
789
790 3.) add a "prefs" array that should hint about the "optimal"
791 gap existence and extension penalties:
792
793 static Int4 testmatrix_prefs[TESTMATRIX_VALUES_MAX] = {
794 BLAST_MATRIX_NOMINAL,
795 ...
796 };
797
798 4.) Go to the function BlastLoadMatrixValues (in this file) and
799 add two lines before the return at the end of the function:
800
801 matrix_info = MatrixInfoNew("TESTMATRIX", testmatrix_values, testmatrix_prefs, TESTMATRIX_VALUES_MAX);
802 ValNodeAddPointer(&retval, 0, matrix_info);
803
804
805
806 ***************************************************************************************/
807
808
809
810
811
812 #define BLOSUM45_VALUES_MAX 14
813 static Nlm_FloatHi blosum45_values[BLOSUM45_VALUES_MAX][8] = {
814 {(Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, 0.2291, 0.0924, 0.2514, 0.9113, -5.7},
815 {13, 3, (Nlm_FloatHi) INT2_MAX, 0.207, 0.049, 0.14, 1.5, -22},
816 {12, 3, (Nlm_FloatHi) INT2_MAX, 0.199, 0.039, 0.11, 1.8, -34},
817 {11, 3, (Nlm_FloatHi) INT2_MAX, 0.190, 0.031, 0.095, 2.0, -38},
818 {10, 3, (Nlm_FloatHi) INT2_MAX, 0.179, 0.023, 0.075, 2.4, -51},
819 {16, 2, (Nlm_FloatHi) INT2_MAX, 0.210, 0.051, 0.14, 1.5, -24},
820 {15, 2, (Nlm_FloatHi) INT2_MAX, 0.203, 0.041, 0.12, 1.7, -31},
821 {14, 2, (Nlm_FloatHi) INT2_MAX, 0.195, 0.032, 0.10, 1.9, -36},
822 {13, 2, (Nlm_FloatHi) INT2_MAX, 0.185, 0.024, 0.084, 2.2, -45},
823 {12, 2, (Nlm_FloatHi) INT2_MAX, 0.171, 0.016, 0.061, 2.8, -65},
824 {19, 1, (Nlm_FloatHi) INT2_MAX, 0.205, 0.040, 0.11, 1.9, -43},
825 {18, 1, (Nlm_FloatHi) INT2_MAX, 0.198, 0.032, 0.10, 2.0, -43},
826 {17, 1, (Nlm_FloatHi) INT2_MAX, 0.189, 0.024, 0.079, 2.4, -57},
827 {16, 1, (Nlm_FloatHi) INT2_MAX, 0.176, 0.016, 0.063, 2.8, -67},
828 };
829
830 static Int4 blosum45_prefs[BLOSUM45_VALUES_MAX] = {
831 BLAST_MATRIX_NOMINAL,
832 BLAST_MATRIX_NOMINAL,
833 BLAST_MATRIX_NOMINAL,
834 BLAST_MATRIX_NOMINAL,
835 BLAST_MATRIX_NOMINAL,
836 BLAST_MATRIX_NOMINAL,
837 BLAST_MATRIX_NOMINAL,
838 BLAST_MATRIX_BEST,
839 BLAST_MATRIX_NOMINAL,
840 BLAST_MATRIX_NOMINAL,
841 BLAST_MATRIX_NOMINAL,
842 BLAST_MATRIX_NOMINAL,
843 BLAST_MATRIX_NOMINAL,
844 BLAST_MATRIX_NOMINAL
845 };
846
847
848 #define BLOSUM50_VALUES_MAX 16
849 static Nlm_FloatHi blosum50_values[BLOSUM50_VALUES_MAX][8] = {
850 {(Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, 0.2318, 0.112, 0.3362, 0.6895, -4.0},
851 {13, 3, (Nlm_FloatHi) INT2_MAX, 0.212, 0.063, 0.19, 1.1, -16},
852 {12, 3, (Nlm_FloatHi) INT2_MAX, 0.206, 0.055, 0.17, 1.2, -18},
853 {11, 3, (Nlm_FloatHi) INT2_MAX, 0.197, 0.042, 0.14, 1.4, -25},
854 {10, 3, (Nlm_FloatHi) INT2_MAX, 0.186, 0.031, 0.11, 1.7, -34},
855 {9, 3, (Nlm_FloatHi) INT2_MAX, 0.172, 0.022, 0.082, 2.1, -48},
856 {16, 2, (Nlm_FloatHi) INT2_MAX, 0.215, 0.066, 0.20, 1.05, -15},
857 {15, 2, (Nlm_FloatHi) INT2_MAX, 0.210, 0.058, 0.17, 1.2, -20},
858 {14, 2, (Nlm_FloatHi) INT2_MAX, 0.202, 0.045, 0.14, 1.4, -27},
859 {13, 2, (Nlm_FloatHi) INT2_MAX, 0.193, 0.035, 0.12, 1.6, -32},
860 {12, 2, (Nlm_FloatHi) INT2_MAX, 0.181, 0.025, 0.095, 1.9, -41},
861 {19, 1, (Nlm_FloatHi) INT2_MAX, 0.212, 0.057, 0.18, 1.2, -21},
862 {18, 1, (Nlm_FloatHi) INT2_MAX, 0.207, 0.050, 0.15, 1.4, -28},
863 {17, 1, (Nlm_FloatHi) INT2_MAX, 0.198, 0.037, 0.12, 1.6, -33},
864 {16, 1, (Nlm_FloatHi) INT2_MAX, 0.186, 0.025, 0.10, 1.9, -42},
865 {15, 1, (Nlm_FloatHi) INT2_MAX, 0.171, 0.015, 0.063, 2.7, -76},
866 };
867
868 static Int4 blosum50_prefs[BLOSUM50_VALUES_MAX] = {
869 BLAST_MATRIX_NOMINAL,
870 BLAST_MATRIX_NOMINAL,
871 BLAST_MATRIX_NOMINAL,
872 BLAST_MATRIX_NOMINAL,
873 BLAST_MATRIX_NOMINAL,
874 BLAST_MATRIX_NOMINAL,
875 BLAST_MATRIX_NOMINAL,
876 BLAST_MATRIX_NOMINAL,
877 BLAST_MATRIX_NOMINAL,
878 BLAST_MATRIX_BEST,
879 BLAST_MATRIX_NOMINAL,
880 BLAST_MATRIX_NOMINAL,
881 BLAST_MATRIX_NOMINAL,
882 BLAST_MATRIX_NOMINAL,
883 BLAST_MATRIX_NOMINAL,
884 BLAST_MATRIX_NOMINAL
885 };
886
887 #define BLOSUM62_VALUES_MAX 12
888 static Nlm_FloatHi blosum62_values[BLOSUM62_VALUES_MAX][8] = {
889 {(Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, 0.3176, 0.134, 0.4012, 0.7916, -3.2},
890 {11, 2, (Nlm_FloatHi) INT2_MAX, 0.297, 0.082, 0.27, 1.1, -10},
891 {10, 2, (Nlm_FloatHi) INT2_MAX, 0.291, 0.075, 0.23, 1.3, -15},
892 {9, 2, (Nlm_FloatHi) INT2_MAX, 0.279, 0.058, 0.19, 1.5, -19},
893 {8, 2, (Nlm_FloatHi) INT2_MAX, 0.264, 0.045, 0.15, 1.8, -26},
894 {7, 2, (Nlm_FloatHi) INT2_MAX, 0.239, 0.027, 0.10, 2.5, -46},
895 {6, 2, (Nlm_FloatHi) INT2_MAX, 0.201, 0.012, 0.061, 3.3, -58},
896 {13, 1, (Nlm_FloatHi) INT2_MAX, 0.292, 0.071, 0.23, 1.2, -11},
897 {12, 1, (Nlm_FloatHi) INT2_MAX, 0.283, 0.059, 0.19, 1.5, -19},
898 {11, 1, (Nlm_FloatHi) INT2_MAX, 0.267, 0.041, 0.14, 1.9, -30},
899 {10, 1, (Nlm_FloatHi) INT2_MAX, 0.243, 0.024, 0.10, 2.5, -44},
900 {9, 1, (Nlm_FloatHi) INT2_MAX, 0.206, 0.010, 0.052, 4.0, -87},
901 };
902
903 static Int4 blosum62_prefs[BLOSUM62_VALUES_MAX] = {
904 BLAST_MATRIX_NOMINAL,
905 BLAST_MATRIX_NOMINAL,
906 BLAST_MATRIX_NOMINAL,
907 BLAST_MATRIX_NOMINAL,
908 BLAST_MATRIX_NOMINAL,
909 BLAST_MATRIX_NOMINAL,
910 BLAST_MATRIX_NOMINAL,
911 BLAST_MATRIX_NOMINAL,
912 BLAST_MATRIX_NOMINAL,
913 BLAST_MATRIX_BEST,
914 BLAST_MATRIX_NOMINAL,
915 BLAST_MATRIX_NOMINAL,
916 };
917
918
919 #define BLOSUM80_VALUES_MAX 10
920 static Nlm_FloatHi blosum80_values[BLOSUM80_VALUES_MAX][8] = {
921 {(Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, 0.3430, 0.177, 0.6568, 0.5222, -1.6},
922 {25, 2, (Nlm_FloatHi) INT2_MAX, 0.342, 0.17, 0.66, 0.52, -1.6},
923 {13, 2, (Nlm_FloatHi) INT2_MAX, 0.336, 0.15, 0.57, 0.59, -3},
924 {9, 2, (Nlm_FloatHi) INT2_MAX, 0.319, 0.11, 0.42, 0.76, -6},
925 {8, 2, (Nlm_FloatHi) INT2_MAX, 0.308, 0.090, 0.35, 0.89, -9},
926 {7, 2, (Nlm_FloatHi) INT2_MAX, 0.293, 0.070, 0.27, 1.1, -14},
927 {6, 2, (Nlm_FloatHi) INT2_MAX, 0.268, 0.045, 0.19, 1.4, -19},
928 {11, 1, (Nlm_FloatHi) INT2_MAX, 0.314, 0.095, 0.35, 0.90, -9},
929 {10, 1, (Nlm_FloatHi) INT2_MAX, 0.299, 0.071, 0.27, 1.1, -14},
930 {9, 1, (Nlm_FloatHi) INT2_MAX, 0.279, 0.048, 0.20, 1.4, -19},
931 };
932
933 static Int4 blosum80_prefs[BLOSUM80_VALUES_MAX] = {
934 BLAST_MATRIX_NOMINAL,
935 BLAST_MATRIX_NOMINAL,
936 BLAST_MATRIX_NOMINAL,
937 BLAST_MATRIX_NOMINAL,
938 BLAST_MATRIX_NOMINAL,
939 BLAST_MATRIX_NOMINAL,
940 BLAST_MATRIX_NOMINAL,
941 BLAST_MATRIX_BEST,
942 BLAST_MATRIX_NOMINAL
943 };
944
945 #define BLOSUM90_VALUES_MAX 8
946 static Nlm_FloatHi blosum90_values[BLOSUM90_VALUES_MAX][8] = {
947 {(Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, 0.3346, 0.190, 0.7547, 0.4434, -1.4},
948 {9, 2, (Nlm_FloatHi) INT2_MAX, 0.310, 0.12, 0.46, 0.67, -6},
949 {8, 2, (Nlm_FloatHi) INT2_MAX, 0.300, 0.099, 0.39, 0.76, -7},
950 {7, 2, (Nlm_FloatHi) INT2_MAX, 0.283, 0.072, 0.30, 0.93, -11},
951 {6, 2, (Nlm_FloatHi) INT2_MAX, 0.259, 0.048, 0.22, 1.2, -16},
952 {11, 1, (Nlm_FloatHi) INT2_MAX, 0.302, 0.093, 0.39, 0.78, -8},
953 {10, 1, (Nlm_FloatHi) INT2_MAX, 0.290, 0.075, 0.28, 1.04, -15},
954 {9, 1, (Nlm_FloatHi) INT2_MAX, 0.265, 0.044, 0.20, 1.3, -19},
955 };
956
957 static Int4 blosum90_prefs[BLOSUM90_VALUES_MAX] = {
958 BLAST_MATRIX_NOMINAL,
959 BLAST_MATRIX_NOMINAL,
960 BLAST_MATRIX_NOMINAL,
961 BLAST_MATRIX_NOMINAL,
962 BLAST_MATRIX_NOMINAL,
963 BLAST_MATRIX_NOMINAL,
964 BLAST_MATRIX_BEST,
965 BLAST_MATRIX_NOMINAL
966 };
967
968 #define PAM250_VALUES_MAX 16
969 static Nlm_FloatHi pam250_values[PAM250_VALUES_MAX][8] = {
970 {(Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, 0.2252, 0.0868, 0.2223, 0.98, -5.0},
971 {15, 3, (Nlm_FloatHi) INT2_MAX, 0.205, 0.049, 0.13, 1.6, -23},
972 {14, 3, (Nlm_FloatHi) INT2_MAX, 0.200, 0.043, 0.12, 1.7, -26},
973 {13, 3, (Nlm_FloatHi) INT2_MAX, 0.194, 0.036, 0.10, 1.9, -31},
974 {12, 3, (Nlm_FloatHi) INT2_MAX, 0.186, 0.029, 0.085, 2.2, -41},
975 {11, 3, (Nlm_FloatHi) INT2_MAX, 0.174, 0.020, 0.070, 2.5, -48},
976 {17, 2, (Nlm_FloatHi) INT2_MAX, 0.204, 0.047, 0.12, 1.7, -28},
977 {16, 2, (Nlm_FloatHi) INT2_MAX, 0.198, 0.038, 0.11, 1.8, -29},
978 {15, 2, (Nlm_FloatHi) INT2_MAX, 0.191, 0.031, 0.087, 2.2, -44},
979 {14, 2, (Nlm_FloatHi) INT2_MAX, 0.182, 0.024, 0.073, 2.5, -53},
980 {13, 2, (Nlm_FloatHi) INT2_MAX, 0.171, 0.017, 0.059, 2.9, -64},
981 {21, 1, (Nlm_FloatHi) INT2_MAX, 0.205, 0.045, 0.11, 1.8, -34},
982 {20, 1, (Nlm_FloatHi) INT2_MAX, 0.199, 0.037, 0.10, 1.9, -35},
983 {19, 1, (Nlm_FloatHi) INT2_MAX, 0.192, 0.029, 0.083, 2.3, -52},
984 {18, 1, (Nlm_FloatHi) INT2_MAX, 0.183, 0.021, 0.070, 2.6, -60},
985 {17, 1, (Nlm_FloatHi) INT2_MAX, 0.171, 0.014, 0.052, 3.3, -86},
986 };
987
988 static Int4 pam250_prefs[PAM250_VALUES_MAX] = {
989 BLAST_MATRIX_NOMINAL,
990 BLAST_MATRIX_NOMINAL,
991 BLAST_MATRIX_NOMINAL,
992 BLAST_MATRIX_NOMINAL,
993 BLAST_MATRIX_NOMINAL,
994 BLAST_MATRIX_NOMINAL,
995 BLAST_MATRIX_NOMINAL,
996 BLAST_MATRIX_NOMINAL,
997 BLAST_MATRIX_BEST,
998 BLAST_MATRIX_NOMINAL,
999 BLAST_MATRIX_NOMINAL,
1000 BLAST_MATRIX_NOMINAL,
1001 BLAST_MATRIX_NOMINAL,
1002 BLAST_MATRIX_NOMINAL,
1003 BLAST_MATRIX_NOMINAL,
1004 BLAST_MATRIX_NOMINAL
1005 };
1006
1007 #define PAM30_VALUES_MAX 7
1008 static Nlm_FloatHi pam30_values[PAM30_VALUES_MAX][8] = {
1009 {(Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, 0.3400, 0.283, 1.754, 0.1938, -0.3},
1010 {7, 2, (Nlm_FloatHi) INT2_MAX, 0.305, 0.15, 0.87, 0.35, -3},
1011 {6, 2, (Nlm_FloatHi) INT2_MAX, 0.287, 0.11, 0.68, 0.42, -4},
1012 {5, 2, (Nlm_FloatHi) INT2_MAX, 0.264, 0.079, 0.45, 0.59, -7},
1013 {10, 1, (Nlm_FloatHi) INT2_MAX, 0.309, 0.15, 0.88, 0.35, -3},
1014 {9, 1, (Nlm_FloatHi) INT2_MAX, 0.294, 0.11, 0.61, 0.48, -6},
1015 {8, 1, (Nlm_FloatHi) INT2_MAX, 0.270, 0.072, 0.40, 0.68, -10},
1016 };
1017
1018 static Int4 pam30_prefs[PAM30_VALUES_MAX] = {
1019 BLAST_MATRIX_NOMINAL,
1020 BLAST_MATRIX_NOMINAL,
1021 BLAST_MATRIX_NOMINAL,
1022 BLAST_MATRIX_NOMINAL,
1023 BLAST_MATRIX_NOMINAL,
1024 BLAST_MATRIX_BEST,
1025 BLAST_MATRIX_NOMINAL,
1026 };
1027
1028
1029 #define PAM70_VALUES_MAX 7
1030 static Nlm_FloatHi pam70_values[PAM70_VALUES_MAX][8] = {
1031 {(Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, 0.3345, 0.229, 1.029, 0.3250, -0.7},
1032 {8, 2, (Nlm_FloatHi) INT2_MAX, 0.301, 0.12, 0.54, 0.56, -5},
1033 {7, 2, (Nlm_FloatHi) INT2_MAX, 0.286, 0.093, 0.43, 0.67, -7},
1034 {6, 2, (Nlm_FloatHi) INT2_MAX, 0.264, 0.064, 0.29, 0.90, -12},
1035 {11, 1, (Nlm_FloatHi) INT2_MAX, 0.305, 0.12, 0.52, 0.59, -6},
1036 {10, 1, (Nlm_FloatHi) INT2_MAX, 0.291, 0.091, 0.41, 0.71, -9},
1037 {9, 1, (Nlm_FloatHi) INT2_MAX, 0.270, 0.060, 0.28, 0.97, -14},
1038 };
1039
1040 static Int4 pam70_prefs[PAM70_VALUES_MAX] = {
1041 BLAST_MATRIX_NOMINAL,
1042 BLAST_MATRIX_NOMINAL,
1043 BLAST_MATRIX_NOMINAL,
1044 BLAST_MATRIX_NOMINAL,
1045 BLAST_MATRIX_NOMINAL,
1046 BLAST_MATRIX_BEST,
1047 BLAST_MATRIX_NOMINAL
1048 };
1049
1050
1051
1052 #define BLOSUM62_20_VALUES_MAX 65
1053 static Nlm_FloatHi blosum62_20_values[BLOSUM62_20_VALUES_MAX][8] = {
1054 {(Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, (Nlm_FloatHi) INT2_MAX, 0.03391, 0.125, 0.4544, 0.07462, -3.2},
1055 {100, 12, (Nlm_FloatHi) INT2_MAX, 0.0300, 0.056, 0.21, 0.14, -15},
1056 {95, 12, (Nlm_FloatHi) INT2_MAX, 0.0291, 0.047, 0.18, 0.16, -20},
1057 {90, 12, (Nlm_FloatHi) INT2_MAX, 0.0280, 0.038, 0.15, 0.19, -28},
1058 {85, 12, (Nlm_FloatHi) INT2_MAX, 0.0267, 0.030, 0.13, 0.21, -31},
1059 {80, 12, (Nlm_FloatHi) INT2_MAX, 0.0250, 0.021, 0.10, 0.25, -39},
1060 {105, 11, (Nlm_FloatHi) INT2_MAX, 0.0301, 0.056, 0.22, 0.14, -16},
1061 {100, 11, (Nlm_FloatHi) INT2_MAX, 0.0294, 0.049, 0.20, 0.15, -17},
1062 {95, 11, (Nlm_FloatHi) INT2_MAX, 0.0285, 0.042, 0.16, 0.18, -25},
1063 {90, 11, (Nlm_FloatHi) INT2_MAX, 0.0271, 0.031, 0.14, 0.20, -28},
1064 {85, 11, (Nlm_FloatHi) INT2_MAX, 0.0256, 0.023, 0.10, 0.26, -46},
1065 {115, 10, (Nlm_FloatHi) INT2_MAX, 0.0308, 0.062, 0.22, 0.14, -20},
1066 {110, 10, (Nlm_FloatHi) INT2_MAX, 0.0302, 0.056, 0.19, 0.16, -26},
1067 {105, 10, (Nlm_FloatHi) INT2_MAX, 0.0296, 0.050, 0.17, 0.17, -27},
1068 {100, 10, (Nlm_FloatHi) INT2_MAX, 0.0286, 0.041, 0.15, 0.19, -32},
1069 {95, 10, (Nlm_FloatHi) INT2_MAX, 0.0272, 0.030, 0.13, 0.21, -35},
1070 {90, 10, (Nlm_FloatHi) INT2_MAX, 0.0257, 0.022, 0.11, 0.24, -40},
1071 {85, 10, (Nlm_FloatHi) INT2_MAX, 0.0242, 0.017, 0.083, 0.29, -51},
1072 {115, 9, (Nlm_FloatHi) INT2_MAX, 0.0306, 0.061, 0.24, 0.13, -14},
1073 {110, 9, (Nlm_FloatHi) INT2_MAX, 0.0299, 0.053, 0.19, 0.16, -23},
1074 {105, 9, (Nlm_FloatHi) INT2_MAX, 0.0289, 0.043, 0.17, 0.17, -23},
1075 {100, 9, (Nlm_FloatHi) INT2_MAX, 0.0279, 0.036, 0.14, 0.20, -31},
1076 {95, 9, (Nlm_FloatHi) INT2_MAX, 0.0266, 0.028, 0.12, 0.23, -37},
1077 {120, 8, (Nlm_FloatHi) INT2_MAX, 0.0307, 0.062, 0.22, 0.14, -18},
1078 {115, 8, (Nlm_FloatHi) INT2_MAX, 0.0300, 0.053, 0.20, 0.15, -19},
1079 {110, 8, (Nlm_FloatHi) INT2_MAX, 0.0292, 0.046, 0.17, 0.17, -23},
1080 {105, 8, (Nlm_FloatHi) INT2_MAX, 0.0280, 0.035, 0.14, 0.20, -31},
1081 {100, 8, (Nlm_FloatHi) INT2_MAX, 0.0266, 0.026, 0.12, 0.23, -37},
1082 {125, 7, (Nlm_FloatHi) INT2_MAX, 0.0306, 0.058, 0.22, 0.14, -18},
1083 {120, 7, (Nlm_FloatHi) INT2_MAX, 0.0300, 0.052, 0.19, 0.16, -23},
1084 {115, 7, (Nlm_FloatHi) INT2_MAX, 0.0292, 0.044, 0.17, 0.17, -24},
1085 {110, 7, (Nlm_FloatHi) INT2_MAX, 0.0279, 0.032, 0.14, 0.20, -31},
1086 {105, 7, (Nlm_FloatHi) INT2_MAX, 0.0267, 0.026, 0.11, 0.24, -41},
1087 {120,10,5, 0.0298, 0.049, 0.19, 0.16, -21},
1088 {115,10,5, 0.0290, 0.042, 0.16, 0.18, -25},
1089 {110,10,5, 0.0279, 0.033, 0.13, 0.21, -32},
1090 {105,10,5, 0.0264, 0.024, 0.10, 0.26, -46},
1091 {100,10,5, 0.0250, 0.018, 0.081, 0.31, -56},
1092 {125,10,4, 0.0301, 0.053, 0.18, 0.17, -25},
1093 {120,10,4, 0.0292, 0.043, 0.15, 0.20, -33},
1094 {115,10,4, 0.0282, 0.035, 0.13, 0.22, -36},
1095 {110,10,4, 0.0270, 0.027, 0.11, 0.25, -41},
1096 {105,10,4, 0.0254, 0.020, 0.079, 0.32, -60},
1097 {130,10,3, 0.0300, 0.051, 0.17, 0.18, -27},
1098 {125,10,3, 0.0290, 0.040, 0.13, 0.22, -38},
1099 {120,10,3, 0.0278, 0.030, 0.11, 0.25, -44},
1100 {115,10,3, 0.0267, 0.025, 0.092, 0.29, -52},
1101 {110,10,3, 0.0252, 0.018, 0.070, 0.36, -70},
1102 {135,10,2, 0.0292, 0.040, 0.13, 0.22, -35},
1103 {130,10,2, 0.0283, 0.034, 0.10, 0.28, -51},
1104 {125,10,2, 0.0269, 0.024, 0.077, 0.35, -71},
1105 {120,10,2, 0.0253, 0.017, 0.059, 0.43, -90},
1106 {115,10,2, 0.0234, 0.011, 0.043, 0.55, -121},
1107 {100,14,3, 0.0258, 0.023, 0.087, 0.33, -59},
1108 {105,13,3, 0.0263, 0.024, 0.085, 0.31, -57},
1109 {110,12,3, 0.0271, 0.028, 0.093, 0.29, -54},
1110 {115,11,3, 0.0275, 0.030, 0.10, 0.27, -49},
1111 {125,9,3, 0.0283, 0.034, 0.12, 0.23, -38},
1112 {130,8,3, 0.0287, 0.037, 0.12, 0.23, -40},
1113 {125,7,3, 0.0287, 0.036, 0.12, 0.24, -44},
1114 {140,6,3, 0.0285, 0.033, 0.12, 0.23, -40},
1115 {105,14,3, 0.0270, 0.028, 0.10, 0.27, -46},
1116 {110,13,3, 0.0279, 0.034, 0.10, 0.27, -50},
1117 {115,12,3, 0.0282, 0.035, 0.12, 0.24, -42},
1118 {120,11,3, 0.0286, 0.037, 0.12, 0.24, -44},
1119 };
1120
1121 static Int4 blosum62_20_prefs[BLOSUM62_20_VALUES_MAX] = {
1122 BLAST_MATRIX_NOMINAL,
1123 BLAST_MATRIX_NOMINAL,
1124 BLAST_MATRIX_NOMINAL,
1125 BLAST_MATRIX_NOMINAL,
1126 BLAST_MATRIX_NOMINAL,
1127 BLAST_MATRIX_NOMINAL,
1128 BLAST_MATRIX_NOMINAL,
1129 BLAST_MATRIX_NOMINAL,
1130 BLAST_MATRIX_NOMINAL,
1131 BLAST_MATRIX_NOMINAL,
1132 BLAST_MATRIX_NOMINAL,
1133 BLAST_MATRIX_NOMINAL,
1134 BLAST_MATRIX_NOMINAL,
1135 BLAST_MATRIX_NOMINAL,
1136 BLAST_MATRIX_NOMINAL,
1137 BLAST_MATRIX_NOMINAL,
1138 BLAST_MATRIX_NOMINAL,
1139 BLAST_MATRIX_NOMINAL,
1140 BLAST_MATRIX_NOMINAL,
1141 BLAST_MATRIX_NOMINAL,
1142 BLAST_MATRIX_NOMINAL,
1143 BLAST_MATRIX_NOMINAL,
1144 BLAST_MATRIX_NOMINAL,
1145 BLAST_MATRIX_NOMINAL,
1146 BLAST_MATRIX_NOMINAL,
1147 BLAST_MATRIX_NOMINAL,
1148 BLAST_MATRIX_NOMINAL,
1149 BLAST_MATRIX_NOMINAL,
1150 BLAST_MATRIX_NOMINAL,
1151 BLAST_MATRIX_NOMINAL,
1152 BLAST_MATRIX_NOMINAL,
1153 BLAST_MATRIX_NOMINAL,
1154 BLAST_MATRIX_NOMINAL,
1155 BLAST_MATRIX_NOMINAL,
1156 BLAST_MATRIX_NOMINAL,
1157 BLAST_MATRIX_NOMINAL,
1158 BLAST_MATRIX_NOMINAL,
1159 BLAST_MATRIX_NOMINAL,
1160 BLAST_MATRIX_NOMINAL,
1161 BLAST_MATRIX_NOMINAL,
1162 BLAST_MATRIX_NOMINAL,
1163 BLAST_MATRIX_NOMINAL,
1164 BLAST_MATRIX_NOMINAL,
1165 BLAST_MATRIX_NOMINAL,
1166 BLAST_MATRIX_NOMINAL,
1167 BLAST_MATRIX_BEST,
1168 BLAST_MATRIX_NOMINAL,
1169 BLAST_MATRIX_NOMINAL,
1170 BLAST_MATRIX_NOMINAL,
1171 BLAST_MATRIX_NOMINAL,
1172 BLAST_MATRIX_NOMINAL,
1173 BLAST_MATRIX_NOMINAL,
1174 BLAST_MATRIX_NOMINAL,
1175 BLAST_MATRIX_NOMINAL,
1176 BLAST_MATRIX_NOMINAL,
1177 BLAST_MATRIX_NOMINAL,
1178 BLAST_MATRIX_NOMINAL,
1179 BLAST_MATRIX_NOMINAL,
1180 BLAST_MATRIX_NOMINAL,
1181 BLAST_MATRIX_NOMINAL,
1182 BLAST_MATRIX_NOMINAL,
1183 BLAST_MATRIX_NOMINAL,
1184 BLAST_MATRIX_NOMINAL,
1185 BLAST_MATRIX_NOMINAL,
1186 BLAST_MATRIX_NOMINAL
1187 };
1188
1189 /*
1190 Allocates memory for the BLAST_ScoreBlkPtr.
1191 */
1192
1193 BLAST_ScoreBlkPtr LIBCALL
BLAST_ScoreBlkNew(Uint1 alphabet,Int2 number_of_contexts)1194 BLAST_ScoreBlkNew(Uint1 alphabet, Int2 number_of_contexts)
1195
1196 {
1197 BLAST_ScoreBlkPtr sbp;
1198 SeqCodeTablePtr sctp;
1199
1200 if (alphabet != BLASTNA_SEQ_CODE)
1201 {
1202 sctp = SeqCodeTableFindObj(alphabet);
1203 if (sctp == NULL)
1204 return NULL;
1205 }
1206
1207 sbp = (BLAST_ScoreBlkPtr) MemNew(sizeof(BLAST_ScoreBlk));
1208
1209 if (sbp != NULL)
1210 {
1211 sbp->alphabet_code = alphabet;
1212 if (alphabet != BLASTNA_SEQ_CODE)
1213 sbp->alphabet_size = sctp->num;
1214 else
1215 sbp->alphabet_size = BLASTNA_SIZE;
1216
1217 /* set the alphabet type (protein or not). */
1218 switch (alphabet) {
1219 case Seq_code_iupacaa:
1220 case Seq_code_ncbi8aa:
1221 case Seq_code_ncbieaa:
1222 case Seq_code_ncbipaa:
1223 case Seq_code_iupacaa3:
1224 return NULL;
1225
1226 case Seq_code_ncbistdaa:
1227 sbp->protein_alphabet = TRUE;
1228 break;
1229 case BLASTNA_SEQ_CODE:
1230 sbp->protein_alphabet = FALSE;
1231 break;
1232 default:
1233 break;
1234 }
1235
1236 sbp->matrix_struct = BlastMatrixAllocate(sbp->alphabet_size);
1237 if (sbp->matrix_struct == NULL)
1238 {
1239 sbp = BLAST_ScoreBlkDestruct(sbp);
1240 return sbp;
1241 }
1242 sbp->matrix = sbp->matrix_struct->matrix;
1243 sbp->maxscore = (BLAST_ScorePtr) MemNew(BLAST_MATRIX_SIZE*sizeof(BLAST_Score));
1244 sbp->number_of_contexts = number_of_contexts;
1245 sbp->sfp = MemNew(sbp->number_of_contexts*sizeof(BLAST_ScoreFreqPtr));
1246 sbp->kbp_std = MemNew(sbp->number_of_contexts*sizeof(BLAST_KarlinBlkPtr));
1247 sbp->kbp_gap_std = MemNew(sbp->number_of_contexts*sizeof(BLAST_KarlinBlkPtr));
1248 sbp->kbp_psi = MemNew(sbp->number_of_contexts*sizeof(BLAST_KarlinBlkPtr));
1249 sbp->kbp_gap_psi = MemNew(sbp->number_of_contexts*sizeof(BLAST_KarlinBlkPtr));
1250 }
1251
1252 return sbp;
1253 }
1254
1255 BLAST_ScoreBlkPtr LIBCALL
BLAST_ScoreBlkDestruct(BLAST_ScoreBlkPtr sbp)1256 BLAST_ScoreBlkDestruct(BLAST_ScoreBlkPtr sbp)
1257
1258 {
1259 Int4 index, rows;
1260 if (sbp == NULL)
1261 return NULL;
1262
1263 for (index=0; index<sbp->number_of_contexts; index++) {
1264 if (sbp->sfp)
1265 sbp->sfp[index] = BlastScoreFreqDestruct(sbp->sfp[index]);
1266 if (sbp->kbp_std)
1267 sbp->kbp_std[index] = BlastKarlinBlkDestruct(sbp->kbp_std[index]);
1268 if (sbp->kbp_gap_std)
1269 sbp->kbp_gap_std[index] = BlastKarlinBlkDestruct(sbp->kbp_gap_std[index]);
1270 if (sbp->kbp_psi)
1271 sbp->kbp_psi[index] = BlastKarlinBlkDestruct(sbp->kbp_psi[index]);
1272 if (sbp->kbp_gap_psi)
1273 sbp->kbp_gap_psi[index] = BlastKarlinBlkDestruct(sbp->kbp_gap_psi[index]);
1274 }
1275 if (sbp->kbp_ideal)
1276 sbp->kbp_ideal = BlastKarlinBlkDestruct(sbp->kbp_ideal);
1277 sbp->sfp = MemFree(sbp->sfp);
1278 sbp->kbp_std = MemFree(sbp->kbp_std);
1279 sbp->kbp_psi = MemFree(sbp->kbp_psi);
1280 sbp->kbp_gap_std = MemFree(sbp->kbp_gap_std);
1281 sbp->kbp_gap_psi = MemFree(sbp->kbp_gap_psi);
1282 sbp->matrix_struct = BlastMatrixDestruct(sbp->matrix_struct);
1283 sbp->maxscore = MemFree(sbp->maxscore);
1284 sbp->comments = ValNodeFreeData(sbp->comments);
1285 sbp->name = MemFree(sbp->name);
1286 sbp->ambiguous_res = MemFree(sbp->ambiguous_res);
1287
1288 /* Removing posMatrix and posFreqs if any */
1289 rows = sbp->query_length + 1;
1290 if(sbp->posMatrix != NULL) {
1291 for (index=0; index < rows; index++) {
1292 MemFree(sbp->posMatrix[index]);
1293 }
1294 MemFree(sbp->posMatrix);
1295 }
1296
1297 if(sbp->posFreqs != NULL) {
1298 for (index = 0; index < rows; index++) {
1299 MemFree(sbp->posFreqs[index]);
1300 }
1301 MemFree(sbp->posFreqs);
1302 }
1303
1304 sbp = MemFree(sbp);
1305
1306 return sbp;
1307 }
1308
1309 /*
1310 Set the ambiguous residue (e.g, 'N', 'X') in the BLAST_ScoreBlkPtr.
1311 Convert from ncbieaa to sbp->alphabet_code (i.e., ncbistdaa) first.
1312 */
1313 Int2 LIBCALL
BlastScoreSetAmbigRes(BLAST_ScoreBlkPtr sbp,Char ambiguous_res)1314 BlastScoreSetAmbigRes(BLAST_ScoreBlkPtr sbp, Char ambiguous_res)
1315
1316 {
1317 Int2 index;
1318 SeqMapTablePtr smtp;
1319 Uint1Ptr ambig_buffer;
1320
1321 if (sbp == NULL)
1322 return 1;
1323
1324 if (sbp->ambig_occupy >= sbp->ambig_size)
1325 {
1326 sbp->ambig_size += 5;
1327 ambig_buffer = MemNew(sbp->ambig_size*sizeof(Uint1));
1328 for (index=0; index<sbp->ambig_occupy; index++)
1329 {
1330 ambig_buffer[index] = sbp->ambiguous_res[index];
1331 }
1332 sbp->ambiguous_res = MemFree(sbp->ambiguous_res);
1333 sbp->ambiguous_res = ambig_buffer;
1334 }
1335
1336 if (sbp->alphabet_code == Seq_code_ncbistdaa)
1337 {
1338 smtp = SeqMapTableFind(sbp->alphabet_code, Seq_code_ncbieaa);
1339 sbp->ambiguous_res[sbp->ambig_occupy] = SeqMapTableConvert(smtp, ambiguous_res);
1340 }
1341 else if (sbp->alphabet_code == BLASTNA_SEQ_CODE)
1342 {
1343 smtp = SeqMapTableFind(Seq_code_ncbi4na, Seq_code_iupacna);
1344 sbp->ambiguous_res[sbp->ambig_occupy] = ncbi4na_to_blastna[SeqMapTableConvert(smtp, ambiguous_res)];
1345 }
1346 else if (sbp->alphabet_code == Seq_code_ncbi4na)
1347 {
1348 smtp = SeqMapTableFind(Seq_code_ncbi4na, Seq_code_iupacna);
1349 sbp->ambiguous_res[sbp->ambig_occupy] = SeqMapTableConvert(smtp, ambiguous_res);
1350 }
1351
1352 (sbp->ambig_occupy)++;
1353
1354
1355 return 0;
1356 }
1357
1358 /*
1359 Deallocates all data associated with the BLAST_MatrixPtr.
1360 */
1361 BLAST_MatrixPtr LIBCALL
BLAST_MatrixDestruct(BLAST_MatrixPtr blast_matrix)1362 BLAST_MatrixDestruct(BLAST_MatrixPtr blast_matrix)
1363
1364 {
1365 Int4 index;
1366
1367 if (blast_matrix == NULL)
1368 return NULL;
1369
1370 /* We may have 2 different matrices in there */
1371
1372 if(blast_matrix->original_matrix &&
1373 blast_matrix->original_matrix != blast_matrix->matrix) {
1374 /* blast_matrix->original_matrix is a square matrix with the
1375 same number of columns as blast_matrix->matrix. Therefore
1376 blast_matrix->original_matrix has blast_matrix->columns
1377 rows. */
1378 for (index=0; index < blast_matrix->columns; index++) {
1379 MemFree(blast_matrix->original_matrix[index]);
1380 }
1381 MemFree(blast_matrix->original_matrix);
1382 }
1383
1384 blast_matrix->name = MemFree(blast_matrix->name);
1385 if (blast_matrix->matrix) {
1386 for (index=0; index<blast_matrix->rows; index++) {
1387 MemFree(blast_matrix->matrix[index]);
1388 }
1389 MemFree(blast_matrix->matrix);
1390 }
1391
1392 if(blast_matrix->posFreqs != NULL) {
1393 for (index = 0; index < blast_matrix->rows; index++) {
1394 MemFree(blast_matrix->posFreqs[index]);
1395 }
1396 MemFree(blast_matrix->posFreqs);
1397 }
1398
1399 MemFree(blast_matrix);
1400
1401 return NULL;
1402 }
1403
1404
1405 /*
1406 Allocates and fills the BLAST_MatrixPtr.
1407 positionBased indicates that the posMatrix
1408 on BLAST_ScoreBlkPtr should be used rather
1409 than the 'normal' matrix.
1410 */
1411 BLAST_MatrixPtr LIBCALL
BLAST_MatrixFill(BLAST_ScoreBlkPtr sbp,Boolean positionBased)1412 BLAST_MatrixFill(BLAST_ScoreBlkPtr sbp, Boolean positionBased)
1413
1414 {
1415 BLAST_MatrixPtr blast_matrix;
1416 FloatHi karlinK = 0.0;
1417 Int4 index1, index2, dim1, dim2;
1418 Int4Ptr PNTR matrix, PNTR original_matrix;
1419 Nlm_FloatHi **posFreqs = NULL;
1420
1421 if (sbp == NULL)
1422 return NULL;
1423
1424 blast_matrix = (BLAST_MatrixPtr) MemNew(sizeof(BLAST_Matrix));
1425
1426 dim1 = sbp->alphabet_size;
1427 dim2 = sbp->alphabet_size;
1428 original_matrix = sbp->matrix;
1429
1430 if (sbp->kbp_gap_psi[0])
1431 karlinK = sbp->kbp_gap_psi[0]->K;
1432
1433 matrix = MemNew(dim1*sizeof(Int4Ptr));
1434 for (index1=0; index1<dim1; index1++) {
1435 matrix[index1] = MemNew(dim2*sizeof(Int4));
1436 for (index2=0; index2<dim2; index2++) {
1437 matrix[index1][index2] = original_matrix[index1][index2];
1438 }
1439 }
1440
1441 blast_matrix->original_matrix = matrix;
1442
1443 /* For PSI BLAST blast_matrix->matrix will be position based */
1444 if (sbp->posMatrix) {
1445 dim1 = sbp->query_length + 1;
1446 dim2 = sbp->alphabet_size;
1447 original_matrix = sbp->posMatrix;
1448 matrix = MemNew(dim1*sizeof(Int4Ptr));
1449 for (index1=0; index1<dim1; index1++) {
1450 matrix[index1] = MemNew(dim2*sizeof(Int4));
1451 for (index2=0; index2<dim2; index2++) {
1452 matrix[index1][index2] = original_matrix[index1][index2];
1453 }
1454 }
1455 }
1456 blast_matrix->matrix = matrix;
1457
1458 /* Copying posFreqs to the BLAST_Matrix */
1459 if ((sbp->posFreqs != NULL) && (sbp->posMatrix != NULL)) {
1460 posFreqs = MemNew(dim1*sizeof(Nlm_FloatHi *));
1461 for (index1 = 0; index1 < dim1; index1++) {
1462 posFreqs[index1] = MemNew(dim2*sizeof(Nlm_FloatHi));
1463 for (index2=0; index2 < dim2; index2++) {
1464 posFreqs[index1][index2] = sbp->posFreqs[index1][index2];
1465 }
1466 }
1467 }
1468
1469 blast_matrix->is_prot = sbp->protein_alphabet;
1470 blast_matrix->name = StringSave(sbp->name);
1471 blast_matrix->rows = dim1;
1472 blast_matrix->columns = dim2;
1473 blast_matrix->matrix = matrix;
1474 blast_matrix->posFreqs = posFreqs;
1475 blast_matrix->karlinK = karlinK;
1476
1477 return blast_matrix;
1478 }
1479
1480 /*
1481 Fill in the matrix for blastn using the penaly and rewards
1482
1483 The query sequence alphabet is blastna, the subject sequence
1484 is ncbi2na. The alphabet blastna is defined in blastkar.h
1485 and the first four elements of blastna are identical to ncbi2na.
1486
1487 The query is in the first index, the subject is the second.
1488 if matrix==NULL, it is allocated and returned.
1489 */
1490
BlastScoreBlkMatCreateEx(BLAST_ScorePtr PNTR matrix,BLAST_Score penalty,BLAST_Score reward)1491 BLAST_ScorePtr PNTR LIBCALL BlastScoreBlkMatCreateEx(BLAST_ScorePtr PNTR matrix,BLAST_Score penalty, BLAST_Score reward)
1492 {
1493
1494 Int2 index1, index2, degen;
1495 Int2 degeneracy[BLASTNA_SIZE+1];
1496
1497 if(!matrix) {
1498 BLASTMatrixStructurePtr matrix_struct;
1499 matrix_struct =BlastMatrixAllocate((Int2) BLASTNA_SIZE);
1500 matrix = matrix_struct->matrix;
1501 }
1502 for (index1 = 0; index1<BLASTNA_SIZE; index1++) /* blastna */
1503 for (index2 = 0; index2<BLASTNA_SIZE; index2++) /* blastna */
1504 matrix[index1][index2] = 0;
1505
1506 /* In blastna the 1st four bases are A, C, G, and T, exactly as it is ncbi2na. */
1507 /* ncbi4na gives them the value 1, 2, 4, and 8. */
1508
1509 /* Set the first four bases to degen. one */
1510 for (index1=0; index1<NUMBER_NON_AMBIG_BP; index1++)
1511 degeneracy[index1] = 1;
1512
1513 for (index1=NUMBER_NON_AMBIG_BP; index1<BLASTNA_SIZE; index1++) /* blastna */
1514 {
1515 degen=0;
1516 for (index2=0; index2<NUMBER_NON_AMBIG_BP; index2++) /* ncbi2na */
1517 {
1518 if (blastna_to_ncbi4na[index1] & blastna_to_ncbi4na[index2])
1519 degen++;
1520 }
1521 degeneracy[index1] = degen;
1522 }
1523
1524
1525 for (index1=0; index1<BLASTNA_SIZE; index1++) /* blastna */
1526 {
1527 for (index2=index1; index2<BLASTNA_SIZE; index2++) /* blastna */
1528 {
1529 if (blastna_to_ncbi4na[index1] & blastna_to_ncbi4na[index2])
1530 { /* round up for positive scores, down for negatives. */
1531 matrix[index1][index2] = Nlm_Nint( (double) ((degeneracy[index2]-1)*penalty + reward))/degeneracy[index2];
1532 if (index1 != index2)
1533 {
1534 matrix[index2][index1] = matrix[index1][index2];
1535 }
1536 }
1537 else
1538 {
1539 matrix[index1][index2] = penalty;
1540 matrix[index2][index1] = penalty;
1541 }
1542 }
1543 }
1544
1545 /* The value of 15 is a gap, which is a sentinel between strands in
1546 the ungapped extension algorithm */
1547 for (index1=0; index1<BLASTNA_SIZE; index1++) /* blastna */
1548 matrix[BLASTNA_SIZE-1][index1] = INT4_MIN / 2;
1549 for (index1=0; index1<BLASTNA_SIZE; index1++) /* blastna */
1550 matrix[index1][BLASTNA_SIZE-1] = INT4_MIN / 2;
1551
1552 return matrix;
1553 }
1554 /*
1555 Fill in the matrix for blastn using the penaly and rewards on
1556 the BLAST_ScoreBlkPtr.
1557
1558 The query sequence alphabet is blastna, the subject sequence
1559 is ncbi2na. The alphabet blastna is defined in blastkar.h
1560 and the first four elements of blastna are identical to ncbi2na.
1561
1562 The query is in the first index, the subject is the second.
1563 */
1564
1565
BlastScoreBlkMatCreate(BLAST_ScoreBlkPtr sbp)1566 static Int2 BlastScoreBlkMatCreate(BLAST_ScoreBlkPtr sbp)
1567 {
1568 Char buffer[25];
1569
1570 sbp->matrix = BlastScoreBlkMatCreateEx(sbp->matrix,sbp->penalty, sbp->reward);
1571 sbp->mat_dim1 = BLASTNA_SIZE;
1572 sbp->mat_dim2 = BLASTNA_SIZE;
1573
1574 sprintf(buffer, "blastn matrix:%ld %ld", (long) sbp->reward, (long) sbp->penalty);
1575 sbp->name = StringSave(buffer);
1576
1577 return 0;
1578 }
1579
1580
1581
1582
1583
1584 /*
1585 This function fills in the BLAST_ScoreBlk structure.
1586 Tasks are:
1587 -read in the matrix
1588 -set maxscore
1589 */
1590
1591 Int2 LIBCALL
BlastScoreBlkMatFill(BLAST_ScoreBlkPtr sbp,CharPtr matrix)1592 BlastScoreBlkMatFill(BLAST_ScoreBlkPtr sbp, CharPtr matrix)
1593
1594 {
1595 Char string[PATH_MAX] = "", alphabet_type[3] = "";
1596 CharPtr matrix_dir = NULL;
1597 Int2 status = 0;
1598 FILE *fp = NULL;
1599
1600 if (sbp->read_in_matrix) {
1601 /* Convert matrix name to upper case. */
1602 matrix = Nlm_StrUpper(matrix);
1603
1604 sbp->name = StringSave(matrix); /* Save the name of the matrix. */
1605
1606 /* 1. Try local directory */
1607 if(FileLength(matrix) > 0)
1608 fp = FileOpen(matrix, "r");
1609
1610 /* 2. Try configuration file */
1611 if (fp == NULL) {
1612 if (sbp->protein_alphabet)
1613 Nlm_StringNCpy(alphabet_type, "aa", 2);
1614 else
1615 Nlm_StringNCpy(alphabet_type, "nt", 2);
1616 alphabet_type[2] = NULLB;
1617
1618 if(FindPath("ncbi", "ncbi", "data", string, PATH_MAX)) {
1619 matrix_dir = StringSave(string);
1620 sprintf(string, "%s%s", matrix_dir, matrix);
1621 if(FileLength(string) > 0) {
1622 fp = FileOpen(string, "r");
1623 } else {
1624 sprintf(string, "%s%s%s%s", matrix_dir,
1625 alphabet_type, DIRDELIMSTR, matrix);
1626 if(FileLength(string) > 0)
1627 fp = FileOpen(string, "r");
1628 }
1629 matrix_dir = MemFree(matrix_dir);
1630 }
1631 }
1632 /* Trying to use local "data" directory */
1633
1634 if(fp == NULL) {
1635 sprintf(string, "data%s%s", DIRDELIMSTR, matrix);
1636 if(FileLength(string) > 0)
1637 fp = FileOpen(string, "r");
1638 }
1639
1640 /* Get the matrix locations from the environment for UNIX. */
1641 if (fp == NULL) {
1642
1643 matrix_dir = getenv("BLASTMAT");
1644 if (matrix_dir != NULL) {
1645 sprintf(string, "%s%s%s%s%s", matrix_dir, DIRDELIMSTR,alphabet_type, DIRDELIMSTR, matrix);
1646 }
1647
1648 if(FileLength(string) > 0)
1649 fp = FileOpen(string, "r");
1650
1651 /* Try again without "aa" or "nt" */
1652 if (fp == NULL) {
1653 if (matrix_dir != NULL) {
1654 sprintf(string, "%s%s%s", matrix_dir, DIRDELIMSTR, matrix);
1655 }
1656
1657 if(FileLength(string) > 0)
1658 fp = FileOpen(string, "r");
1659 }
1660 }
1661
1662 if (fp == NULL) {
1663 ErrPostEx(SEV_WARNING, 0, 0, "Unable to open %s", matrix);
1664 return 4;
1665 }
1666
1667 if((status=BlastScoreBlkMatRead(sbp, fp)) != 0) {
1668 FileClose(fp);
1669 return status;
1670 }
1671 FileClose(fp);
1672 } else {
1673 if((status=BlastScoreBlkMatCreate(sbp)) != 0)
1674 return status;
1675 }
1676
1677 if((status=BlastScoreBlkMaxScoreSet(sbp)) != 0)
1678 return status;
1679
1680 return status;
1681 }
1682
1683 /*
1684 Return the specified matrix. Do this by setting up the ScoreBlkPtr
1685 and then fetching the matrix from disk.
1686 */
1687
1688 BLAST_MatrixPtr LIBCALL
BLAST_MatrixFetch(CharPtr matrix_name)1689 BLAST_MatrixFetch(CharPtr matrix_name)
1690
1691 {
1692 BLAST_MatrixPtr matrix;
1693 BLAST_ScoreBlkPtr sbp;
1694
1695 if (matrix_name == NULL)
1696 return NULL;
1697
1698 sbp = BLAST_ScoreBlkNew(Seq_code_ncbistdaa, 1);
1699
1700 /* Read in for protein. */
1701 sbp->read_in_matrix = TRUE;
1702
1703 BlastScoreBlkMatFill(sbp, matrix_name);
1704
1705 matrix = BLAST_MatrixFill(sbp, FALSE);
1706
1707 sbp = BLAST_ScoreBlkDestruct(sbp);
1708
1709 return matrix;
1710 }
1711
1712
1713 /*
1714 Calculate the Karlin parameters. This function should be called once
1715 for each context, or frame translated.
1716
1717 The rfp and stdrfp are calculated for each context, this should be
1718 fixed.
1719 */
1720
1721 Int2 LIBCALL
BlastScoreBlkFill(BLAST_ScoreBlkPtr sbp,CharPtr query,Int4 query_length,Int2 context_number)1722 BlastScoreBlkFill(BLAST_ScoreBlkPtr sbp, CharPtr query, Int4 query_length, Int2 context_number)
1723 {
1724 BLAST_ResFreqPtr rfp, stdrfp;
1725 Int2 retval=0;
1726
1727 rfp = BlastResFreqNew(sbp);
1728 stdrfp = BlastResFreqNew(sbp);
1729 BlastResFreqStdComp(sbp, stdrfp);
1730 BlastResFreqString(sbp, rfp, query, query_length);
1731 sbp->sfp[context_number] = BlastScoreFreqNew(sbp->loscore, sbp->hiscore);
1732 BlastScoreFreqCalc(sbp, sbp->sfp[context_number], rfp, stdrfp);
1733 sbp->kbp_std[context_number] = BlastKarlinBlkCreate();
1734 retval = BlastKarlinBlkCalc(sbp->kbp_std[context_number], sbp->sfp[context_number]);
1735 if (retval)
1736 {
1737 rfp = BlastResFreqDestruct(rfp);
1738 stdrfp = BlastResFreqDestruct(stdrfp);
1739 return retval;
1740 }
1741 sbp->kbp_psi[context_number] = BlastKarlinBlkCreate();
1742 retval = BlastKarlinBlkCalc(sbp->kbp_psi[context_number], sbp->sfp[context_number]);
1743 rfp = BlastResFreqDestruct(rfp);
1744 stdrfp = BlastResFreqDestruct(stdrfp);
1745
1746 return retval;
1747 }
1748
1749 /*
1750 Calculates the standard Karlin parameters. This is used
1751 if the query is translated and the calculated (real) Karlin
1752 parameters are bad, as they're calculated for non-coding regions.
1753 */
1754
1755 BLAST_KarlinBlkPtr LIBCALL
BlastKarlinBlkStandardCalcEx(BLAST_ScoreBlkPtr sbp)1756 BlastKarlinBlkStandardCalcEx(BLAST_ScoreBlkPtr sbp)
1757
1758 {
1759 BLAST_KarlinBlkPtr kbp_ideal;
1760 BLAST_ResFreqPtr stdrfp;
1761 BLAST_ScoreFreqPtr sfp;
1762
1763 stdrfp = BlastResFreqNew(sbp);
1764 BlastResFreqStdComp(sbp, stdrfp);
1765 sfp = BlastScoreFreqNew(sbp->loscore, sbp->hiscore);
1766 BlastScoreFreqCalc(sbp, sfp, stdrfp, stdrfp);
1767 kbp_ideal = BlastKarlinBlkCreate();
1768 BlastKarlinBlkCalc(kbp_ideal, sfp);
1769 stdrfp = BlastResFreqDestruct(stdrfp);
1770
1771 sfp = BlastScoreFreqDestruct(sfp);
1772
1773 return kbp_ideal;
1774 }
1775
1776 Int2 LIBCALL
BlastKarlinBlkStandardCalc(BLAST_ScoreBlkPtr sbp,Int2 context_start,Int2 context_end)1777 BlastKarlinBlkStandardCalc(BLAST_ScoreBlkPtr sbp, Int2 context_start, Int2 context_end)
1778 {
1779
1780 BLAST_KarlinBlkPtr kbp_ideal, kbp;
1781 Int2 index;
1782
1783 kbp_ideal = BlastKarlinBlkStandardCalcEx(sbp);
1784 /* Replace the calculated values with ideal ones for blastx, tblastx. */
1785 for (index=context_start; index<=context_end; index++)
1786 {
1787 kbp = sbp->kbp[index];
1788 if (kbp->Lambda >= kbp_ideal->Lambda)
1789 {
1790 kbp->Lambda = kbp_ideal->Lambda;
1791 kbp->K = kbp_ideal->K;
1792 kbp->logK = kbp_ideal->logK;
1793 kbp->H = kbp_ideal->H;
1794 }
1795 }
1796 kbp_ideal = BlastKarlinBlkDestruct(kbp_ideal);
1797
1798 return 0;
1799 }
1800
1801 /*
1802 Creates the Karlin Block.
1803 */
1804
1805 BLAST_KarlinBlkPtr LIBCALL
BlastKarlinBlkCreate(void)1806 BlastKarlinBlkCreate(void)
1807
1808 {
1809 BLAST_KarlinBlkPtr kbp;
1810
1811 kbp = (BLAST_KarlinBlkPtr) MemNew(sizeof(BLAST_KarlinBlk));
1812
1813 return kbp;
1814 }
1815
1816 /*
1817 Deallocates the Karlin Block.
1818 */
1819
1820 BLAST_KarlinBlkPtr LIBCALL
BlastKarlinBlkDestruct(BLAST_KarlinBlkPtr kbp)1821 BlastKarlinBlkDestruct(BLAST_KarlinBlkPtr kbp)
1822
1823 {
1824 kbp = MemFree(kbp);
1825
1826 return kbp;
1827 }
1828
1829
1830 /*
1831 Read in the matrix from the FILE *fp.
1832
1833 This function ASSUMES that the matrices are in the ncbistdaa
1834 format. BLAST should be able to use any alphabet, though it
1835 is expected that ncbistdaa will be used.
1836 */
1837
1838 static Char ASCII_TO_BLASTNA_CONVERT[128]={
1839 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1840 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1841 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1842 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1843 15, 0,10, 1,11,15,15, 2,12,15,15, 7,15, 6,14,15,
1844 15,15, 4, 9, 3,15,13, 8,15, 5,15,15,15,15,15,15,
1845 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
1846 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15};
1847
1848 #define X_CODE 21
1849 #define U_CODE 24
1850
1851 Int2 LIBCALL
BlastScoreBlkMatRead(BLAST_ScoreBlkPtr sbp,FILE * fp)1852 BlastScoreBlkMatRead(BLAST_ScoreBlkPtr sbp, FILE *fp)
1853 {
1854 Char buf[512+3];
1855 Char temp[512];
1856 CharPtr cp, lp;
1857 Char ch;
1858 BLAST_ScorePtr PNTR matrix;
1859 BLAST_ScorePtr m;
1860 BLAST_Score score;
1861 Int2 a1cnt = 0, a2cnt = 0;
1862 Char a1chars[BLAST_MAX_ALPHABET], a2chars[BLAST_MAX_ALPHABET];
1863 long lineno = 0;
1864 Nlm_FloatHi xscore;
1865 register int index1, index2, total;
1866 SeqCodeTablePtr sctp;
1867 SeqMapTablePtr smtp=NULL;
1868 Int2 status;
1869 static TNlmMutex read_matrix_mutex;
1870
1871 NlmMutexInit(&read_matrix_mutex);
1872 NlmMutexLock(read_matrix_mutex);
1873
1874 matrix = sbp->matrix;
1875
1876 if (sbp->alphabet_code != BLASTNA_SEQ_CODE) {
1877 sctp = SeqCodeTableFindObj(sbp->alphabet_code);
1878 if(sctp == NULL) {
1879 NlmMutexUnlock(read_matrix_mutex);
1880 return 1;
1881 }
1882
1883 total = sctp->start_at + sctp->num;
1884 for (index1 = sctp->start_at; index1 < total; index1++)
1885 for (index2 = sctp->start_at; index2 < total; index2++)
1886 matrix[index1][index2] = BLAST_SCORE_MIN;
1887
1888 if (sbp->alphabet_code != Seq_code_ncbieaa) {
1889 smtp = SeqMapTableFind(sbp->alphabet_code, Seq_code_ncbieaa);
1890 if (smtp == NULL) {
1891 NlmMutexUnlock(read_matrix_mutex);
1892 return 1;
1893 }
1894 }
1895 } else {
1896 /* Fill-in all the defaults ambiguity and normal codes */
1897 status=BlastScoreBlkMatCreate(sbp);
1898 if(status != 0)
1899 {
1900 NlmMutexUnlock(read_matrix_mutex);
1901 return status;
1902 }
1903 }
1904
1905 /* Read the residue names for the second alphabet */
1906 while (Nlm_FileGets(buf, sizeof(buf), fp) != NULL) {
1907 ++lineno;
1908 if (Nlm_StrChr(buf, '\n') == NULL) {
1909 NlmMutexUnlock(read_matrix_mutex);
1910 return 2;
1911 }
1912
1913 if (buf[0] == COMMENT_CHR) {
1914 /* save the comment line in a linked list */
1915 *Nlm_StrChr(buf, '\n') = NULLB;
1916 ValNodeCopyStr(&sbp->comments, 0, buf+1);
1917 continue;
1918 }
1919 if ((cp = Nlm_StrChr(buf, COMMENT_CHR)) != NULL)
1920 *cp = NULLB;
1921 lp = (CharPtr)Nlm_StrTok(buf, TOKSTR);
1922 if (lp == NULL) /* skip blank lines */
1923 continue;
1924 while (lp != NULL) {
1925
1926 if (smtp)
1927 ch = SeqMapTableConvert(smtp, *lp);
1928 else {
1929 if (sbp->alphabet_code != BLASTNA_SEQ_CODE) {
1930 ch = *lp;
1931 } else {
1932 ch = ASCII_TO_BLASTNA_CONVERT[toupper(*lp)];
1933 }
1934 }
1935 a2chars[a2cnt++] = ch;
1936 lp = (CharPtr)Nlm_StrTok(NULL, TOKSTR);
1937 }
1938
1939 break; /* Exit loop after reading one line. */
1940 }
1941
1942 if (a2cnt <= 1) {
1943 NlmMutexUnlock(read_matrix_mutex);
1944 return 2;
1945 }
1946
1947 if (sbp->alphabet_code != BLASTNA_SEQ_CODE) {
1948 sbp->mat_dim2 = a2cnt;
1949 }
1950 while (Nlm_FileGets(buf, sizeof(buf), fp) != NULL) {
1951 ++lineno;
1952 if ((cp = Nlm_StrChr(buf, '\n')) == NULL) {
1953 NlmMutexUnlock(read_matrix_mutex);
1954 return 2;
1955 }
1956 if ((cp = Nlm_StrChr(buf, COMMENT_CHR)) != NULL)
1957 *cp = NULLB;
1958 if ((lp = (CharPtr)Nlm_StrTok(buf, TOKSTR)) == NULL)
1959 continue;
1960 ch = *lp;
1961 cp = (CharPtr) lp;
1962 if ((cp = Nlm_StrTok(NULL, TOKSTR)) == NULL) {
1963 NlmMutexUnlock(read_matrix_mutex);
1964 return 2;
1965 }
1966 if (a1cnt >= DIM(a1chars)) {
1967 NlmMutexUnlock(read_matrix_mutex);
1968 return 2;
1969 }
1970
1971 if (smtp) {
1972 ch = SeqMapTableConvert(smtp, ch);
1973
1974 } else {
1975 if (sbp->alphabet_code == BLASTNA_SEQ_CODE) {
1976 ch = ASCII_TO_BLASTNA_CONVERT[toupper(ch)];
1977 }
1978 }
1979 a1chars[a1cnt++] = ch;
1980 m = &matrix[(int)ch][0];
1981 index2 = 0;
1982 while (cp != NULL) {
1983 if (index2 >= a2cnt) {
1984 NlmMutexUnlock(read_matrix_mutex);
1985 return 2;
1986 }
1987 Nlm_StrCpy(temp, cp);
1988
1989 if (Nlm_StrICmp(temp, "na") == 0) {
1990 score = BLAST_SCORE_1MIN;
1991 } else {
1992 if (sscanf(temp, "%lg", &xscore) != 1) {
1993 NlmMutexUnlock(read_matrix_mutex);
1994 return 2;
1995 }
1996 /*xscore = MAX(xscore, BLAST_SCORE_1MIN);*/
1997 if (xscore > BLAST_SCORE_1MAX || xscore < BLAST_SCORE_1MIN) {
1998 NlmMutexUnlock(read_matrix_mutex);
1999 return 2;
2000 }
2001 xscore += (xscore >= 0. ? 0.5 : -0.5);
2002 score = (BLAST_Score)xscore;
2003 }
2004
2005 m[(int)a2chars[index2++]] = score;
2006
2007 cp = Nlm_StrTok(NULL, TOKSTR);
2008 }
2009 }
2010
2011 if (a1cnt <= 1) {
2012 NlmMutexUnlock(read_matrix_mutex);
2013 return 2;
2014 }
2015
2016 if (sbp->alphabet_code != BLASTNA_SEQ_CODE) {
2017 sbp->mat_dim1 = a1cnt;
2018
2019 /* For protein matrices, copy the X scores to the
2020 U scores. Keeping the U scores as BLAST_SCORE_MIN
2021 means that U never aligns with another letter,
2022 even another U */
2023 for (index1 = 0; index1 < a2cnt; index1++) {
2024 matrix[U_CODE][index1] = matrix[X_CODE][index1];
2025 matrix[index1][U_CODE] = matrix[index1][X_CODE];
2026 }
2027 }
2028
2029 NlmMutexUnlock(read_matrix_mutex);
2030 return 0;
2031 }
2032
2033 Int2 LIBCALL
BlastScoreBlkMaxScoreSet(BLAST_ScoreBlkPtr sbp)2034 BlastScoreBlkMaxScoreSet(BLAST_ScoreBlkPtr sbp)
2035 {
2036 BLAST_Score score, maxscore;
2037 BLAST_ScorePtr PNTR matrix;
2038 Int2 index1, index2;
2039
2040 sbp->loscore = BLAST_SCORE_1MAX;
2041 sbp->hiscore = BLAST_SCORE_1MIN;
2042 matrix = sbp->matrix;
2043 for (index1=0; index1<sbp->alphabet_size; index1++)
2044 {
2045 maxscore=BLAST_SCORE_MIN;
2046 for (index2=0; index2<sbp->alphabet_size; index2++)
2047 {
2048 score = matrix[index1][index2];
2049 if (score <= BLAST_SCORE_MIN || score >= BLAST_SCORE_MAX)
2050 continue;
2051 if (score > maxscore)
2052 {
2053 maxscore = score;
2054 }
2055 if (sbp->loscore > score)
2056 sbp->loscore = score;
2057 if (sbp->hiscore < score)
2058 sbp->hiscore = score;
2059 }
2060 sbp->maxscore[index1] = maxscore;
2061 }
2062 /* If the lo/hi-scores are BLAST_SCORE_MIN/BLAST_SCORE_MAX, (i.e., for
2063 gaps), then use other scores. */
2064
2065 if (sbp->loscore < BLAST_SCORE_1MIN)
2066 sbp->loscore = BLAST_SCORE_1MIN;
2067 if (sbp->hiscore > BLAST_SCORE_1MAX)
2068 sbp->hiscore = BLAST_SCORE_1MAX;
2069
2070 return 0;
2071 }
2072
2073 /* maxscore for PSI Blast depends not on the residue, but on the position
2074 in the posMatrix, so maxscore array has size of the length of posMatrix */
2075 /* SSH */
BlastPSIMaxScoreGet(BLAST_ScorePtr PNTR posMatrix,Int4 start,Int4 length)2076 BLAST_ScorePtr BlastPSIMaxScoreGet(BLAST_ScorePtr PNTR posMatrix,
2077 Int4 start, Int4 length)
2078 {
2079 BLAST_Score score, maxscore;
2080 BLAST_ScorePtr maxscore_pos;
2081 Int4 index1, index2;
2082
2083 if(posMatrix == NULL)
2084 return NULL;
2085
2086 maxscore_pos = MemNew(length * sizeof(BLAST_Score));
2087
2088 for (index1 = start; index1 < length; index1++) {
2089 maxscore = BLAST_SCORE_MIN;
2090 for (index2 = 0; index2 < PSI_ALPHABET_SIZE; index2++) {
2091 score = posMatrix[index1][index2];
2092 if (score <= BLAST_SCORE_MIN || score >= BLAST_SCORE_MAX)
2093 continue;
2094 if (score > maxscore) {
2095 maxscore = score;
2096 }
2097 }
2098
2099 maxscore_pos[index1] = maxscore;
2100 }
2101 return maxscore_pos;
2102 }
2103
2104 static BLASTMatrixStructurePtr
BlastMatrixAllocate(Int2 alphabet_size)2105 BlastMatrixAllocate(Int2 alphabet_size)
2106
2107 {
2108 BLASTMatrixStructurePtr matrix_struct;
2109 Int2 index;
2110
2111 if (alphabet_size <= 0 || alphabet_size >= BLAST_MATRIX_SIZE)
2112 return NULL;
2113
2114 matrix_struct = (BLASTMatrixStructurePtr) MemNew(sizeof(BLASTMatrixStructure));
2115
2116 if (matrix_struct == NULL)
2117 return NULL;
2118
2119 for (index=0; index<BLAST_MATRIX_SIZE-1; index++)
2120 {
2121 matrix_struct->matrix[index] = matrix_struct->long_matrix + index*BLAST_MATRIX_SIZE;
2122 }
2123
2124 return matrix_struct;
2125 }
2126
2127 static BLASTMatrixStructurePtr
BlastMatrixDestruct(BLASTMatrixStructurePtr matrix_struct)2128 BlastMatrixDestruct(BLASTMatrixStructurePtr matrix_struct)
2129
2130 {
2131
2132 if (matrix_struct == NULL)
2133 return NULL;
2134
2135 matrix_struct = MemFree(matrix_struct);
2136
2137 return matrix_struct;
2138 }
2139
2140 /*
2141 Allocated the BLAST_ResCompPtr for a given alphabet. Only the
2142 alphabets ncbistdaa and ncbi4na should be used by BLAST.
2143 */
2144
2145 BLAST_ResCompPtr LIBCALL
BlastResCompNew(BLAST_ScoreBlkPtr sbp)2146 BlastResCompNew(BLAST_ScoreBlkPtr sbp)
2147 {
2148 BLAST_ResCompPtr rcp;
2149
2150 rcp = (BLAST_ResCompPtr) MemNew(sizeof(BLAST_ResComp));
2151 if (rcp == NULL)
2152 return NULL;
2153
2154 rcp->alphabet_code = sbp->alphabet_code;
2155
2156 /* comp0 has zero offset, comp starts at sctp->start_at, only one
2157 array is allocated. */
2158 rcp->comp0 = (Int4Ptr) MemNew(BLAST_MATRIX_SIZE*sizeof(Int4));
2159 if (rcp->comp0 == NULL)
2160 {
2161 rcp = BlastResCompDestruct(rcp);
2162 return rcp;
2163 }
2164
2165 rcp->comp = rcp->comp0 - sbp->alphabet_start;
2166
2167 return rcp;
2168 }
2169
2170
2171 BLAST_ResCompPtr LIBCALL
BlastResCompDestruct(BLAST_ResCompPtr rcp)2172 BlastResCompDestruct(BLAST_ResCompPtr rcp)
2173 {
2174 if (rcp == NULL)
2175 return NULL;
2176
2177 if (rcp->comp0 != NULL)
2178 rcp->comp0 = MemFree(rcp->comp0);
2179
2180 return MemFree(rcp);
2181 }
2182
2183 /*
2184 Store the composition of a (query) string.
2185 */
2186 Int2 LIBCALL
BlastResCompStr(BLAST_ScoreBlkPtr sbp,BLAST_ResCompPtr rcp,CharPtr str,Int4 length)2187 BlastResCompStr(BLAST_ScoreBlkPtr sbp, BLAST_ResCompPtr rcp, CharPtr str, Int4 length)
2188 {
2189 CharPtr lp, lpmax;
2190 Int2 index;
2191 Uint1 mask;
2192
2193 if (sbp == NULL || rcp == NULL || str == NULL)
2194 return 1;
2195
2196 if (rcp->alphabet_code != sbp->alphabet_code)
2197 return 1;
2198
2199 /* For megablast, check only the first 4 bits of the sequence values */
2200 mask = (sbp->protein_alphabet ? 0xff : 0x0f);
2201
2202 /* comp0 starts at zero and extends for "num", comp is the same array, but
2203 "start_at" offset. */
2204 for (index=0; index<(sbp->alphabet_size); index++)
2205 rcp->comp0[index] = 0;
2206
2207 for (lp = str, lpmax = lp+length; lp < lpmax; lp++)
2208 {
2209 ++rcp->comp[(int)(*lp & mask)];
2210 }
2211
2212 /* Don't count ambig. residues. */
2213 for (index=0; index<sbp->ambig_occupy; index++)
2214 {
2215 rcp->comp[sbp->ambiguous_res[index]] = 0;
2216 }
2217
2218 return 0;
2219 }
2220
2221 static Int2
BlastScoreChk(BLAST_Score lo,BLAST_Score hi)2222 BlastScoreChk(BLAST_Score lo, BLAST_Score hi)
2223 {
2224 if (lo >= 0 || hi <= 0 ||
2225 lo < BLAST_SCORE_1MIN || hi > BLAST_SCORE_1MAX)
2226 return 1;
2227
2228 if (hi - lo > BLAST_SCORE_RANGE_MAX)
2229 return 1;
2230
2231 return 0;
2232 }
2233
2234 BLAST_ScoreFreqPtr
BlastScoreFreqNew(BLAST_Score score_min,BLAST_Score score_max)2235 BlastScoreFreqNew(BLAST_Score score_min, BLAST_Score score_max)
2236 {
2237 BLAST_ScoreFreqPtr sfp;
2238 BLAST_Score range;
2239
2240 if (BlastScoreChk(score_min, score_max) != 0)
2241 return NULL;
2242
2243 sfp = (BLAST_ScoreFreqPtr) MemNew(sizeof(BLAST_ScoreFreq));
2244 if (sfp == NULL)
2245 return NULL;
2246
2247 range = score_max - score_min + 1;
2248 sfp->sprob = (Nlm_FloatHi PNTR) MemNew(sizeof(Nlm_FloatHi) * range);
2249 if (sfp->sprob == NULL)
2250 {
2251 BlastScoreFreqDestruct(sfp);
2252 return NULL;
2253 }
2254
2255 sfp->sprob0 = sfp->sprob;
2256 sfp->sprob -= score_min;
2257 sfp->score_min = score_min;
2258 sfp->score_max = score_max;
2259 sfp->obs_min = sfp->obs_max = 0;
2260 sfp->score_avg = 0.0;
2261 return sfp;
2262 }
2263
2264 BLAST_ScoreFreqPtr
BlastScoreFreqDestruct(BLAST_ScoreFreqPtr sfp)2265 BlastScoreFreqDestruct(BLAST_ScoreFreqPtr sfp)
2266 {
2267 if (sfp == NULL)
2268 return NULL;
2269
2270 if (sfp->sprob0 != NULL)
2271 sfp->sprob0 = MemFree(sfp->sprob0);
2272 sfp = MemFree(sfp);
2273 return sfp;
2274 }
2275
2276 static Int2
BlastScoreFreqCalc(BLAST_ScoreBlkPtr sbp,BLAST_ScoreFreqPtr sfp,BLAST_ResFreqPtr rfp1,BLAST_ResFreqPtr rfp2)2277 BlastScoreFreqCalc(BLAST_ScoreBlkPtr sbp, BLAST_ScoreFreqPtr sfp, BLAST_ResFreqPtr rfp1, BLAST_ResFreqPtr rfp2)
2278 {
2279 BLAST_ScorePtr PNTR matrix;
2280 BLAST_Score score, obs_min, obs_max;
2281 Nlm_FloatHi score_sum, score_avg;
2282 Int2 alphabet_start, alphabet_end, index1, index2;
2283
2284 if (sbp == NULL || sfp == NULL)
2285 return 1;
2286
2287 if (sbp->loscore < sfp->score_min || sbp->hiscore > sfp->score_max)
2288 return 1;
2289
2290 for (score = sfp->score_min; score <= sfp->score_max; score++)
2291 sfp->sprob[score] = 0.0;
2292
2293 matrix = sbp->matrix;
2294
2295 alphabet_start = sbp->alphabet_start;
2296 alphabet_end = alphabet_start + sbp->alphabet_size;
2297 for (index1=alphabet_start; index1<alphabet_end; index1++)
2298 {
2299 for (index2=alphabet_start; index2<alphabet_end; index2++)
2300 {
2301 score = matrix[index1][index2];
2302 if (score >= sbp->loscore)
2303 {
2304 sfp->sprob[score] += rfp1->prob[index1] * rfp2->prob[index2];
2305 }
2306 }
2307 }
2308
2309 score_sum = 0.;
2310 obs_min = obs_max = BLAST_SCORE_MIN;
2311 for (score = sfp->score_min; score <= sfp->score_max; score++)
2312 {
2313 if (sfp->sprob[score] > 0.)
2314 {
2315 score_sum += sfp->sprob[score];
2316 obs_max = score;
2317 if (obs_min == BLAST_SCORE_MIN)
2318 obs_min = score;
2319 }
2320 }
2321 sfp->obs_min = obs_min;
2322 sfp->obs_max = obs_max;
2323
2324 score_avg = 0.0;
2325 if (score_sum > 0.0001 || score_sum < -0.0001)
2326 {
2327 for (score = obs_min; score <= obs_max; score++)
2328 {
2329 sfp->sprob[score] /= score_sum;
2330 score_avg += score * sfp->sprob[score];
2331 }
2332 }
2333 sfp->score_avg = score_avg;
2334
2335 return 0;
2336 }
2337
2338 typedef struct {
2339 Char ch;
2340 Nlm_FloatHi p;
2341 } BLAST_LetterProb;
2342
2343 #define STD_AMINO_ACID_FREQS Robinson_prob
2344
2345 #if STD_AMINO_ACID_FREQS == Dayhoff_prob
2346 /* M. O. Dayhoff amino acid background frequencies */
2347 static BLAST_LetterProb Dayhoff_prob[] = {
2348 { 'A', 87.13 },
2349 { 'C', 33.47 },
2350 { 'D', 46.87 },
2351 { 'E', 49.53 },
2352 { 'F', 39.77 },
2353 { 'G', 88.61 },
2354 { 'H', 33.62 },
2355 { 'I', 36.89 },
2356 { 'K', 80.48 },
2357 { 'L', 85.36 },
2358 { 'M', 14.75 },
2359 { 'N', 40.43 },
2360 { 'P', 50.68 },
2361 { 'Q', 38.26 },
2362 { 'R', 40.90 },
2363 { 'S', 69.58 },
2364 { 'T', 58.54 },
2365 { 'V', 64.72 },
2366 { 'W', 10.49 },
2367 { 'Y', 29.92 }
2368 };
2369 #endif
2370
2371 #if STD_AMINO_ACID_FREQS == Altschul_prob
2372 /* Stephen Altschul amino acid background frequencies */
2373 static BLAST_LetterProb Altschul_prob[] = {
2374 { 'A', 81.00 },
2375 { 'C', 15.00 },
2376 { 'D', 54.00 },
2377 { 'E', 61.00 },
2378 { 'F', 40.00 },
2379 { 'G', 68.00 },
2380 { 'H', 22.00 },
2381 { 'I', 57.00 },
2382 { 'K', 56.00 },
2383 { 'L', 93.00 },
2384 { 'M', 25.00 },
2385 { 'N', 45.00 },
2386 { 'P', 49.00 },
2387 { 'Q', 39.00 },
2388 { 'R', 57.00 },
2389 { 'S', 68.00 },
2390 { 'T', 58.00 },
2391 { 'V', 67.00 },
2392 { 'W', 13.00 },
2393 { 'Y', 32.00 }
2394 };
2395 #endif
2396
2397 #if STD_AMINO_ACID_FREQS == Robinson_prob
2398 /* amino acid background frequencies from Robinson and Robinson */
2399 static BLAST_LetterProb Robinson_prob[] = {
2400 { 'A', 78.05 },
2401 { 'C', 19.25 },
2402 { 'D', 53.64 },
2403 { 'E', 62.95 },
2404 { 'F', 38.56 },
2405 { 'G', 73.77 },
2406 { 'H', 21.99 },
2407 { 'I', 51.42 },
2408 { 'K', 57.44 },
2409 { 'L', 90.19 },
2410 { 'M', 22.43 },
2411 { 'N', 44.87 },
2412 { 'P', 52.03 },
2413 { 'Q', 42.64 },
2414 { 'R', 51.29 },
2415 { 'S', 71.20 },
2416 { 'T', 58.41 },
2417 { 'V', 64.41 },
2418 { 'W', 13.30 },
2419 { 'Y', 32.16 }
2420 };
2421 #endif
2422
2423 static BLAST_LetterProb nt_prob[] = {
2424 { 'A', 25.00 },
2425 { 'C', 25.00 },
2426 { 'G', 25.00 },
2427 { 'T', 25.00 }
2428 };
2429
2430 /*
2431 Allocates the BLAST_ResFreqPtr and then fills in the frequencies
2432 in the probabilities.
2433 */
2434
2435 BLAST_ResFreqPtr LIBCALL
BlastResFreqNew(BLAST_ScoreBlkPtr sbp)2436 BlastResFreqNew(BLAST_ScoreBlkPtr sbp)
2437 {
2438 BLAST_ResFreqPtr rfp;
2439
2440 if (sbp == NULL)
2441 {
2442 return NULL;
2443 }
2444
2445 rfp = (BLAST_ResFreqPtr) MemNew(sizeof(BLAST_ResFreq));
2446 if (rfp == NULL)
2447 return NULL;
2448
2449 rfp->alphabet_code = sbp->alphabet_code;
2450
2451 rfp->prob0 = (Nlm_FloatHi PNTR) MemNew(sizeof(Nlm_FloatHi) * sbp->alphabet_size);
2452 if (rfp->prob0 == NULL)
2453 {
2454 rfp = BlastResFreqDestruct(rfp);
2455 return rfp;
2456 }
2457 rfp->prob = rfp->prob0 - sbp->alphabet_start;
2458
2459 return rfp;
2460 }
2461
BlastResFreqFree(BLAST_ResFreqPtr rfp)2462 void LIBCALL BlastResFreqFree(BLAST_ResFreqPtr rfp)
2463 {
2464 MemFree(rfp->prob0);
2465 MemFree(rfp);
2466
2467 return;
2468 }
2469
2470 /*
2471 Normalize the frequencies to "norm".
2472 */
2473 Int2 LIBCALL
BlastResFreqNormalize(BLAST_ScoreBlkPtr sbp,BLAST_ResFreqPtr rfp,Nlm_FloatHi norm)2474 BlastResFreqNormalize(BLAST_ScoreBlkPtr sbp, BLAST_ResFreqPtr rfp, Nlm_FloatHi norm)
2475 {
2476 Int2 alphabet_stop, index;
2477 Nlm_FloatHi sum = 0., p;
2478
2479 if (norm == 0.)
2480 return 1;
2481
2482 alphabet_stop = sbp->alphabet_start + sbp->alphabet_size;
2483 for (index=sbp->alphabet_start; index<alphabet_stop; index++)
2484 {
2485 p = rfp->prob[index];
2486 if (p < 0.)
2487 return 1;
2488 sum += p;
2489 }
2490 if (sum <= 0.)
2491 return 0;
2492
2493 for (index=sbp->alphabet_start; index<alphabet_stop; index++)
2494 {
2495 rfp->prob[index] /= sum;
2496 rfp->prob[index] *= norm;
2497 }
2498 return 0;
2499 }
2500
2501 /*
2502 Fills a buffer with the 'standard' alphabet (given by
2503 STD_AMINO_ACID_FREQS[index].ch).
2504
2505 Return value is the number of residues in alphabet.
2506 Negative returns upon error.
2507 */
2508
2509 Int2 LIBCALL
BlastGetStdAlphabet(Uint1 alphabet_code,Uint1Ptr residues,Int4 residues_size)2510 BlastGetStdAlphabet (Uint1 alphabet_code, Uint1Ptr residues, Int4 residues_size)
2511
2512 {
2513 Int2 index;
2514 SeqMapTablePtr smtp=NULL;
2515
2516 if (residues_size < DIM(STD_AMINO_ACID_FREQS))
2517 return -2;
2518
2519 if (alphabet_code != Seq_code_ncbieaa)
2520 {
2521 smtp = SeqMapTableFind(alphabet_code, Seq_code_ncbieaa);
2522 if (smtp == NULL)
2523 {
2524 return -1;
2525 }
2526 }
2527
2528 for (index=0; index<DIM(STD_AMINO_ACID_FREQS); index++)
2529 {
2530 if (smtp)
2531 {
2532 residues[index] = SeqMapTableConvert(smtp, STD_AMINO_ACID_FREQS[index].ch);
2533 }
2534 else
2535 {
2536 residues[index] = STD_AMINO_ACID_FREQS[index].ch;
2537 }
2538 }
2539
2540 return index;
2541 }
2542
2543 Int2 LIBCALL
BlastResFreqStdComp(BLAST_ScoreBlkPtr sbp,BLAST_ResFreqPtr rfp)2544 BlastResFreqStdComp(BLAST_ScoreBlkPtr sbp, BLAST_ResFreqPtr rfp)
2545 {
2546 Int2 index, retval;
2547 Uint1Ptr residues;
2548
2549 if (sbp->protein_alphabet == TRUE)
2550 {
2551 residues = (Uint1Ptr) MemNew(DIM(STD_AMINO_ACID_FREQS)*sizeof(Uint1));
2552 retval = BlastGetStdAlphabet(sbp->alphabet_code, residues, DIM(STD_AMINO_ACID_FREQS));
2553 if (retval < 1)
2554 return retval;
2555
2556 for (index=0; index<DIM(STD_AMINO_ACID_FREQS); index++)
2557 {
2558 rfp->prob[residues[index]] = STD_AMINO_ACID_FREQS[index].p;
2559 }
2560 residues = MemFree(residues);
2561 }
2562 else
2563 { /* beginning of blastna and ncbi2na are the same. */
2564 /* Only blastna used for nucleotides. */
2565 for (index=0; index<DIM(nt_prob); index++)
2566 {
2567 rfp->prob[index] = nt_prob[index].p;
2568 }
2569 }
2570
2571 BlastResFreqNormalize(sbp, rfp, 1.0);
2572
2573 return 0;
2574 }
2575
2576 CharPtr LIBCALL
BlastRepresentativeResidues(Int2 length)2577 BlastRepresentativeResidues(Int2 length)
2578
2579 {
2580 CharPtr buffer, ptr;
2581 Int2 index, total;
2582 Int4 number;
2583 total=0;
2584
2585 for (index=0; index<DIM(STD_AMINO_ACID_FREQS); index++)
2586 {
2587 total += (Int2) STD_AMINO_ACID_FREQS[index].p;
2588 }
2589
2590 buffer = (CharPtr) MemNew((length+1)*sizeof(Char));
2591
2592 ptr = buffer;
2593 for (index=0; index<DIM(STD_AMINO_ACID_FREQS); index++)
2594 {
2595 number = Nint((STD_AMINO_ACID_FREQS[index].p)*((Nlm_FloatHi) length)/((Nlm_FloatHi) total));
2596 while (number > 0)
2597 {
2598 *ptr = STD_AMINO_ACID_FREQS[index].ch;
2599 ptr++;
2600 number--;
2601 }
2602 }
2603
2604 return buffer;
2605 }
2606
2607 Int2 LIBCALL
BlastResFreqString(BLAST_ScoreBlkPtr sbp,BLAST_ResFreqPtr rfp,CharPtr string,Int4 length)2608 BlastResFreqString(BLAST_ScoreBlkPtr sbp, BLAST_ResFreqPtr rfp, CharPtr string, Int4 length)
2609 {
2610 BLAST_ResCompPtr rcp;
2611
2612 rcp = BlastResCompNew(sbp);
2613
2614 BlastResCompStr(sbp, rcp, string, length);
2615
2616 BlastResFreqResComp(sbp, rfp, rcp);
2617
2618 rcp = BlastResCompDestruct(rcp);
2619
2620 return 0;
2621 }
2622
2623 BLAST_ResFreqPtr LIBCALL
BlastResFreqDestruct(BLAST_ResFreqPtr rfp)2624 BlastResFreqDestruct(BLAST_ResFreqPtr rfp)
2625 {
2626 if (rfp == NULL)
2627 return NULL;
2628
2629 if (rfp->prob0 != NULL)
2630 MemFree(rfp->prob0);
2631
2632 rfp = MemFree(rfp);
2633
2634 return rfp;
2635 }
2636
2637 /*
2638 Calculate the residue frequencies associated with the provided ResComp
2639 */
2640 Int2 LIBCALL
BlastResFreqResComp(BLAST_ScoreBlkPtr sbp,BLAST_ResFreqPtr rfp,BLAST_ResCompPtr rcp)2641 BlastResFreqResComp(BLAST_ScoreBlkPtr sbp, BLAST_ResFreqPtr rfp, BLAST_ResCompPtr rcp)
2642 {
2643 Int2 alphabet_max, index;
2644 Nlm_FloatHi sum = 0.;
2645
2646 if (rfp == NULL || rcp == NULL)
2647 return 1;
2648
2649 if (rfp->alphabet_code != rcp->alphabet_code)
2650 return 1;
2651
2652 alphabet_max = sbp->alphabet_start + sbp->alphabet_size;
2653 for (index=sbp->alphabet_start; index<alphabet_max; index++)
2654 sum += rcp->comp[index];
2655
2656 if (sum == 0.) {
2657 BlastResFreqClr(sbp, rfp);
2658 return 0;
2659 }
2660
2661 for (index=sbp->alphabet_start; index<alphabet_max; index++)
2662 rfp->prob[index] = rcp->comp[index] / sum;
2663
2664 return 0;
2665 }
2666
2667 Int2 LIBCALL
BlastResFreqClr(BLAST_ScoreBlkPtr sbp,BLAST_ResFreqPtr rfp)2668 BlastResFreqClr(BLAST_ScoreBlkPtr sbp, BLAST_ResFreqPtr rfp)
2669 {
2670 Int2 alphabet_max, index;
2671
2672 alphabet_max = sbp->alphabet_start + sbp->alphabet_size;
2673 for (index=sbp->alphabet_start; index<alphabet_max; index++)
2674 rfp->prob[index] = 0.0;
2675
2676 return 0;
2677 }
2678
2679 /** Supported substitution and gap costs with corresponding quality values
2680 * for nucleotide sequence comparisons.
2681 * NB: the values 0 and 0 for the gap costs are treated as the defaults used for
2682 * the greedy gapped extension, i.e.
2683 * gap opening = 0,
2684 * gap extension = 1/2 match - mismatch.
2685 *
2686 * The fields are:
2687 *
2688 * 1. Gap opening cost,
2689 * 2. Gap extension cost,
2690 * 3. Lambda,
2691 * 4. K,
2692 * 5. H,
2693 * 6. Alpha,
2694 * 7. Beta,
2695 * 8. Theta
2696 */
2697
2698 /** Karlin-Altschul parameter values for substitution scores 1 and -5. */
2699 static const array_of_8 blastn_values_1_5[] = {
2700 { 0, 0, 1.39, 0.747, 1.38, 1.00, 0, 100 },
2701 { 3, 3, 1.39, 0.747, 1.38, 1.00, 0, 100 }
2702 };
2703
2704 /** Karlin-Altschul parameter values for substitution scores 1 and -4. */
2705 static const array_of_8 blastn_values_1_4[] = {
2706 { 0, 0, 1.383, 0.738, 1.36, 1.02, 0, 100 },
2707 { 1, 2, 1.36, 0.67, 1.2, 1.1, 0, 98 },
2708 { 0, 2, 1.26, 0.43, 0.90, 1.4, -1, 91 },
2709 { 2, 1, 1.35, 0.61, 1.1, 1.2, -1, 98 },
2710 { 1, 1, 1.22, 0.35, 0.72, 1.7, -3, 88 }
2711 };
2712
2713 /** Karlin-Altschul parameter values for substitution scores 2 and -7.
2714 * These parameters can only be applied to even scores. Any odd score must be
2715 * rounded down to the nearest even number before calculating the e-value.
2716 */
2717 static const array_of_8 blastn_values_2_7[] = {
2718 { 0, 0, 0.69, 0.73, 1.34, 0.515, 0, 100 },
2719 { 2, 4, 0.68, 0.67, 1.2, 0.55, 0, 99 },
2720 { 0, 4, 0.63, 0.43, 0.90, 0.7, -1, 91 },
2721 { 4, 2, 0.675, 0.62, 1.1, 0.6, -1, 98 },
2722 { 2, 2, 0.61, 0.35, 0.72, 1.7, -3, 88 }
2723 };
2724
2725 /** Karlin-Altschul parameter values for substitution scores 1 and -3. */
2726 static const array_of_8 blastn_values_1_3[] = {
2727 { 0, 0, 1.374, 0.711, 1.31, 1.05, 0, 100 },
2728 { 2, 2, 1.37, 0.70, 1.2, 1.1, 0, 99 },
2729 { 1, 2, 1.35, 0.64, 1.1, 1.2, -1, 98 },
2730 { 0, 2, 1.25, 0.42, 0.83, 1.5, -2, 91 },
2731 { 2, 1, 1.34, 0.60, 1.1, 1.2, -1, 97 },
2732 { 1, 1, 1.21, 0.34, 0.71, 1.7, -2, 88 }
2733 };
2734
2735 /** Karlin-Altschul parameter values for substitution scores 2 and -5.
2736 * These parameters can only be applied to even scores. Any odd score must be
2737 * rounded down to the nearest even number before calculating the e-value.
2738 */
2739 static const array_of_8 blastn_values_2_5[] = {
2740 { 0, 0, 0.675, 0.65, 1.1, 0.6, -1, 99 },
2741 { 2, 4, 0.67, 0.59, 1.1, 0.6, -1, 98 },
2742 { 0, 4, 0.62, 0.39, 0.78, 0.8, -2, 91 },
2743 { 4, 2, 0.67, 0.61, 1.0, 0.65, -2, 98 },
2744 { 2, 2, 0.56, 0.32, 0.59, 0.95, -4, 82 }
2745 };
2746
2747 /** Karlin-Altschul parameter values for substitution scores 1 and -2. */
2748 static const array_of_8 blastn_values_1_2[] = {
2749 { 0, 0, 1.28, 0.46, 0.85, 1.5, -2, 96 },
2750 { 2, 2, 1.33, 0.62, 1.1, 1.2, 0, 99 },
2751 { 1, 2, 1.30, 0.52, 0.93, 1.4, -2, 97 },
2752 { 0, 2, 1.19, 0.34, 0.66, 1.8, -3, 89 },
2753 { 3, 1, 1.32, 0.57, 1.0, 1.3, -1, 99 },
2754 { 2, 1, 1.29, 0.49, 0.92, 1.4, -1, 96 },
2755 { 1, 1, 1.14, 0.26, 0.52, 2.2, -5, 85 }
2756 };
2757
2758 /** Karlin-Altschul parameter values for substitution scores 2 and -3.
2759 * These parameters can only be applied to even scores. Any odd score must be
2760 * rounded down to the nearest even number before calculating the e-value.
2761 */
2762 static const array_of_8 blastn_values_2_3[] = {
2763 { 0, 0, 0.55, 0.21, 0.46, 1.2, -5, 87 },
2764 { 4, 4, 0.63, 0.42, 0.84, 0.75, -2, 99 },
2765 { 2, 4, 0.615, 0.37, 0.72, 0.85, -3, 97 },
2766 { 0, 4, 0.55, 0.21, 0.46, 1.2, -5, 87 },
2767 { 3, 3, 0.615, 0.37, 0.68, 0.9, -3, 97 },
2768 { 6, 2, 0.63, 0.42, 0.84, 0.75, -2, 99 },
2769 { 5, 2, 0.625, 0.41, 0.78, 0.8, -2, 99 },
2770 { 4, 2, 0.61, 0.35, 0.68, 0.9, -3, 96 },
2771 { 2, 2, 0.515, 0.14, 0.33, 1.55, -9, 81 }
2772 };
2773
2774 /** Karlin-Altschul parameter values for substitution scores 3 and -4. */
2775 static const array_of_8 blastn_values_3_4[] = {
2776 { 6, 3, 0.389, 0.25, 0.56, 0.7, -5, 95},
2777 { 5, 3, 0.375, 0.21, 0.47, 0.8, -6, 92},
2778 { 4, 3, 0.351, 0.14, 0.35, 1.0, -9, 86},
2779 { 6, 2, 0.362, 0.16, 0.45, 0.8, -4, 88},
2780 { 5, 2, 0.330, 0.092, 0.28, 1.2, -13, 81},
2781 { 4, 2, 0.281, 0.046, 0.16, 1.8, -23, 69}
2782 };
2783
2784 /** Karlin-Altschul parameter values for substitution scores 4 and -5. */
2785 static const array_of_8 blastn_values_4_5[] = {
2786 { 0, 0, 0.22, 0.061, 0.22, 1.0, -15, 74 },
2787 { 6, 5, 0.28, 0.21, 0.47, 0.6 , -7, 93 },
2788 { 5, 5, 0.27, 0.17, 0.39, 0.7, -9, 90 },
2789 { 4, 5, 0.25, 0.10, 0.31, 0.8, -10, 83 },
2790 { 3, 5, 0.23, 0.065, 0.25, 0.9, -11, 76 }
2791 };
2792
2793 /** Karlin-Altschul parameter values for substitution scores 1 and -1. */
2794 static const array_of_8 blastn_values_1_1[] = {
2795 { 3, 2, 1.09, 0.31, 0.55, 2.0, -2, 99 },
2796 { 2, 2, 1.07, 0.27, 0.49, 2.2, -3, 97 },
2797 { 1, 2, 1.02, 0.21, 0.36, 2.8, -6, 92 },
2798 { 0, 2, 0.80, 0.064, 0.17, 4.8, -16, 72 },
2799 { 4, 1, 1.08, 0.28, 0.54, 2.0, -2, 98 },
2800 { 3, 1, 1.06, 0.25, 0.46, 2.3, -4, 96 },
2801 { 2, 1, 0.99, 0.17, 0.30, 3.3, -10, 90 }
2802 };
2803
2804 /** Karlin-Altschul parameter values for substitution scores 3 and -2. */
2805 static const array_of_8 blastn_values_3_2[] = {
2806 { 5, 5, 0.208, 0.030, 0.072, 2.9, -47, 77}
2807 };
2808
2809 /** Karlin-Altschul parameter values for substitution scores 5 and -4. */
2810 static const array_of_8 blastn_values_5_4[] = {
2811 { 10, 6, 0.163, 0.068, 0.16, 1.0, -19, 85 },
2812 { 8, 6, 0.146, 0.039, 0.11, 1.3, -29, 76 }
2813 };
2814
2815 static Int2
s_SplitArrayOf8(const array_of_8 * input,const array_of_8 ** normal,const array_of_8 ** non_affine,Boolean * split)2816 s_SplitArrayOf8(const array_of_8* input, const array_of_8** normal, const array_of_8** non_affine, Boolean *split)
2817 {
2818
2819 if (input == NULL || normal == NULL || non_affine == NULL)
2820 return -1;
2821
2822 *normal = NULL;
2823 *non_affine = NULL;
2824
2825 if (input[0][0] == 0 && input[0][1] == 0)
2826 {
2827 *normal = input+1;
2828 *non_affine = input;
2829 *split = TRUE;
2830 }
2831 else
2832 {
2833 *normal = input;
2834 *split = FALSE;
2835 }
2836 return 0;
2837
2838 }
2839
2840 /** Returns the array of values corresponding to the given match/mismatch
2841 * scores, the number of supported gap cost combinations and thresholds for
2842 * the gap costs, beyond which the ungapped statistics can be applied.
2843 * @param reward Match reward score [in]
2844 * @param penalty Mismatch penalty score [in]
2845 * @param array_size Number of supported combinations for this match/mismatch
2846 * pair [out]
2847 * @param normal the values for normal (e.g, "affine") gap costs [out]
2848 * @param non_affine specialized values used for megablast [out]
2849 * @param gap_open_max Gap opening cost threshold for infinite gap costs [out]
2850 * @param gap_extend_max Gap extension cost threshold for infinite gap costs [out]
2851 * @param round_down if set to TRUE only even scores should be used for calculation
2852 * of expect value or bit scores [out]
2853 * @param error_return Pointer to error message [out]
2854 * @return zero on success, other values if error
2855 */
2856 static Int2
s_GetNuclValuesArray(Int4 reward,Int4 penalty,Int4 * array_size,const array_of_8 ** normal,const array_of_8 ** non_affine,Int4 * gap_open_max,Int4 * gap_extend_max,Boolean * round_down,ValNodePtr * error_return)2857 s_GetNuclValuesArray(Int4 reward, Int4 penalty, Int4* array_size,
2858 const array_of_8** normal, const array_of_8** non_affine,
2859 Int4* gap_open_max, Int4* gap_extend_max, Boolean* round_down,
2860 ValNodePtr* error_return)
2861 {
2862 Int2 status = 0;
2863 const array_of_8 * kValues = NULL;
2864 const array_of_8 * kValues_non_affine = NULL;
2865 Boolean split = FALSE;
2866
2867 *round_down = FALSE;
2868
2869 *array_size = 0;
2870
2871 if (reward == 1 && penalty == -5) {
2872 if ((status=s_SplitArrayOf8(blastn_values_1_5, &kValues, &kValues_non_affine, &split)))
2873 return status;
2874
2875 *array_size = sizeof(blastn_values_1_5)/sizeof(array_of_8);
2876 *gap_open_max = 3;
2877 *gap_extend_max = 3;
2878 } else if (reward == 1 && penalty == -4) {
2879 if ((status=s_SplitArrayOf8(blastn_values_1_4, &kValues, &kValues_non_affine, &split)))
2880 return status;
2881
2882 *array_size = sizeof(blastn_values_1_4)/sizeof(array_of_8);
2883 *gap_open_max = 2;
2884 *gap_extend_max = 2;
2885 } else if (reward == 2 && penalty == -7) {
2886 if ((status=s_SplitArrayOf8(blastn_values_2_7, &kValues, &kValues_non_affine, &split)))
2887 return status;
2888
2889 *round_down = TRUE;
2890 *array_size = sizeof(blastn_values_2_7)/sizeof(array_of_8);
2891 *gap_open_max = 4;
2892 *gap_extend_max = 4;
2893 } else if (reward == 1 && penalty == -3) {
2894 if ((status=s_SplitArrayOf8(blastn_values_1_3, &kValues, &kValues_non_affine, &split)))
2895 return status;
2896
2897 *array_size = sizeof(blastn_values_1_3)/sizeof(array_of_8);
2898 *gap_open_max = 2;
2899 *gap_extend_max = 2;
2900 } else if (reward == 2 && penalty == -5) {
2901 if ((status=s_SplitArrayOf8(blastn_values_2_5, &kValues, &kValues_non_affine, &split)))
2902 return status;
2903
2904 *round_down = TRUE;
2905 *array_size = sizeof(blastn_values_2_5)/sizeof(array_of_8);
2906 *gap_open_max = 4;
2907 *gap_extend_max = 4;
2908 } else if (reward == 1 && penalty == -2) {
2909 if ((status=s_SplitArrayOf8(blastn_values_1_2, &kValues, &kValues_non_affine, &split)))
2910 return status;
2911
2912 *array_size = sizeof(blastn_values_1_2)/sizeof(array_of_8);
2913 *gap_open_max = 2;
2914 *gap_extend_max = 2;
2915 } else if (reward == 2 && penalty == -3) {
2916 if ((status=s_SplitArrayOf8(blastn_values_2_3, &kValues, &kValues_non_affine, &split)))
2917 return status;
2918
2919 *round_down = TRUE;
2920 *array_size = sizeof(blastn_values_2_3)/sizeof(array_of_8);
2921 *gap_open_max = 6;
2922 *gap_extend_max = 4;
2923 } else if (reward == 3 && penalty == -4) {
2924 if ((status=s_SplitArrayOf8(blastn_values_3_4, &kValues, &kValues_non_affine, &split)))
2925 return status;
2926
2927 *round_down = TRUE;
2928 *array_size = sizeof(blastn_values_3_4)/sizeof(array_of_8);
2929 *gap_open_max = 6;
2930 *gap_extend_max = 3;
2931 } else if (reward == 1 && penalty == -1) {
2932 if ((status=s_SplitArrayOf8(blastn_values_1_1, &kValues, &kValues_non_affine, &split)))
2933 return status;
2934
2935 *array_size = sizeof(blastn_values_1_1)/sizeof(array_of_8);
2936 *gap_open_max = 4;
2937 *gap_extend_max = 2;
2938 } else if (reward == 3 && penalty == -2) {
2939 if ((status=s_SplitArrayOf8(blastn_values_3_2, &kValues, &kValues_non_affine, &split)))
2940 return status;
2941
2942 *array_size = sizeof(blastn_values_3_2)/sizeof(array_of_8);
2943 *gap_open_max = 5;
2944 *gap_extend_max = 5;
2945 } else if (reward == 4 && penalty == -5) {
2946 if ((status=s_SplitArrayOf8(blastn_values_4_5, &kValues, &kValues_non_affine, &split)))
2947 return status;
2948
2949 *array_size = sizeof(blastn_values_4_5)/sizeof(array_of_8);
2950 *gap_open_max = 12;
2951 *gap_extend_max = 8;
2952 } else if (reward == 5 && penalty == -4) {
2953 if ((status=s_SplitArrayOf8(blastn_values_5_4, &kValues, &kValues_non_affine, &split)))
2954 return status;
2955
2956 *array_size = sizeof(blastn_values_5_4)/sizeof(array_of_8);
2957 *gap_open_max = 25;
2958 *gap_extend_max = 10;
2959 } else { /* Unsupported reward-penalty */
2960 status = -1;
2961 if (error_return) {
2962 char buffer[256];
2963 sprintf(buffer, "Substitution scores %d and %d are not supported",
2964 reward, penalty);
2965 BlastConstructErrorMessage("s_GetNuclValuesArray", buffer, 2, error_return);
2966 }
2967 }
2968 if (split)
2969 (*array_size)--;
2970
2971 *normal = kValues;
2972 *non_affine = kValues_non_affine;
2973
2974 return status;
2975 }
2976
2977
2978 /** Returns the beta statistical parameter value, given the nucleotide
2979 * substitution scores.
2980 * @param reward Match reward score [in]
2981 * @param penalty Mismatch penalty score [in]
2982 * @return The value of the beta parameter.
2983 */
s_GetUngappedBeta(Int4 reward,Int4 penalty)2984 static double s_GetUngappedBeta(Int4 reward, Int4 penalty)
2985 {
2986 double beta = 0;
2987 if ((reward == 1 && penalty == -1) ||
2988 (reward == 2 && penalty == -3))
2989 beta = -2;
2990
2991 return beta;
2992 }
2993
BlastKarlinGetNuclAlphaBeta(Int4 reward,Int4 penalty,Int4 gap_open,Int4 gap_extend,BLAST_KarlinBlkPtr kbp,Boolean gapped_calculation,double * alpha,double * beta)2994 Int2 BlastKarlinGetNuclAlphaBeta(Int4 reward, Int4 penalty, Int4 gap_open,
2995 Int4 gap_extend, BLAST_KarlinBlkPtr kbp,
2996 Boolean gapped_calculation,
2997 double *alpha, double *beta)
2998 {
2999 const int kGapOpenIndex = 0;
3000 const int kGapExtIndex = 1;
3001 const int kAlphaIndex = 5;
3002 const int kBetaIndex = 6;
3003 Int4 num_combinations = 0;
3004 Int4 gap_open_max = 0, gap_extend_max = 0;
3005 Int4 index = 0;
3006 const array_of_8* kNormal=NULL;
3007 const array_of_8* kNonAffine=NULL;
3008 Boolean round_down = FALSE;
3009 Int2 status = s_GetNuclValuesArray(reward,
3010 penalty,
3011 &num_combinations,
3012 &kNormal,
3013 &kNonAffine,
3014 &gap_open_max,
3015 &gap_extend_max,
3016 &round_down,
3017 NULL);
3018
3019 if (status)
3020 return status;
3021
3022 ASSERT(alpha && beta && kbp);
3023
3024 /* For ungapped search return ungapped values of alpha and beta. */
3025 if (gapped_calculation && kNormal) {
3026 if (gap_open == 0 && gap_extend == 0 && kNonAffine)
3027 {
3028 *alpha = kNonAffine[0][kAlphaIndex];
3029 *beta = kNonAffine[0][kBetaIndex];
3030 return 0;
3031 }
3032 for (index = 0; index < num_combinations; ++index) {
3033 if (kNormal[index][kGapOpenIndex] == gap_open &&
3034 kNormal[index][kGapExtIndex] == gap_extend) {
3035 *alpha = kNormal[index][kAlphaIndex];
3036 *beta = kNormal[index][kBetaIndex];
3037 return 0;
3038 }
3039 }
3040 }
3041
3042 /* If input values not found in tables, or if this is an ungapped search,
3043 return the ungapped values of alpha and beta. */
3044 *alpha = kbp->Lambda/kbp->H;
3045 *beta = s_GetUngappedBeta(reward, penalty);
3046
3047 return 0;
3048 }
3049
3050 /** Copies data from in to out. Both in and out should be
3051 * allocated.
3052 * @param in object to be copied [in]
3053 * @param out object to be copied to [in|out]
3054 */
3055 static void
s_KarlinBlkCopy(BLAST_KarlinBlk * in,BLAST_KarlinBlk * out)3056 s_KarlinBlkCopy(BLAST_KarlinBlk* in, BLAST_KarlinBlk* out)
3057 {
3058 ASSERT(in && out);
3059
3060 out->Lambda = in->Lambda;
3061 out->K = in->K;
3062 out->logK = in->logK;
3063 out->H = in->H;
3064 out->paramC = in->paramC;
3065
3066 return;
3067 }
3068
3069 Int2
BlastKarlinBlkNuclGappedCalc(BLAST_KarlinBlk * kbp,Int4 gap_open,Int4 gap_extend,Int4 reward,Int4 penalty,BLAST_KarlinBlk * kbp_ungap,Boolean * round_down,ValNodePtr * error_return)3070 BlastKarlinBlkNuclGappedCalc(BLAST_KarlinBlk* kbp, Int4 gap_open,
3071 Int4 gap_extend, Int4 reward, Int4 penalty,
3072 BLAST_KarlinBlk* kbp_ungap,
3073 Boolean* round_down,
3074 ValNodePtr* error_return)
3075 {
3076 const int kGapOpenIndex = 0;
3077 const int kGapExtIndex = 1;
3078 const int kLambdaIndex = 2;
3079 const int kKIndex = 3;
3080 const int kHIndex = 4;
3081 int num_combinations = 0;
3082 int gap_open_max, gap_extend_max;
3083 const array_of_8* kNormal=NULL;
3084 const array_of_8* kNonAffine=NULL;
3085 Int2 status = s_GetNuclValuesArray(reward,
3086 penalty,
3087 &num_combinations,
3088 &kNormal,
3089 &kNonAffine,
3090 &gap_open_max,
3091 &gap_extend_max,
3092 round_down,
3093 error_return);
3094
3095 if (status)
3096 return status;
3097
3098 ASSERT(kbp && kbp_ungap);
3099
3100
3101 /* Try to find the table entry corresponding to input gap costs. */
3102 if (gap_open == 0 && gap_extend == 0 && kNonAffine)
3103 {
3104 kbp->Lambda = kNonAffine[0][kLambdaIndex];
3105 kbp->K = kNonAffine[0][kKIndex];
3106 kbp->logK = log(kbp->K);
3107 kbp->H = kNonAffine[0][kHIndex];
3108 }
3109 else
3110 {
3111 int index=0;
3112 for (index = 0; index < num_combinations; ++index) {
3113 if (kNormal[index][kGapOpenIndex] == gap_open &&
3114 kNormal[index][kGapExtIndex] == gap_extend) {
3115 kbp->Lambda = kNormal[index][kLambdaIndex];
3116 kbp->K = kNormal[index][kKIndex];
3117 kbp->logK = log(kbp->K);
3118 kbp->H = kNormal[index][kHIndex];
3119 break;
3120 }
3121 }
3122
3123 /* If gap costs are not found in the table, check if they belong to the
3124 infinite domain, where ungapped values of the parameters can be used. */
3125 if (index == num_combinations) {
3126 /* If gap costs are larger than maximal provided in tables, copy
3127 the values from the ungapped Karlin block. */
3128 if (gap_open >= gap_open_max && gap_extend >= gap_extend_max) {
3129 s_KarlinBlkCopy(kbp_ungap, kbp);
3130 } else if (error_return) {
3131 char buffer[8192];
3132 int i=0;
3133 int len=0;
3134 /* Unsupported gap costs combination. */
3135 sprintf(buffer, "Gap existence and extension values %ld and %ld "
3136 "are not supported for substitution scores %ld and %ld\n",
3137 (long) gap_open, (long) gap_extend, (long) reward, (long) penalty);
3138 for (i = 0; i < num_combinations; ++i)
3139 {
3140 len = strlen(buffer);
3141 sprintf(buffer+len, "%ld and %ld are supported existence and extension values\n",
3142 (long) kNormal[i][kGapOpenIndex], (long) kNormal[i][kGapExtIndex]);
3143 }
3144 len = strlen(buffer);
3145 sprintf(buffer+len, "%ld and %ld are supported existence and extension values\n",
3146 (long) gap_open_max, (long) gap_extend_max);
3147 len = strlen(buffer);
3148 sprintf(buffer+len, "Any values more stringent than %ld and %ld are supported\n",
3149 (long) gap_open_max, (long) gap_extend_max);
3150 BlastConstructErrorMessage("BlastKarlinBlkNuclGappedCalc", buffer, 2, error_return);
3151 return 1;
3152 }
3153 }
3154 }
3155
3156 return 0;
3157 }
3158
3159
3160 /*
3161 Deallocates MatrixInfoPtr
3162 */
3163
3164 static MatrixInfoPtr
MatrixInfoDestruct(MatrixInfoPtr matrix_info)3165 MatrixInfoDestruct(MatrixInfoPtr matrix_info)
3166
3167 {
3168 if (matrix_info == NULL)
3169 return NULL;
3170
3171 MemFree(matrix_info->name);
3172 return MemFree(matrix_info);
3173 }
3174
3175 /*
3176 Makes New MatrixInfoPtr
3177 */
3178
3179 static MatrixInfoPtr
MatrixInfoNew(CharPtr name,array_of_8 * values,Int4Ptr prefs,Int4 max_number)3180 MatrixInfoNew(CharPtr name, array_of_8 *values, Int4Ptr prefs, Int4 max_number)
3181
3182 {
3183 MatrixInfoPtr matrix_info;
3184
3185 matrix_info = (MatrixInfoPtr) MemNew(sizeof(MatrixInfo));
3186 matrix_info->name = StringSave(name);
3187 matrix_info->values = values;
3188 matrix_info->prefs = prefs;
3189 matrix_info->max_number_values = max_number;
3190
3191 return matrix_info;
3192 }
3193
3194 static ValNodePtr
BlastMatrixValuesDestruct(ValNodePtr vnp)3195 BlastMatrixValuesDestruct(ValNodePtr vnp)
3196
3197 {
3198 ValNodePtr head;
3199
3200 head = vnp;
3201 while (vnp)
3202 {
3203 MatrixInfoDestruct((MatrixInfoPtr) vnp->data.ptrvalue);
3204 vnp = vnp->next;
3205 }
3206
3207 head = ValNodeFree(head);
3208
3209 return head;
3210 }
3211
3212 /*
3213 ValNodePtr BlastLoadMatrixValues (void)
3214
3215 Loads all the matrix values, returns a ValNodePtr chain that contains
3216 MatrixInfoPtr's.
3217
3218 */
3219 static ValNodePtr
BlastLoadMatrixValues(void)3220 BlastLoadMatrixValues (void)
3221
3222 {
3223 MatrixInfoPtr matrix_info;
3224 ValNodePtr retval=NULL;
3225
3226 matrix_info = MatrixInfoNew("BLOSUM80", blosum80_values, blosum80_prefs, BLOSUM80_VALUES_MAX);
3227 ValNodeAddPointer(&retval, 0, matrix_info);
3228
3229 matrix_info = MatrixInfoNew("BLOSUM62", blosum62_values, blosum62_prefs, BLOSUM62_VALUES_MAX);
3230 ValNodeAddPointer(&retval, 0, matrix_info);
3231
3232 matrix_info = MatrixInfoNew("BLOSUM50", blosum50_values, blosum50_prefs, BLOSUM50_VALUES_MAX);
3233 ValNodeAddPointer(&retval, 0, matrix_info);
3234
3235 matrix_info = MatrixInfoNew("BLOSUM45", blosum45_values, blosum45_prefs, BLOSUM45_VALUES_MAX);
3236 ValNodeAddPointer(&retval, 0, matrix_info);
3237
3238 matrix_info = MatrixInfoNew("PAM250", pam250_values, pam250_prefs, PAM250_VALUES_MAX);
3239 ValNodeAddPointer(&retval, 0, matrix_info);
3240
3241 /*
3242 matrix_info = MatrixInfoNew("BLOSUM62_20", blosum62_20_values, blosum62_20_prefs, BLOSUM62_20_VALUES_MAX);
3243 ValNodeAddPointer(&retval, 0, matrix_info);
3244 */
3245
3246 matrix_info = MatrixInfoNew("BLOSUM90", blosum90_values, blosum90_prefs, BLOSUM90_VALUES_MAX);
3247 ValNodeAddPointer(&retval, 0, matrix_info);
3248
3249 matrix_info = MatrixInfoNew("PAM30", pam30_values, pam30_prefs, PAM30_VALUES_MAX);
3250 ValNodeAddPointer(&retval, 0, matrix_info);
3251
3252 matrix_info = MatrixInfoNew("PAM70", pam70_values, pam70_prefs, PAM70_VALUES_MAX);
3253 ValNodeAddPointer(&retval, 0, matrix_info);
3254
3255 return retval;
3256 }
3257 /*
3258 Int2 LIBCALL
3259 BlastKarlinGetMatrixValues(CharPtr matrix, Int4Ptr open, Int4Ptr extension, FloatHiPtr lambda, FloatHiPtr K, FloatHiPtr H)
3260
3261 Obtains arrays of the allowed opening and extension penalties for gapped BLAST for
3262 the given matrix. Also obtains arrays of Lambda, K, and H. Any of these fields that
3263 are not required should be set to NULL. The Int2 return value is the length of the
3264 arrays.
3265 */
3266
3267 Int2 LIBCALL
BlastKarlinGetMatrixValues(CharPtr matrix,Int4Ptr PNTR open,Int4Ptr PNTR extension,FloatHiPtr PNTR lambda,FloatHiPtr PNTR K,FloatHiPtr PNTR H,Int4Ptr PNTR pref_flags)3268 BlastKarlinGetMatrixValues(CharPtr matrix, Int4Ptr PNTR open, Int4Ptr PNTR extension, FloatHiPtr PNTR lambda, FloatHiPtr PNTR K, FloatHiPtr PNTR H, Int4Ptr PNTR pref_flags)
3269
3270 {
3271 return BlastKarlinGetMatrixValuesEx2(matrix, open, extension, NULL, lambda, K, H, NULL, NULL, pref_flags);
3272
3273 }
3274
3275 /*
3276 Int2 LIBCALL
3277 BlastKarlinGetMatrixValuesEx(CharPtr matrix, Int4Ptr open, Int4Ptr extension, FloatHiPtr lambda, FloatHiPtr K, FloatHiPtr H)
3278
3279 Obtains arrays of the allowed opening and extension penalties for gapped BLAST for
3280 the given matrix. Also obtains arrays of Lambda, K, and H. Any of these fields that
3281 are not required should be set to NULL. The Int2 return value is the length of the
3282 arrays.
3283 */
3284
3285 Int2 LIBCALL
BlastKarlinGetMatrixValuesEx(CharPtr matrix,Int4Ptr PNTR open,Int4Ptr PNTR extension,Int4Ptr PNTR decline_align,FloatHiPtr PNTR lambda,FloatHiPtr PNTR K,FloatHiPtr PNTR H,Int4Ptr PNTR pref_flags)3286 BlastKarlinGetMatrixValuesEx(CharPtr matrix, Int4Ptr PNTR open, Int4Ptr PNTR extension, Int4Ptr PNTR decline_align, FloatHiPtr PNTR lambda, FloatHiPtr PNTR K, FloatHiPtr PNTR H, Int4Ptr PNTR pref_flags)
3287
3288 {
3289 return BlastKarlinGetMatrixValuesEx2(matrix, open, extension, decline_align, lambda, K, H, NULL, NULL, pref_flags);
3290
3291 }
3292
3293 /*
3294 Int2 LIBCALL
3295 BlastKarlinGetMatrixValuesEx2(CharPtr matrix, Int4Ptr open, Int4Ptr extension, Int4Ptr decline_align, FloatHiPtr lambda, FloatHiPtr K, FloatHiPtr H)
3296
3297 Obtains arrays of the allowed opening and extension penalties for gapped BLAST for
3298 the given matrix. Also obtains arrays of Lambda, K, and H. Any of these fields that
3299 are not required should be set to NULL. The Int2 return value is the length of the
3300 arrays.
3301 */
3302
3303 Int2 LIBCALL
BlastKarlinGetMatrixValuesEx2(CharPtr matrix,Int4Ptr PNTR open,Int4Ptr PNTR extension,Int4Ptr PNTR decline_align,FloatHiPtr PNTR lambda,FloatHiPtr PNTR K,FloatHiPtr PNTR H,FloatHiPtr PNTR alpha,FloatHiPtr PNTR beta,Int4Ptr PNTR pref_flags)3304 BlastKarlinGetMatrixValuesEx2(CharPtr matrix, Int4Ptr PNTR open, Int4Ptr PNTR extension, Int4Ptr PNTR decline_align, FloatHiPtr PNTR lambda, FloatHiPtr PNTR K, FloatHiPtr PNTR H, FloatHiPtr PNTR alpha, FloatHiPtr PNTR beta, Int4Ptr PNTR pref_flags)
3305
3306 {
3307 array_of_8 *values;
3308 Boolean found_matrix=FALSE;
3309 Int4 index, max_number_values=0;
3310 Int4Ptr open_array=NULL, extension_array=NULL, decline_align_array=NULL, pref_flags_array=NULL, prefs;
3311 Nlm_FloatHiPtr lambda_array=NULL, K_array=NULL, H_array=NULL, alpha_array=NULL, beta_array=NULL;
3312 MatrixInfoPtr matrix_info;
3313 ValNodePtr vnp, head;
3314
3315 if (matrix == NULL)
3316 return 0;
3317
3318 vnp = head = BlastLoadMatrixValues();
3319
3320 while (vnp)
3321 {
3322 matrix_info = vnp->data.ptrvalue;
3323 if (StringICmp(matrix_info->name, matrix) == 0)
3324 {
3325 values = matrix_info->values;
3326 max_number_values = matrix_info->max_number_values;
3327 prefs = matrix_info->prefs;
3328 found_matrix = TRUE;
3329 break;
3330 }
3331 vnp = vnp->next;
3332 }
3333
3334 if (found_matrix)
3335 {
3336 if (open)
3337 *open = open_array = MemNew(max_number_values*sizeof(Int4));
3338 if (extension)
3339 *extension = extension_array = MemNew(max_number_values*sizeof(Int4));
3340 if (decline_align)
3341 *decline_align = decline_align_array = MemNew(max_number_values*sizeof(Int4));
3342 if (lambda)
3343 *lambda = lambda_array = (FloatHiPtr) MemNew(max_number_values*sizeof(FloatHi));
3344 if (K)
3345 *K = K_array = (FloatHiPtr) MemNew(max_number_values*sizeof(FloatHi));
3346 if (H)
3347 *H = H_array = (FloatHiPtr) MemNew(max_number_values*sizeof(FloatHi));
3348 if (alpha)
3349 *alpha = alpha_array = (FloatHiPtr) MemNew(max_number_values*sizeof(FloatHi));
3350 if (beta)
3351 *beta = beta_array = (FloatHiPtr) MemNew(max_number_values*sizeof(FloatHi));
3352 if (pref_flags)
3353 *pref_flags = pref_flags_array = MemNew(max_number_values*sizeof(Int4));
3354
3355 for (index=0; index<max_number_values; index++)
3356 {
3357 if (open)
3358 open_array[index] = values[index][0];
3359 if (extension)
3360 extension_array[index] = values[index][1];
3361 if (decline_align)
3362 decline_align_array[index] = values[index][2];
3363 if (lambda)
3364 lambda_array[index] = values[index][3];
3365 if (K)
3366 K_array[index] = values[index][4];
3367 if (H)
3368 H_array[index] = values[index][5];
3369 if (alpha)
3370 alpha_array[index] = values[index][6];
3371 if (beta)
3372 beta_array[index] = values[index][7];
3373 if (pref_flags)
3374 pref_flags_array[index] = prefs[index];
3375 }
3376 }
3377
3378 BlastMatrixValuesDestruct(head);
3379
3380 return max_number_values;
3381 }
3382
3383 /*Extract the alpha and beta settings for this matrixName, and these
3384 gap open and gap extension costs*/
getAlphaBeta(CharPtr matrixName,Nlm_FloatHi * alpha,Nlm_FloatHi * beta,Boolean gapped,Int4 gap_open,Int4 gap_extend)3385 void LIBCALL getAlphaBeta(CharPtr matrixName, Nlm_FloatHi *alpha,
3386 Nlm_FloatHi *beta, Boolean gapped, Int4 gap_open, Int4 gap_extend)
3387 {
3388 Int4Ptr gapOpen_arr, gapExtend_arr, pref_flags;
3389 FloatHiPtr alpha_arr, beta_arr;
3390 Int2 num_values;
3391 Int4 i; /*loop index*/
3392
3393 num_values = BlastKarlinGetMatrixValuesEx2(matrixName, &gapOpen_arr,
3394 &gapExtend_arr, NULL, NULL, NULL, NULL, &alpha_arr, &beta_arr,
3395 &pref_flags);
3396
3397 if (gapped) {
3398 if ((0 == gap_open) && (0 == gap_extend)) {
3399 for(i = 1; i < num_values; i++) {
3400 if(pref_flags[i]==BLAST_MATRIX_BEST) {
3401 (*alpha) = alpha_arr[i];
3402 (*beta) = beta_arr[i];
3403 break;
3404 }
3405 }
3406 }
3407 else {
3408 for(i = 1; i < num_values; i++) {
3409 if ((gapOpen_arr[i] == gap_open) &&
3410 (gapExtend_arr[i] == gap_extend)) {
3411 (*alpha) = alpha_arr[i];
3412 (*beta) = beta_arr[i];
3413 break;
3414 }
3415 }
3416 }
3417 }
3418 else if (num_values > 0) {
3419 (*alpha) = alpha_arr[0];
3420 (*beta) = beta_arr[0];
3421 }
3422
3423 MemFree(gapOpen_arr);
3424 MemFree(gapExtend_arr);
3425 MemFree(pref_flags);
3426 MemFree(alpha_arr);
3427 MemFree(beta_arr);
3428 }
3429
3430 /*
3431 Conveniently return default/best Karling-Altschul parameters for a given matrix.
3432
3433 */
3434
3435 Int2 LIBCALL
BlastKarlinGetDefaultMatrixValues(CharPtr matrix,Int4Ptr open,Int4Ptr extension,FloatHiPtr lambda,FloatHiPtr K,FloatHiPtr H)3436 BlastKarlinGetDefaultMatrixValues(CharPtr matrix, Int4Ptr open, Int4Ptr extension, FloatHiPtr lambda, FloatHiPtr K, FloatHiPtr H) {
3437 Int4Ptr gapOpen_arr, gapExtend_arr, pref_flags;
3438 FloatHiPtr Lambda_arr, Kappa_arr, H_arr;
3439 Int4 i,n;
3440 if(matrix==NULL)
3441 matrix = "BLOSUM62";
3442 n = BlastKarlinGetMatrixValues(matrix, &gapOpen_arr, &gapExtend_arr, &Lambda_arr, &Kappa_arr, &H_arr,&pref_flags);
3443 if(n) {
3444 *open = gapOpen_arr[0];
3445 *extension = gapExtend_arr[0];
3446 *K = Kappa_arr[0];
3447 *lambda = Lambda_arr[0];
3448 *H = H_arr[0];
3449 for(i=0;i<n;i++) {
3450 if(pref_flags[i]==BLAST_MATRIX_PREFERRED) {
3451 *open = gapOpen_arr[i];
3452 *extension = gapExtend_arr[i];
3453 *K = Kappa_arr[i];
3454 *lambda = Lambda_arr[i];
3455 *H = H_arr[i];
3456 } else if(pref_flags[i]==BLAST_MATRIX_BEST) {
3457 *open = gapOpen_arr[i];
3458 *extension = gapExtend_arr[i];
3459 *K = Kappa_arr[i];
3460 *lambda = Lambda_arr[i];
3461 *H = H_arr[i];
3462 i+=n;
3463 }
3464 }
3465 MemFree(gapOpen_arr);
3466 MemFree(gapExtend_arr);
3467 MemFree(Kappa_arr);
3468 MemFree(Lambda_arr);
3469 MemFree(H_arr);
3470 MemFree(pref_flags);
3471 return 1;
3472 } else
3473 return 0;
3474 }
3475 /*
3476 Supplies lambda, H, and K values, as calcualted by Stephen Altschul
3477 in Methods in Enzy. (vol 266, page 474).
3478
3479 if kbp is NULL, then a validation is perfomed.
3480 */
3481
3482 Int2 LIBCALL
BlastKarlinBlkGappedCalc(BLAST_KarlinBlkPtr kbp,Int4 gap_open,Int4 gap_extend,CharPtr matrix_name,ValNodePtr PNTR error_return)3483 BlastKarlinBlkGappedCalc(BLAST_KarlinBlkPtr kbp, Int4 gap_open, Int4 gap_extend, CharPtr matrix_name, ValNodePtr PNTR error_return)
3484
3485 {
3486
3487 return BlastKarlinBlkGappedCalcEx(kbp, gap_open, gap_extend, (FloatHi) INT2_MAX, matrix_name, error_return);
3488
3489 }
3490
3491 /*
3492 Supplies lambda, H, and K values, as calcualted by Stephen Altschul
3493 in Methods in Enzy. (vol 266, page 474).
3494
3495 if kbp is NULL, then a validation is perfomed.
3496 */
3497
3498 Int2 LIBCALL
BlastKarlinBlkGappedCalcEx(BLAST_KarlinBlkPtr kbp,Int4 gap_open,Int4 gap_extend,Int4 decline_align,CharPtr matrix_name,ValNodePtr PNTR error_return)3499 BlastKarlinBlkGappedCalcEx(BLAST_KarlinBlkPtr kbp, Int4 gap_open, Int4 gap_extend, Int4 decline_align, CharPtr matrix_name, ValNodePtr PNTR error_return)
3500
3501 {
3502 Char buffer[256];
3503 Int2 status = BlastKarlinkGapBlkFill(kbp, gap_open, gap_extend, decline_align, matrix_name);
3504
3505 if (status && error_return)
3506 {
3507 if (status == 1)
3508 {
3509 MatrixInfoPtr matrix_info;
3510 ValNodePtr vnp, head;
3511
3512 vnp = head = BlastLoadMatrixValues();
3513
3514 sprintf(buffer, "%s is not a supported matrix", matrix_name);
3515 BlastConstructErrorMessage("BlastKarlinBlkGappedCalc", buffer, 2, error_return);
3516
3517 while (vnp)
3518 {
3519 matrix_info = vnp->data.ptrvalue;
3520 sprintf(buffer, "%s is a supported matrix", matrix_info->name);
3521 BlastConstructErrorMessage("BlastKarlinBlkGappedCalc", buffer, 2, error_return);
3522 vnp = vnp->next;
3523 }
3524
3525 BlastMatrixValuesDestruct(head);
3526 }
3527 else if (status == 2)
3528 {
3529 if (decline_align == INT2_MAX)
3530 sprintf(buffer, "Gap existence and extension values of %ld and %ld not supported for %s", (long) gap_open, (long) gap_extend, matrix_name);
3531 else
3532 sprintf(buffer, "Gap existence, extension and decline-to-align values of %ld, %ld and %ld not supported for %s", (long) gap_open, (long) gap_extend, (long) decline_align, matrix_name);
3533 BlastConstructErrorMessage("BlastKarlinBlkGappedCalc", buffer, 2, error_return);
3534 BlastKarlinReportAllowedValues(matrix_name, error_return);
3535 }
3536 }
3537
3538 return status;
3539 }
3540
3541 /*
3542 Attempts to fill KarlinBlk for given gap opening, extensions etc.
3543 Will return non-zero status if that fails.
3544
3545 return values: -1 if matrix_name is NULL;
3546 1 if matrix not found
3547 2 if matrix found, but open, extend etc. values not supported.
3548 */
3549 Int2 LIBCALL
BlastKarlinkGapBlkFill(BLAST_KarlinBlkPtr kbp,Int4 gap_open,Int4 gap_extend,Int4 decline_align,CharPtr matrix_name)3550 BlastKarlinkGapBlkFill(BLAST_KarlinBlkPtr kbp, Int4 gap_open, Int4 gap_extend, Int4 decline_align, CharPtr matrix_name)
3551 {
3552 Boolean found_matrix=FALSE, found_values=FALSE;
3553 array_of_8 *values;
3554 Int2 status=0;
3555 Int4 index, max_number_values=0;
3556 MatrixInfoPtr matrix_info;
3557 ValNodePtr vnp, head;
3558
3559 if (matrix_name == NULL)
3560 return -1;
3561
3562 values = NULL;
3563
3564 vnp = head = BlastLoadMatrixValues();
3565 while (vnp)
3566 {
3567 matrix_info = vnp->data.ptrvalue;
3568 if (StringICmp(matrix_info->name, matrix_name) == 0)
3569 {
3570 values = matrix_info->values;
3571 max_number_values = matrix_info->max_number_values;
3572 found_matrix = TRUE;
3573 break;
3574 }
3575 vnp = vnp->next;
3576 }
3577
3578
3579 if (found_matrix)
3580 {
3581 for (index=0; index<max_number_values; index++)
3582 {
3583 if (Nint(values[index][0]) == gap_open &&
3584 Nint(values[index][1]) == gap_extend &&
3585 (Nint(values[index][2]) == INT2_MAX || Nint(values[index][2]) == decline_align))
3586 {
3587 if (kbp)
3588 {
3589 kbp->Lambda = values[index][3];
3590 kbp->K = values[index][4];
3591 kbp->logK = log(kbp->K);
3592 kbp->H = values[index][5];
3593 }
3594 found_values = TRUE;
3595 break;
3596 }
3597 }
3598
3599 if (found_values == TRUE)
3600 {
3601 status = 0;
3602 }
3603 else
3604 {
3605 status = 2;
3606 }
3607 }
3608 else
3609 {
3610 status = 1;
3611 }
3612
3613 BlastMatrixValuesDestruct(head);
3614
3615 return status;
3616 }
3617
3618 CharPtr
PrintMatrixMessage(const Char * matrix_name)3619 PrintMatrixMessage(const Char *matrix_name)
3620 {
3621 CharPtr buffer=MemNew(1024*sizeof(Char));
3622 CharPtr ptr;
3623 MatrixInfoPtr matrix_info;
3624 ValNodePtr vnp, head;
3625
3626 ptr = buffer;
3627 sprintf(ptr, "%s is not a supported matrix, supported matrices are:\n", matrix_name);
3628
3629 ptr += StringLen(ptr);
3630
3631 vnp = head = BlastLoadMatrixValues();
3632
3633 while (vnp)
3634 {
3635 matrix_info = vnp->data.ptrvalue;
3636 sprintf(ptr, "%s \n", matrix_info->name);
3637 ptr += StringLen(ptr);
3638 vnp = vnp->next;
3639 }
3640 BlastMatrixValuesDestruct(head);
3641
3642 return buffer;
3643 }
3644
3645 CharPtr
PrintAllowedValuesMessage(const Char * matrix_name,Int4 gap_open,Int4 gap_extend,Int4 decline_align)3646 PrintAllowedValuesMessage(const Char *matrix_name, Int4 gap_open, Int4 gap_extend, Int4 decline_align)
3647 {
3648 array_of_8 *values;
3649 Boolean found_matrix=FALSE;
3650 CharPtr buffer, ptr;
3651 Int4 index, max_number_values=0;
3652 MatrixInfoPtr matrix_info;
3653 ValNodePtr vnp, head;
3654
3655 ptr = buffer = MemNew(2048*sizeof(Char));
3656
3657 if (decline_align == INT2_MAX)
3658 sprintf(ptr, "Gap existence and extension values of %ld and %ld not supported for %s\nsupported values are:\n",
3659 (long) gap_open, (long) gap_extend, matrix_name);
3660 else
3661 sprintf(ptr, "Gap existence, extension and decline-to-align values of %ld, %ld and %ld not supported for %s\n supported values are:\n",
3662 (long) gap_open, (long) gap_extend, (long) decline_align, matrix_name);
3663
3664 ptr += StringLen(ptr);
3665
3666 vnp = head = BlastLoadMatrixValues();
3667 while (vnp)
3668 {
3669 matrix_info = vnp->data.ptrvalue;
3670 if (StringICmp(matrix_info->name, matrix_name) == 0)
3671 {
3672 values = matrix_info->values;
3673 max_number_values = matrix_info->max_number_values;
3674 found_matrix = TRUE;
3675 break;
3676 }
3677 vnp = vnp->next;
3678 }
3679
3680 if (found_matrix)
3681 {
3682 for (index=0; index<max_number_values; index++)
3683 {
3684 if (Nint(values[index][2]) == INT2_MAX)
3685 sprintf(ptr, "%ld, %ld\n", (long) Nint(values[index][0]), (long) Nint(values[index][1]));
3686 else
3687 sprintf(ptr, "%ld, %ld, %ld\n", (long) Nint(values[index][0]), (long) Nint(values[index][1]), (long) Nint(values[index][2]));
3688 ptr += StringLen(ptr);
3689 }
3690 }
3691
3692 BlastMatrixValuesDestruct(head);
3693
3694 return buffer;
3695 }
3696
3697
3698 Int2 LIBCALL
BlastKarlinReportAllowedValues(const Char * matrix_name,ValNodePtr PNTR error_return)3699 BlastKarlinReportAllowedValues(const Char *matrix_name, ValNodePtr PNTR error_return)
3700 {
3701 array_of_8 *values;
3702 Boolean found_matrix=FALSE;
3703 Char buffer[256];
3704 Int4 index, max_number_values=0;
3705 MatrixInfoPtr matrix_info;
3706 ValNodePtr vnp, head;
3707
3708 vnp = head = BlastLoadMatrixValues();
3709 while (vnp)
3710 {
3711 matrix_info = vnp->data.ptrvalue;
3712 if (StringICmp(matrix_info->name, matrix_name) == 0)
3713 {
3714 values = matrix_info->values;
3715 max_number_values = matrix_info->max_number_values;
3716 found_matrix = TRUE;
3717 break;
3718 }
3719 vnp = vnp->next;
3720 }
3721
3722 if (found_matrix)
3723 {
3724 for (index=0; index<max_number_values; index++)
3725 {
3726 if (Nint(values[index][2]) == INT2_MAX)
3727 sprintf(buffer, "Gap existence and extension values of %ld and %ld are supported", (long) Nint(values[index][0]), (long) Nint(values[index][1]));
3728 else
3729 sprintf(buffer, "Gap existence, extension and decline-to-align values of %ld, %ld and %ld are supported", (long) Nint(values[index][0]), (long) Nint(values[index][1]), (long) Nint(values[index][2]));
3730 BlastConstructErrorMessage("BlastKarlinBlkGappedCalc", buffer, 2, error_return);
3731 }
3732 }
3733
3734 BlastMatrixValuesDestruct(head);
3735
3736 return 0;
3737 }
3738
3739 /*
3740 BlastKarlinLtoH
3741
3742 Calculate H, the relative entropy of the p's and q's
3743 */
3744 static Nlm_FloatHi LIBCALL
BlastKarlinLtoH(BLAST_ScoreFreqPtr sfp,Nlm_FloatHi lambda)3745 BlastKarlinLtoH(BLAST_ScoreFreqPtr sfp, Nlm_FloatHi lambda)
3746 {
3747 BLAST_Score score;
3748 Nlm_FloatHi H, etonlam, sum, scale;
3749
3750 Nlm_FloatHi PNTR probs = sfp->sprob;
3751 BLAST_Score low = sfp->obs_min, high = sfp->obs_max;
3752
3753 if (lambda < 0.) {
3754 return -1.;
3755 }
3756 if (BlastScoreChk(low, high) != 0) return -1.;
3757
3758 etonlam = exp( - lambda );
3759 sum = low * probs[low];
3760 for( score = low + 1; score <= high; score++ ) {
3761 sum = score * probs[score] + etonlam * sum;
3762 }
3763
3764 scale = Nlm_Powi( etonlam, high );
3765 if( scale > 0.0 ) {
3766 H = lambda * sum/scale;
3767 } else { /* Underflow of exp( -lambda * high ) */
3768 H = lambda * exp( lambda * high + log(sum) );
3769 }
3770 return H;
3771 }
3772
3773 /*
3774 Everything below here was (more or less) copied from the old
3775 karlin.c and could work separately from the stuff above.
3776 */
3777
3778 /**************** Statistical Significance Parameter Subroutine ****************
3779
3780 Version 1.0 February 2, 1990
3781 Version 1.2 July 6, 1990
3782
3783 Program by: Stephen Altschul
3784
3785 Address: National Center for Biotechnology Information
3786 National Library of Medicine
3787 National Institutes of Health
3788 Bethesda, MD 20894
3789
3790 Internet: altschul@ncbi.nlm.nih.gov
3791
3792 See: Karlin, S. & Altschul, S.F. "Methods for Assessing the Statistical
3793 Significance of Molecular Sequence Features by Using General Scoring
3794 Schemes," Proc. Natl. Acad. Sci. USA 87 (1990), 2264-2268.
3795
3796 Computes the parameters lambda and K for use in calculating the
3797 statistical significance of high-scoring segments or subalignments.
3798
3799 The scoring scheme must be integer valued. A positive score must be
3800 possible, but the expected (mean) score must be negative.
3801
3802 A program that calls this routine must provide the value of the lowest
3803 possible score, the value of the greatest possible score, and a pointer
3804 to an array of probabilities for the occurrence of all scores between
3805 these two extreme scores. For example, if score -2 occurs with
3806 probability 0.7, score 0 occurs with probability 0.1, and score 3
3807 occurs with probability 0.2, then the subroutine must be called with
3808 low = -2, high = 3, and pr pointing to the array of values
3809 { 0.7, 0.0, 0.1, 0.0, 0.0, 0.2 }. The calling program must also provide
3810 pointers to lambda and K; the subroutine will then calculate the values
3811 of these two parameters. In this example, lambda=0.330 and K=0.154.
3812
3813 The parameters lambda and K can be used as follows. Suppose we are
3814 given a length N random sequence of independent letters. Associated
3815 with each letter is a score, and the probabilities of the letters
3816 determine the probability for each score. Let S be the aggregate score
3817 of the highest scoring contiguous segment of this sequence. Then if N
3818 is sufficiently large (greater than 100), the following bound on the
3819 probability that S is greater than or equal to x applies:
3820
3821 P( S >= x ) <= 1 - exp [ - KN exp ( - lambda * x ) ].
3822
3823 In other words, the p-value for this segment can be written as
3824 1-exp[-KN*exp(-lambda*S)].
3825
3826 This formula can be applied to pairwise sequence comparison by assigning
3827 scores to pairs of letters (e.g. amino acids), and by replacing N in the
3828 formula with N*M, where N and M are the lengths of the two sequences
3829 being compared.
3830
3831 In addition, letting y = KN*exp(-lambda*S), the p-value for finding m
3832 distinct segments all with score >= S is given by:
3833
3834 2 m-1 -y
3835 1 - [ 1 + y + y /2! + ... + y /(m-1)! ] e
3836
3837 Notice that for m=1 this formula reduces to 1-exp(-y), which is the same
3838 as the previous formula.
3839
3840 *******************************************************************************/
3841
3842 Int2 LIBCALL
BlastKarlinBlkCalc(BLAST_KarlinBlkPtr kbp,BLAST_ScoreFreqPtr sfp)3843 BlastKarlinBlkCalc(BLAST_KarlinBlkPtr kbp, BLAST_ScoreFreqPtr sfp)
3844 {
3845
3846
3847 if (kbp == NULL || sfp == NULL)
3848 return 1;
3849
3850 /* Calculate the parameter Lambda */
3851
3852 kbp->Lambda = BlastKarlinLambdaNR(sfp);
3853 if (kbp->Lambda < 0.)
3854 goto ErrExit;
3855
3856
3857 /* Calculate H */
3858
3859 kbp->H = BlastKarlinLtoH(sfp, kbp->Lambda);
3860 if (kbp->H < 0.)
3861 goto ErrExit;
3862
3863
3864 /* Calculate K and log(K) */
3865
3866 kbp->K = BlastKarlinLHtoK(sfp, kbp->Lambda, kbp->H);
3867 if (kbp->K < 0.)
3868 goto ErrExit;
3869 kbp->logK = log(kbp->K);
3870
3871 /* Normal return */
3872 return 0;
3873
3874 ErrExit:
3875 kbp->Lambda = kbp->H = kbp->K = -1.;
3876 #ifdef BLASTKAR_HUGE_VAL
3877 kbp->logK = BLASTKAR_HUGE_VAL;
3878 #else
3879 kbp->logK = 1.e30;
3880 #endif
3881 return 1;
3882 }
3883 #define DIMOFP0 (iterlimit*range + 1)
3884 #define DIMOFP0_MAX (BLAST_KARLIN_K_ITER_MAX*BLAST_SCORE_RANGE_MAX+1)
3885
3886
3887 #define smallLambdaThreshold 20 /*defines special case in K computation*/
3888 /*threshold is on exp(-Lambda)*/
3889
3890 /*The following procedure computes K. The input includes Lambda, H,
3891 * and an array of probabilities for each score.
3892 * There are distinct closed form for three cases:
3893 * 1. high score is 1 low score is -1
3894 * 2. high score is 1 low score is not -1
3895 * 3. low score is -1, high score is not 1
3896 *
3897 * Otherwise, in most cases the value is computed as:
3898 * -exp(-2.0*outerSum) / ((H/lambda)*(exp(-lambda) - 1)
3899 * The last term (exp(-lambda) - 1) can be computed in two different
3900 * ways depending on whether lambda is small or not.
3901 * outerSum is a sum of the terms
3902 * innerSum/j, where j is denoted by iterCounter in the code.
3903 * The sum is truncated when the new term innersum/j i sufficiently small.
3904 * innerSum is a weighted sum of the probabilities of
3905 * of achieving a total score i in a gapless alignment,
3906 * which we denote by P(i,j).
3907 * of exactly j characters. innerSum(j) has two parts
3908 * Sum over i < 0 P(i,j)exp(-i * lambda) +
3909 * Sum over i >=0 P(i,j)
3910 * The terms P(i,j) are computed by dynamic programming.
3911 * An earlier version was flawed in that ignored the special case 1
3912 * and tried to replace the tail of the computation of outerSum
3913 * by a geometric series, but the base of the geometric series
3914 * was not accurately estimated in some cases.
3915 */
3916
3917 Nlm_FloatHi
BlastKarlinLHtoK(BLAST_ScoreFreqPtr sfp,Nlm_FloatHi lambda,Nlm_FloatHi H)3918 BlastKarlinLHtoK(BLAST_ScoreFreqPtr sfp, Nlm_FloatHi lambda, Nlm_FloatHi H)
3919 {
3920 /*The next array stores the probabilities of getting each possible
3921 score in an alignment of fixed length; the array is shifted
3922 during part of the computation, so that
3923 entry 0 is for score 0. */
3924 Nlm_FloatHi PNTR alignmentScoreProbabilities = NULL;
3925 BLAST_Score low; /* Lowest score (must be negative) */
3926 BLAST_Score high; /* Highest score (must be positive) */
3927 BLAST_Score range; /* range of scores, computed as high - low*/
3928 Nlm_FloatHi K; /* local copy of K to return*/
3929 Int4 i; /*loop index*/
3930 Int4 iterCounter; /*counter on iterations*/
3931 BLAST_Score divisor; /*candidate divisor of all scores with
3932 non-zero probabilities*/
3933 /*highest and lowest possible alignment scores for current length*/
3934 BLAST_Score lowAlignmentScore, highAlignmentScore;
3935 BLAST_Score first, last; /*loop indices for dynamic program*/
3936 register Nlm_FloatHi innerSum;
3937 Nlm_FloatHi oldsum, oldsum2; /* values of innerSum on previous
3938 iterations*/
3939 Nlm_FloatHi outerSum; /* holds sum over j of (innerSum
3940 for iteration j/j)*/
3941
3942 Nlm_FloatHi score_avg; /*average score*/
3943 /*first term to use in the closed form for the case where
3944 high == 1 or low == -1, but not both*/
3945 Nlm_FloatHi firstTermClosedForm; /*usually store H/lambda*/
3946 Int4 iterlimit; /*upper limit on iterations*/
3947 Nlm_FloatHi sumlimit; /*lower limit on contributions
3948 to sum over scores*/
3949
3950 /*array of score probabilities reindexed so that low is at index 0*/
3951 Nlm_FloatHi PNTR probArrayStartLow;
3952
3953 /*pointers used in dynamic program*/
3954 Nlm_FloatHi PNTR ptrP, PNTR ptr1, PNTR ptr2, PNTR ptr1e;
3955 Nlm_FloatHi expMinusLambda; /*e^^(-Lambda) */
3956
3957 if (lambda <= 0. || H <= 0.) {
3958 /* Theory dictates that H and lambda must be positive, so
3959 * return -1 to indicate an error */
3960 return -1.;
3961 }
3962
3963 /*Karlin-Altschul theory works only if the expected score
3964 is negative*/
3965 if (sfp->score_avg >= 0.0) {
3966 return -1.;
3967 }
3968
3969 low = sfp->obs_min;
3970 high = sfp->obs_max;
3971 range = high - low;
3972
3973 probArrayStartLow = &sfp->sprob[low];
3974 /* Look for the greatest common divisor ("delta" in Appendix of PNAS 87 of
3975 Karlin&Altschul (1990) */
3976 for (i = 1, divisor = -low; i <= range && divisor > 1; ++i) {
3977 if (probArrayStartLow[i] != 0.0)
3978 divisor = Nlm_Gcd(divisor, i);
3979 }
3980
3981 high /= divisor;
3982 low /= divisor;
3983 lambda *= divisor;
3984
3985 range = high - low;
3986
3987 firstTermClosedForm = H/lambda;
3988 expMinusLambda = exp((Nlm_FloatHi) -lambda);
3989
3990 if (low == -1 && high == 1) {
3991 K = (sfp->sprob[low*divisor] - sfp->sprob[high*divisor]) *
3992 (sfp->sprob[low*divisor] - sfp->sprob[high*divisor]) /
3993 sfp->sprob[low*divisor];
3994 return(K);
3995 }
3996
3997 if (low == -1 || high == 1) {
3998 if (high != 1) {
3999 score_avg = sfp->score_avg / divisor;
4000 firstTermClosedForm
4001 = (score_avg * score_avg) / firstTermClosedForm;
4002 }
4003 return firstTermClosedForm * (1.0 - expMinusLambda);
4004 }
4005
4006 sumlimit = BLAST_KARLIN_K_SUMLIMIT_DEFAULT;
4007 iterlimit = BLAST_KARLIN_K_ITER_MAX;
4008
4009 if (DIMOFP0 > DIMOFP0_MAX) {
4010 return -1.;
4011 }
4012 alignmentScoreProbabilities =
4013 (Nlm_FloatHi PNTR)MemNew(DIMOFP0 *
4014 sizeof(*alignmentScoreProbabilities));
4015 if (alignmentScoreProbabilities == NULL)
4016 return -1.;
4017
4018 outerSum = 0.;
4019 lowAlignmentScore = highAlignmentScore = 0;
4020 alignmentScoreProbabilities[0] = innerSum = oldsum = oldsum2 = 1.;
4021
4022 for (iterCounter = 0;
4023 ((iterCounter < iterlimit) && (innerSum > sumlimit));
4024 outerSum += innerSum /= ++iterCounter) {
4025 first = last = range;
4026 lowAlignmentScore += low;
4027 highAlignmentScore += high;
4028 /*dynamic program to compute P(i,j)*/
4029 for (ptrP = alignmentScoreProbabilities +
4030 (highAlignmentScore-lowAlignmentScore);
4031 ptrP >= alignmentScoreProbabilities;
4032 *ptrP-- =innerSum) {
4033 ptr1 = ptrP - first;
4034 ptr1e = ptrP - last;
4035 ptr2 = probArrayStartLow + first;
4036 for (innerSum = 0.; ptr1 >= ptr1e; )
4037 innerSum += *ptr1-- * *ptr2++;
4038 if (first)
4039 --first;
4040 if (ptrP - alignmentScoreProbabilities <= range)
4041 --last;
4042 }
4043 /* Horner's rule */
4044 innerSum = *++ptrP;
4045 for( i = lowAlignmentScore + 1; i < 0; i++ ) {
4046 innerSum = *++ptrP + innerSum * expMinusLambda;
4047 }
4048 innerSum *= expMinusLambda;
4049
4050 for (; i <= highAlignmentScore; ++i)
4051 innerSum += *++ptrP;
4052 oldsum2 = oldsum;
4053 oldsum = innerSum;
4054 }
4055
4056 #ifdef ADD_GEOMETRIC_TERMS_TO_K
4057 /*old code assumed that the later terms in sum were
4058 asymptotically comparable to those of a geometric
4059 progression, and tried to speed up convergence by
4060 guessing the estimated ratio between sucessive terms
4061 and using the explicit terms of a geometric progression
4062 to speed up convergence. However, the assumption does not
4063 always hold, and convergenece of the above code is fast
4064 enough in practice*/
4065 /* Terms of geometric progression added for correction */
4066 {
4067 Nlm_FloatHi ratio; /* fraction used to generate the
4068 geometric progression */
4069
4070 ratio = oldsum / oldsum2;
4071 if (ratio >= (1.0 - sumlimit*0.001)) {
4072 K = -1.;
4073 if (alignmentScoreProbabilities != NULL)
4074 MemFree(alignmentScoreProbabilities);
4075 return K;
4076 }
4077 sumlimit *= 0.01;
4078 while (innerSum > sumlimit) {
4079 oldsum *= ratio;
4080 outerSum += innerSum = oldsum / ++iterCounter;
4081 }
4082 }
4083 #endif
4084
4085 if (expMinusLambda < smallLambdaThreshold ) {
4086 K = -exp((double)-2.0*outerSum) /
4087 (firstTermClosedForm*(expMinusLambda - 1.0));
4088 } else {
4089 K = -exp((double)-2.0*outerSum) /
4090 (firstTermClosedForm*Expm1(-(double)lambda));
4091 }
4092
4093 if (alignmentScoreProbabilities != NULL)
4094 MemFree(alignmentScoreProbabilities);
4095
4096 return K;
4097 }
4098
4099
4100 /**
4101 * Find positive solution to
4102 *
4103 * sum_{i=low}^{high} exp(i lambda) * probs[i] = 1.
4104 *
4105 * Note that this solution does not exist unless the average score is
4106 * negative and the largest score that occurs with nonzero probability
4107 * is positive.
4108 *
4109 * @param probs probabilities of a score occurring
4110 * @param d the gcd of the possible scores. This equals 1 if
4111 * the scores are not a lattice
4112 * @param low the lowest possible score that occurs with
4113 * nonzero probability
4114 * @param high the highest possible score that occurs with
4115 * nonzero probability.
4116 * @param lambda0 an initial guess for lambda
4117 * @param tolx the tolerance to which lambda must be computed
4118 * @param itmax the maximum number of times the function may be
4119 * evaluated
4120 * @param maxNewton the maximum permissible number of Newton
4121 * iterations; after that the computation will proceed
4122 * by bisection.
4123 * @param *itn the number of iterations needed to compute Lambda,
4124 * or itmax if Lambda could not be computed.
4125 *
4126 * Let phi(lambda) = sum_{i=low}^{high} exp(i lambda) - 1. Then
4127 * phi(lambda) may be written
4128 *
4129 * phi(lamdba) = exp(u lambda) f( exp(-lambda) )
4130 *
4131 * where f(x) is a polynomial that has exactly two zeros, one at x = 1
4132 * and one at x = exp(-lamdba). It is simpler to solve this problem
4133 * in x = exp(-lambda) than it is to solve it in lambda, because we
4134 * know that for x, a solution lies in [0,1], and because Newton's
4135 * method is generally more stable and efficient for polynomials than
4136 * it is for exponentials.
4137 *
4138 * For the most part, this function is a standard safeguarded Newton
4139 * iteration: define an interval of uncertainty [a,b] with f(a) > 0
4140 * and f(b) < 0 (except for the initial value b = 1, where f(b) = 0);
4141 * evaluate the function and use the sign of that value to shrink the
4142 * interval of uncertainty; compute a Newton step; and if the Newton
4143 * step suggests a point outside the interval of uncertainty or fails
4144 * to decrease the function sufficiently, then bisect. There are
4145 * three further details needed to understand the algorithm:
4146 *
4147 * 1) If y the unique solution in [0,1], then f is positive to the left of
4148 * y, and negative to the right. Therefore, we may determine whether
4149 * the Newton step -f(x)/f'(x) is moving toward, or away from, y by
4150 * examining the sign of f'(x). If f'(x) >= 0, we bisect instead
4151 * of taking the Newton step.
4152 * 2) There is a neighborhood around x = 1 for which f'(x) >= 0, so
4153 * (1) prevents convergence to x = 1 (and for a similar reason
4154 * prevents convergence to x = 0, if the function is incorrectly
4155 * called with probs[high] == 0).
4156 * 3) Conditions like fabs(p) < lambda_tolerance * x * (1-x) are used in
4157 * convergence criteria because these values translate to a bound
4158 * on the relative error in lambda. This is proved in the
4159 * "Blast Scoring Parameters" document that accompanies the BLAST
4160 * code.
4161 *
4162 * The iteration on f(x) is robust and doesn't overflow; defining a
4163 * robust safeguarded Newton iteration on phi(lambda) that cannot
4164 * converge to lambda = 0 and that is protected against overflow is
4165 * more difficult. So (despite the length of this comment) the Newton
4166 * iteration on f(x) is the simpler solution.
4167 */
4168
4169 static Nlm_FloatHi
NlmKarlinLambdaNR(Nlm_FloatHi PNTR probs,BLAST_Score d,BLAST_Score low,BLAST_Score high,Nlm_FloatHi lambda0,Nlm_FloatHi tolx,int itmax,int maxNewton,int * itn)4170 NlmKarlinLambdaNR(Nlm_FloatHi PNTR probs, BLAST_Score d, BLAST_Score low,
4171 BLAST_Score high, Nlm_FloatHi lambda0, Nlm_FloatHi tolx,
4172 int itmax, int maxNewton, int * itn)
4173 {
4174 int k;
4175 Nlm_FloatHi x0, x, a = 0, b = 1;
4176 Nlm_FloatHi f = 4; /* Larger than any possible value of the poly in [0,1] */
4177 int isNewton = 0; /* we haven't yet taken a Newton step. */
4178
4179 assert( d > 0 );
4180
4181 x0 = exp( -lambda0 );
4182 x = ( 0 < x0 && x0 < 1 ) ? x0 : .5;
4183
4184 for( k = 0; k < itmax; k++ ) { /* all iteration indices k */
4185 int i;
4186 Nlm_FloatHi g, fold = f;
4187 int wasNewton = isNewton; /* If true, then the previous step was a */
4188 /* Newton step */
4189 isNewton = 0; /* Assume that this step is not */
4190
4191 /* Horner's rule for evaluating a polynomial and its derivative */
4192 g = 0;
4193 f = probs[low];
4194 for( i = low + d; i < 0; i += d ) {
4195 g = x * g + f;
4196 f = f * x + probs[i];
4197 }
4198 g = x * g + f;
4199 f = f * x + probs[0] - 1;
4200 for( i = d; i <= high; i += d ) {
4201 g = x * g + f;
4202 f = f * x + probs[i];
4203 }
4204 /* End Horner's rule */
4205
4206 if( f > 0 ) {
4207 a = x; /* move the left endpoint */
4208 } else if( f < 0 ) {
4209 b = x; /* move the right endpoint */
4210 } else { /* f == 0 */
4211 break; /* x is an exact solution */
4212 }
4213 if( b - a < 2 * a * ( 1 - b ) * tolx ) {
4214 /* The midpoint of the interval converged */
4215 x = (a + b) / 2; break;
4216 }
4217
4218 if( k >= maxNewton ||
4219 /* If convergence of Newton's method appears to be failing; or */
4220 ( wasNewton && fabs( f ) > .9 * fabs(fold) ) ||
4221 /* if the previous iteration was a Newton step but didn't decrease
4222 * f sufficiently; or */
4223 g >= 0
4224 /* if a Newton step will move us away from the desired solution */
4225 ) { /* then */
4226 /* bisect */
4227 x = (a + b)/2;
4228 } else {
4229 /* try a Newton step */
4230 double p = - f/g;
4231 double y = x + p;
4232 if( y <= a || y >= b ) { /* The proposed iterate is not in (a,b) */
4233 x = (a + b)/2;
4234 } else { /* The proposed iterate is in (a,b). Accept it. */
4235 isNewton = 1;
4236 x = y;
4237 if( fabs( p ) < tolx * x * (1-x) ) break; /* Converged */
4238 } /* else the proposed iterate is in (a,b) */
4239 } /* else try a Newton step. */
4240 } /* end for all iteration indices k */
4241 *itn = k;
4242 return -log(x)/d;
4243 }
4244
4245
4246 Nlm_FloatHi
BlastKarlinLambdaNR(BLAST_ScoreFreqPtr sfp)4247 BlastKarlinLambdaNR(BLAST_ScoreFreqPtr sfp)
4248 {
4249 BLAST_Score low; /* Lowest score (must be negative) */
4250 BLAST_Score high; /* Highest score (must be positive) */
4251 int itn;
4252 BLAST_Score i, d;
4253 Nlm_FloatHi PNTR sprob;
4254 Nlm_FloatHi returnValue;
4255
4256 low = sfp->obs_min;
4257 high = sfp->obs_max;
4258 if (sfp->score_avg >= 0.) { /* Expected score must be negative */
4259 return -1.0;
4260 }
4261 if (BlastScoreChk(low, high) != 0) return -1.;
4262
4263 sprob = sfp->sprob;
4264 /* Find greatest common divisor of all scores */
4265 for (i = 1, d = -low; i <= high-low && d > 1; ++i) {
4266 if (sprob[i+low] != 0.0) {
4267 d = Nlm_Gcd(d, i);
4268 }
4269 }
4270 returnValue =
4271 NlmKarlinLambdaNR( sprob, d, low, high,
4272 BLAST_KARLIN_LAMBDA0_DEFAULT,
4273 BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT,
4274 20, 20 + BLAST_KARLIN_LAMBDA_ITER_DEFAULT, &itn );
4275
4276
4277 return returnValue;
4278 }
4279
4280 Nlm_FloatHi LIBCALL
impalaKarlinLambdaNR(BLAST_ScoreFreqPtr sfp,Nlm_FloatHi initialLambda)4281 impalaKarlinLambdaNR(BLAST_ScoreFreqPtr sfp, Nlm_FloatHi initialLambda)
4282 {
4283 Nlm_FloatHi returnValue;
4284 int itn;
4285 Nlm_FloatHi PNTR sprob = sfp->sprob;
4286
4287 if (sfp->score_avg >= 0.) { /* Expected score must be negative */
4288 return -1.0;
4289 }
4290 {
4291 Boolean foundPositive = FALSE;
4292 BLAST_Score j;
4293 for(j = 1; j <=sfp->obs_max; j++) {
4294 if (sprob[j] > 0.0) {
4295 foundPositive = TRUE;
4296 break;
4297 }
4298 }
4299 if (!foundPositive) return(-1);
4300 }
4301
4302 returnValue =
4303 NlmKarlinLambdaNR( sprob, 1, sfp->obs_min, sfp->obs_max,
4304 initialLambda, BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT,
4305 20, 20 + BLAST_KARLIN_LAMBDA_ITER_DEFAULT, &itn );
4306
4307 return returnValue;
4308 }
4309
4310
4311
4312 /* Compute a divisor used to weight the evalue of a collection of
4313 * "nsegs" distinct alignments. These divisors are used to compensate
4314 * for the effect of choosing the best among multiple collections of
4315 * alignments. See
4316 *
4317 * Stephen F. Altschul. Evaluating the statitical significance of
4318 * multiple distinct local alignments. In Suhai, editior, Theoretical
4319 * and Computational Methods in Genome Research, pages 1-14. Plenum
4320 * Press, New York, 1997.
4321 *
4322 * The "decayrate" parameter of this routine is a value in the
4323 * interval (0,1). Typical values of decayrate are .1 and .5. */
4324
4325 Nlm_FloatHi LIBCALL
BlastGapDecayDivisor(Nlm_FloatHi decayrate,unsigned nsegs)4326 BlastGapDecayDivisor(Nlm_FloatHi decayrate, unsigned nsegs )
4327 {
4328 return (1. - decayrate) * Nlm_Powi(decayrate, nsegs - 1);
4329 }
4330
4331 /*
4332 BlastCutoffs
4333
4334 Calculate the cutoff score, S, and the highest expected score.
4335
4336 WRG (later modified by TLM).
4337 */
4338 Int2 LIBCALL
BlastCutoffs(BLAST_ScorePtr S,Nlm_FloatHi PNTR E,BLAST_KarlinBlkPtr kbp,Nlm_FloatHi searchsp,Nlm_Boolean dodecay,Nlm_FloatHi gap_decay_rate)4339 BlastCutoffs(BLAST_ScorePtr S, /* cutoff score */
4340 Nlm_FloatHi PNTR E, /* expected no. of HSPs scoring at or above S */
4341 BLAST_KarlinBlkPtr kbp,
4342 Nlm_FloatHi searchsp, /* size of search space. */
4343 Nlm_Boolean dodecay, /* TRUE ==> use gapdecay feature */
4344 Nlm_FloatHi gap_decay_rate )
4345 {
4346 BLAST_Score s = *S, es;
4347 Nlm_FloatHi e = *E, esave;
4348 Boolean s_changed = FALSE;
4349
4350 if (kbp->Lambda == -1. || kbp->K == -1. || kbp->H == -1.)
4351 return 1;
4352
4353 /*
4354 Calculate a cutoff score, S, from the Expected
4355 (or desired) number of reported HSPs, E.
4356 */
4357 es = 1;
4358 esave = e;
4359 if (e > 0.)
4360 {
4361 if (dodecay) {
4362 /* Invert the adjustment to the e-value that will be applied
4363 * to compensate for the effect of choosing the best among
4364 * multiple alignments */
4365 if( gap_decay_rate > 0 && gap_decay_rate < 1 ) {
4366 e *= BlastGapDecayDivisor(gap_decay_rate, 1);
4367 }
4368 }
4369 es = BlastKarlinEtoS_simple(e, kbp, searchsp);
4370 }
4371 /*
4372 Pick the larger cutoff score between the user's choice
4373 and that calculated from the value of E.
4374 */
4375 if (es > s) {
4376 s_changed = TRUE;
4377 *S = s = es;
4378 }
4379
4380 /*
4381 Re-calculate E from the cutoff score, if E going in was too high
4382 */
4383 if (esave <= 0. || !s_changed)
4384 {
4385 e = BlastKarlinStoE_simple(s, kbp, searchsp);
4386 if (dodecay) {
4387 /* Weight the e-value to compensate for the effect of
4388 * choosing the best of more than one collection of
4389 * distinct alignments */
4390 if( gap_decay_rate > 0 && gap_decay_rate < 1 ) {
4391 e /= BlastGapDecayDivisor(gap_decay_rate, 1);
4392 }
4393 }
4394 *E = e;
4395 }
4396
4397 return 0;
4398 }
4399
4400 /*
4401 BlastKarlinEtoS() -- given an Expect value, return the associated cutoff score
4402
4403 Error return value is BLAST_SCORE_MIN
4404 */
4405 BLAST_Score LIBCALL
BlastKarlinEtoS(Nlm_FloatHi E,BLAST_KarlinBlkPtr kbp,Nlm_FloatHi qlen,Nlm_FloatHi dblen)4406 BlastKarlinEtoS(Nlm_FloatHi E, /* Expect value */
4407 BLAST_KarlinBlkPtr kbp,
4408 Nlm_FloatHi qlen, /* length of query sequence */
4409 Nlm_FloatHi dblen) /* length of database */
4410 {
4411 return BlastKarlinEtoS_simple(E, kbp, qlen*dblen);
4412 }
4413
4414
4415 /* Smallest float that might not cause a floating point exception in
4416 S = (BLAST_Score) (ceil( log((Nlm_FloatHi)(K * searchsp / E)) / Lambda ));
4417 below.
4418 */
4419 #define BLASTKAR_SMALL_FLOAT 1.0e-297
4420 BLAST_Score LIBCALL
BlastKarlinEtoS_simple(Nlm_FloatHi E,BLAST_KarlinBlkPtr kbp,Nlm_FloatHi searchsp)4421 BlastKarlinEtoS_simple(Nlm_FloatHi E, /* Expect value */
4422 BLAST_KarlinBlkPtr kbp,
4423 Nlm_FloatHi searchsp) /* size of search space */
4424 {
4425
4426 Nlm_FloatHi Lambda, K, H; /* parameters for Karlin statistics */
4427 BLAST_Score S;
4428
4429 Lambda = kbp->Lambda;
4430 K = kbp->K;
4431 H = kbp->H;
4432 if (Lambda < 0. || K < 0. || H < 0.)
4433 {
4434 return BLAST_SCORE_MIN;
4435 }
4436
4437 E = MAX(E, BLASTKAR_SMALL_FLOAT);
4438
4439 S = (BLAST_Score) (ceil( log((Nlm_FloatHi)(K * searchsp / E)) / Lambda ));
4440 return S;
4441 }
4442
4443 /*
4444 BlastKarlinStoP
4445
4446 Calculate the probability (as opposed to expectation)
4447 of achieving a particular score.
4448
4449 On error, return value is -1. (same as BlastKarlinStoE()).
4450 */
4451 Nlm_FloatHi LIBCALL
BlastKarlinStoP(BLAST_Score S,BLAST_KarlinBlkPtr kbp,Nlm_FloatHi qlen,Nlm_FloatHi dblen)4452 BlastKarlinStoP(BLAST_Score S,
4453 BLAST_KarlinBlkPtr kbp,
4454 Nlm_FloatHi qlen, /* length of query sequence */
4455 Nlm_FloatHi dblen) /* length of database */
4456 {
4457 return BlastKarlinStoP_simple(S, kbp, qlen*dblen);
4458 }
4459
4460 Nlm_FloatHi LIBCALL
BlastKarlinStoP_simple(BLAST_Score S,BLAST_KarlinBlkPtr kbp,Nlm_FloatHi searchsp)4461 BlastKarlinStoP_simple(BLAST_Score S,
4462 BLAST_KarlinBlkPtr kbp,
4463 Nlm_FloatHi searchsp) /* size of search space. */
4464 {
4465 Nlm_FloatHi x, p;
4466
4467 x = BlastKarlinStoE_simple(S, kbp, searchsp);
4468 if (x == -1.)
4469 return x;
4470 p = BlastKarlinEtoP(x);
4471
4472 return p;
4473 }
4474
4475 /*
4476 BlastKarlinStoE() -- given a score, return the associated Expect value
4477
4478 Error return value is -1.
4479 */
4480 Nlm_FloatHi LIBCALL
BlastKarlinStoE(BLAST_Score S,BLAST_KarlinBlkPtr kbp,Nlm_FloatHi qlen,Nlm_FloatHi dblen)4481 BlastKarlinStoE(BLAST_Score S,
4482 BLAST_KarlinBlkPtr kbp,
4483 Nlm_FloatHi qlen, /* length of query sequence */
4484 Nlm_FloatHi dblen) /* length of database */
4485 {
4486 return BlastKarlinStoE_simple(S, kbp, qlen*dblen);
4487 }
4488
4489 Nlm_FloatHi LIBCALL
BlastKarlinStoE_simple(BLAST_Score S,BLAST_KarlinBlkPtr kbp,Nlm_FloatHi searchsp)4490 BlastKarlinStoE_simple(BLAST_Score S,
4491 BLAST_KarlinBlkPtr kbp,
4492 Nlm_FloatHi searchsp) /* size of search space. */
4493 {
4494 Nlm_FloatHi Lambda, K, H; /* parameters for Karlin statistics */
4495
4496 Lambda = kbp->Lambda;
4497 K = kbp->K;
4498 H = kbp->H;
4499 if (Lambda < 0. || K < 0. || H < 0.) {
4500 return -1.;
4501 }
4502
4503 return searchsp * exp((Nlm_FloatHi)(-Lambda * S) + kbp->logK);
4504 }
4505
4506 /*
4507 BlastKarlinPtoE -- convert a P-value to an Expect value
4508 When using BlastKarlinPtoE in the context of a database search,
4509 the returned E-value should be multiplied by the effective
4510 length of the database and divided by the effective lnegth of
4511 the subject.
4512 */
4513 Nlm_FloatHi LIBCALL
BlastKarlinPtoE(Nlm_FloatHi p)4514 BlastKarlinPtoE(Nlm_FloatHi p)
4515 {
4516 if (p < 0. || p > 1.0)
4517 {
4518 return INT4_MIN;
4519 }
4520
4521 if (p == 1)
4522 return INT4_MAX;
4523
4524 return -Nlm_Log1p(-p);
4525 }
4526
4527 /*
4528 BlastKarlinEtoP -- convert an Expect value to a P-value
4529 When using BlastKarlinEtoP in the context of a database search,
4530 the input paramter E-value should be divided by the effective
4531 length of the database and multiplied by the effective lnegth of
4532 the subject, before BlastKarlinEtoP is called.
4533 */
4534 Nlm_FloatHi LIBCALL
BlastKarlinEtoP(Nlm_FloatHi x)4535 BlastKarlinEtoP(Nlm_FloatHi x)
4536 {
4537 return -Nlm_Expm1((Nlm_FloatHi)-x);
4538 }
4539
4540 /*
4541 BlastKarlinStoLen()
4542
4543 Given a score, return the length expected for an HSP of that score
4544 */
4545 Nlm_FloatHi LIBCALL
BlastKarlinStoLen(BLAST_KarlinBlkPtr kbp,BLAST_Score S)4546 BlastKarlinStoLen(BLAST_KarlinBlkPtr kbp, BLAST_Score S)
4547 {
4548 return kbp->Lambda * S / kbp->H;
4549 }
4550
4551 static Nlm_FloatHi tab2[] = { /* table for r == 2 */
4552 0.01669, 0.0249, 0.03683, 0.05390, 0.07794, 0.1111, 0.1559, 0.2146,
4553 0.2890, 0.3794, 0.4836, 0.5965, 0.7092, 0.8114, 0.8931, 0.9490,
4554 0.9806, 0.9944, 0.9989
4555 };
4556
4557 static Nlm_FloatHi tab3[] = { /* table for r == 3 */
4558 0.9806, 0.9944, 0.9989, 0.0001682,0.0002542,0.0003829,0.0005745,0.0008587,
4559 0.001278, 0.001893, 0.002789, 0.004088, 0.005958, 0.008627, 0.01240, 0.01770,
4560 0.02505, 0.03514, 0.04880, 0.06704, 0.09103, 0.1220, 0.1612, 0.2097,
4561 0.2682, 0.3368, 0.4145, 0.4994, 0.5881, 0.6765, 0.7596, 0.8326,
4562 0.8922, 0.9367, 0.9667, 0.9846, 0.9939, 0.9980
4563 };
4564
4565 static Nlm_FloatHi tab4[] = { /* table for r == 4 */
4566 2.658e-07,4.064e-07,6.203e-07,9.450e-07,1.437e-06,2.181e-06,3.302e-06,4.990e-06,
4567 7.524e-06,1.132e-05,1.698e-05,2.541e-05,3.791e-05,5.641e-05,8.368e-05,0.0001237,
4568 0.0001823,0.0002677,0.0003915,0.0005704,0.0008275,0.001195, 0.001718, 0.002457,
4569 0.003494, 0.004942, 0.006948, 0.009702, 0.01346, 0.01853, 0.02532, 0.03431,
4570 0.04607, 0.06128, 0.08068, 0.1051, 0.1352, 0.1719, 0.2157, 0.2669,
4571 0.3254, 0.3906, 0.4612, 0.5355, 0.6110, 0.6849, 0.7544, 0.8168,
4572 0.8699, 0.9127, 0.9451, 0.9679, 0.9827, 0.9915, 0.9963
4573 };
4574
4575 static Nlm_FloatHi PNTR table[] = { tab2, tab3, tab4 };
4576 static short tabsize[] = { DIM(tab2)-1, DIM(tab3)-1, DIM(tab4)-1 };
4577
4578 static Nlm_FloatHi LIBCALL f PROTO((Nlm_FloatHi,Nlm_VoidPtr));
4579 static Nlm_FloatHi LIBCALL g PROTO((Nlm_FloatHi,Nlm_VoidPtr));
4580
4581
4582 /*
4583 Estimate the Sum P-value by calculation or interpolation, as appropriate.
4584 Approx. 2-1/2 digits accuracy minimum throughout the range of r, s.
4585 r = number of segments
4586 s = total score (in nats), adjusted by -r*log(KN)
4587 */
4588 Nlm_FloatHi LIBCALL
BlastSumP(Int4 r,Nlm_FloatHi s)4589 BlastSumP(Int4 r, Nlm_FloatHi s)
4590 {
4591 Int4 i, r1, r2;
4592 Nlm_FloatHi a;
4593
4594 if (r == 1)
4595 return -Nlm_Expm1(-exp(-s));
4596
4597 if (r <= 4) {
4598 if (r < 1)
4599 return 0.;
4600 r1 = r - 1;
4601 if (s >= r*r+r1) {
4602 a = Nlm_LnGammaInt(r+1);
4603 return r * exp(r1*log(s)-s-a-a);
4604 }
4605 if (s > -2*r) {
4606 /* interpolate */
4607 i = (Int4) (a = s+s+(4*r));
4608 a -= i;
4609 i = tabsize[r2 = r - 2] - i;
4610 return a*table[r2][i-1] + (1.-a)*table[r2][i];
4611 }
4612 return 1.;
4613 }
4614
4615 return BlastSumPCalc(r, s);
4616 }
4617
4618 /*
4619 BlastSumPCalc
4620
4621 Evaluate the following Nlm_FloatHi integral, where r = number of segments
4622 and s = the adjusted score in nats:
4623
4624 (r-2) oo oo
4625 Prob(r,s) = r - - (r-2)
4626 ------------- | exp(-y) | x exp(-exp(x - y/r)) dx dy
4627 (r-1)! (r-2)! U U
4628 s 0
4629 */
4630 static Nlm_FloatHi
BlastSumPCalc(int r,Nlm_FloatHi s)4631 BlastSumPCalc(int r, Nlm_FloatHi s)
4632 {
4633 int r1, itmin;
4634 Nlm_FloatHi t, d, epsilon;
4635 Nlm_FloatHi est_mean, mean, stddev, stddev4;
4636 Nlm_FloatHi xr, xr1, xr2, logr;
4637 Nlm_FloatHi args[6];
4638
4639 epsilon = BLAST_SUMP_EPSILON_DEFAULT; /* accuracy for SumP calcs. */
4640
4641 if (r == 1) {
4642 if (s > 8.)
4643 return exp(-s);
4644 return -Nlm_Expm1(-exp(-s));
4645 }
4646 if (r < 1)
4647 return 0.;
4648
4649 xr = r;
4650
4651 if (r < 8) {
4652 if (s <= -2.3*xr)
4653 return 1.;
4654 }
4655 else if (r < 15) {
4656 if (s <= -2.5*xr)
4657 return 1.;
4658 }
4659 else if (r < 27) {
4660 if (s <= -3.0*xr)
4661 return 1.;
4662 }
4663 else if (r < 51) {
4664 if (s <= -3.4*xr)
4665 return 1.;
4666 }
4667 else if (r < 101) {
4668 if (s <= -4.0*xr)
4669 return 1.;
4670 }
4671
4672 /* stddev in the limit of infinite r, but quite good for even small r */
4673 stddev = sqrt(xr);
4674 stddev4 = 4.*stddev;
4675 xr1 = r1 = r - 1;
4676
4677 if (r > 100) {
4678 /* Calculate lower bound on the mean using inequality log(r) <= r */
4679 est_mean = -xr * xr1;
4680 if (s <= est_mean - stddev4)
4681 return 1.;
4682 }
4683
4684 /* mean is rather close to the mode, and the mean is readily calculated */
4685 /* mean in the limit of infinite r, but quite good for even small r */
4686 logr = log(xr);
4687 mean = xr * (1. - logr) - 0.5;
4688 if (s <= mean - stddev4)
4689 return 1.;
4690
4691 if (s >= mean) {
4692 t = s + 6.*stddev;
4693 itmin = 1;
4694 }
4695 else {
4696 t = mean + 6.*stddev;
4697 itmin = 2;
4698 }
4699
4700 #define ARG_R args[0]
4701 #define ARG_R2 args[1]
4702 #define ARG_ADJ1 args[2]
4703 #define ARG_ADJ2 args[3]
4704 #define ARG_SDIVR args[4]
4705 #define ARG_EPS args[5]
4706
4707 ARG_R = xr;
4708 ARG_R2 = xr2 = r - 2;
4709 ARG_ADJ1 = xr2*logr - Nlm_LnGammaInt(r1) - Nlm_LnGammaInt(r);
4710 ARG_EPS = epsilon;
4711
4712 do {
4713 d = Nlm_RombergIntegrate(g, args, s, t, epsilon, 0, itmin);
4714 #ifdef BLASTKAR_HUGE_VAL
4715 if (d == BLASTKAR_HUGE_VAL)
4716 return d;
4717 #endif
4718 } while (s < mean && d < 0.4 && itmin++ < 4);
4719
4720 return (d < 1. ? d : 1.);
4721 }
4722
4723 static Nlm_FloatHi LIBCALL
g(Nlm_FloatHi s,Nlm_VoidPtr vp)4724 g(Nlm_FloatHi s, Nlm_VoidPtr vp)
4725 {
4726 register Nlm_FloatHi PNTR args = vp;
4727 Nlm_FloatHi mx;
4728
4729 ARG_ADJ2 = ARG_ADJ1 - s;
4730 ARG_SDIVR = s / ARG_R; /* = s / r */
4731 mx = (s > 0. ? ARG_SDIVR + 3. : 3.);
4732 return Nlm_RombergIntegrate(f, vp, 0., mx, ARG_EPS, 0, 1);
4733 }
4734
4735 static Nlm_FloatHi LIBCALL
f(Nlm_FloatHi x,Nlm_VoidPtr vp)4736 f(Nlm_FloatHi x, Nlm_VoidPtr vp)
4737 {
4738 register Nlm_FloatHi PNTR args = vp;
4739 register Nlm_FloatHi y;
4740
4741 y = exp(x - ARG_SDIVR);
4742 #ifdef BLASTKAR_HUGE_VAL
4743 if (y == BLASTKAR_HUGE_VAL)
4744 return 0.;
4745 #endif
4746 if (ARG_R2 == 0.)
4747 return exp(ARG_ADJ2 - y);
4748 if (x == 0.)
4749 return 0.;
4750 return exp(ARG_R2*log(x) + ARG_ADJ2 - y);
4751 }
4752
4753 /*
4754 Calculates the e-value for alignments with "small" gaps (typically
4755 under fifty residues/basepairs) following ideas of Stephen Altschul's.
4756 */
4757
4758 Nlm_FloatHi LIBCALL
BlastSmallGapSumE(Int4 starting_points,Int2 num,Nlm_FloatHi xsum,Int4 query_length,Int4 subject_length,Int8 dblen_eff,Nlm_FloatHi weight_divisor)4759 BlastSmallGapSumE(
4760 Int4 starting_points, /* the number of starting points
4761 * permitted between adjacent
4762 * alignments;
4763 * max_overlap + max_gap + 1 */
4764 Int2 num, /* the number of distinct alignments in this
4765 * collection */
4766 Nlm_FloatHi xsum, /* the sum of the scores of these alignments
4767 * each weighted by an appropriate value of
4768 * Lambda and logK */
4769 Int4 query_length, /* the effective len of the query seq */
4770 Int4 subject_length, /* the effective len of the database seq */
4771 Int8 dblen_eff, /* the effective len of the database */
4772 Nlm_FloatHi weight_divisor) /* a divisor used to weight the e-value
4773 * when multiple collections of alignments
4774 * are being considered by the calling
4775 * routine */
4776 {
4777 Nlm_FloatHi search_space; /* The effective size of the search space */
4778 Nlm_FloatHi sum_e; /* The e-value of this set of alignments */
4779
4780 if(num == 1) {
4781 search_space = (Nlm_FloatHi) query_length * (Nlm_FloatHi)dblen_eff;
4782
4783 sum_e = search_space * exp(-xsum);
4784 } else {
4785 Nlm_FloatHi sum_p; /* The p-value of this set of alignments */
4786
4787 search_space = (Nlm_FloatHi)subject_length * (Nlm_FloatHi)query_length;
4788
4789 xsum -=
4790 log(search_space) + 2 * (num-1)*log((Nlm_FloatHi)starting_points);
4791 xsum -= LnFactorial((Nlm_FloatHi) num);
4792
4793 sum_p = BlastSumP(num, xsum);
4794 sum_e = BlastKarlinPtoE(sum_p) *
4795 ((Nlm_FloatHi) dblen_eff / (Nlm_FloatHi) subject_length);
4796 }
4797 if( weight_divisor == 0 || (sum_e /= weight_divisor) > INT4_MAX ) {
4798 sum_e = INT4_MAX;
4799 }
4800
4801 return sum_e;
4802 }
4803
4804
4805 /*
4806 Calculates the e-value of a collection multiple distinct
4807 alignments with asymmetric gaps between the alignments. The gaps
4808 in one (protein) sequence are typically small (like in
4809 BlastSmallGapSumE) gap an the gaps in the other (translated DNA)
4810 sequence are possibly large (up to 4000 bp.) This routine is used
4811 for linking HSPs representing exons in the DNA sequence that are
4812 separated by introns.
4813 */
4814
4815 Nlm_FloatHi LIBCALL
BlastUnevenGapSumE(Int4 query_start_points,Int4 subject_start_points,Int2 num,Nlm_FloatHi xsum,Int4 query_length,Int4 subject_length,Int8 dblen_eff,Nlm_FloatHi weight_divisor)4816 BlastUnevenGapSumE(
4817 Int4 query_start_points, /* the number of starting points in
4818 * the query sequence permitted
4819 * between adjacent alignments;
4820 * max_overlap + max_gap + 1 */
4821 Int4 subject_start_points, /* the number of starting points in
4822 * the subject sequence permitted
4823 * between adjacent alignments */
4824 Int2 num, /* the number of distinct alignments in this
4825 * collection */
4826 Nlm_FloatHi xsum, /* the sum of the scores of these alignments
4827 * each weighted by an appropriate value of
4828 * Lambda and logK */
4829 Int4 query_length, /* the effective len of the query seq */
4830 Int4 subject_length, /* the effective len of the database seq */
4831 Int8 dblen_eff, /* the effective len of the database */
4832 Nlm_FloatHi weight_divisor) /* a divisor used to weight the e-value
4833 * when multiple collections of alignments
4834 * are being considered by the calling
4835 * routine */
4836 {
4837 Nlm_FloatHi search_space; /* The effective size of the search space */
4838 Nlm_FloatHi sum_e; /* The e-value of this set of alignments */
4839
4840 if( num == 1 ) {
4841 search_space = (Nlm_FloatHi)query_length * (Nlm_FloatHi)dblen_eff;
4842
4843 sum_e = search_space * exp(-xsum);
4844 } else {
4845 Nlm_FloatHi sum_p; /* The p-value of this set of alignments */
4846
4847 search_space = (Nlm_FloatHi)subject_length * (Nlm_FloatHi)query_length;
4848
4849 xsum -= log(search_space) +
4850 (num-1)*(log((Nlm_FloatHi) query_start_points) +
4851 log((Nlm_FloatHi) subject_start_points));
4852 xsum -= LnFactorial((Nlm_FloatHi) num);
4853
4854 sum_p = BlastSumP(num, xsum);
4855
4856 sum_e = BlastKarlinPtoE(sum_p) *
4857 ((Nlm_FloatHi) dblen_eff / (Nlm_FloatHi) subject_length);
4858 }
4859 if( weight_divisor == 0 || (sum_e /= weight_divisor) > INT4_MAX ) {
4860 sum_e = INT4_MAX;
4861 }
4862
4863 return sum_e;
4864 }
4865
4866 /*
4867 Calculates the e-value if a collection of distinct alignments with
4868 arbitrarily large gaps between the alignments
4869 */
4870
4871 Nlm_FloatHi LIBCALL
BlastLargeGapSumE(Int2 num,Nlm_FloatHi xsum,Int4 query_length,Int4 subject_length,Int8 dblen_eff,Nlm_FloatHi weight_divisor)4872 BlastLargeGapSumE(
4873 Int2 num, /* the number of distinct alignments in this
4874 * collection */
4875 Nlm_FloatHi xsum, /* the sum of the scores of these alignments
4876 * each weighted by an appropriate value of
4877 * Lambda and logK */
4878 Int4 query_length, /* the effective len of the query seq */
4879 Int4 subject_length, /* the effective len of the database seq */
4880 Int8 dblen_eff, /* the effective len of the database */
4881 Nlm_FloatHi weight_divisor) /* a divisor used to weight the e-value
4882 * when multiple collections of alignments
4883 * are being considered by the calling
4884 * routine */
4885 {
4886 Nlm_FloatHi sum_p; /* The p-value of this set of alignments */
4887 Nlm_FloatHi sum_e; /* The e-value of this set of alignments */
4888
4889 /* The next two variables are for compatability with Warren's code. */
4890 Nlm_FloatHi lcl_subject_length; /* query_length as a float */
4891 Nlm_FloatHi lcl_query_length; /* subject_length as a float */
4892
4893 lcl_query_length = (Nlm_FloatHi) query_length;
4894 lcl_subject_length = (Nlm_FloatHi) subject_length;
4895
4896 if( num == 1 ) {
4897 sum_e = lcl_query_length * (Nlm_FloatHi) dblen_eff * exp(-xsum);
4898 } else {
4899 xsum -= num*log(lcl_subject_length*lcl_query_length)
4900 - LnFactorial((Nlm_FloatHi) num);
4901
4902 sum_p = BlastSumP(num, xsum);
4903
4904 sum_e = BlastKarlinPtoE(sum_p) *
4905 ((Nlm_FloatHi) dblen_eff / (Nlm_FloatHi) subject_length);
4906 }
4907 if( weight_divisor == 0 || (sum_e /= weight_divisor) > INT4_MAX ) {
4908 sum_e = INT4_MAX;
4909 }
4910
4911 return sum_e;
4912 }
4913
4914
4915 /********************************************************************
4916 *
4917 * The following function, from Stephen Altschul, calculates a
4918 * pseudoscore from a p-vlaue and n, the product of the database
4919 * and query lengths.
4920 * double p; p-value
4921 * double n; search space
4922 *********************************************************************/
4923 /* Move the following constant into blast.h??, or only the last one. */
4924 #define PSCALE 20.0
4925 #define PSEUDO_SCORE_MAX 32767
4926 #define SYBASE_MIN 1.0e-300
4927
4928 Int2
ConvertPtoPseudoS(Nlm_FloatHi p,Nlm_FloatHi n)4929 ConvertPtoPseudoS(Nlm_FloatHi p, Nlm_FloatHi n)
4930 {
4931 Int2 s;
4932 Nlm_FloatHi E;
4933
4934 /* If p is 1.0, then E is very large and E/n is about one. */
4935 if (p > 0.99)
4936 return 0.5;
4937 /* If p is very small, the highest score should be returned. */
4938 else if (p < SYBASE_MIN)
4939 return PSEUDO_SCORE_MAX;
4940
4941 /* E = -ln(1-p); the following calculates it. */
4942 E = -Nlm_Log1p(-p);
4943
4944 s= ConvertEtoPseudoS (E, n);
4945
4946 return s;
4947 }
4948
4949 /*******************************************************************
4950 *
4951 * This function calculates a pseudo-score from an E value.
4952 * The searchsp is the product of the query and database
4953 * lengths. As the E value is directly related to the search
4954 * space, this effectively scales the database size out of
4955 * the calculation of the pseudo-score.
4956 *******************************************************************/
4957 Int2
ConvertEtoPseudoS(Nlm_FloatHi E,Nlm_FloatHi searchsp)4958 ConvertEtoPseudoS(Nlm_FloatHi E, Nlm_FloatHi searchsp)
4959 {
4960 Int2 s;
4961
4962 /* If E is very small, the highest score should be returned. */
4963 if (E < SYBASE_MIN)
4964 return PSEUDO_SCORE_MAX;
4965
4966 if (E > searchsp)
4967 return 0;
4968
4969 /* 0.5 is added to make sure this is rounded up, not down. */
4970 s= 0.5-PSCALE*log(E/searchsp);
4971
4972 return s;
4973 }
4974
4975
4976
4977 /*
4978 Given a pseudoscore, a subroutine for calculating an E-value for a comparison
4979 of size n (the product of the sequence length) is:
4980 */
4981
4982 Nlm_FloatHi
ConvertPseudoStoE(Int2 s,Nlm_FloatHi n)4983 ConvertPseudoStoE(Int2 s, Nlm_FloatHi n)
4984 {
4985 return n*exp(-s/PSCALE);
4986 }
4987
4988 /****************************************************************************
4989 *
4990 * This function attaches a new ScorePtr to the "*old" one passed
4991 * in. If *old is NULL, then *old is set equal to the new ptr.
4992 * The new pointer (NOT the head of the chain) is returned.
4993 *
4994 * If the value is an int, then set prob to zero and pass the value in
4995 * as score; if the value is a Nlm_FloatHi, pass it in as prob.
4996 *
4997 * The type of score is stored in an Object-id.str
4998 ****************************************************************************/
4999
5000 ScorePtr
MakeBlastScore(ScorePtr PNTR old,CharPtr scoretype,Nlm_FloatHi prob,Int4 score)5001 MakeBlastScore (ScorePtr PNTR old, CharPtr scoretype, Nlm_FloatHi prob, Int4 score)
5002
5003 {
5004 ScorePtr scrp, scrp1;
5005
5006 scrp = ScoreNew();
5007
5008 if (score)
5009 {
5010 scrp->choice = 1;
5011 scrp->value.intvalue = score;
5012 }
5013 else
5014 {
5015 scrp->choice = 2;
5016 scrp->value.realvalue = (Nlm_FloatHi) prob;
5017 }
5018
5019 scrp->id = ObjectIdNew();
5020 scrp->id->str = StringSave(scoretype);
5021
5022
5023 if (*old == NULL)
5024 *old = scrp;
5025 else
5026 {
5027 for (scrp1=*old; scrp1->next; scrp1=scrp1->next)
5028 ;
5029 scrp1->next = scrp;
5030 }
5031
5032
5033 return scrp;
5034 }
5035
5036 #ifndef TX_MATRIX_SIZE
5037 #define TX_MATRIX_SIZE 128
5038 #endif
5039
BlastMatrixToTxMatrix(BLAST_MatrixPtr blast_matrix)5040 Int4Ptr PNTR LIBCALL BlastMatrixToTxMatrix(BLAST_MatrixPtr blast_matrix)
5041 {
5042 Uint1 i, j, index1, index2;
5043 Int4Ptr PNTR matrix = blast_matrix->original_matrix;
5044 Int4Ptr PNTR txmatrix;
5045 SeqMapTablePtr smtp;
5046 SeqCodeTablePtr sctp;
5047
5048 if (!blast_matrix->is_prot || matrix == NULL)
5049 return NULL;
5050
5051 sctp = SeqCodeTableFindObj(Seq_code_ncbistdaa);
5052 smtp = SeqMapTableFind(Seq_code_ncbieaa, Seq_code_ncbistdaa);
5053
5054 txmatrix = Malloc(TX_MATRIX_SIZE*sizeof(Int4Ptr));
5055
5056 for (i=0; i<TX_MATRIX_SIZE; i++) {
5057 txmatrix[i] = Malloc(TX_MATRIX_SIZE*sizeof(Int4));
5058 for (j=0; j<TX_MATRIX_SIZE; j++)
5059 txmatrix[i][j] = BLAST_SCORE_MIN;
5060 }
5061
5062 for (i=sctp->start_at; i < sctp->start_at + sctp->num; i++) {
5063 for (j=sctp->start_at; j < sctp->start_at + sctp->num; j++) {
5064 index1 = SeqMapTableConvert(smtp, i);
5065 index2 = SeqMapTableConvert(smtp, j);
5066 txmatrix[index1][index2] = matrix[i][j];
5067 }
5068 }
5069
5070 return txmatrix;
5071 }
5072
TxMatrixDestruct(Int4Ptr PNTR txmatrix)5073 Int4Ptr PNTR LIBCALL TxMatrixDestruct(Int4Ptr PNTR txmatrix)
5074 {
5075 Int2 i;
5076
5077 if (txmatrix == NULL)
5078 return NULL;
5079
5080 for (i=0; i<TX_MATRIX_SIZE; i++)
5081 MemFree(txmatrix[i]);
5082
5083 MemFree(txmatrix);
5084 return NULL;
5085 }
5086
5087
5088 /**
5089 * Computes the adjustment to the lengths of the query and database sequences
5090 * that is used to compensate for edge effects when computing evalues.
5091 *
5092 * The length adjustment is an integer-valued approximation to the fixed
5093 * point of the function
5094 *
5095 * f(ell) = beta +
5096 * (alpha/lambda) * (log K + log((m - ell)*(n - N ell)))
5097 *
5098 * where m is the query length n is the length of the database and N is the
5099 * number of sequences in the database. The values beta, alpha, lambda and
5100 * K are statistical, Karlin-Altschul parameters.
5101 *
5102 * The value of the length adjustment computed by this routine, A,
5103 * will always be an integer smaller than the fixed point of
5104 * f(ell). Usually, it will be the largest such integer. However, the
5105 * computed length adjustment, A, will also be so small that
5106 *
5107 * K * (m - A) * (n - N * A) > min(m,n).
5108 *
5109 * Moreover, an iterative method is used to compute A, and under
5110 * unusual circumstances the iterative method may not converge.
5111 *
5112 * @param K the statistical parameter K
5113 * @param logK the natural logarithm of K
5114 * @param alpha_d_lambda the ratio of the statistical parameters
5115 * alpha and lambda (for ungapped alignments, the
5116 * value 1/H should be used)
5117 * @param beta the statistical parameter beta (for ungapped
5118 * alignments, beta == 0)
5119 * @param query_length the length of the query sequence
5120 * @param db_length the length of the database
5121 * @param db_num_seq the number of sequences in the database
5122 * @param length_adjustment the computed value of the length adjustment [out]
5123 *
5124 * @return 0 if length_adjustment is known to be the largest integer less
5125 * than the fixed point of f(ell); 1 otherwise.
5126 */
5127 Int4
BlastComputeLengthAdjustment(Nlm_FloatHi K,Nlm_FloatHi logK,Nlm_FloatHi alpha_d_lambda,Nlm_FloatHi beta,Int4 query_length,Int8 db_length,Int4 db_num_seqs,Int4 * length_adjustment)5128 BlastComputeLengthAdjustment(Nlm_FloatHi K,
5129 Nlm_FloatHi logK,
5130 Nlm_FloatHi alpha_d_lambda,
5131 Nlm_FloatHi beta,
5132 Int4 query_length,
5133 Int8 db_length,
5134 Int4 db_num_seqs,
5135 Int4 * length_adjustment)
5136 {
5137 Int4 i; /* iteration index */
5138 const Int4 maxits = 20; /* maximum allowed iterations */
5139 Nlm_FloatHi m = query_length, n = db_length, N = db_num_seqs;
5140
5141 Nlm_FloatHi ell; /* A float value of the length adjustment */
5142 Nlm_FloatHi ss; /* effective size of the search space */
5143 Nlm_FloatHi ell_min = 0, ell_max; /* At each iteration i,
5144 * ell_min <= ell <= ell_max. */
5145 Boolean converged = FALSE; /* True if the iteration converged */
5146 Nlm_FloatHi ell_next = 0; /* Value the variable ell takes at iteration
5147 * i + 1 */
5148 /* Choose ell_max to be the largest nonnegative value that satisfies
5149 *
5150 * K * (m - ell) * (n - N * ell) > max(m,n)
5151 *
5152 * Use quadratic formula: 2 c /( - b + sqrt( b*b - 4 * a * c )) */
5153 { /* scope of a, mb, and c, the coefficients in the quadratic formula
5154 * (the variable mb is -b) */
5155 Nlm_FloatHi a = N;
5156 Nlm_FloatHi mb = m * N + n;
5157 Nlm_FloatHi c = n * m - MAX(m, n) / K;
5158
5159 if(c < 0) {
5160 *length_adjustment = 0;
5161 return 1;
5162 } else {
5163 ell_max = 2 * c / (mb + sqrt(mb * mb - 4 * a * c));
5164 }
5165 } /* end scope of a, mb and c */
5166
5167 for(i = 1; i <= maxits; i++) { /* for all iteration indices */
5168 Nlm_FloatHi ell_bar; /* proposed next value of ell */
5169 ell = ell_next;
5170 ss = (m - ell) * (n - N * ell);
5171 ell_bar = alpha_d_lambda * (logK + log(ss)) + beta;
5172 if(ell_bar >= ell) { /* ell is no bigger than the true fixed point */
5173 ell_min = ell;
5174 if(ell_bar - ell_min <= 1.0) {
5175 converged = TRUE;
5176 break;
5177 }
5178 if(ell_min == ell_max) { /* There are no more points to check */
5179 break;
5180 }
5181 } else { /* else ell is greater than the true fixed point */
5182 ell_max = ell;
5183 }
5184 if(ell_min <= ell_bar && ell_bar <= ell_max) {
5185 /* ell_bar is in range. Accept it */
5186 ell_next = ell_bar;
5187 } else { /* else ell_bar is not in range. Reject it */
5188 ell_next = (i == 1) ? ell_max : (ell_min + ell_max) / 2;
5189 }
5190 } /* end for all iteration indices */
5191 if(converged) { /* the iteration converged */
5192 /* If ell_fixed is the (unknown) true fixed point, then we
5193 * wish to set (*length_adjustment) to floor(ell_fixed). We
5194 * assume that floor(ell_min) = floor(ell_fixed) */
5195 *length_adjustment = (Int4) ell_min;
5196 /* But verify that ceil(ell_min) != floor(ell_fixed) */
5197 ell = ceil(ell_min);
5198 if( ell <= ell_max ) {
5199 ss = (m - ell) * (n - N * ell);
5200 if(alpha_d_lambda * (logK + log(ss)) + beta >= ell) {
5201 /* ceil(ell_min) == floor(ell_fixed) */
5202 *length_adjustment = (Int4) ell;
5203 }
5204 }
5205 } else { /* else the iteration did not converge. */
5206 /* Use the best value seen so far */
5207 *length_adjustment = (Int4) ell_min;
5208 }
5209
5210 return converged ? 0 : 1;
5211 }
5212