1 static char const rcsid[] = "$Id: kappa.c,v 6.94 2013/11/04 15:42:10 boratyng Exp $";
2
3 /* $Id: kappa.c,v 6.94 2013/11/04 15:42:10 boratyng Exp $
4 * ==========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================*/
27
28 /*****************************************************************************
29
30 File name: kappa.c
31
32 Authors: Alejandro Schaffer, Mike Gertz
33
34 Contents: Utilities for doing Smith-Waterman alignments and adjusting
35 the scoring system for each match in blastpgp
36
37 Functions in this file call functions in the composition adjustment
38 module to perform adjustment and recalculation of alignments. This
39 file is intended to be (thin) glue code that:
40
41 - adapts C toolkit data structures by the composition adjustment module;
42 - calls functions in the composition adjustment module;
43 - converts the output of the composition adjustment module into the
44 format needed by the C toolkit.
45
46 This file is very similar to the blast_kappa.c file, which is glue
47 code the performs the same function for the C++ toolkit. The sole
48 external symbol from this file should be RedoAlignmentCore. Static
49 functions in this file have a 'Kappa_' prefix. Please adhere to this
50 convention to avoid a name clash with functions in blast_kappa.c (the
51 name clash can matter in debuggers and search engines.)
52
53 $Revision: 6.94 $
54
55 Revision 6.75 2005/11/07 15:28:56 coulouri
56 From Mike Gertz:
57
58 Record the composition mode used for each aligment; formerly the
59 composition mode was recorded for each subject sequence. For blastp,
60 the composition is only done once per subject sequence so the former
61 behavior was correct, but it was incorrect for tblastn.
62
63 Fixed a bug when the query frame is checked in an intermediate
64 containment test. For tblastn, this effectively disabled the test.
65 This bug can effect tblastn alignments when SmithWaterman is not used
66 in the rare occasion that omitting the intermediate containment test
67 causes a different alignment to be found.
68
69 Added a sort key to location_compare windows to sort first by query
70 index. For tblastn alignments when query concatination is used, not
71 having this sort key may on rare occassion cause a different alignment
72 to be reported.
73
74 Cast a few Nlm_FloatHi values to Int4 to suppress compiler warnings.
75
76 Revision 6.74 2005/09/29 17:40:18 coulouri
77 from mike gertz:
78 - Many changes to support query concatenation; including changing
79 the API of RedoAlignementCore.
80 - In RedoAlignmentCore, only call BlastLinkHsps if
81 search->pbp->do_sum_stats is true.
82 - In RedoAlignmentCore, in the non-Smith-Waterman case, compute a
83 separate composition for each HSP, even if multiple HSPs are in
84 the same window.
85
86 Revision 6.73 2005/09/13 16:23:01 coulouri
87 correct include paths for windows
88
89 Revision 6.72 2005/09/13 14:55:06 coulouri
90 From Mike Gertz: use libblastcompadj
91
92 Revision 6.71 2005/09/08 19:48:36 coulouri
93 From Mike Gertz:
94 Changed the behavior of a SWheap so that space for only a small number
95 (100) of hits is allocated initially, but the SWheap will grow to hold
96 as many hits as are needed.
97
98 Revision 6.70 2005/09/08 14:01:49 coulouri
99 From Mike Gertz:
100 - Fixed a bug in which the extent of the subject match was being
101 calculated incorrectly setting forbidden ranges for a
102 Smith-Waterman alignment.
103
104 Revision 6.69 2005/08/31 20:33:25 coulouri
105 New code to use the value of options->kappa_expect_value as the cutoff
106 e-value within RedoAlignmentCore. The code saves and restores the
107 value of search->pbp->cutoff_e.
108
109 Revision 6.68 2005/08/30 18:20:46 coulouri
110 From Mike Gertz:
111 Changes to routines used in mode > 1 of compositional adjustment of
112 scoring matrices:
113 - Removed REscaleInitialScores as it is now redundant; rescaling
114 is now done as part of the call s_ScatterScores.
115 - Refactored Kappa_AdjustBXZ into the routines
116 Kappa_SetNonstandardAaScores and Kappa_AverageScores.
117 - Kappa_SetNonstandardAaScores sets scores for U, * and -,
118 unlike its predecessor Kappa_AdjustBXZ.
119 - Fixed a bug in the formula used to compute the score of (B,B)
120 matches.
121 - In s_RoundScoreMatrix, check whether the unrounded value is <
122 BLAST_SCORE_MIN and if so set the rounded value to
123 BLAST_SCORE_MIN; do not read the incoming value of the "matrix"
124 parameter.
125
126 Changes to mode 1 (traditional) composition-based statistics:
127 - scaleMatrix renamed s_ScaleMatrx with slightly different
128 parameter list.
129 - s_ScaleMatrix does not scale substitution scores for the
130 nonstandard X, U, - and * characters (but does scale B and Z).
131 The previous version scaled scores for X and *.
132
133 Made minor changes to Kappa_CompositionBasedStats and
134 Kappa_AdjustSearch to call the new routines described above.
135
136 Revision 6.67 2005/08/05 12:04:53 coulouri
137 From Mike Gertz:
138 - Move setting gap_align->translate2 to Kappa_RecordInitialSearch;
139 for tblastn and some option settings it would not otherwise get set.
140 - Remove a now redundant setting of gap_align->translate2 in the
141 HitToGapAlign routine.
142 - Fixes to comments.
143
144 Revision 6.66 2005/08/02 14:40:29 coulouri
145 From Mike Gertz:
146 - Fixes to comments
147 - Added enum constant eGapChar; renamed eStarChar to eStopChar.
148 - Change the integer type of some variables to suppress warnings about
149 assigning a wider type to a narrower type.
150 - Made the routines BLbasicSmithWatermanScoreOnly and BLSmithWatermanFindStart
151 static.
152 - Renamed s_ScatterFreqRatios -> s_ScatterScores; renamed parameters.
153 - In NewAlignmentUsingXdrop, use the translate2 field from gap_align
154 to set the same field in the edit block.
155 - Changed the Kappa_WindowInfo datatype to hold a list of HSPs in the
156 window; added logic in several places, notably WindowsFromHSPs, to
157 generate and maintain these lists.
158 - Refactored Kappa_AdjustSearch. Use a more sophisticated rule,
159 implemented in the new Kappa_GetSubjectComposition routine, to
160 determine the subject sequence composition for tblastn.
161 - Removed unused parameters from several routines.
162 - In RedoAlignmentCore, delete NRrecord to avoid a memory leak.
163
164 Revision 6.65 2005/07/28 14:34:06 coulouri
165 From Mike Gertz:
166 - Made minor fixes to whitespace and formatting.
167 - Fixed the comment to SWheap to accurately reflect recent changes in
168 the way SWheapRecordCompare is used.
169 - Changed some #define'd constants to enums or "static const"
170 variables. Renamed these constants appropriately.
171 - Made the array alphaConvert and the routine scalePosMatrix static.
172 - Made sure that gap_align->translate2 is set for tblastn.
173 - Fixed a bug Kappa_SequenceGetTranslatedWindow: the wrong formula had
174 been used to compute num_nucleotides.
175
176 Revision 6.64 2005/07/26 13:07:20 coulouri
177 - Changed #include "NRdefs.h" to #include <NRdefs.h> to be consistent
178 with toolbox conventions.
179
180 - Removed the unused kScoreMatrixRange constant
181
182 - Extended comments for some routines. Fixed spelling and other
183 errors in comments.
184
185 - Fixed white space and formatting in several locations.
186
187 - Renamed functions:
188 o permuteLetterProbs -> s_GatherLetterProbs
189 o permuteFreqRatios -> s_ScatterFreqRatios
190 o adjustBXZ -> Kappa_AdjustBXZ
191 o roundScoreMatrix -> s_RoundScoreMatrix
192
193 - Eliminated the use of a temporary variable in REscaleInitialScores.
194
195 - In sScatterFreqRatios, replaced the NRrecord parameter by a pointer
196 to the frequency ratio matrix contained in the record.
197
198 Revision 6.63 2005/07/14 20:19:45 coulouri
199 - In Kappa_SequenceGetWindow, change all selenocysteine (U)
200 residues in the subject sequence to X.
201 - In scaleMatrix, do not compute the log of frequency ratios that are
202 zero, set the matrix element to BLAST_SCORE_MIN instead.
203 - Removed the startMatrix parameter to scaleMatrix, as it is no
204 longer used.
205
206 Revision 6.61 2005/06/08 19:31:31 papadopo
207 From Michael Gertz:
208 1. The use of the score for comparing collections of alignments
209 was removed in March 2004; revert to the previous rule
210 2. Added a new field "bestScore" to the SWheapRecord structure
211 3. Various additional changes to support the use of score as a
212 key in a SWheapRecord
213 4. The comparison function is now used to determine whether new HSPs
214 may be added to the heap. Previously, the evalue only was used
215 5. Removed a complex test that sometimes terminated computation of
216 Smith-Waterman alignments for a given subject sequence. It is more
217 consistent with all other modes of operation to omit this test
218 6. Removed the "capacity" parameter to SWheapInitialize and used the
219 heapThreshold parameter to set the capacity for the heap
220
221 Revision 6.60 2005/05/18 21:27:33 papadopo
222 make fillResidueProbability unconditional in Kappa_AdjustSearch
223
224 Revision 6.59 2005/05/16 18:10:13 kans
225 include Mode_condition.h to get prototype for chooseMode
226
227 Revision 6.58 2005/05/16 17:46:48 papadopo
228 From Mike Gertz/Alejandro Schaffer: Added support for compositional
229 adjustment of score matrices, both conditionally and unconditionally.
230 The adjustParameters argument to RedoAlignmentCore is no longer Boolean
231 because there are now 4 allowed options:
232 0 no adjustment
233 1 composition-based statistics
234 2 conditional compositional adjustment
235 3 unconditional compositional adjustment
236
237 Revision 6.57 2005/04/13 17:18:02 coulouri
238 changes to WindowsFromHSPs for tblastn composition-based statistics
239
240 Revision 6.56 2005/01/24 15:52:54 camacho
241 doxygen fixes from Mike Gertz
242
243 Revision 6.55 2005/01/20 16:28:59 camacho
244 doxygen fixes
245
246 Revision 6.54 2005/01/18 14:54:13 camacho
247 Change in tie-breakers for score comparison, suggestion by Mike Gertz
248
249 Revision 6.53 2004/11/24 15:42:33 coulouri
250 do not dereference null pointer
251
252 Revision 6.52 2004/11/23 21:29:02 camacho
253 Changes from Mike Gertz:
254 - If no alignments are found, I no longer create an empty HSP list and
255 process it; I just move on to the next match.
256 - For tblastn, culling by containment now occurs before link_hsps as
257 it should. This can not have any effect on the current output, as
258 tblastn + RedoAlignmentCore is not enabled.
259 - I changed some Nlm_Mallocs, etc. to MemNew.
260 - Other cosmetic changes, doxygen friendly comments
261
262 Revision 6.51 2004/11/01 20:43:48 camacho
263 Added error handling for missing PSSM
264
265 Revision 6.50 2004/10/27 21:00:02 camacho
266 Change the order of elements in Score* returned by GetScoreSetFromBlastHsp to
267 be consistent with score ordering in other contexts of BLAST.
268
269 Revision 6.49 2004/09/09 13:47:47 papadopo
270 from Michael Gertz: remove unnecessary check for the presence of Selenocysteine in translated sequences
271
272 Revision 6.48 2004/08/23 19:37:38 papadopo
273 From Michael Gertz: fix memory leak in getStartFreqRatios
274
275 Revision 6.47 2004/08/23 17:12:21 papadopo
276 From Michael Gertz: Backported some changes to
277 getStartFreqRatios and computeScaledStandardMatrix
278 from algo/blast/core/blast_kappa.c code. Also a few
279 straightforward changes to getStartFreqRatios,
280 because the code has diverged slightly
281
282 Revision 6.46 2004/07/28 17:19:54 kans
283 HeapSort callbacks are int LIBCALLBACK for compilation on the PC
284
285 Revision 6.45 2004/07/27 19:59:00 papadopo
286 Changes by Michael Gertz to allow RedoAlignmentCore to be used
287 for tblastn searches. The current version of the code uses two heuristics:
288 1) a heuristic that skips doing re-alignment for an HSP if the old
289 alignment of the HSP is contained in a higher-scoring HSP that
290 has already been re-aligned.
291 2) a heuristic that attempts to ensure that all alignments in the list
292 of redone alignments has distinct ends.
293 The original code also used these heuristics, but the code that
294 implements the heuristics has been rewritten. As a result,
295 RedoAlignmentCore will occasionally report better-scoring alignments.
296 This is still a heuristic; which HSPs are reported still depends on
297 the order in which they are re-aligned.
298
299 Revision 6.44 2004/06/23 14:53:58 camacho
300 Use renamed FreqRatios structure from posit.h
301
302 Revision 6.43 2004/06/22 14:16:56 camacho
303 Use SFreqRatios structure from algo/blast/core to obtain underlying matrices' frequency ratios.
304
305 Revision 6.42 2004/06/18 15:50:25 madden
306 For secondary alignments with Smith-Waterman on, use the E-value from the X-drop alignment computed by ALIGN that has more flexibility in which positions are allowed in the dynamic program.
307
308 Add a sort of alignments for the same query-subject pair because the use of X-drop alignments occasionally reorders such alignments.
309
310 Changes from Mike Gertz, submitted by Alejandro Schaffer.
311
312 Revision 6.41 2004/06/14 21:11:05 papadopo
313 From Michael Gertz:
314 - Added several casts where casts occur in blast_kappa.c. These casts
315 should have no real effect; the log of blast_kappa.c indicates that
316 they suppress compiler warnings.
317 - Changed the type of one variable that holds a score from
318 Nlm_FloatHi to Int4.
319 - moved the definition Kappa_ForbiddenRanges and relevant
320 routines earlier in the file.
321 - fixed some comments.
322 - made a few (~5) changes in whitespace.
323
324 Revision 6.40 2004/06/03 16:10:50 dondosha
325 Fix in Kappa_SearchParametersNew: allocate correct number of rows for matrices
326
327 Revision 6.39 2004/03/31 18:12:13 papadopo
328 Mike Gertz' refactoring of RedoAlignmentCore
329
330 Revision 6.38 2004/02/06 19:25:16 camacho
331 Alejandro Schaffer's corrections to RedoAlignmentCore pointed out by
332 Mike Gertz:
333 1. Corrected the rule for assigning newSW->isfirstAlignment
334 2. Changed upper bound on assignment to forbiddenRanges
335 3. Assigned a value to newScore earlier
336 4. Eliminated use of skipThis
337 5. Restored value of search->gapAlign at the end
338
339 Revision 6.37 2004/01/27 20:31:52 madden
340 remove extra setting of kbp_gap
341
342 Revision 6.36 2004/01/06 17:48:44 dondosha
343 Do not free Karlin block in RedoAlignmentCore, because its pointer is passed outside
344
345 Revision 6.35 2003/12/01 19:15:27 madden
346 Add one byte to filteredMatchingSequence to prevent ABR/ABW
347
348 Revision 6.34 2003/10/22 20:37:19 madden
349 Set kbp to rescaled values, use upper-case for SCALING_FACTOR define
350
351 Revision 6.33 2003/10/02 19:59:34 kans
352 BlastGetGapAlgnTbck needed FALSE instead of NULL in two parameters - Mac compiler complaint
353
354 Revision 6.32 2003/10/02 19:31:24 madden
355 In RedoAlignmentCore, call procedure BlastGetGapAlgnTbck instead of ALIGN
356 to redo alignments; this allows the endpoints of the alignment to change.
357 Because BlastGetGapAlgnTbck returns a list of SeqAlign's while ALIGN
358 passes back a single-alignment, the post-processing of the redone
359 alignments is changed including the addition of the procedure
360 concatenateListOfSeqaligns.
361
362 Revision 6.30 2003/05/30 17:25:36 coulouri
363 add rcsid
364
365 Revision 6.29 2003/05/13 16:02:53 coulouri
366 make ErrPostEx(SEV_FATAL, ...) exit with nonzero status
367
368 Revision 6.28 2002/12/19 14:40:35 kans
369 changed C++-style comment to C-style
370
371 Revision 6.27 2002/12/10 22:58:42 bealer
372 Keep mappings to sequences from readdb so that "num_ident" code does not
373 segfault with multiple databases.
374
375 Revision 6.26 2002/11/06 20:31:14 dondosha
376 Added recalculation of the number of identities when computing seqalign from SWResults
377
378 Revision 6.25 2002/10/16 13:33:58 madden
379 Corrected initialization of initialUngappedLambda in RedoAlignmentCore
380
381 Revision 6.24 2002/09/03 13:55:14 kans
382 changed NULL to 0 for Mac compiler
383
384 Revision 6.23 2002/08/29 15:47:49 camacho
385 Changed RedoAlignmentCore to work without readdb facility
386
387 Revision 6.22 2002/08/20 15:43:08 camacho
388 Fixed memory leak in getStartFreqRatios
389
390 Revision 6.21 2002/05/23 20:14:21 madden
391 Correction for last checkin to cover SmithWaterman type alignment
392
393 Revision 6.20 2001/12/28 18:02:33 dondosha
394 Keep score and scoreThisAlign for each local alignment, so as to allow tie-breaking by score
395
396 Revision 6.19 2001/07/26 12:52:25 madden
397 Fix memory leaks
398
399 Revision 6.18 2001/07/09 15:12:47 shavirin
400 Functions BLbasicSmithWatermanScoreOnly() and BLSmithWatermanFindStart()
401 used to calculate Smith-waterman alignments on low level become external.
402
403 Revision 6.17 2001/05/25 19:40:46 vakatov
404 Nested comment typo fixed
405
406 Revision 6.16 2001/04/13 20:47:36 madden
407 Eliminated use of PRO_K_MULTIPLIER in adjusting E-values Added allocateStartFreqs and freeStartFreqs and getStartFreqRatios to enable the score matrix scaling to work entirely with frequency ratios and round to integers only at the very end of the scaling calculation.
408
409 Revision 6.15 2001/03/20 15:07:34 madden
410 Fix from AS for (near) exact matches
411
412 Revision 6.14 2001/01/04 13:48:44 madden
413 Correction for 3-parameter gapping
414
415 Revision 6.13 2000/12/05 19:31:33 madden
416 Avoid duplicate insertion into heap when ((doThis == FALSE) && (currentState = SWPurging))
417
418 Revision 6.12 2000/10/16 21:08:05 madden
419 segResult takes BioseqPtr as argument, produced from readdb_get_bioseq
420
421 Revision 6.11 2000/10/13 19:32:58 madden
422 Fix for bug that caused crash
423
424 Revision 6.10 2000/10/10 21:46:03 shavirin
425 Added support for BLOSUM50, BLOSUM90, PAM250 with -t T
426
427 Revision 6.9 2000/10/10 19:45:51 shavirin
428 Checked for NULL hsp_array in the function RedoAlignmentCore().
429
430 Revision 6.8 2000/08/18 21:28:24 madden
431 support for BLOSUM62_20A and BLOSUM62_20B, prevent LambdaRatio from getting above 1.0
432
433 Revision 6.7 2000/08/09 21:10:15 shavirin
434 Added paramter discontinuous to the function newConvertSWalignsToSeqAligns()
435
436 Revision 6.6 2000/08/08 21:45:04 shavirin
437 Removed initialization of decline_aline parameter to INT2_MAX.
438
439 Revision 6.5 2000/08/03 22:20:10 shavirin
440 Added creation of the default posFreqs array if it is empty in
441 RedoAlignmentCore(). Fixed some memory leaks.
442
443 Revision 6.4 2000/08/02 18:00:34 shavirin
444 Fixed memory leak in RedoAlignmentCore. Fixed a problem specific
445 to BLOSUM45 and BLOSUM80 in the procedure scalePosMatrix().
446
447 Revision 6.3 2000/07/26 19:34:13 kans
448 removed unix-only headers
449
450 Revision 6.2 2000/07/26 17:06:31 lewisg
451 add LIBCALLs
452
453 Revision 6.1 2000/07/25 17:40:05 shavirin
454 Initial revision.
455
456
457 ******************************************************************************/
458
459 #include<assert.h>
460 #include<string.h>
461
462 #include <ncbi.h>
463 #include <blast.h>
464 #include <objseq.h>
465 #include <objsset.h>
466 #include <sequtil.h>
467 #include <seqport.h>
468 #include <tofasta.h>
469 #include <blastpri.h>
470 #include <txalign.h>
471 #include <simutil.h>
472 #include <posit.h>
473 #include <gapxdrop.h>
474 #include <fcntl.h>
475 #include <profiles.h>
476
477 #include <algo/blast/composition_adjustment/nlm_linear_algebra.h>
478 #include <algo/blast/composition_adjustment/matrix_frequency_data.h>
479 #include <algo/blast/composition_adjustment/composition_adjustment.h>
480 #include <algo/blast/composition_adjustment/compo_heap.h>
481 #include <algo/blast/composition_adjustment/smith_waterman.h>
482 #include <algo/blast/composition_adjustment/redo_alignment.h>
483 #include <algo/blast/composition_adjustment/unified_pvalues.h>
484
485
486 /** Define KAPPA_PRINT_DIAGNOSTICS to be true to turn on printing of
487 * diagnostic information from some routines.
488 *
489 * This macro is usually used as part of a C-conditional
490 * @code
491 * if (KAPPA_PRINT_DIAGNOSTICS) {
492 * perform expensive tests
493 * }
494 * @endcode
495 *
496 * The C compiler will then validate the code that prints the
497 * diagnostics, but will almost always strip the code if
498 * KAPPA_PRINT_DIAGNOSTICS is false.
499 */
500 #ifndef KAPPA_PRINT_DIAGNOSTICS
501 #define KAPPA_PRINT_DIAGNOSTICS 0
502 #endif
503
504 /**
505 * Create a score set from the data in an HSP.
506 *
507 * @param hsp the HSP of interest [in]
508 * @param logK Karlin-Altschul statistical parameter [in]
509 * @param lambda Karlin-Altschul statistical parameter [in]
510 * @param scoreDivisor the value by which reported scores are to be
511 * divided [in]
512 */
513 static ScorePtr
Kappa_GetScoreSetFromBlastHsp(const BLAST_HSPPtr hsp,Nlm_FloatHi lambda,Nlm_FloatHi logK,Nlm_FloatHi scoreDivisor)514 Kappa_GetScoreSetFromBlastHsp(
515 const BLAST_HSPPtr hsp,
516 Nlm_FloatHi lambda,
517 Nlm_FloatHi logK,
518 Nlm_FloatHi scoreDivisor)
519 {
520 ScorePtr score_set = NULL; /* the new score set */
521 int score; /* the score, scaled using scoreDivisor */
522 Nlm_FloatHi bit_score; /* the integer-valued score, in bits */
523 Nlm_FloatHi evalue; /* the e-value, with numbers too close to zero
524 set to zero */
525
526 score = Nlm_Nint(((Nlm_FloatHi) hsp->score) / scoreDivisor);
527 if (score > 0) {
528 MakeBlastScore(&score_set, "score", 0.0, score);
529 }
530
531 evalue = (hsp->evalue >= 1.0e-180) ? hsp->evalue : 0;
532 if (hsp->num > 1) {
533 MakeBlastScore(&score_set, "sum_n", 0.0, hsp->num);
534 MakeBlastScore(&score_set, "sum_e", evalue, 0);
535 if( hsp->ordering_method == BLAST_SMALL_GAPS) {
536 MakeBlastScore(&score_set, "small_gap", 0.0, 1);
537 }
538 } else {
539 MakeBlastScore(&score_set, "e_value", evalue, 0);
540 }
541
542 /* Compute the bit score using the newly computed scaled score. */
543 bit_score = (score*lambda*scoreDivisor - logK)/NCBIMATH_LN2;
544 if (bit_score >= 0.) {
545 MakeBlastScore(&score_set, "bit_score", bit_score, 0);
546 }
547
548 if (hsp->ordering_method > 3) {
549 /* In new tblastn this means splice junction was found */
550 MakeBlastScore(&score_set, "splice_junction", 0.0, 1);
551 }
552
553 if (hsp->num_ident > 0) {
554 MakeBlastScore(&score_set, "num_ident", 0.0, hsp->num_ident);
555 }
556
557 MakeBlastScore(&score_set, "comp_adjustment_method",0.0,
558 hsp->comp_adjustment_method);
559 return score_set;
560 }
561
562
563 /**
564 * Converts a collection of alignments represented by a BLAST_HitList
565 * into a collection of instances of SeqAlign. Returns the new
566 * collection.
567 *
568 * @param hitlist the hitlist to convert to SeqAligns [in]
569 * @param subject_id the identifier of the subject sequence [in]
570 * @param query_id the identifier of the query sequence [in]
571 * @param queryOrigin origin of this specific query in the
572 * concatenated query [in]
573 * @param queryLength length of this specific query [in]
574 * @param lambda Karlin-Altschul statistical parameter [in]
575 * @param logK Karlin-Altschul statistical parameter [in]
576 * @param scoreDivisor value by which reported scores are to be
577 * divided [in]
578 * @param bestEvalue) the best (smallest) e-value of all the alignments
579 * in the hitlist [out]
580 */
581 static SeqAlignPtr
Kappa_SeqAlignsFromHitlist(BLAST_HitListPtr hitlist,SeqIdPtr subject_id,SeqIdPtr query_id,int queryOrigin,int queryLength,Nlm_FloatHi lambda,Nlm_FloatHi logK,Nlm_FloatHi scoreDivisor)582 Kappa_SeqAlignsFromHitlist(
583 BLAST_HitListPtr hitlist,
584 SeqIdPtr subject_id,
585 SeqIdPtr query_id,
586 int queryOrigin,
587 int queryLength,
588 Nlm_FloatHi lambda,
589 Nlm_FloatHi logK,
590 Nlm_FloatHi scoreDivisor)
591 {
592 SeqAlignPtr aligns = NULL; /* list of SeqAligns to be returned */
593 int hsp_index;
594
595 for( hsp_index = hitlist->hspcnt - 1; hsp_index >= 0; hsp_index-- ) {
596 /* iterate in reverse order over all HSPs in the hitlist */
597 BLAST_HSPPtr hsp; /* HSP for this iteration */
598 SeqAlignPtr seqAlign; /* the new SeqAlign */
599
600 hsp = hitlist->hsp_array[hsp_index];
601
602 /* Shift the edit block to query local coordinates */
603 hsp->gap_info->start1 -= queryOrigin;
604 hsp->gap_info->length1 -= queryOrigin;
605 hsp->gap_info->original_length1 = queryLength;
606
607 seqAlign = GapXEditBlockToSeqAlign(hsp->gap_info, subject_id, query_id);
608 seqAlign->score =
609 Kappa_GetScoreSetFromBlastHsp(hsp, lambda, logK, scoreDivisor);
610
611 seqAlign->next = aligns;
612 aligns = seqAlign;
613 }
614 return aligns;
615 }
616
617
618 /**
619 * Adjusts the E-values in a BLAST_HitList to be composites of
620 * a composition-based P-value and a score/alignment-based P-value
621 *
622 * @param hitlist the hitlist whose E-values need to be adjusted
623 * @param comp_p_value P-value from sequence composition
624 * @param search is the structure with all the information about
625 * the search
626 * @param LambdaRatio the ratio between the observed value of Lambda
627 * and the predicted value of lambda (used to print
628 * diagnostics)
629 * @param subject_id the subject id of this sequence (used to print
630 * diagnostics)
631 **/
632 static void
Kappa_AdjustEvaluesForComposition(BLAST_HitListPtr hitlist,Nlm_FloatHi comp_p_value,BlastSearchBlkPtr search,Nlm_FloatHi LambdaRatio,int subject_id)633 Kappa_AdjustEvaluesForComposition(
634 BLAST_HitListPtr hitlist,
635 Nlm_FloatHi comp_p_value,
636 BlastSearchBlkPtr search,
637 Nlm_FloatHi LambdaRatio,
638 int subject_id)
639 {
640 int hsp_index;
641
642 for (hsp_index = 0; hsp_index < hitlist->hspcnt; hsp_index++) {
643 /* for all HSPs */
644 BLAST_HSPPtr hsp; /* HSP for this iteration */
645 double align_p_value; /* P-value for the alignment score */
646 double combined_p_value; /* combination of two P-values */
647 double old_e_value; /* alignment E-value */
648 double db_to_sequence_scale; /* scale factor for converting a database
649 e-value to a sequence e-value */
650
651 hsp = hitlist->hsp_array[hsp_index];
652 old_e_value = hsp->evalue;
653 /* Convert the database E-value to the sequence E-value */
654 db_to_sequence_scale =
655 (MAX((search->subject->length - search->length_adjustment), 1.0))/
656 search->dblen_eff;
657 hsp->evalue *= db_to_sequence_scale;
658
659 align_p_value = BlastKarlinEtoP(hsp->evalue);
660 combined_p_value = Blast_Overall_P_Value(comp_p_value,align_p_value);
661 hsp->evalue = BlastKarlinPtoE(combined_p_value);
662 hsp->evalue /= db_to_sequence_scale;
663
664 if (KAPPA_PRINT_DIAGNOSTICS) {
665 int sequence_gi; /*GI of a sequence*/
666 int context_index; /* query context index of this HSP */
667 context_index = hsp->context;
668
669 if (search->rdfp) {
670 SeqIdPtr sip; /*used to extract sequence from database*/
671 char *sname = NULL; /*string for defline*/
672 char *buff = NULL; /*string for defline*/
673 char *string_descriptor = NULL; /*string for defline*/
674
675 sname = (char *) malloc(SeqIdBufferSize * sizeof(Char));
676 buff = (char *) malloc(SeqIdBufferSize * sizeof(Char));
677 string_descriptor =
678 (char *) malloc(SeqIdBufferSize * sizeof(Char));
679 sip = NULL;
680 readdb_get_defline(search->rdfp,subject_id,&sname);
681 readdb_get_descriptor(search->rdfp, subject_id, &sip,
682 &string_descriptor);
683 if (sip)
684 SeqIdWrite(sip, buff, PRINTID_FASTA_LONG, sizeof(buff));
685 (void) GetAccessionFromSeqId(SeqIdFindBest(sip,SEQID_GI),
686 &sequence_gi, &sname);
687 sip = SeqIdSetFree(sip);
688 free(string_descriptor);
689 free(buff);
690 free(sname);
691 } else {
692 sequence_gi = (-1);
693 }
694 printf("GI %d Lambda ratio %e comp. p-value %e; "
695 "adjust E-value of query length %d match length "
696 "%d from %e to %e\n",
697 sequence_gi, LambdaRatio, comp_p_value,
698 search->context[context_index].query->length,
699 search->subject->length, old_e_value, hsp->evalue);
700 } /* end if (KAPPA_PRINT_DIAGNOSTICS) */
701 } /* end for all HSPs */
702 }
703
704 /**
705 * Converts a list of objects of type BlastCompo_Alignment to an
706 * new object of type BLAST_HitList and returns the result. Conversion
707 * in this direction is lossless. The list passed to this routine is
708 * freed to ensure that there is no aliasing of fields between the
709 * list of BlastCompo_Alignments and the new hitlist.
710 *
711 * @param search general search information
712 * @param alignments a list of distinct alignments
713 */
714 static BLAST_HitListPtr
Kappa_SortedHitlistFromAligns(BlastSearchBlkPtr search,BlastCompo_Alignment ** alignments)715 Kappa_SortedHitlistFromAligns(
716 BlastSearchBlkPtr search,
717 BlastCompo_Alignment ** alignments)
718 {
719 /* The context of the query is always zero in current
720 * implementations of RedoAlignmentCore. */
721 const Int2 context = 0;
722
723 BLAST_HitListPtr hitlist; /* the new hitlist */
724 BlastCompo_Alignment * align; /* represents the current
725 * alignment in loops */
726 if(search->current_hitlist != NULL) {
727 search->current_hitlist->hspcnt_max = search->current_hitlist->hspcnt;
728 BlastHitListPurge(search->current_hitlist);
729 } else {
730 search->current_hitlist = BlastHitListNew(search);
731 }
732 hitlist = search->current_hitlist;
733 hitlist->do_not_reallocate = FALSE;
734
735 for(align = *alignments; align != NULL; align = align->next) {
736 BLAST_HSPPtr hsp; /* the new HSP for this alignment */
737 /* queryExtent and matchExtent represent the extent of the
738 alignment in the query and subject sequences respectively. */
739 int queryExtent = align->queryEnd - align->queryStart;
740 int matchExtent = align->matchEnd - align->matchStart;
741
742 BlastSaveCurrentHsp(search, align->score, align->queryStart,
743 align->matchStart, matchExtent, context);
744
745 hsp = hitlist->hsp_array[hitlist->hspcnt - 1];
746
747 hsp->query.length = queryExtent;
748 hsp->query.end = align->queryEnd;
749
750 hsp->subject.length = matchExtent;
751 hsp->subject.end = align->matchEnd;
752
753 hsp->subject.frame = (Int2) align->frame;
754 hsp->evalue = 0.0; /* E-values are computed after the full
755 hitlist has been created. */
756 hsp->gap_info = (GapXEditBlockPtr) align->context;
757 switch (align->matrix_adjust_rule) {
758 case eDontAdjustMatrix:
759 hsp->comp_adjustment_method = eNoCompositionBasedStats;
760 break;
761 case eCompoScaleOldMatrix:
762 hsp->comp_adjustment_method = eCompositionBasedStats;
763 break;
764 default:
765 hsp->comp_adjustment_method = eCompositionMatrixAdjust;
766 break;
767 }
768 /* Break the aliasing between align->context and hsp->gap_info. */
769 align->context = NULL;
770 }
771 BlastCompo_AlignmentsFree(alignments, NULL);
772 if (hitlist->hspcnt > 0) {
773 HeapSort(hitlist->hsp_array, hitlist->hspcnt, sizeof(BLAST_HSPPtr),
774 score_compare_hsps);
775 }
776 return hitlist;
777 }
778
779
780 /**
781 * Calculate expect values for a list of alignments and remove those
782 * that don't meet an e-value cutoff from the list.
783 *
784 * @param *pbestScore the largest score observed in the list
785 * @param *pbestEvalue the best (smallest) e-value observed in the
786 * list
787 * @param full_subject_length the untranslated length of the subject
788 * sequence
789 * @param search contains search parameters used to compute
790 * e-values and the e-value cutoff
791 * @param do_link_hsps use hsp linking when computing e-values
792 * @param pvalueForThisPair composition p-value
793 * @param LambdaRatio lambda ratio, if available
794 * @param subject_id index of subject
795 */
796 static void
Kappa_HitlistEvaluateAndPurge(int * pbestScore,double * pbestEvalue,int full_subject_length,BlastSearchBlkPtr search,int do_link_hsps,double pvalueForThisPair,double LambdaRatio,int subject_id)797 Kappa_HitlistEvaluateAndPurge(int * pbestScore, double *pbestEvalue,
798 int full_subject_length,
799 BlastSearchBlkPtr search,
800 int do_link_hsps,
801 double pvalueForThisPair,
802 double LambdaRatio,
803 int subject_id)
804 {
805 BLAST_HitListPtr hitlist; /* a hitlist containing the newly-computed
806 * alignments */
807 Nlm_FloatHi bestEvalue; /* best e-value among alignments in the
808 hitlist */
809 int bestScore; /* best score among alignments in the
810 hitlist */
811 int hsp_index; /* index of the current HSP */
812
813 hitlist = search->current_hitlist;
814 search->subject->length = full_subject_length;
815 if (do_link_hsps) {
816 BlastLinkHsps(search);
817 } else {
818 BlastGetNonSumStatsEvalue(search);
819 }
820 if ((0 <= pvalueForThisPair) && (pvalueForThisPair <= 1))
821 Kappa_AdjustEvaluesForComposition(hitlist, pvalueForThisPair,
822 search, LambdaRatio, subject_id);
823 BlastReapHitlistByEvalue(search);
824 /* Find the e-value of the best alignment in the list -- the list
825 * is typically sorted by score and not by e-value, so a search is
826 * necessary. */
827 bestEvalue = DBL_MAX;
828 bestScore = 0;
829 for (hsp_index = 0; hsp_index < hitlist->hspcnt; hsp_index++) {
830 if( hitlist->hsp_array[hsp_index]->evalue < bestEvalue ) {
831 bestEvalue = hitlist->hsp_array[hsp_index]->evalue;
832 }
833 if( hitlist->hsp_array[hsp_index]->score > bestScore ) {
834 bestScore = hitlist->hsp_array[hsp_index]->score;
835 }
836 }
837 *pbestScore = bestScore;
838 *pbestEvalue = bestEvalue;
839 }
840
841
842 /**
843 * A callback routine: compute lambda for the given score frequencies.
844 * (@sa calc_lambda_type).
845 */
846 static double
Kappa_CalcLambda(double probs[],int min_score,int max_score,double lambda0)847 Kappa_CalcLambda(double probs[], int min_score, int max_score, double lambda0)
848 {
849 int i, n;
850 double avg;
851 BLAST_ScoreFreq freq;
852
853 n = max_score - min_score + 1;
854 avg = 0.0;
855 for (i = 0; i < n; i++) {
856 avg += (min_score + i) * probs[i];
857 }
858 freq.obs_min = min_score;
859 freq.obs_max = max_score;
860 freq.sprob = &probs[-min_score];
861 freq.score_avg = avg;
862
863 return impalaKarlinLambdaNR(&freq, lambda0);
864 }
865
866
867 /**
868 * Get a matrix of the frequency ratios that underlie the score
869 * matrix being used on this pass. The returned matrix is
870 * position-specific, so if we are in the first pass, use query to
871 * convert the 20x20 standard matrix into a position-specific
872 * variant. matrixName is the name of the underlying 20x20 score
873 * matrix used. numPositions is the length of the query;
874 * startNumerator is the matrix of frequency ratios as stored in
875 * posit.h. It needs to be divided by the frequency of the second
876 * character to get the intended ratio
877 *
878 * @param returnRatios an allocated matrix to hold the frequency
879 * ratios [out]
880 * @param search general search information [in]
881 * @param query the query sequence [in]
882 * @param matrixName name of the underlying matrix [in]
883 * @param startNumerator matrix of frequency ratios [in]
884 * @param numPositions length of the query [in]
885 * @param positionSpecific is this a position-specific search? [in]
886 */
887 static void
Kappa_GetStartFreqRatios(Nlm_FloatHi ** returnRatios,BlastSearchBlkPtr search,Uint1Ptr query,const char * matrixName,Nlm_FloatHi ** startNumerator,int numPositions,Boolean positionSpecific)888 Kappa_GetStartFreqRatios(Nlm_FloatHi ** returnRatios,
889 BlastSearchBlkPtr search,
890 Uint1Ptr query,
891 const char *matrixName,
892 Nlm_FloatHi **startNumerator,
893 int numPositions,
894 Boolean positionSpecific)
895 {
896 int i,j;
897 FreqRatios * stdFreqRatios = NULL;
898 /* a small cutoff used to determine whether it is necessary
899 * to reverse the multiplication done in posit.c */
900 const double posEpsilon = 0.0001;
901
902 stdFreqRatios = PSIMatrixFrequencyRatiosNew(matrixName);
903 if (positionSpecific) {
904 for(i = 0; i < numPositions; i++) {
905 for(j = 0; j < PROTEIN_ALPHABET; j++) {
906 returnRatios[i][j] = stdFreqRatios->data[query[i]][j];
907 }
908 }
909 } else {
910 for (i = 0; i < PROTEIN_ALPHABET; i++) {
911 for (j = 0; j < PROTEIN_ALPHABET; j++) {
912 returnRatios[i][j] = stdFreqRatios->data[i][j];
913 }
914 }
915 }
916 stdFreqRatios = PSIMatrixFrequencyRatiosFree(stdFreqRatios);
917
918 if (positionSpecific) {
919 Nlm_FloatHi *standardProb; /*probabilities of each letter*/
920 BLAST_ResFreqPtr stdrfp; /* gets standard frequencies in prob field */
921
922 stdrfp = BlastResFreqNew(search->sbp);
923 BlastResFreqStdComp(search->sbp,stdrfp);
924 standardProb = &stdrfp->prob[0];
925
926 /*reverse multiplication done in posit.c*/
927 for(i = 0; i < numPositions; i++) {
928 for(j = 0; j < PROTEIN_ALPHABET; j++) {
929 if ((standardProb[query[i]] > posEpsilon) &&
930 (standardProb[j] > posEpsilon) &&
931 (j != eStopChar) && (j != Xchar) &&
932 (startNumerator[i][j] > posEpsilon))
933 {
934 returnRatios[i][j] = startNumerator[i][j]/standardProb[j];
935 }
936 }
937 }
938 stdrfp = BlastResFreqDestruct(stdrfp);
939 }
940 }
941
942
943 /** SCALING_FACTOR is a multiplicative factor used to get more bits of
944 * precision in the integer matrix scores. It cannot be arbitrarily
945 * large because we do not want total alignment scores to exceed
946 * -(INT4_MIN) */
947 #define SCALING_FACTOR 32
948
949
950 /**
951 * produce a scaled-up version of the position-specific matrix
952 * starting from posFreqs
953 *
954 * @param fillPosMatrix is the matrix to be filled
955 * @param nonposMatrix is the underlying position-independent matrix,
956 * used to fill positions where frequencies are
957 * irrelevant
958 * @param matrixName name of the position-independent matrix
959 * @param posFreq frequency ratios for the position-dependent
960 * matrix
961 * @param query query sequence data
962 * @param queryLength length of the query
963 * @param sbp stores various parameters of the search
964 */
965 static void
Kappa_ScalePosMatrix(BLAST_Score ** fillPosMatrix,BLAST_Score ** nonposMatrix,const Char * matrixName,Nlm_FloatHi ** posFreqs,Uint1 * query,int queryLength,BLAST_ScoreBlkPtr sbp,Nlm_FloatHi localScalingFactor)966 Kappa_ScalePosMatrix(BLAST_Score **fillPosMatrix, BLAST_Score **nonposMatrix,
967 const Char *matrixName, Nlm_FloatHi **posFreqs,
968 Uint1 *query, int queryLength, BLAST_ScoreBlkPtr sbp,
969 Nlm_FloatHi localScalingFactor)
970 {
971
972 posSearchItems *posSearch; /*used to pass data into scaling routines*/
973 compactSearchItems *compactSearch; /*used to pass data into scaling routines*/
974 int i; /*loop index*/
975 BLAST_ResFreqPtr stdrfp; /* gets standard frequencies in prob field */
976 int a; /*index over characters*/
977
978
979 posSearch = (posSearchItems *) MemNew (1 * sizeof(posSearchItems));
980 compactSearch = (compactSearchItems *) MemNew (1 * sizeof(compactSearchItems));
981 posSearch->posMatrix = (BLAST_Score **) MemNew((queryLength + 1) * sizeof(BLAST_Score *));
982 posSearch->posPrivateMatrix = fillPosMatrix;
983 posSearch->posFreqs = posFreqs;
984 posSearch->stdFreqRatios = PSIMatrixFrequencyRatiosNew(matrixName);
985 for(i = 0; i <= queryLength; i++)
986 posSearch->posMatrix[i] = (BLAST_Score *) MemNew(PROTEIN_ALPHABET * sizeof(BLAST_Score));
987
988 compactSearch->query = (Uint1Ptr) query;
989 compactSearch->qlength = queryLength;
990 compactSearch->alphabetSize = PROTEIN_ALPHABET;
991 compactSearch->gapped_calculation = TRUE;
992 compactSearch->matrix = nonposMatrix;
993 compactSearch->lambda = sbp->kbp_gap_std[0]->Lambda;
994 compactSearch->kbp_std = sbp->kbp_std;
995 compactSearch->kbp_psi = sbp->kbp_psi;
996 compactSearch->kbp_gap_psi = sbp->kbp_gap_psi;
997 compactSearch->kbp_gap_std = sbp->kbp_gap_std;
998 compactSearch->lambda_ideal = sbp->kbp_ideal->Lambda;
999 compactSearch->K_ideal = sbp->kbp_ideal->K;
1000
1001 stdrfp = BlastResFreqNew(sbp);
1002 BlastResFreqStdComp(sbp,stdrfp);
1003 compactSearch->standardProb = MemNew(compactSearch->alphabetSize * sizeof(Nlm_FloatHi));
1004 for(a = 0; a < compactSearch->alphabetSize; a++)
1005 compactSearch->standardProb[a] = stdrfp->prob[a];
1006 stdrfp = BlastResFreqDestruct(stdrfp);
1007
1008 posFreqsToMatrix(posSearch,compactSearch);
1009 impalaScaling(posSearch, compactSearch, localScalingFactor, FALSE);
1010
1011 for(i = 0; i <= queryLength; i++)
1012 MemFree(posSearch->posMatrix[i]);
1013
1014 MemFree(compactSearch->standardProb);
1015 MemFree(posSearch->posMatrix);
1016 PSIMatrixFrequencyRatiosFree(posSearch->stdFreqRatios);
1017 MemFree(posSearch);
1018 MemFree(compactSearch);
1019 }
1020
1021
1022 /** Convert an array of HSPs into a list of BlastCompo_Alignment
1023 * objects.
1024 *
1025 * @param search search parameters
1026 * @param hsp_array existing array of HSPs
1027 * @param hspcnt length of hsp_array
1028 * @param localScalingFactor factor by which scores are scaled in the
1029 * new list of alignments
1030 * @return the new list of alignments.
1031 */
1032 static BlastCompo_Alignment *
Kappa_ResultHspToDistinctAlign(BlastSearchBlkPtr search,BLASTResultHsp hsp_array[],int hspcnt,double localScalingFactor)1033 Kappa_ResultHspToDistinctAlign(BlastSearchBlkPtr search,
1034 BLASTResultHsp hsp_array[], int hspcnt,
1035 double localScalingFactor)
1036 {
1037 BlastCompo_Alignment *aligns; /* list of alignments to be returned */
1038 BlastCompo_Alignment **tail; /* next location to store a pointer to
1039 a new alignment */
1040 int hsp_index; /* loop index */
1041
1042 aligns = NULL;
1043 tail = &aligns;
1044 for (hsp_index = 0; hsp_index < hspcnt; hsp_index++) {
1045 BlastCompo_Alignment *new_align; /* a new alignment */
1046 int queryIndex, queryEnd, matchEnd;
1047 BLASTResultHsp * hsp = &hsp_array[hsp_index];
1048 queryEnd = hsp->query_offset + hsp->query_length;
1049 matchEnd = hsp->subject_offset + hsp->subject_length;
1050 if(search->mult_queries != NULL) {
1051 queryIndex = GetQueryNum(search->mult_queries,
1052 hsp->query_offset, queryEnd - 1, 0);
1053 } else {
1054 queryIndex = 0;
1055 }
1056 new_align =
1057 BlastCompo_AlignmentNew((int) (hsp->score * localScalingFactor),
1058 eDontAdjustMatrix,
1059 hsp->query_offset, queryEnd,
1060 queryIndex, hsp->subject_offset,
1061 matchEnd, hsp->subject_frame, hsp);
1062 if (new_align == NULL) {
1063 ErrPostEx(SEV_FATAL, E_NoMemory, 0,
1064 "Failed to allocate a new alignment");
1065 }
1066 *tail = new_align;
1067 tail = &new_align->next;
1068 }
1069 return aligns;
1070 }
1071
1072
1073 /**
1074 * Redo a S-W alignment using an x-drop alignment. The result will
1075 * usually be the same as the S-W alignment. The call to ALIGN
1076 * attempts to force the endpoints of the alignment to match the
1077 * optimal endpoints determined by the Smith-Waterman algorithm.
1078 * ALIGN is used, so that if the data structures for storing BLAST
1079 * alignments are changed, the code will not break
1080 *
1081 * @param query the query data
1082 * @param queryStart start of the alignment in the query sequence
1083 * @param queryEnd end of the alignment in the query sequence,
1084 * as computed by the Smith-Waterman algorithm
1085 * @param subject the subject (database) sequence
1086 * @param matchStart start of the alignment in the subject sequence
1087 * @param matchEnd end of the alignment in the query sequence,
1088 * as computed by the Smith-Waterman algorithm
1089 * @param frame translation frame of the database sequence (0
1090 * if not translated)
1091 * @param gap_align parameters for a gapped alignment
1092 * @param score score computed by the Smith-Waterman algorithm
1093 */
1094 static void
Kappa_SWFindFinalEndsUsingXdrop(BlastCompo_SequenceData * query,int queryStart,int queryEnd,BlastCompo_SequenceData * subject,int matchStart,int matchEnd,int frame,GapAlignBlkPtr gap_align,int score)1095 Kappa_SWFindFinalEndsUsingXdrop(
1096 BlastCompo_SequenceData * query,
1097 int queryStart,
1098 int queryEnd,
1099 BlastCompo_SequenceData * subject,
1100 int matchStart,
1101 int matchEnd,
1102 int frame,
1103 GapAlignBlkPtr gap_align,
1104 int score)
1105 {
1106 int XdropAlignScore; /* alignment score obtained using X-dropoff
1107 * method rather than Smith-Waterman */
1108 int doublingCount = 0; /* number of times X-dropoff had to be
1109 * doubled */
1110 int *alignScript, *dummy; /* the alignment script that will be
1111 generated below by the ALIGN
1112 routine. */
1113 GapXEditBlockPtr editBlock; /* traceback info for this alignment */
1114 /* Extent of the alignment as computed by an x-drop alignment
1115 * (usually the same as (queryEnd - queryStart) and (matchEnd -
1116 * matchStart)) */
1117 int queryExtent, matchExtent;
1118
1119 gap_align->query_start = queryStart;
1120 gap_align->subject_start = matchStart;
1121 do {
1122 alignScript =
1123 (int *) MemNew((subject->length + query->length + 3) * sizeof(int));
1124
1125 XdropAlignScore =
1126 ALIGN(&query->data[queryStart - 1], &subject->data[matchStart - 1],
1127 queryEnd - queryStart + 1, matchEnd - matchStart + 1,
1128 alignScript, &queryExtent, &matchExtent, &dummy,
1129 gap_align, queryStart - 1, FALSE);
1130 gap_align->score = XdropAlignScore;
1131 gap_align->query_stop = gap_align->query_start + queryExtent - 1;
1132 gap_align->subject_stop = gap_align->subject_start + matchExtent - 1;
1133
1134 gap_align->x_parameter *= 2;
1135 doublingCount++;
1136 if((XdropAlignScore < score) && (doublingCount < 3)) {
1137 MemFree(alignScript);
1138 }
1139 } while((XdropAlignScore < score) && (doublingCount < 3));
1140 editBlock =
1141 TracebackToGapXEditBlock(NULL, NULL, queryExtent, matchExtent,
1142 alignScript, queryStart, matchStart);
1143 alignScript = MemFree(alignScript);
1144 editBlock->length1 = query->length;
1145 editBlock->length2 = subject->length;
1146 editBlock->discontinuous = gap_align->discontinuous;
1147 editBlock->translate2 = gap_align->translate2;
1148 editBlock->frame2 = (Int2) frame;
1149
1150 gap_align->edit_block = editBlock;
1151 }
1152
1153
1154 typedef struct Kappa_SequenceLocalData {
1155 Uint1 prog_number; /**< identifies the type of blast search being
1156 performed. The type of search determines
1157 how sequence data should be obtained. */
1158 CharPtr genetic_code; /**< genetic code for translated searches */
1159 BioseqPtr bsp_db; /**< An object that represents this sequence
1160 in low level routines. */
1161 Boolean bioseq_needs_unlock; /**< true if the bsp_db must be
1162 disposed of by a call to
1163 BioseqUnlock, rather than
1164 BioseqFree */
1165 ReadDBFILEPtr rdfp; /**< A pointer to a database from which
1166 sequences may be obtained */
1167 } Kappa_SequenceLocalData;
1168
1169
1170 /**
1171 * Initialize a new matching sequence, obtaining information about the
1172 * sequence from the search.
1173 *
1174 * @param self object to be initialized
1175 * @param search search information
1176 * @param subject_index index of the matching sequence in the database
1177 */
1178 static void
Kappa_MatchingSequenceInitialize(BlastCompo_MatchingSequence * self,BlastSearchBlkPtr search,int subject_index)1179 Kappa_MatchingSequenceInitialize(
1180 BlastCompo_MatchingSequence * self,
1181 BlastSearchBlkPtr search,
1182 int subject_index)
1183 {
1184 Kappa_SequenceLocalData * local_data;
1185
1186 local_data = self->local_data = MemNew(sizeof(Kappa_SequenceLocalData));
1187 self->index = subject_index;
1188
1189 local_data->prog_number = search->prog_number;
1190 local_data->rdfp = search->rdfp;
1191 if(local_data->prog_number == blast_type_tblastn) {
1192 local_data ->genetic_code = search->db_genetic_code;
1193 } else {
1194 /* the sequence will not be translated */
1195 local_data ->genetic_code = NULL;
1196 }
1197 if(local_data->rdfp) {
1198 local_data->rdfp->parameters |= READDB_KEEP_HDR_AND_SEQ;
1199 local_data->bsp_db = readdb_get_bioseq(local_data->rdfp, self->index );
1200 local_data->bioseq_needs_unlock = FALSE;
1201 } else {
1202 local_data->bsp_db = BioseqLockById(search->subject_info->sip);
1203 local_data->bioseq_needs_unlock = TRUE;
1204 }
1205 self->length = local_data->bsp_db->length;
1206 }
1207
1208
1209 /** Release the resources associated with a matching sequence. */
1210 static void
Kappa_MatchingSequenceRelease(BlastCompo_MatchingSequence * self)1211 Kappa_MatchingSequenceRelease(BlastCompo_MatchingSequence * self)
1212 {
1213 Kappa_SequenceLocalData * local_data = self->local_data;
1214 if(local_data->bsp_db) {
1215 if(local_data->bioseq_needs_unlock) {
1216 BioseqUnlock(local_data->bsp_db);
1217 local_data->bsp_db = NULL;
1218 } else {
1219 local_data->bsp_db = BioseqFree(local_data->bsp_db);
1220 }
1221 }
1222 MemFree(local_data);
1223 self->local_data = NULL;
1224 }
1225
1226 #define MINUMUM_FRACTION_NEAR_IDENTICAL 0.98
1227
1228 /**
1229 * Test whether the aligned parts of two sequences that
1230 * have a high-scoring gapless alignment are nearly identical
1231 *
1232 * @param seqData subject sequence
1233 * @param queryData query sequence
1234 * @param queryOffset offset for align if there are multiple queries
1235 * @param align information about the alignment
1236 * @param rangeOffset offset for subject sequence (used for tblastn)
1237 *
1238 * @return TRUE if the aligned portions are nearly identical
1239 */
testNearIdentical(const BlastCompo_SequenceData * seqData,const BlastCompo_SequenceData * queryData,const int queryOffset,const BlastCompo_Alignment * align,const int rangeOffset)1240 static Boolean testNearIdentical(const BlastCompo_SequenceData *seqData,
1241 const BlastCompo_SequenceData *queryData,
1242 const int queryOffset,
1243 const BlastCompo_Alignment *align,
1244 const int rangeOffset)
1245 {
1246 int numIdentical = 0;
1247 double fractionIdentical;
1248 int qPos, sPos; /*positions in query and subject;*/
1249 int qEnd; /*end of query*/
1250
1251 qPos = align->queryStart - queryOffset;
1252 qEnd = align->queryEnd - queryOffset;
1253 sPos = align->matchStart - rangeOffset;
1254 while (qPos < qEnd) {
1255 if (queryData->data[qPos] == seqData->data[sPos])
1256 numIdentical++;
1257 sPos++;
1258 qPos++;
1259 }
1260 fractionIdentical = ((double) numIdentical/
1261 (double) (align->queryEnd - align->queryStart));
1262 if (fractionIdentical >= MINUMUM_FRACTION_NEAR_IDENTICAL)
1263 return(TRUE);
1264 else
1265 return(FALSE);
1266 }
1267
1268
1269 /** Character to use for masked residues */
1270 #define BLASTP_MASK_RESIDUE eXchar
1271 /** Default instructions and mask residue for SEG filtering */
1272 #define BLASTP_MASK_INSTRUCTIONS "S 10 1.8 2.1"
1273
1274
1275 /**
1276 * Obtain a string of translated data
1277 *
1278 * @param self the sequence from which to obtain the data [in]
1279 * @param range the range, in amino acid coordinates, of data
1280 * to get; includes the translation frame [in]
1281 * @param seqData the resulting data [out]
1282 * @param q_range the range, in amino acid coordinates, of data
1283 * to get; includes the translation frame [in]
1284 * @param queryData the query sequence [in]
1285 * @param align information about the alignment between query and subject
1286 * @param shouldTestIdentical did alignment pass a preliminary test in
1287 * redo_alignment.c that indicates the sequence
1288 * pieces may be near identical
1289 */
1290 static void
Kappa_SequenceGetTranslatedRange(const BlastCompo_MatchingSequence * self,const BlastCompo_SequenceRange * range,BlastCompo_SequenceData * seqData,const BlastCompo_SequenceRange * q_range,const BlastCompo_SequenceData * queryData,const BlastCompo_Alignment * align,const Boolean shouldTestIdentical)1291 Kappa_SequenceGetTranslatedRange(const BlastCompo_MatchingSequence * self,
1292 const BlastCompo_SequenceRange * range,
1293 BlastCompo_SequenceData * seqData,
1294 const BlastCompo_SequenceRange * q_range,
1295 const BlastCompo_SequenceData * queryData,
1296 const BlastCompo_Alignment *align,
1297 const Boolean shouldTestIdentical)
1298 {
1299 int i;
1300 int nucleotide_start; /* position of the first nucleotide to be
1301 * translated */
1302 int num_nucleotides; /* number of nucleotides to translate */
1303 Int2 translation_frame = (Int2) range->context;
1304 Kappa_SequenceLocalData * local_data = self->local_data;
1305
1306 nucleotide_start = 3 * range->begin;
1307 num_nucleotides =
1308 3 * (range->end - range->begin) + ABS(translation_frame) - 1;
1309 { /* scope of nucleotide_data */
1310 Uint1Ptr nucleotide_data = MemNew((num_nucleotides + 1) * sizeof(Uint1));
1311 { /* scope of spp */
1312 SeqPortPtr spp; /* a SeqPort used to read the sequence data */
1313 Uint1 strand; /* a flag indicating which strand to read */
1314 Uint1 nucleotide; /* an individual nucleotide */
1315
1316 if (translation_frame >= 0) {
1317 strand = Seq_strand_plus;
1318 } else {
1319 strand = Seq_strand_minus;
1320 }
1321 spp = SeqPortNew(local_data->bsp_db, FIRST_RESIDUE, LAST_RESIDUE,
1322 strand, Seq_code_ncbi4na);
1323 SeqPortSeek(spp, nucleotide_start, SEEK_SET);
1324
1325 for(i = 0;
1326 i < num_nucleotides &&
1327 (nucleotide = SeqPortGetResidue(spp)) != SEQPORT_EOF;
1328 i++ ) {
1329 /* for all nucleotides in the translation range */
1330 nucleotide_data[i] = nucleotide;
1331 }
1332
1333 spp = SeqPortFree(spp);
1334 } /* end scope of spp */
1335 seqData->buffer =
1336 GetTranslation(nucleotide_data, num_nucleotides, translation_frame,
1337 &seqData->length, local_data->genetic_code);
1338 seqData->data = seqData->buffer + 1; /* This is a protein sequence,
1339 so the first byte is nil. */
1340 MemFree(nucleotide_data);
1341 } /* end scope of nucleotide_data */
1342
1343 #ifndef KAPPA_NO_SEG_SEQUENCE_TBLASTN
1344 { /* scope of variables used for SEG filtering */
1345 BioseqPtr bsp_temp; /* a Bioseq for the translated sequence */
1346 ObjectIdPtr oip; /* a unique ObjectId for the translated sequence */
1347 SeqLocPtr seg_slp; /* a SeqLoc for SEG filtering */
1348 bsp_temp = BlastMakeTempProteinBioseq(seqData->data, seqData->length,
1349 Seq_code_ncbistdaa);
1350 bsp_temp->id = SeqIdSetFree(bsp_temp->id);
1351
1352 oip = (ObjectIdPtr) UniqueLocalId();
1353 ValNodeAddPointer(&(bsp_temp->id), SEQID_LOCAL, oip);
1354 SeqMgrAddToBioseqIndex(bsp_temp);
1355
1356 seg_slp = BlastBioseqFilter(bsp_temp, BLASTP_MASK_INSTRUCTIONS);
1357 if (seg_slp) {
1358 HackSeqLocId(seg_slp, local_data->bsp_db->id);
1359 if ((!shouldTestIdentical) ||
1360 (shouldTestIdentical &&
1361 (!testNearIdentical(seqData, queryData, q_range->begin, align, range->begin))))
1362 BlastMaskTheResidues(seqData->data, seqData->length,
1363 BLASTP_MASK_RESIDUE, seg_slp, FALSE, 0);
1364 seg_slp = SeqLocSetFree(seg_slp);
1365 }
1366
1367 SeqMgrDeleteFromBioseqIndex(bsp_temp);
1368 bsp_temp->id = SeqIdSetFree(bsp_temp->id);
1369 bsp_temp = BioseqFree(bsp_temp);
1370 } /* end scope of variables used for SEG filtering */
1371 #endif
1372 }
1373
1374
1375 /**
1376 * Obtain a string of protein data
1377 *
1378 * @param self the sequence from which to obtain the data [in]
1379 * @param range the range, in amino acid coordinates, of data
1380 * to get; includes the translation frame [in]
1381 * @param seqData the resulting data [out]
1382 * @param range the range, in amino acid coordinates, of data
1383 * to get; includes the translation frame [in]
1384 * @param queryData the query sequence [in]
1385 * @param align information about the alignment between query and subject
1386 * @param shouldTestIdentical did alignment pass a preliminary test in
1387 * redo_alignment.c that indicates the sequence
1388 * pieces may be near identical
1389 */
1390 static void
Kappa_SequenceGetProteinRange(const BlastCompo_MatchingSequence * self,const BlastCompo_SequenceRange * range,BlastCompo_SequenceData * seqData,const BlastCompo_SequenceRange * q_range,const BlastCompo_SequenceData * queryData,const BlastCompo_Alignment * align,const Boolean shouldTestIdentical)1391 Kappa_SequenceGetProteinRange(const BlastCompo_MatchingSequence * self,
1392 const BlastCompo_SequenceRange * range,
1393 BlastCompo_SequenceData * seqData,
1394 const BlastCompo_SequenceRange * q_range,
1395 const BlastCompo_SequenceData * queryData,
1396 const BlastCompo_Alignment *align,
1397 const Boolean shouldTestIdentical)
1398 {
1399 int i;
1400 Kappa_SequenceLocalData * local_data = self->local_data;
1401
1402 /* The sequence does not need to be translated. */
1403 /* Obtain the entire sequence (necessary for SEG filtering.) */
1404 if(local_data->rdfp != NULL) {
1405 Uint1Ptr origData; /* data obtained from readdb_get_sequence;
1406 * this data cannot be modified, so we copy
1407 * it. */
1408 int idx;
1409 seqData->length = readdb_get_sequence(local_data->rdfp, self->index,
1410 (Uint1Ptr PNTR) & origData );
1411 seqData->buffer = MemNew((seqData->length + 2) * sizeof(Uint1));
1412 seqData->buffer[0] = '\0';
1413 seqData->data = &seqData->buffer[1];
1414 for( idx = 0; idx < seqData->length; idx++ ) {
1415 /* Copy the sequence data, replacing occurrences of amino acid
1416 * number 24 (Selenocysteine) with number 21 (Undetermined or
1417 * atypical). */
1418 if(origData[idx] != eSelenocysteine) {
1419 seqData->data[idx] = origData[idx];
1420 } else {
1421 seqData->data[idx] = eXchar;
1422 fprintf(stderr, "Selenocysteine (U) at position %ld"
1423 " replaced by X\n",
1424 (long) idx + 1);
1425 }
1426 }
1427 seqData->data[seqData->length] = '\0';
1428 } else { /* self->rdfp is NULL */
1429 SeqPortPtr spp = NULL; /* a SeqPort used to read the
1430 sequence data */
1431 Uint1 residue; /* an individual residue */
1432 int idx;
1433
1434 seqData->length = local_data->bsp_db->length;
1435 seqData->buffer = MemNew((seqData->length + 2) * sizeof(Uint1));
1436 seqData->buffer[0] = '\0';
1437 seqData->data = seqData->buffer + 1;
1438
1439 spp =
1440 SeqPortNew(local_data->bsp_db, FIRST_RESIDUE, LAST_RESIDUE,
1441 Seq_strand_unknown, Seq_code_ncbistdaa);
1442
1443 idx = 0;
1444 while((residue = SeqPortGetResidue(spp)) != SEQPORT_EOF) {
1445 if(IS_residue(residue)) {
1446 /* Replace occurrences of amino acid number 24
1447 (Selenocysteine) with number 21 (Undetermined or
1448 atypical). */
1449 if(residue == eSelenocysteine) {
1450 residue = eXchar;
1451 fprintf(stderr, "Selenocysteine (U) at position %ld"
1452 " replaced by X\n",
1453 (long) idx + 1);
1454 }
1455 seqData->data[idx++] = residue;
1456 }
1457 }
1458 seqData->data[idx] = 0; /* terminate the string */
1459 spp = SeqPortFree(spp);
1460 } /* end else self->rdfp is NULL */
1461
1462 #ifndef KAPPA_BLASTP_NO_SEG_SEQUENCE
1463 {
1464 SeqLocPtr seg_slp; /*pointer to structure for SEG filtering*/
1465
1466 seg_slp =
1467 BlastBioseqFilter(local_data->bsp_db, BLASTP_MASK_INSTRUCTIONS);
1468 if (seg_slp) {
1469 if ((!shouldTestIdentical) ||
1470 (shouldTestIdentical &&
1471 (!testNearIdentical(seqData, queryData, q_range->begin, align, 0))))
1472 BlastMaskTheResidues(seqData->data, seqData->length,
1473 BLASTP_MASK_RESIDUE, seg_slp, FALSE, 0);
1474 seg_slp = SeqLocSetFree(seg_slp);
1475 }
1476 }
1477 #endif
1478 /* Fit the data to the range. */
1479 seqData ->data = &seqData->data[range->begin - 1];
1480 *seqData->data++ = '\0';
1481 seqData ->length = range->end - range->begin;
1482 }
1483
1484 /**
1485 * Obtain the sequence data that lies within the given range.
1486 * Note: BLASTX search is not fully supported yet
1487 *
1488 * @param self sequence information [in]
1489 * @param s_range the range, in amino acid coordinates, of data
1490 * to get [in]
1491 * @param seqData the sequence data obtained [out]
1492 * @param query the query sequence [in]
1493 * @param q_range the range, in amino acid coordinates, of data
1494 * to get [in]
1495 * @param queryData the query data obtained [out]
1496 * @param align information about the alignment between query and subject
1497 * @param shouldTestIdentical did alignment pass a preliminary test in
1498 * redo_alignment.c that indicates the sequence
1499 * pieces may be near identical
1500 *
1501 * @returns always 0 (posts a fatal error on failure rather than
1502 * returning an error code.)
1503 */
1504 static int
Kappa_SequenceGetRange(const BlastCompo_MatchingSequence * self,const BlastCompo_SequenceRange * s_range,BlastCompo_SequenceData * seqData,const BlastCompo_SequenceData * query,const BlastCompo_SequenceRange * q_range,BlastCompo_SequenceData * queryData,const Uint8 * query_words,const BlastCompo_Alignment * align,const Boolean shouldTestIdentical,const ECompoAdjustModes compo_adjust_mode,const Boolean isSmithWaterman,Boolean * subject_maybe_biased)1505 Kappa_SequenceGetRange(
1506 const BlastCompo_MatchingSequence * self,
1507 const BlastCompo_SequenceRange * s_range,
1508 BlastCompo_SequenceData * seqData,
1509 const BlastCompo_SequenceData * query,
1510 const BlastCompo_SequenceRange * q_range,
1511 BlastCompo_SequenceData * queryData,
1512 const Uint8* query_words,
1513 const BlastCompo_Alignment *align,
1514 const Boolean shouldTestIdentical,
1515 const ECompoAdjustModes compo_adjust_mode,
1516 const Boolean isSmithWaterman,
1517 Boolean* subject_maybe_biased)
1518 {
1519 Int4 idx;
1520 Kappa_SequenceLocalData * local_data = self->local_data;
1521 Uint1 *origData = query->data + q_range->begin;
1522 /* Copy the query sequence (necessary for SEG filtering.) */
1523 queryData->length = q_range->end - q_range->begin;
1524 queryData->buffer = calloc((queryData->length + 2), sizeof(Uint1));
1525 queryData->data = queryData->buffer + 1;
1526
1527 for (idx = 0; idx < queryData->length; idx++) {
1528 /* Copy the sequence data, replacing occurences of amino acid
1529 * number 24 (Selenocysteine) with number 21 (Undetermined or
1530 * atypical). */
1531 queryData->data[idx] = (origData[idx] != 24) ? origData[idx] : 21;
1532 }
1533 if(local_data->prog_number == blast_type_tblastn) {
1534 /* The sequence must be translated. */
1535 Kappa_SequenceGetTranslatedRange(self, s_range, seqData, q_range, queryData,
1536 align, shouldTestIdentical);
1537 } else {
1538 Kappa_SequenceGetProteinRange(self, s_range, seqData, q_range, queryData,
1539 align, shouldTestIdentical);
1540 }
1541 return 0;
1542 }
1543
1544
1545
1546 /**
1547 * Converts an HSP, obtained from a blast search, to a GapAlignBlk that
1548 * is in a state in which it has all information necessary to redo the
1549 * computation of a traceback.
1550 *
1551 * @param gap_align the GapAlignBlk to be modified [out]
1552 * @param search general information about the search [in]
1553 * @param hsp the HSP to be converted [in]
1554 * @param queryOrigin origin of the query participating in this
1555 * alignment within the concatenated query [in]
1556 * @param subject_range, the subject_range used to compute the traceback,
1557 * in amino acid coordinates [in]
1558 * @param query, the query data [in]
1559 * @param subject the subject data [in]
1560 */
1561 static void
Kappa_HitToGapAlign(GapAlignBlkPtr gap_align,BlastSearchBlkPtr search,BLAST_HSPPtr hsp,int queryOrigin,BlastCompo_SequenceRange * subject_range,BlastCompo_SequenceData * query,BlastCompo_SequenceData * subject)1562 Kappa_HitToGapAlign(
1563 GapAlignBlkPtr gap_align,
1564 BlastSearchBlkPtr search,
1565 BLAST_HSPPtr hsp,
1566 int queryOrigin,
1567 BlastCompo_SequenceRange * subject_range,
1568 BlastCompo_SequenceData * query,
1569 BlastCompo_SequenceData * subject)
1570 {
1571 gap_align->query = query->data;
1572 gap_align->query_length = query->length;
1573 gap_align->query_frame = 0;
1574
1575 gap_align->subject = subject->data;
1576 gap_align->subject_length = subject->length;
1577 gap_align->subject_frame = hsp->subject.frame;
1578
1579 /* Shift the hsp to use query/subject local coordinates. */
1580 hsp->query.offset -= queryOrigin;
1581 hsp->query.gapped_start -= queryOrigin;
1582 hsp->query.end -= queryOrigin;
1583 hsp->subject.offset -= subject_range->begin;
1584 hsp->subject.gapped_start -= subject_range->begin;
1585 hsp->subject.end -= subject_range->begin;
1586 if(CheckStartForGappedAlignment(search, hsp, gap_align->query,
1587 gap_align->subject, search->sbp->matrix)) {
1588 /* We may use the starting point supplied by the HSP. */
1589 gap_align->q_start = hsp->query.gapped_start;
1590 gap_align->s_start = hsp->subject.gapped_start;
1591 } else {
1592 /* We must recompute the start for the gapped alignment, as the
1593 one in the HSP was unacceptable.*/
1594 gap_align->q_start =
1595 GetStartForGappedAlignment(search, hsp, gap_align->query,
1596 gap_align->subject, search->sbp->matrix);
1597 gap_align->s_start =
1598 (hsp->subject.offset - hsp->query.offset) + gap_align->q_start;
1599 }
1600 }
1601
1602
1603 /**
1604 * Reads a GapAlignBlk that has been used to compute a traceback, and
1605 * return a BlastCompo_Alignment representing the alignment.
1606 *
1607 * @param gap_align the GapAlignBlk
1608 * @param matrix_adjust_rule the rule used to compute the scoring matrix
1609 * @param queryRange range of the query data in the
1610 * concatenated query
1611 * @param ccat_query_length length of the concatenated query
1612 * @param subjectRange range of the subject data in the full
1613 * database sequence, expressed in amino acid
1614 * coordinates
1615 * @param subjectLength original. untranslated length of the
1616 * subject sequence
1617 */
1618 static BlastCompo_Alignment *
Kappa_NewAlignFromGapAlign(GapAlignBlkPtr gap_align,EMatrixAdjustRule matrix_adjust_rule,BlastCompo_SequenceRange * query_range,int ccat_query_length,BlastCompo_SequenceRange * subject_range,int subjectLength)1619 Kappa_NewAlignFromGapAlign(
1620 GapAlignBlkPtr gap_align,
1621 EMatrixAdjustRule matrix_adjust_rule,
1622 BlastCompo_SequenceRange * query_range,
1623 int ccat_query_length,
1624 BlastCompo_SequenceRange * subject_range,
1625 int subjectLength)
1626 {
1627 int query_index, translation_frame;
1628 BlastCompo_Alignment * obj; /* the new alignment */
1629
1630 /* The gap_align is in coordinates that are relative to the
1631 query/subject subject_range. Shift to global coordinates. */
1632 int queryStart = gap_align->query_start + query_range->begin;
1633 int queryEnd = gap_align->query_stop + query_range->begin + 1;
1634 int matchStart = gap_align->subject_start + subject_range->begin;
1635 int matchEnd = gap_align->subject_stop + subject_range->begin + 1;
1636
1637 if(gap_align->edit_block != NULL) {
1638 gap_align->edit_block->start1 += query_range->begin;
1639 gap_align->edit_block->length1 += query_range->begin;
1640 gap_align->edit_block->start2 += subject_range->begin;
1641 gap_align->edit_block->length2 += subject_range->begin;
1642 gap_align->edit_block->original_length1 = ccat_query_length;
1643 gap_align->edit_block->original_length2 = subjectLength;
1644 }
1645 query_index = query_range->context;
1646 translation_frame = subject_range->context;
1647 obj = BlastCompo_AlignmentNew(gap_align->score, matrix_adjust_rule,
1648 queryStart, queryEnd, query_index,
1649 matchStart, matchEnd,
1650 translation_frame,
1651 gap_align->edit_block);
1652 if (obj == NULL) {
1653 ErrPostEx(SEV_FATAL, E_NoMemory, 0,
1654 "Failed to allocate a new alignment");
1655 }
1656 gap_align->edit_block = NULL; /* set to NULL to avoid aliasing */
1657
1658 return obj;
1659 }
1660
1661
1662 /** A callback used when performing SmithWaterman alignments:
1663 * Calculate the traceback for one alignment by performing an x-drop
1664 * alignment in the forward direction, possibly increasing the x-drop
1665 * parameter until the desired score is attained.
1666 *
1667 * The start, end and score of the alignment should be obtained
1668 * using the Smith-Waterman algorithm before this routine is called.
1669 *
1670 * @param *palign the new alignment
1671 * @param *pqueryEnd on entry, the end of the alignment in the
1672 * query, as computed by the Smith-Waterman
1673 * algorithm. On exit, the end as computed by
1674 * the x-drop algorithm
1675 * @param *pmatchEnd like as *pqueryEnd, but for the subject
1676 * sequence
1677 * @param queryStart the starting point in the query
1678 * @param matchStart the starting point in the subject
1679 * @param score the score of the alignment, as computed by
1680 * the Smith-Waterman algorithm
1681 * @param query query sequence data
1682 * @param query_range range of this query in the concatenated
1683 * query
1684 * @param ccat_query_length total length of the concatenated query
1685 * @param subject subject sequence data
1686 * @param subject_range range of subject_data in the translated
1687 * query, in amino acid coordinates
1688 * @param full_subject_length length of the full subject sequence
1689 * @param gapping_params parameters used to compute gapped
1690 * alignments
1691 * @param matrix_adjust_rule the rule used to compute the scoring matrix
1692 * @returns 0 (posts a fatal error if it fails)
1693 * @sa new_xdrop_align_type
1694 */
1695 static int
Kappa_NewAlignmentUsingXdrop(BlastCompo_Alignment ** pnewAlign,int * pqueryEnd,int * pmatchEnd,int queryStart,int matchStart,int score,BlastCompo_SequenceData * query,BlastCompo_SequenceRange * query_range,int queryLength,BlastCompo_SequenceData * subject,BlastCompo_SequenceRange * subject_range,int subjectLength,BlastCompo_GappingParams * gapping_params,EMatrixAdjustRule matrix_adjust_rule)1696 Kappa_NewAlignmentUsingXdrop(BlastCompo_Alignment ** pnewAlign,
1697 int * pqueryEnd, int *pmatchEnd,
1698 int queryStart, int matchStart, int score,
1699 BlastCompo_SequenceData * query,
1700 BlastCompo_SequenceRange * query_range,
1701 int queryLength,
1702 BlastCompo_SequenceData * subject,
1703 BlastCompo_SequenceRange * subject_range,
1704 int subjectLength,
1705 BlastCompo_GappingParams * gapping_params,
1706 EMatrixAdjustRule matrix_adjust_rule)
1707 {
1708 /* General search parameters */
1709 BlastSearchBlkPtr search = gapping_params->context;
1710 /* Specific parameters used for gapped alignments */
1711 GapAlignBlkPtr gap_align = search->gap_align;
1712 /* the translation frame */
1713 int frame = subject_range->context;
1714
1715 gap_align->x_parameter = gapping_params->x_dropoff;
1716
1717 Kappa_SWFindFinalEndsUsingXdrop(query, queryStart, *pqueryEnd,
1718 subject, matchStart, *pmatchEnd,
1719 frame, gap_align, score);
1720 *pqueryEnd = gap_align->query_stop + 1;
1721 *pmatchEnd = gap_align->subject_stop + 1;
1722 *pnewAlign = Kappa_NewAlignFromGapAlign(gap_align, matrix_adjust_rule,
1723 query_range, queryLength,
1724 subject_range, subjectLength);
1725 gap_align->x_parameter = gapping_params->x_dropoff;
1726
1727 return 0;
1728 }
1729
1730
1731 /**
1732 * A callback: calculate the traceback for one alignment by
1733 * performing an x-drop alignment in both directions
1734 *
1735 * @param in_align the existing alignment, without traceback
1736 * @param matrix_adjust_rule the rule used to compute the scoring matrix
1737 * @param query_data query sequence data
1738 * @param query_range range of this query in the concatenated
1739 * query
1740 * @param ccat_query_length total length of the concatenated query
1741 * @param subject_data subject sequence data
1742 * @param subject_range range of subject_data in the translated
1743 * query, in amino acid coordinates
1744 * @param full_subject_length length of the full subject sequence
1745 * @param gapping_params parameters used to compute gapped
1746 * alignments
1747 * @sa redo_one_alignment_type
1748 */
1749 static BlastCompo_Alignment *
Kappa_RedoOneAlignment(BlastCompo_Alignment * in_align,EMatrixAdjustRule matrix_adjust_rule,BlastCompo_SequenceData * query_data,BlastCompo_SequenceRange * query_range,int ccat_query_length,BlastCompo_SequenceData * subject_data,BlastCompo_SequenceRange * subject_range,int full_subject_length,BlastCompo_GappingParams * gapping_params)1750 Kappa_RedoOneAlignment(BlastCompo_Alignment * in_align,
1751 EMatrixAdjustRule matrix_adjust_rule,
1752 BlastCompo_SequenceData * query_data,
1753 BlastCompo_SequenceRange * query_range,
1754 int ccat_query_length,
1755 BlastCompo_SequenceData * subject_data,
1756 BlastCompo_SequenceRange * subject_range,
1757 int full_subject_length,
1758 BlastCompo_GappingParams * gapping_params)
1759 {
1760 /* New alignment to be returned */
1761 BlastCompo_Alignment * newAlign;
1762 /* The HSP to be redone */
1763 BLASTResultHspPtr hsp = in_align->context;
1764 /* A copy of hsp as a BLAST_HSP; needed to fix a type mismatch */
1765 BLAST_HSPPtr temp_hsp = MemNew(sizeof(BLAST_HSP));
1766 /* General search information */
1767 BlastSearchBlkPtr search = gapping_params->context;
1768
1769 CopyResultHspToHSP(hsp, temp_hsp);
1770 Kappa_HitToGapAlign(search->gap_align, search, temp_hsp, query_range->begin,
1771 subject_range, query_data, subject_data);
1772 search->gap_align->x_parameter = gapping_params->x_dropoff;
1773
1774 PerformGappedAlignmentWithTraceback(search->gap_align);
1775
1776 newAlign =
1777 Kappa_NewAlignFromGapAlign(search->gap_align, matrix_adjust_rule,
1778 query_range, ccat_query_length,
1779 subject_range, full_subject_length);
1780 MemFree(temp_hsp);
1781
1782 return newAlign;
1783 }
1784
1785
1786 /**
1787 * A Kappa_SearchParameters represents the data needed by
1788 * RedoAlignmentCore to adjust the parameters of a search, including
1789 * the original value of these parameters
1790 */
1791 typedef struct Kappa_SearchParameters {
1792 int gap_open; /**< a penalty for the existence of a gap */
1793 int gapExtend; /**< a penalty for each residue (or
1794 nucleotide) in the gap */
1795 int gapDecline; /**< a penalty for declining to align a pair
1796 of residues */
1797 int gap_x_dropoff_final; /**< value of the x-drop parameter
1798 for the final gapped alignment
1799 with traceback */
1800 BLAST_Score **origMatrix; /**< The original matrix values */
1801 /** a matrix. Each row represents a query and each column
1802 represents the probabilities of a particular residue */
1803 GapAlignBlkPtr orig_gap_align;
1804 Nlm_FloatHi original_expect_value; /**< expect value on entry */
1805 BLAST_KarlinBlkPtr kbp_gap_orig; /**< copy of the original gapped
1806 Karlin-Altschul block corresponding to
1807 the first context */
1808 BLAST_KarlinBlkPtr * orig_kbp_gap_array; /**< pointer to the array of gapped
1809 Karlin-Altschul block for all
1810 contexts; needed to restore
1811 the search to its original
1812 configuration. */
1813 Boolean use_mq_flag; /**< the initial value of the
1814 search->mult_queries->use_mq flag;
1815 or false if search->mult_queries ==
1816 NULL */
1817 } Kappa_SearchParameters;
1818
1819
1820 /**
1821 * Release the data associated with a Kappa_SearchParameters and
1822 * delete the object
1823 * @param searchParams the object to be deleted [in][out]
1824 */
1825 static void
Kappa_SearchParametersFree(Kappa_SearchParameters ** searchParams)1826 Kappa_SearchParametersFree(Kappa_SearchParameters ** searchParams)
1827 {
1828 /* for convenience, remove one level of indirection from searchParams */
1829 Kappa_SearchParameters *sp = *searchParams;
1830
1831 if(sp->kbp_gap_orig) BlastKarlinBlkDestruct(sp->kbp_gap_orig);
1832
1833 Nlm_Int4MatrixFree(&sp->origMatrix);
1834
1835 Nlm_Free(*searchParams);
1836 *searchParams = NULL;
1837 }
1838
1839
1840 /**
1841 * Create a new instance of Kappa_SearchParameters
1842 *
1843 * @param rows number of rows in the scoring matrix
1844 * @param compo_adjust_mode if >0, use composition-based statistics
1845 * @param numQueries the number of queries in the concatenated
1846 * query
1847 * @param positionBased if true, the search is position-based
1848 */
1849 static Kappa_SearchParameters *
Kappa_SearchParametersNew(int rows,ECompoAdjustModes compo_adjust_mode,Boolean positionBased)1850 Kappa_SearchParametersNew(
1851 int rows,
1852 ECompoAdjustModes compo_adjust_mode,
1853 Boolean positionBased)
1854 {
1855 Kappa_SearchParameters *sp; /* the new object */
1856 sp = MemNew(sizeof(Kappa_SearchParameters));
1857
1858 sp->orig_kbp_gap_array = NULL;
1859 sp->kbp_gap_orig = NULL;
1860 sp->origMatrix = NULL;
1861
1862 sp->kbp_gap_orig = BlastKarlinBlkCreate();
1863 if (compo_adjust_mode != eNoCompositionBasedStats) {
1864 if (positionBased) {
1865 sp->origMatrix = Nlm_Int4MatrixNew(rows, PROTEIN_ALPHABET);
1866 } else {
1867 sp->origMatrix = Nlm_Int4MatrixNew(PROTEIN_ALPHABET, PROTEIN_ALPHABET);
1868 }
1869 if (sp->origMatrix == NULL) {
1870 ErrPostEx(SEV_FATAL, E_NoMemory, 0,
1871 "Failed to allocate a scoring matrix");
1872 }
1873 }
1874 /* end if(compo_adjust_mode) */
1875
1876 return sp;
1877 }
1878
1879
1880 /**
1881 * Record the initial value of the search parameters that are to be
1882 * adjusted.
1883 *
1884 * @param searchParams holds the recorded values [out]
1885 * @param search the search parameters [in]
1886 * @param compo_adjust_mode mode of composition adjustment [in]
1887 */
1888 static void
Kappa_RecordInitialSearch(Kappa_SearchParameters * searchParams,BlastSearchBlkPtr search,ECompoAdjustModes compo_adjust_mode)1889 Kappa_RecordInitialSearch(Kappa_SearchParameters * searchParams,
1890 BlastSearchBlkPtr search,
1891 ECompoAdjustModes compo_adjust_mode)
1892 {
1893 BLAST_KarlinBlkPtr kbp; /* statistical parameters used to evaluate a
1894 * query-subject pair */
1895 BLAST_Score **matrix; /* matrix used to score a local
1896 query-subject alignment */
1897
1898 searchParams->gap_x_dropoff_final = search->pbp->gap_x_dropoff_final;
1899 searchParams->original_expect_value = search->pbp->cutoff_e;
1900
1901 if (search->mult_queries != NULL) {
1902 searchParams->use_mq_flag = search->mult_queries->use_mq;
1903 } else {
1904 searchParams->use_mq_flag = 0;
1905 }
1906 searchParams->gap_open = search->pbp->gap_open;
1907 searchParams->gapExtend = search->pbp->gap_extend;
1908 searchParams->gapDecline = search->pbp->decline_align;
1909
1910 searchParams->orig_kbp_gap_array = search->sbp->kbp_gap;
1911 if(search->positionBased) {
1912 kbp = search->sbp->kbp_gap_psi[0];
1913 matrix = search->sbp->posMatrix;
1914 } else {
1915 kbp = search->sbp->kbp_gap_std[0];
1916 matrix = search->sbp->matrix;
1917 }
1918 searchParams->kbp_gap_orig->Lambda = kbp->Lambda;
1919 searchParams->kbp_gap_orig->K = kbp->K;
1920 searchParams->kbp_gap_orig->logK = kbp->logK;
1921 searchParams->kbp_gap_orig->H = kbp->H;
1922
1923 searchParams->orig_gap_align = search->gap_align;
1924 search->gap_align = NULL; /* break aliasing */
1925
1926 if(compo_adjust_mode != eNoCompositionBasedStats) {
1927 int i, j; /* iteration indices */
1928 int rows;
1929 if (search->positionBased) {
1930 rows = search->context[0].query->length;
1931 } else {
1932 rows = PROTEIN_ALPHABET;
1933 }
1934 for(i = 0; i < rows; i++) {
1935 for(j = 0; j < PROTEIN_ALPHABET; j++) {
1936 searchParams->origMatrix[i][j] = matrix[i][j];
1937 }
1938 }
1939 }
1940 }
1941
1942
1943 /**
1944 * Rescale the search parameters in the search object to obtain more
1945 * precision.
1946 *
1947 * @param search the search block to be rescaled
1948 * @param localScalingFactor the scale factor
1949 * @param options certain command-line options to BLAST
1950 */
1951 static void
Kappa_RescaleSearch(BlastSearchBlkPtr search,double localScalingFactor,BLAST_OptionsBlkPtr options)1952 Kappa_RescaleSearch(BlastSearchBlkPtr search,
1953 double localScalingFactor,
1954 BLAST_OptionsBlkPtr options)
1955 {
1956 BLAST_KarlinBlkPtr kbp; /* the statistical parameters used to
1957 * evaluate alignments of a
1958 * query-subject pair */
1959 if(search->positionBased) {
1960 kbp = search->sbp->kbp_gap_psi[0];
1961 } else {
1962 kbp = search->sbp->kbp_gap_std[0];
1963 }
1964 kbp->Lambda /= localScalingFactor;
1965 kbp->logK = log(kbp->K);
1966
1967 if (search->mult_queries != NULL) {
1968 search->mult_queries->use_mq = 0;
1969 }
1970 search->pbp->cutoff_e = options->kappa_expect_value;
1971 search->pbp->gap_open =
1972 Nlm_Nint(search->pbp->gap_open * localScalingFactor);
1973 search->pbp->gap_extend =
1974 Nlm_Nint(search->pbp->gap_extend * localScalingFactor);
1975 if(search->pbp->decline_align < INT2_MAX) {
1976 search->pbp->decline_align =
1977 Nlm_Nint(search->pbp->decline_align * localScalingFactor);
1978 }
1979 search->gap_align = GapAlignBlkNew(1, 1);
1980 search->gap_align->matrix = search->sbp->matrix;
1981 search->gap_align->positionBased = search->positionBased;
1982 if(search->positionBased) {
1983 search->gap_align->posMatrix = search->sbp->posMatrix;
1984 }
1985 search->gap_align->gap_open = search->pbp->gap_open;
1986 search->gap_align->gap_extend = search->pbp->gap_extend;
1987 search->gap_align->decline_align = search->pbp->decline_align;
1988 search->gap_align->translate2 =
1989 (Boolean) (search->prog_number == blast_type_tblastn);
1990
1991 search->gap_align->x_parameter =
1992 (int) (options->gap_x_dropoff_final * NCBIMATH_LN2 / kbp->Lambda);
1993 }
1994
1995
1996 /**
1997 * Restore the parameters that were adjusted to their original values
1998 * @param search the search to be restored [out]
1999 * @param matrix the scoring matrix to be restored [out]
2000 * @param searchParams a record of the original values [in]
2001 * @param SmithWaterman if true, we have performed a Smith-Waterman
2002 * alignment with these search parameters [in]
2003 */
2004 static void
Kappa_RestoreSearch(BlastSearchBlkPtr search,BLAST_Score ** matrix,Kappa_SearchParameters * searchParams,ECompoAdjustModes compo_adjust_mode)2005 Kappa_RestoreSearch(
2006 BlastSearchBlkPtr search,
2007 BLAST_Score ** matrix,
2008 Kappa_SearchParameters * searchParams,
2009 ECompoAdjustModes compo_adjust_mode)
2010 {
2011 BLAST_KarlinBlkPtr kbp; /* statistical parameters used to
2012 evaluate the significance of
2013 alignment of a query-subject
2014 pair */
2015 int i, j; /* iteration indices */
2016 search->pbp->gap_x_dropoff_final = searchParams->gap_x_dropoff_final;
2017 search->pbp->cutoff_e = searchParams->original_expect_value;
2018 search->pbp->gap_open = searchParams->gap_open;
2019 search->pbp->gap_extend = searchParams->gapExtend;
2020 search->pbp->decline_align = searchParams->gapDecline;
2021 if (search->mult_queries != NULL) {
2022 search->mult_queries->use_mq = searchParams->use_mq_flag;
2023 }
2024 GapAlignBlkDelete(search->gap_align);
2025 search->gap_align = searchParams->orig_gap_align;
2026 search->sbp->kbp_gap = searchParams->orig_kbp_gap_array;
2027
2028 if(search->positionBased) {
2029 kbp = search->sbp->kbp_gap_psi[0];
2030 } else {
2031 kbp = search->sbp->kbp_gap_std[0];
2032 }
2033 kbp->Lambda = searchParams->kbp_gap_orig->Lambda;
2034 kbp->K = searchParams->kbp_gap_orig->K;
2035 kbp->logK = searchParams->kbp_gap_orig->logK;
2036 kbp->H = searchParams->kbp_gap_orig->H;
2037
2038 if(compo_adjust_mode != eNoCompositionBasedStats) {
2039 int rows; /* the number of rows in matrix */
2040 if (search->positionBased) {
2041 rows = search->context[0].query->length;
2042 } else {
2043 rows = PROTEIN_ALPHABET;
2044 }
2045 for(i = 0; i < rows; i++) {
2046 for(j = 0; j < PROTEIN_ALPHABET; j++) {
2047 matrix[i][j] = searchParams->origMatrix[i][j];
2048 }
2049 }
2050 }
2051 }
2052
2053
2054 /** Initialize an object of type Blast_MatrixInfo.
2055 * @param self object to initialize
2056 * @param search search data
2057 * @param localScalingFactor amount by which this search is scaled
2058 * @param matrixName name of the matrix */
2059 static void
Kappa_MatrixInfoInit(Blast_MatrixInfo * self,double localScalingFactor,BlastSearchBlkPtr search,const char * matrixName)2060 Kappa_MatrixInfoInit(Blast_MatrixInfo * self,
2061 double localScalingFactor,
2062 BlastSearchBlkPtr search,
2063 const char * matrixName)
2064 {
2065 Uint1Ptr query; /* the query sequence */
2066 int queryLength; /* the length of the query sequence */
2067 Nlm_FloatHi initialUngappedLambda;
2068
2069 query = search->context[0].query->sequence;
2070 queryLength = search->context[0].query->length;
2071
2072 if(self->positionBased) {
2073 if(search->sbp->posFreqs == NULL) {
2074 search->sbp->posFreqs =
2075 allocatePosFreqs(queryLength, PROTEIN_ALPHABET);
2076 }
2077 Kappa_GetStartFreqRatios(self->startFreqRatios, search, query, matrixName,
2078 search->sbp->posFreqs, queryLength, TRUE);
2079 Kappa_ScalePosMatrix(self->startMatrix, search->sbp->matrix, matrixName,
2080 search->sbp->posFreqs, query, queryLength,
2081 search->sbp, localScalingFactor);
2082 initialUngappedLambda = search->sbp->kbp_psi[0]->Lambda;
2083 } else {
2084 Kappa_GetStartFreqRatios(self->startFreqRatios, search, query,
2085 matrixName, NULL, PROTEIN_ALPHABET,
2086 FALSE);
2087 initialUngappedLambda = search->sbp->kbp_ideal->Lambda;
2088 }
2089 self->ungappedLambda = initialUngappedLambda / localScalingFactor;
2090 if(!search->positionBased) {
2091 FreqRatios * freqRatios; /* frequency ratios for the matrix */
2092
2093 freqRatios = PSIMatrixFrequencyRatiosNew(matrixName);
2094 if (freqRatios == NULL) {
2095 ErrPostEx(SEV_FATAL, 1, 0, "blastpgp: Cannot adjust parameters "
2096 "for matrix %s\n", matrixName);
2097 }
2098 Blast_Int4MatrixFromFreq(self->startMatrix, self->cols, freqRatios->data,
2099 self->ungappedLambda);
2100 freqRatios = PSIMatrixFrequencyRatiosFree(freqRatios);
2101 }
2102 self->matrixName = strdup(matrixName);
2103 }
2104
2105
2106 /**
2107 * Record information about all queries in the concatenated query.
2108 *
2109 * @param *pquery a new list of BlastCompo_QueryInfo objects
2110 * @param *pnumQueries the length of *pquery
2111 * @param *maxLength the length of the longest query
2112 * @param search a search block, which is used to obtain the
2113 * query information */
2114 static void
Kappa_GetQueryInfo(BlastCompo_QueryInfo ** pquery,int * pnumQueries,int * maxLength,BlastSearchBlkPtr search)2115 Kappa_GetQueryInfo(BlastCompo_QueryInfo **pquery,
2116 int * pnumQueries,
2117 int *maxLength, BlastSearchBlkPtr search)
2118 {
2119 int query_index;
2120 int numQueries = search->mult_queries ? search->mult_queries->NumQueries : 1;
2121 BlastCompo_QueryInfo * query =
2122 Nlm_Calloc(numQueries, sizeof(BlastCompo_QueryInfo));
2123
2124 *pnumQueries = numQueries;
2125 *maxLength = 0;
2126 *pquery = query;
2127 if (search->mult_queries) {
2128 for (query_index = 0; query_index < numQueries; query_index++) {
2129 query[query_index].eff_search_space =
2130 search->mult_queries->SearchSpEff[query_index];
2131 }
2132 } else {
2133 query[0].eff_search_space = search->searchsp_eff;
2134 }
2135 if (search->mult_queries != NULL) {
2136 /* Query concatenation is in use; find each individual query
2137 within the concatenated query */
2138 Uint1 * ccat_query = /* the concatenated query */
2139 search->context[0].query->sequence;
2140 int length;
2141
2142 for (query_index = 0; query_index < numQueries; query_index++) {
2143 query[query_index].origin =
2144 search->mult_queries->QueryStarts[query_index];
2145 query[query_index].seq.data = &ccat_query[query[query_index].origin];
2146 length = search->mult_queries->QueryEnds[query_index] -
2147 query[query_index].origin + 1;
2148 query[query_index].seq.length = length;
2149 if (length > *maxLength) {
2150 *maxLength = length;
2151 }
2152 }
2153 } else {
2154 /* Query concatenation is not in use; just use the entire query */
2155 query[0].seq.data = search->context[0].query->sequence;
2156 query[0].seq.length = search->context[0].query->length;
2157 query[0].origin = 0;
2158 *maxLength = query[0].seq.length;
2159 }
2160 for (query_index = 0; query_index < numQueries; query_index++) {
2161 Blast_ReadAaComposition(&query[query_index].composition,
2162 PROTEIN_ALPHABET,
2163 query[query_index].seq.data,
2164 query[query_index].seq.length);
2165 }
2166 }
2167
2168
2169 /**
2170 * Initialize an object that contains the parameters needed to perform
2171 * a gapped alignment.
2172 *
2173 * @param gapping_params the object to be initialized
2174 * @param search holds search parameters
2175 * @param options holds BLAST command-line options
2176 * @param Lambda the statistical parameter Lambda;
2177 * gives the scale of the scoring system.
2178 */
2179 static void
Kappa_GappingParamsInit(BlastCompo_GappingParams * gapping_params,BlastSearchBlkPtr search)2180 Kappa_GappingParamsInit(BlastCompo_GappingParams * gapping_params,
2181 BlastSearchBlkPtr search)
2182 {
2183 gapping_params->gap_open = search->gap_align->gap_open;
2184 gapping_params->gap_extend = search->gap_align->gap_extend;
2185 gapping_params->decline_align = search->gap_align->decline_align;
2186 gapping_params->x_dropoff = search->gap_align->x_parameter;
2187 gapping_params->context = search;
2188 }
2189
2190
2191 /**
2192 * A callback: Free GapXEditBlock that is pointed to by a (void *).
2193 */
Kappa_FreeEditBlock(void * traceback)2194 static void Kappa_FreeEditBlock(void * traceback)
2195 {
2196 if (traceback != NULL)
2197 GapXEditBlockDelete((GapXEditBlockPtr)traceback);
2198 }
2199
2200
2201 /**
2202 * Callbacks used by Blast_RedoOneMatch and
2203 * Blast_RedoOneMatchSmithWaterman */
2204 static const Blast_RedoAlignCallbacks
2205 redo_align_callbacks = {
2206 Kappa_CalcLambda,
2207 Kappa_SequenceGetRange,
2208 Kappa_RedoOneAlignment,
2209 Kappa_NewAlignmentUsingXdrop,
2210 Kappa_FreeEditBlock
2211 };
2212
2213
2214 /** Get a set of alignment parameters, for use by Blast_RedoOneMatch or
2215 * Blast_RedoOneMatchSmithWaterman.
2216 * @param search search parameters
2217 * @param options BLAST command line options
2218 * @param compo_adjust_mode composition adjustment mode
2219 * @param localScalingFactor factor by which scores are scaled */
2220 static Blast_RedoAlignParams *
Kappa_GetAlignParams(BlastSearchBlkPtr search,BLAST_OptionsBlkPtr options,ECompoAdjustModes compo_adjust_mode,double localScalingFactor)2221 Kappa_GetAlignParams(BlastSearchBlkPtr search,
2222 BLAST_OptionsBlkPtr options,
2223 ECompoAdjustModes compo_adjust_mode,
2224 double localScalingFactor)
2225 {
2226 int ccat_query_length;
2227 int rows;
2228 int cutoff_s;
2229 double cutoff_e;
2230 BlastCompo_GappingParams * gapping_params = NULL;
2231 Blast_MatrixInfo * scaledMatrixInfo;
2232 Blast_RedoAlignParams * params;
2233 int query_is_translated = search->prog_number == blast_type_blastx;
2234 int subject_is_translated = search->prog_number == blast_type_tblastn;
2235 int do_link_hsps = search->pbp->do_sum_stats;
2236 /* Negative cutoff means that the old rules for CBS segging will be used */
2237 double near_identical_cutoff = -1.0;
2238
2239 if(do_link_hsps) {
2240 cutoff_s = (int) (search->pbp->cutoff_s1 * localScalingFactor);
2241 } else {
2242 /* There is no cutoff score; we consider e-values instead */
2243 cutoff_s = 0;
2244 }
2245 cutoff_e = search->pbp->cutoff_e;
2246 ccat_query_length = search->context[0].query->length;
2247 rows = search->positionBased ? ccat_query_length : PROTEIN_ALPHABET;
2248 if (compo_adjust_mode == eNoCompositionBasedStats) {
2249 /* We cannot compute a value for scaledMatrixInfo if
2250 * compo_adjust_mode is off. There are side effects to the
2251 * computation (specifically a call to updateLambdaK that alters
2252 * the gapped K and ungapped lambda for position-based searches)
2253 * that are undesirable and deeply embedded in the code. */
2254 scaledMatrixInfo = NULL;
2255 } else {
2256 scaledMatrixInfo = Blast_MatrixInfoNew(rows, PROTEIN_ALPHABET,
2257 search->positionBased);
2258 if (scaledMatrixInfo == NULL) {
2259 ErrPostEx(SEV_FATAL, E_NoMemory, 0, "Failed to allocate memory");
2260 }
2261 Kappa_MatrixInfoInit(scaledMatrixInfo, localScalingFactor, search,
2262 options->matrix);
2263 }
2264 gapping_params = malloc(sizeof(BlastCompo_GappingParams));
2265 Kappa_GappingParamsInit(gapping_params, search);
2266 params =
2267 Blast_RedoAlignParamsNew(&scaledMatrixInfo, &gapping_params,
2268 compo_adjust_mode, search->positionBased,
2269 query_is_translated, subject_is_translated, ccat_query_length,
2270 cutoff_s, cutoff_e, do_link_hsps,
2271 &redo_align_callbacks, near_identical_cutoff);
2272 if (params == NULL) {
2273 ErrPostEx(SEV_FATAL, E_NoMemory, 0, "Failed to allocate memory");
2274 }
2275 return params;
2276 }
2277
2278
2279 /**
2280 * Convert a Blast_CompoHeap to a flat list of SeqAligns. Note that there may
2281 * be more than one alignment per element in the heap. The new list
2282 * preserves the order of the SeqAligns associated with each
2283 * HeapRecord.
2284 *
2285 * @param self a Blast_CompoHeap
2286 */
2287 static SeqAlignPtr
Kappa_CompoHeapToFlatList(BlastCompo_Heap * self)2288 Kappa_CompoHeapToFlatList(BlastCompo_Heap * self)
2289 {
2290 SeqAlignPtr list = NULL;
2291 SeqAlignPtr result;
2292
2293 while (NULL != (result = BlastCompo_HeapPop(self))) {
2294 SeqAlignPtr oldList = list;
2295 list = result;
2296 while (NULL != result->next) {
2297 result = result->next;
2298 }
2299 result->next = oldList;
2300 }
2301 return list;
2302 }
2303
2304
2305 /**
2306 * Top level routine to recompute alignments for each
2307 * match found by the gapped BLAST algorithm
2308 *
2309 * @param search is the structure with all the information about
2310 * the search
2311 * @param options is used to pass certain command line options
2312 * taken in by BLAST
2313 * @param hitlist_count is the number of old matches
2314 * @param compo_adjust_mode determines whether we are to adjust the
2315 * Karlin-Altschul parameters and score matrix
2316 * @param SmithWaterman determines whether the new local alignments
2317 * should be computed by the optimal Smith-Waterman
2318 * algorithm; SmithWaterman false means that
2319 * alignments will be recomputed by the current
2320 * X-drop algorithm as implemented in the procedure
2321 * ALIGN.
2322 * @return a array of lists of SeqAlign; each element
2323 * in the array is a list of SeqAligns for
2324 * one query in the concatenated query.
2325 * It is assumed that at least one of compo_adjust_mode and
2326 * SmithWaterman is >0 or true when this procedure is called A linked list
2327 * of alignments is returned; the alignments are sorted according to
2328 * the lowest E-value of the best alignment for each matching
2329 * sequence; alignments for the same matching sequence are in the
2330 * list consecutively regardless of the E-value of the secondary
2331 * alignments. Ties in sorted order are much rarer than for the
2332 * standard BLAST method, but are broken deterministically based on
2333 * the index of the matching sequences in the database.
2334 */
2335 SeqAlignPtr *
RedoAlignmentCore(BlastSearchBlkPtr search,BLAST_OptionsBlkPtr options,int hitlist_count,int adjustParameters,Boolean SmithWaterman)2336 RedoAlignmentCore(BlastSearchBlkPtr search,
2337 BLAST_OptionsBlkPtr options,
2338 int hitlist_count,
2339 int adjustParameters,
2340 Boolean SmithWaterman)
2341 {
2342 int status = 0; /* error status returned by routines */
2343 int match_index; /* index over matches */
2344 SeqAlignPtr * results = NULL; /* an array of lists of SeqAligns to
2345 return */
2346 Nlm_FloatHi localScalingFactor; /* the factor by which to
2347 * scale the scoring system in
2348 * order to obtain greater
2349 * precision */
2350 BLAST_Score **matrix; /* score matrix */
2351 Kappa_SearchParameters *searchParams; /* the values of the search
2352 * parameters that will be
2353 * recorded, altered in the
2354 * search structure in this
2355 * routine, and then restored
2356 * before the routine
2357 * exits. */
2358 Blast_ForbiddenRanges forbidden; /* forbidden ranges for each
2359 * database position (used in
2360 * Smith-Waterman alignments)
2361 */
2362 BlastCompo_Heap * significantMatches; /* a collection of alignments for each
2363 * query sequence with sequences from
2364 * the database */
2365 Blast_CompositionWorkspace
2366 *NRrecord = NULL; /* stores all fields needed for
2367 * computing a compositionally adjusted
2368 * score matrix using Newton's method */
2369 int query_index; /* loop index */
2370 int numQueries; /* number of queries in the
2371 concatenated query */
2372 int ccat_query_length; /* length of the concatenated query, or
2373 of the sole query if query
2374 concatenation is not in use */
2375 int maxQueryLength; /* the greatest length among all queries */
2376 BlastCompo_Alignment * incoming_aligns; /* existing alignments for a match */
2377 BlastCompo_QueryInfo * query_info = NULL;
2378 Blast_RedoAlignParams * redo_align_params;
2379 double Lambda, logK;
2380 double pvalueForThisPair = (-1); /*p-value for this match
2381 for composition; -1 == no adjustment*/
2382 double LambdaRatio; /*lambda ratio*/
2383 int compositionTestIndex = options->unified_p;
2384 /*which test function do we use to see if
2385 a composition-adjusted p-value is desired*/
2386
2387 /* the composition adjustment mode, obtained from adjustParameters
2388 (we don't make adjustParameters itself an enum so that only
2389 kappa.c depends on the header that defines ECompoAdjustModes) */
2390 ECompoAdjustModes compo_adjust_mode;
2391
2392 if (search->positionBased) {
2393 /* Can't use a compositional P-value with position based searches */
2394 compositionTestIndex = 0;
2395 }
2396 /**** Validate parameters *************/
2397 if (0 > adjustParameters || eNumCompoAdjustModes <= adjustParameters) {
2398 /* Unknown composition adjustment mode */
2399 return NULL;
2400 } else {
2401 compo_adjust_mode = (ECompoAdjustModes) adjustParameters;
2402 }
2403 if(0 == strcmp(options->matrix, "BLOSUM62_20") &&
2404 eNoCompositionBasedStats == compo_adjust_mode) {
2405 /* BLOSUM62_20 only makes sense if compo_adjust_mode is on */
2406 return NULL;
2407 }
2408 if (search->positionBased && search->sbp->posMatrix == NULL) {
2409 Char* msg = "Cannot perform position-specific search without a PSSM";
2410 BlastConstructErrorMessage("RedoAlignmentCore", msg, 3,
2411 &(search->error_return));
2412 return NULL;
2413 }
2414 if (search->positionBased && compo_adjust_mode > 1) {
2415 /* Only mode 1 (or 0) works with position-based searches, silently
2416 * change the value. */
2417 compo_adjust_mode = eCompositionBasedStats;
2418 }
2419 if (compo_adjust_mode > 1 &&
2420 !Blast_FrequencyDataIsAvailable(options->matrix)) {
2421 Char* msg = "Unsupported matrix for composition-based"
2422 " matrix adjustment";
2423 BlastConstructErrorMessage("RedoAlignmentCore", msg, 3,
2424 &(search->error_return));
2425 return NULL;
2426 }
2427 /**** End validate parameters *************/
2428
2429 ccat_query_length = search->context[0].query->length;
2430 searchParams =
2431 Kappa_SearchParametersNew(ccat_query_length, compo_adjust_mode,
2432 search->positionBased);
2433 Kappa_RecordInitialSearch(searchParams, search, compo_adjust_mode);
2434 if (compo_adjust_mode == eNoCompositionBasedStats) {
2435 localScalingFactor = 1.0;
2436 } else {
2437 localScalingFactor = SCALING_FACTOR;
2438 }
2439 if((0 == strcmp(options->matrix, "BLOSUM62_20"))) {
2440 localScalingFactor /= 10;
2441 }
2442 Kappa_RescaleSearch(search, localScalingFactor, options);
2443 if(search->positionBased) {
2444 matrix = search->sbp->posMatrix;
2445 } else {
2446 matrix = search->sbp->matrix;
2447 }
2448 redo_align_params = Kappa_GetAlignParams(search, options, compo_adjust_mode,
2449 localScalingFactor);
2450 {
2451 /* Get appropriate values for Lambda and logK */
2452 BLAST_KarlinBlkPtr kbp;
2453 if(search->positionBased) {
2454 kbp = search->sbp->kbp_gap_psi[0];
2455 } else {
2456 kbp = search->sbp->kbp_gap_std[0];
2457 }
2458 Lambda = kbp->Lambda;
2459 logK = kbp->logK;
2460 }
2461 Kappa_GetQueryInfo(&query_info, &numQueries, &maxQueryLength, search);
2462 if(SmithWaterman) {
2463 status = Blast_ForbiddenRangesInitialize(&forbidden, maxQueryLength);
2464 if (status != 0)
2465 ErrPostEx(SEV_FATAL, E_NoMemory, 0, "Failed to allocate memory");
2466 }
2467 significantMatches = Nlm_Calloc(numQueries, sizeof(BlastCompo_Heap));
2468 ASSERT(options->ethresh != 0.0);
2469 for (query_index = 0; query_index < numQueries; query_index++) {
2470 status =
2471 BlastCompo_HeapInitialize(&significantMatches[query_index],
2472 options->hitlist_size, options->ethresh);
2473 if (status != 0) {
2474 ErrPostEx(SEV_FATAL, E_NoMemory, 0, "Failed to allocate memory");
2475 }
2476 }
2477 if (compo_adjust_mode != eNoCompositionBasedStats) {
2478 NRrecord = Blast_CompositionWorkspaceNew();
2479 if (NRrecord == NULL) {
2480 ErrPostEx(SEV_FATAL, E_NoMemory, 0, "Failed to allocate memory");
2481 }
2482 /* We already checked that the frequency data is available for
2483 * options->matrix, so this call can't fail */
2484 (void) Blast_CompositionWorkspaceInit(NRrecord, options->matrix);
2485 }
2486 for(match_index = 0; match_index < hitlist_count; match_index++) {
2487 /* for all matching sequences */
2488 BlastCompo_MatchingSequence matchingSeq; /* the data for a matching
2489 * database sequence */
2490 BLASTResultHitlistPtr thisMatch; /* alignment data for the
2491 * current query-subject
2492 * match */
2493 BlastCompo_Alignment ** alignments; /* array of lists of
2494 * alignments for each
2495 * query to this subject */
2496 alignments = Nlm_Calloc(numQueries, sizeof(BlastCompo_Alignment *));
2497
2498 thisMatch = search->result_struct->results[match_index];
2499 if(thisMatch->hsp_array == NULL) {
2500 continue;
2501 }
2502 if (BlastCompo_EarlyTermination(thisMatch->best_evalue,
2503 significantMatches, numQueries)) {
2504 break;
2505 }
2506 /* Get the sequence for this match */
2507 Kappa_MatchingSequenceInitialize(&matchingSeq, search,
2508 thisMatch->subject_id);
2509 incoming_aligns =
2510 Kappa_ResultHspToDistinctAlign(search, thisMatch->hsp_array,
2511 thisMatch->hspcnt, localScalingFactor);
2512 if (SmithWaterman) {
2513 status =
2514 Blast_RedoOneMatchSmithWaterman(alignments, redo_align_params,
2515 incoming_aligns, thisMatch->hspcnt,
2516 Lambda, logK,
2517 &matchingSeq, query_info, numQueries,
2518 matrix, PROTEIN_ALPHABET,
2519 NRrecord, &forbidden,
2520 significantMatches,
2521 &pvalueForThisPair,
2522 compositionTestIndex, &LambdaRatio);
2523 } else {
2524 status =
2525 Blast_RedoOneMatch(alignments, redo_align_params, incoming_aligns,
2526 thisMatch->hspcnt, Lambda, &matchingSeq,
2527 ccat_query_length, query_info, numQueries,
2528 matrix, PROTEIN_ALPHABET,
2529 NRrecord, &pvalueForThisPair,
2530 compositionTestIndex, &LambdaRatio);
2531 }
2532 if (status != 0) {
2533 ErrPostEx(SEV_FATAL, E_NoMemory, 0, "Failed to allocate memory");
2534 }
2535 for (query_index = 0; query_index < numQueries; query_index++) {
2536 /* Loop over queries */
2537 if( alignments[query_index] != NULL) { /* alignments were found */
2538 Nlm_FloatHi bestEvalue; /* best e-value among alignments in the
2539 hitlist */
2540 int bestScore; /* best score among alignments in the
2541 hitlist */
2542 BLAST_HitListPtr hitlist; /* a hitlist containing the newly-computed
2543 * alignments */
2544 Kappa_SortedHitlistFromAligns(search, &alignments[query_index]);
2545 hitlist = search->current_hitlist;
2546 if (hitlist->hspcnt > 1 &&
2547 !(SmithWaterman && search->prog_number == blast_type_blastp)) {
2548 /* Eliminate HSPs that are contained in a higher-scoring
2549 * HSP. With blastp and SmithWaterman alignments, the
2550 * forbidden ranges rule does not allow one alignment to be
2551 * contained in another. */
2552 BLASTCheckHSPInclusion(hitlist->hsp_array, hitlist->hspcnt,
2553 FALSE);
2554 hitlist->hspcnt =
2555 HspArrayPurge(hitlist->hsp_array, hitlist->hspcnt, FALSE);
2556 }
2557 Kappa_HitlistEvaluateAndPurge(&bestScore, &bestEvalue,
2558 matchingSeq.length,
2559 search, redo_align_params->do_link_hsps,
2560 pvalueForThisPair, LambdaRatio,
2561 thisMatch->subject_id);
2562 if (bestEvalue <= search->pbp->cutoff_e &&
2563 BlastCompo_HeapWouldInsert(&significantMatches[query_index],
2564 bestEvalue, bestScore,
2565 thisMatch->subject_id)) {
2566 /* If the best alignment is significant, then create and save
2567 * a list of SeqAligns. */
2568 /* SeqAligns for this query-subject pair */
2569 SeqAlignPtr aligns;
2570 void * discardedAligns;
2571 SeqIdPtr query_id;
2572 Kappa_SequenceLocalData * local_data = matchingSeq.local_data;
2573
2574 if (search->mult_queries) {
2575 query_id = search->mult_queries->FakeBsps[query_index]->id;
2576 } else {
2577 query_id = search->query_id;
2578 }
2579 aligns =
2580 Kappa_SeqAlignsFromHitlist(hitlist, local_data->bsp_db->id,
2581 query_id,
2582 query_info[query_index].origin,
2583 query_info[query_index].seq.length,
2584 Lambda, logK, localScalingFactor);
2585 status =
2586 BlastCompo_HeapInsert(&significantMatches[query_index],
2587 aligns, bestEvalue, bestScore,
2588 thisMatch->subject_id,
2589 &discardedAligns);
2590 if (status != 0) {
2591 ErrPostEx(SEV_FATAL, E_NoMemory, 0, "Failed to allocate memory");
2592 }
2593 if (discardedAligns != NULL) {
2594 SeqAlignSetFree(discardedAligns);
2595 }
2596 } /* end if the best alignment is significant */
2597 } /* end if any alignments were found */
2598 } /* end loop over queries */
2599 Kappa_MatchingSequenceRelease(&matchingSeq);
2600 MemFree(alignments);
2601 BlastCompo_AlignmentsFree(&incoming_aligns, NULL);
2602 }
2603 /* end for all matching sequences */
2604 results = Nlm_Calloc(numQueries, sizeof(SeqAlignPtr));
2605 for (query_index = 0; query_index < numQueries; query_index++) {
2606 results[query_index] =
2607 Kappa_CompoHeapToFlatList(&significantMatches[query_index]);
2608 }
2609 /* Clean up */
2610 free(query_info);
2611 Blast_RedoAlignParamsFree(&redo_align_params);
2612 if (search->current_hitlist) {
2613 search->current_hitlist->hspcnt_max = search->current_hitlist->hspcnt;
2614 search->current_hitlist = BlastHitListDestruct(search->current_hitlist);
2615 }
2616 for (query_index = 0; query_index < numQueries; query_index++) {
2617 BlastCompo_HeapRelease(&significantMatches[query_index]);
2618 }
2619 MemFree(significantMatches); significantMatches = NULL;
2620 if(SmithWaterman) {
2621 Blast_ForbiddenRangesRelease(&forbidden);
2622 }
2623 Kappa_RestoreSearch(search, matrix, searchParams, compo_adjust_mode);
2624 Kappa_SearchParametersFree(&searchParams);
2625 if (NULL != NRrecord) {
2626 Blast_CompositionWorkspaceFree(&NRrecord);
2627 }
2628 return (results);
2629 }
2630