1 static char const rcsid[] = "$Id: kappa.c,v 6.94 2013/11/04 15:42:10 boratyng Exp $";
2 
3 /* $Id: kappa.c,v 6.94 2013/11/04 15:42:10 boratyng Exp $
4 *   ==========================================================================
5 *
6 *                            PUBLIC DOMAIN NOTICE
7 *               National Center for Biotechnology Information
8 *
9 *  This software/database is a "United States Government Work" under the
10 *  terms of the United States Copyright Act.  It was written as part of
11 *  the author's official duties as a United States Government employee and
12 *  thus cannot be copyrighted.  This software/database is freely available
13 *  to the public for use. The National Library of Medicine and the U.S.
14 *  Government have not placed any restriction on its use or reproduction.
15 *
16 *  Although all reasonable efforts have been taken to ensure the accuracy
17 *  and reliability of the software and data, the NLM and the U.S.
18 *  Government do not and cannot warrant the performance or results that
19 *  may be obtained by using this software or data. The NLM and the U.S.
20 *  Government disclaim all warranties, express or implied, including
21 *  warranties of performance, merchantability or fitness for any particular
22 *  purpose.
23 *
24 *  Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================*/
27 
28 /*****************************************************************************
29 
30 File name: kappa.c
31 
32 Authors: Alejandro Schaffer, Mike Gertz
33 
34 Contents: Utilities for doing Smith-Waterman alignments and adjusting
35     the scoring system for each match in blastpgp
36 
37 Functions in this file call functions in the composition adjustment
38 module to perform adjustment and recalculation of alignments.  This
39 file is intended to be (thin) glue code that:
40 
41 - adapts C toolkit data structures by the composition adjustment module;
42 - calls functions in the composition adjustment module;
43 - converts the output of the composition adjustment module into the
44   format needed by the C toolkit.
45 
46 This file is very similar to the blast_kappa.c file, which is glue
47 code the performs the same function for the C++ toolkit.  The sole
48 external symbol from this file should be RedoAlignmentCore. Static
49 functions in this file have a 'Kappa_' prefix.  Please adhere to this
50 convention to avoid a name clash with functions in blast_kappa.c (the
51 name clash can matter in debuggers and search engines.)
52 
53  $Revision: 6.94 $
54 
55  Revision 6.75  2005/11/07 15:28:56  coulouri
56  From Mike Gertz:
57 
58  Record the composition mode used for each aligment; formerly the
59  composition mode was recorded for each subject sequence.  For blastp,
60  the composition is only done once per subject sequence so the former
61  behavior was correct, but it was incorrect for tblastn.
62 
63  Fixed a bug when the query frame is checked in an intermediate
64  containment test.  For tblastn, this effectively disabled the test.
65  This bug can effect tblastn alignments when SmithWaterman is not used
66  in the rare occasion that omitting the intermediate containment test
67  causes a different alignment to be found.
68 
69  Added a sort key to location_compare windows to sort first by query
70  index.  For tblastn alignments when query concatination is used, not
71  having this sort key may on rare occassion cause a different alignment
72  to be reported.
73 
74  Cast a few Nlm_FloatHi values to Int4 to suppress compiler warnings.
75 
76  Revision 6.74  2005/09/29 17:40:18  coulouri
77  from mike gertz:
78      - Many changes to support query concatenation; including changing
79        the API of RedoAlignementCore.
80      - In RedoAlignmentCore, only call BlastLinkHsps if
81        search->pbp->do_sum_stats is true.
82      - In RedoAlignmentCore, in the non-Smith-Waterman case, compute a
83        separate composition for each HSP, even if multiple HSPs are in
84        the same window.
85 
86  Revision 6.73  2005/09/13 16:23:01  coulouri
87  correct include paths for windows
88 
89  Revision 6.72  2005/09/13 14:55:06  coulouri
90  From Mike Gertz: use libblastcompadj
91 
92  Revision 6.71  2005/09/08 19:48:36  coulouri
93  From Mike Gertz:
94  Changed the behavior of a SWheap so that space for only a small number
95  (100) of hits is allocated initially, but the SWheap will grow to hold
96  as many hits as are needed.
97 
98  Revision 6.70  2005/09/08 14:01:49  coulouri
99  From Mike Gertz:
100    - Fixed a bug in which the extent of the subject match was being
101      calculated incorrectly setting forbidden ranges for a
102      Smith-Waterman alignment.
103 
104  Revision 6.69  2005/08/31 20:33:25  coulouri
105      New code to use the value of options->kappa_expect_value as the cutoff
106      e-value within RedoAlignmentCore. The code saves and restores the
107      value of search->pbp->cutoff_e.
108 
109  Revision 6.68  2005/08/30 18:20:46  coulouri
110  From Mike Gertz:
111      Changes to routines used in mode > 1 of compositional adjustment of
112      scoring matrices:
113        - Removed REscaleInitialScores as it is now redundant; rescaling
114          is now done as part of the call s_ScatterScores.
115        - Refactored Kappa_AdjustBXZ into the routines
116          Kappa_SetNonstandardAaScores and Kappa_AverageScores.
117        - Kappa_SetNonstandardAaScores sets scores for U, * and -,
118          unlike its predecessor Kappa_AdjustBXZ.
119        - Fixed a bug in the formula used to compute the score of (B,B)
120          matches.
121        - In s_RoundScoreMatrix, check whether the unrounded value is <
122          BLAST_SCORE_MIN and if so set the rounded value to
123          BLAST_SCORE_MIN; do not read the incoming value of the "matrix"
124          parameter.
125 
126      Changes to mode 1 (traditional) composition-based statistics:
127        - scaleMatrix renamed s_ScaleMatrx with slightly different
128          parameter list.
129        - s_ScaleMatrix does not scale substitution scores for the
130          nonstandard X, U, - and * characters (but does scale B and Z).
131          The previous version scaled scores for X and *.
132 
133      Made minor changes to Kappa_CompositionBasedStats and
134      Kappa_AdjustSearch to call the new routines described above.
135 
136  Revision 6.67  2005/08/05 12:04:53  coulouri
137  From Mike Gertz:
138  - Move setting gap_align->translate2 to Kappa_RecordInitialSearch;
139    for tblastn and some option settings it would not otherwise get set.
140  - Remove a now redundant setting of gap_align->translate2 in the
141    HitToGapAlign routine.
142  - Fixes to comments.
143 
144  Revision 6.66  2005/08/02 14:40:29  coulouri
145  From Mike Gertz:
146  - Fixes to comments
147  - Added enum constant eGapChar; renamed eStarChar to eStopChar.
148  - Change the integer type of some variables to suppress warnings about
149    assigning a wider type to a narrower type.
150  - Made the routines BLbasicSmithWatermanScoreOnly and BLSmithWatermanFindStart
151    static.
152  - Renamed s_ScatterFreqRatios -> s_ScatterScores; renamed parameters.
153  - In NewAlignmentUsingXdrop, use the translate2 field from gap_align
154    to set the same field in the edit block.
155  - Changed the Kappa_WindowInfo datatype to hold a list of HSPs in the
156    window; added logic in several places, notably WindowsFromHSPs, to
157    generate and maintain these lists.
158  - Refactored Kappa_AdjustSearch.  Use a more sophisticated rule,
159    implemented in the new Kappa_GetSubjectComposition routine, to
160    determine the subject sequence composition for tblastn.
161  - Removed unused parameters from several routines.
162  - In RedoAlignmentCore, delete NRrecord to avoid a memory leak.
163 
164  Revision 6.65  2005/07/28 14:34:06  coulouri
165  From Mike Gertz:
166  - Made minor fixes to whitespace and formatting.
167  - Fixed the comment to SWheap to accurately reflect recent changes in
168    the way SWheapRecordCompare is used.
169  - Changed some #define'd constants to enums or "static const"
170    variables.  Renamed these constants appropriately.
171  - Made the array alphaConvert and the routine scalePosMatrix static.
172  - Made sure that gap_align->translate2 is set for tblastn.
173  - Fixed a bug Kappa_SequenceGetTranslatedWindow: the wrong formula had
174    been used to compute num_nucleotides.
175 
176  Revision 6.64  2005/07/26 13:07:20  coulouri
177  - Changed #include "NRdefs.h" to #include <NRdefs.h> to be consistent
178    with toolbox conventions.
179 
180  - Removed the unused kScoreMatrixRange constant
181 
182  - Extended comments for some routines.  Fixed spelling and other
183    errors in comments.
184 
185  - Fixed white space and formatting in several locations.
186 
187  - Renamed functions:
188    o permuteLetterProbs -> s_GatherLetterProbs
189    o permuteFreqRatios  -> s_ScatterFreqRatios
190    o adjustBXZ -> Kappa_AdjustBXZ
191    o roundScoreMatrix -> s_RoundScoreMatrix
192 
193  - Eliminated the use of a temporary variable in REscaleInitialScores.
194 
195  - In sScatterFreqRatios, replaced the NRrecord parameter by a pointer
196    to the frequency ratio matrix contained in the record.
197 
198  Revision 6.63  2005/07/14 20:19:45  coulouri
199     - In Kappa_SequenceGetWindow, change all selenocysteine (U)
200       residues in the subject sequence to X.
201     - In scaleMatrix, do not compute the log of frequency ratios that are
202       zero, set the matrix element to BLAST_SCORE_MIN instead.
203     - Removed the startMatrix parameter to scaleMatrix, as it is no
204       longer used.
205 
206  Revision 6.61  2005/06/08 19:31:31  papadopo
207  From Michael Gertz:
208  1. The use of the score for comparing collections of alignments
209     was removed in March 2004; revert to the previous rule
210  2. Added a new field "bestScore" to the SWheapRecord structure
211  3. Various additional changes to support the use of score as a
212     key in a SWheapRecord
213  4. The comparison function is now used to determine whether new HSPs
214     may be added to the heap.  Previously, the evalue only was used
215  5. Removed a complex test that sometimes terminated computation of
216     Smith-Waterman alignments for a given subject sequence. It is more
217     consistent with all other modes of operation to omit this test
218  6. Removed the "capacity" parameter to SWheapInitialize and used the
219     heapThreshold parameter to set the capacity for the heap
220 
221  Revision 6.60  2005/05/18 21:27:33  papadopo
222  make fillResidueProbability unconditional in Kappa_AdjustSearch
223 
224  Revision 6.59  2005/05/16 18:10:13  kans
225  include Mode_condition.h to get prototype for chooseMode
226 
227  Revision 6.58  2005/05/16 17:46:48  papadopo
228  From Mike Gertz/Alejandro Schaffer: Added support for compositional
229  adjustment of score matrices, both conditionally and unconditionally.
230  The adjustParameters argument to RedoAlignmentCore is no longer Boolean
231  because there are now 4 allowed options:
232        0 no adjustment
233        1 composition-based statistics
234        2 conditional compositional adjustment
235        3 unconditional compositional adjustment
236 
237  Revision 6.57  2005/04/13 17:18:02  coulouri
238  changes to WindowsFromHSPs for tblastn composition-based statistics
239 
240  Revision 6.56  2005/01/24 15:52:54  camacho
241  doxygen fixes from Mike Gertz
242 
243  Revision 6.55  2005/01/20 16:28:59  camacho
244  doxygen fixes
245 
246  Revision 6.54  2005/01/18 14:54:13  camacho
247  Change in tie-breakers for score comparison, suggestion by Mike Gertz
248 
249  Revision 6.53  2004/11/24 15:42:33  coulouri
250  do not dereference null pointer
251 
252  Revision 6.52  2004/11/23 21:29:02  camacho
253  Changes from Mike Gertz:
254  - If no alignments are found, I no longer create an empty HSP list and
255    process it; I just move on to the next match.
256  - For tblastn, culling by containment now occurs before link_hsps as
257    it should. This can not have any effect on the current output, as
258    tblastn + RedoAlignmentCore is not enabled.
259  - I changed some Nlm_Mallocs, etc. to MemNew.
260  - Other cosmetic changes, doxygen friendly comments
261 
262  Revision 6.51  2004/11/01 20:43:48  camacho
263  Added error handling for missing PSSM
264 
265  Revision 6.50  2004/10/27 21:00:02  camacho
266  Change the order of elements in Score* returned by GetScoreSetFromBlastHsp to
267  be consistent with score ordering in other contexts of BLAST.
268 
269  Revision 6.49  2004/09/09 13:47:47  papadopo
270  from Michael Gertz: remove unnecessary check for the presence of Selenocysteine in translated sequences
271 
272  Revision 6.48  2004/08/23 19:37:38  papadopo
273  From Michael Gertz: fix memory leak in getStartFreqRatios
274 
275  Revision 6.47  2004/08/23 17:12:21  papadopo
276  From Michael Gertz: Backported some changes to
277  getStartFreqRatios and computeScaledStandardMatrix
278  from algo/blast/core/blast_kappa.c code. Also a few
279  straightforward changes to getStartFreqRatios,
280  because the code has diverged slightly
281 
282  Revision 6.46  2004/07/28 17:19:54  kans
283  HeapSort callbacks are int LIBCALLBACK for compilation on the PC
284 
285  Revision 6.45  2004/07/27 19:59:00  papadopo
286  Changes by Michael Gertz to allow RedoAlignmentCore to be used
287  for tblastn searches. The current version of the code uses two heuristics:
288  1) a heuristic that skips doing re-alignment for an HSP if the old
289     alignment of the HSP is contained in a higher-scoring HSP that
290     has already been re-aligned.
291  2) a heuristic that attempts to ensure that all alignments in the list
292     of redone alignments has distinct ends.
293  The original code also used these heuristics, but the code that
294  implements the heuristics has been rewritten. As a result,
295  RedoAlignmentCore will occasionally report better-scoring alignments.
296  This is still a heuristic; which HSPs are reported still depends on
297  the order in which they are re-aligned.
298 
299  Revision 6.44  2004/06/23 14:53:58  camacho
300  Use renamed FreqRatios structure from posit.h
301 
302  Revision 6.43  2004/06/22 14:16:56  camacho
303  Use SFreqRatios structure from algo/blast/core to obtain underlying matrices' frequency ratios.
304 
305  Revision 6.42  2004/06/18 15:50:25  madden
306  For secondary alignments with Smith-Waterman on, use the E-value from the X-drop alignment computed by ALIGN that has more flexibility in which positions are allowed in the dynamic program.
307 
308  Add a sort of alignments for the same query-subject pair because the use of X-drop alignments occasionally reorders such alignments.
309 
310  Changes from Mike Gertz, submitted by Alejandro Schaffer.
311 
312  Revision 6.41  2004/06/14 21:11:05  papadopo
313  From Michael Gertz:
314  - Added several casts where casts occur in blast_kappa.c.  These casts
315       should have no real effect; the log of blast_kappa.c indicates that
316       they suppress compiler warnings.
317  - Changed the type of one variable that holds a score from
318       Nlm_FloatHi to Int4.
319  - moved the definition Kappa_ForbiddenRanges and relevant
320       routines earlier in the file.
321  - fixed some comments.
322  - made a few (~5) changes in whitespace.
323 
324  Revision 6.40  2004/06/03 16:10:50  dondosha
325  Fix in Kappa_SearchParametersNew: allocate correct number of rows for matrices
326 
327  Revision 6.39  2004/03/31 18:12:13  papadopo
328  Mike Gertz' refactoring of RedoAlignmentCore
329 
330  Revision 6.38  2004/02/06 19:25:16  camacho
331  Alejandro Schaffer's corrections to RedoAlignmentCore pointed out by
332  Mike Gertz:
333  1. Corrected the rule for assigning newSW->isfirstAlignment
334  2. Changed upper bound on assignment to forbiddenRanges
335  3. Assigned a value to newScore earlier
336  4. Eliminated use of skipThis
337  5. Restored value of search->gapAlign at the end
338 
339  Revision 6.37  2004/01/27 20:31:52  madden
340  remove extra setting of kbp_gap
341 
342  Revision 6.36  2004/01/06 17:48:44  dondosha
343  Do not free Karlin block in RedoAlignmentCore, because its pointer is passed outside
344 
345  Revision 6.35  2003/12/01 19:15:27  madden
346  Add one byte to filteredMatchingSequence to prevent ABR/ABW
347 
348  Revision 6.34  2003/10/22 20:37:19  madden
349  Set kbp to rescaled values, use upper-case for SCALING_FACTOR define
350 
351  Revision 6.33  2003/10/02 19:59:34  kans
352  BlastGetGapAlgnTbck needed FALSE instead of NULL in two parameters - Mac compiler complaint
353 
354  Revision 6.32  2003/10/02 19:31:24  madden
355  In RedoAlignmentCore, call procedure BlastGetGapAlgnTbck instead of ALIGN
356  to redo alignments; this allows the endpoints of the alignment to change.
357  Because  BlastGetGapAlgnTbck returns a list of SeqAlign's while ALIGN
358  passes back a single-alignment, the post-processing of the redone
359  alignments is changed including the addition of the procedure
360  concatenateListOfSeqaligns.
361 
362  Revision 6.30  2003/05/30 17:25:36  coulouri
363  add rcsid
364 
365  Revision 6.29  2003/05/13 16:02:53  coulouri
366  make ErrPostEx(SEV_FATAL, ...) exit with nonzero status
367 
368  Revision 6.28  2002/12/19 14:40:35  kans
369  changed C++-style comment to C-style
370 
371  Revision 6.27  2002/12/10 22:58:42  bealer
372  Keep mappings to sequences from readdb so that "num_ident" code does not
373  segfault with multiple databases.
374 
375  Revision 6.26  2002/11/06 20:31:14  dondosha
376  Added recalculation of the number of identities when computing seqalign from SWResults
377 
378  Revision 6.25  2002/10/16 13:33:58  madden
379  Corrected initialization of initialUngappedLambda in RedoAlignmentCore
380 
381  Revision 6.24  2002/09/03 13:55:14  kans
382  changed NULL to 0 for Mac compiler
383 
384  Revision 6.23  2002/08/29 15:47:49  camacho
385  Changed RedoAlignmentCore to work without readdb facility
386 
387  Revision 6.22  2002/08/20 15:43:08  camacho
388  Fixed memory leak in getStartFreqRatios
389 
390  Revision 6.21  2002/05/23 20:14:21  madden
391  Correction for last checkin to cover SmithWaterman type alignment
392 
393  Revision 6.20  2001/12/28 18:02:33  dondosha
394  Keep score and scoreThisAlign for each local alignment, so as to allow tie-breaking by score
395 
396  Revision 6.19  2001/07/26 12:52:25  madden
397  Fix memory leaks
398 
399  Revision 6.18  2001/07/09 15:12:47  shavirin
400  Functions BLbasicSmithWatermanScoreOnly() and BLSmithWatermanFindStart()
401  used to calculate Smith-waterman alignments on low level become external.
402 
403  Revision 6.17  2001/05/25 19:40:46  vakatov
404  Nested comment typo fixed
405 
406  Revision 6.16  2001/04/13 20:47:36  madden
407  Eliminated use of PRO_K_MULTIPLIER in adjusting E-values Added allocateStartFreqs and freeStartFreqs and getStartFreqRatios to enable the score matrix scaling to work entirely with frequency ratios and round to integers only at the very end of the scaling calculation.
408 
409  Revision 6.15  2001/03/20 15:07:34  madden
410  Fix from AS for (near) exact matches
411 
412  Revision 6.14  2001/01/04 13:48:44  madden
413  Correction for 3-parameter gapping
414 
415  Revision 6.13  2000/12/05 19:31:33  madden
416  Avoid duplicate insertion into heap when ((doThis == FALSE) && (currentState = SWPurging))
417 
418  Revision 6.12  2000/10/16 21:08:05  madden
419  segResult takes BioseqPtr as argument, produced from readdb_get_bioseq
420 
421  Revision 6.11  2000/10/13 19:32:58  madden
422  Fix for bug that caused crash
423 
424  Revision 6.10  2000/10/10 21:46:03  shavirin
425  Added support for BLOSUM50, BLOSUM90, PAM250 with -t T
426 
427  Revision 6.9  2000/10/10 19:45:51  shavirin
428  Checked for NULL hsp_array in the function RedoAlignmentCore().
429 
430  Revision 6.8  2000/08/18 21:28:24  madden
431  support for BLOSUM62_20A and BLOSUM62_20B, prevent LambdaRatio from getting above 1.0
432 
433  Revision 6.7  2000/08/09 21:10:15  shavirin
434  Added paramter discontinuous to the function newConvertSWalignsToSeqAligns()
435 
436  Revision 6.6  2000/08/08 21:45:04  shavirin
437  Removed initialization of decline_aline parameter to INT2_MAX.
438 
439  Revision 6.5  2000/08/03 22:20:10  shavirin
440  Added creation of the default posFreqs array if it is empty in
441  RedoAlignmentCore(). Fixed some memory leaks.
442 
443  Revision 6.4  2000/08/02 18:00:34  shavirin
444  Fixed memory leak in RedoAlignmentCore. Fixed a problem specific
445  to BLOSUM45 and BLOSUM80 in the procedure scalePosMatrix().
446 
447  Revision 6.3  2000/07/26 19:34:13  kans
448  removed unix-only headers
449 
450  Revision 6.2  2000/07/26 17:06:31  lewisg
451  add LIBCALLs
452 
453  Revision 6.1  2000/07/25 17:40:05  shavirin
454  Initial revision.
455 
456 
457 ******************************************************************************/
458 
459 #include<assert.h>
460 #include<string.h>
461 
462 #include <ncbi.h>
463 #include <blast.h>
464 #include <objseq.h>
465 #include <objsset.h>
466 #include <sequtil.h>
467 #include <seqport.h>
468 #include <tofasta.h>
469 #include <blastpri.h>
470 #include <txalign.h>
471 #include <simutil.h>
472 #include <posit.h>
473 #include <gapxdrop.h>
474 #include <fcntl.h>
475 #include <profiles.h>
476 
477 #include <algo/blast/composition_adjustment/nlm_linear_algebra.h>
478 #include <algo/blast/composition_adjustment/matrix_frequency_data.h>
479 #include <algo/blast/composition_adjustment/composition_adjustment.h>
480 #include <algo/blast/composition_adjustment/compo_heap.h>
481 #include <algo/blast/composition_adjustment/smith_waterman.h>
482 #include <algo/blast/composition_adjustment/redo_alignment.h>
483 #include <algo/blast/composition_adjustment/unified_pvalues.h>
484 
485 
486 /** Define KAPPA_PRINT_DIAGNOSTICS to be true to turn on printing of
487  * diagnostic information from some routines.
488  *
489  * This macro is usually used as part of a C-conditional
490  * @code
491  * if (KAPPA_PRINT_DIAGNOSTICS) {
492  *     perform expensive tests
493  * }
494  * @endcode
495  *
496  * The C compiler will then validate the code that prints the
497  * diagnostics, but will almost always strip the code if
498  * KAPPA_PRINT_DIAGNOSTICS is false.
499  */
500 #ifndef KAPPA_PRINT_DIAGNOSTICS
501 #define KAPPA_PRINT_DIAGNOSTICS 0
502 #endif
503 
504 /**
505  * Create a score set from the data in an HSP.
506  *
507  * @param hsp               the HSP of interest [in]
508  * @param logK              Karlin-Altschul statistical parameter [in]
509  * @param lambda            Karlin-Altschul statistical parameter [in]
510  * @param scoreDivisor      the value by which reported scores are to be
511  *                          divided [in]
512  */
513 static ScorePtr
Kappa_GetScoreSetFromBlastHsp(const BLAST_HSPPtr hsp,Nlm_FloatHi lambda,Nlm_FloatHi logK,Nlm_FloatHi scoreDivisor)514 Kappa_GetScoreSetFromBlastHsp(
515   const BLAST_HSPPtr hsp,
516   Nlm_FloatHi lambda,
517   Nlm_FloatHi logK,
518   Nlm_FloatHi scoreDivisor)
519 {
520   ScorePtr      score_set = NULL;       /* the new score set */
521   int          score;          /* the score, scaled using scoreDivisor */
522   Nlm_FloatHi   bit_score;      /* the integer-valued score, in bits */
523   Nlm_FloatHi   evalue;         /* the e-value, with numbers too close to zero
524                                    set to zero */
525 
526   score = Nlm_Nint(((Nlm_FloatHi) hsp->score) / scoreDivisor);
527   if (score > 0) {
528     MakeBlastScore(&score_set, "score", 0.0, score);
529   }
530 
531   evalue = (hsp->evalue >= 1.0e-180) ? hsp->evalue : 0;
532   if (hsp->num > 1) {
533     MakeBlastScore(&score_set, "sum_n", 0.0, hsp->num);
534     MakeBlastScore(&score_set, "sum_e", evalue, 0);
535     if( hsp->ordering_method == BLAST_SMALL_GAPS) {
536       MakeBlastScore(&score_set, "small_gap", 0.0, 1);
537     }
538   } else {
539     MakeBlastScore(&score_set, "e_value", evalue, 0);
540   }
541 
542   /* Compute the bit score using the newly computed scaled score. */
543   bit_score = (score*lambda*scoreDivisor - logK)/NCBIMATH_LN2;
544   if (bit_score >= 0.) {
545     MakeBlastScore(&score_set, "bit_score", bit_score, 0);
546   }
547 
548   if (hsp->ordering_method > 3) {
549     /* In new tblastn this means splice junction was found */
550     MakeBlastScore(&score_set, "splice_junction", 0.0, 1);
551   }
552 
553   if (hsp->num_ident > 0) {
554     MakeBlastScore(&score_set, "num_ident", 0.0, hsp->num_ident);
555   }
556 
557   MakeBlastScore(&score_set, "comp_adjustment_method",0.0,
558                  hsp->comp_adjustment_method);
559   return score_set;
560 }
561 
562 
563 /**
564  * Converts a collection of alignments represented by a BLAST_HitList
565  * into a collection of instances of SeqAlign. Returns the new
566  * collection.
567  *
568  * @param hitlist           the hitlist to convert to SeqAligns [in]
569  * @param subject_id        the identifier of the subject sequence [in]
570  * @param query_id          the identifier of the query sequence [in]
571  * @param queryOrigin       origin of this specific query in the
572  *                          concatenated query [in]
573  * @param queryLength       length of this specific query [in]
574  * @param lambda            Karlin-Altschul statistical parameter [in]
575  * @param logK              Karlin-Altschul statistical parameter [in]
576  * @param scoreDivisor      value by which reported scores are to be
577  *                          divided [in]
578  * @param bestEvalue)       the best (smallest) e-value of all the alignments
579  *                          in the hitlist [out]
580  */
581 static SeqAlignPtr
Kappa_SeqAlignsFromHitlist(BLAST_HitListPtr hitlist,SeqIdPtr subject_id,SeqIdPtr query_id,int queryOrigin,int queryLength,Nlm_FloatHi lambda,Nlm_FloatHi logK,Nlm_FloatHi scoreDivisor)582 Kappa_SeqAlignsFromHitlist(
583   BLAST_HitListPtr hitlist,
584   SeqIdPtr subject_id,
585   SeqIdPtr query_id,
586   int queryOrigin,
587   int queryLength,
588   Nlm_FloatHi lambda,
589   Nlm_FloatHi logK,
590   Nlm_FloatHi scoreDivisor)
591 {
592   SeqAlignPtr aligns = NULL;  /* list of SeqAligns to be returned */
593   int        hsp_index;
594 
595   for( hsp_index = hitlist->hspcnt - 1; hsp_index >= 0; hsp_index-- ) {
596     /* iterate in reverse order over all HSPs in the hitlist */
597     BLAST_HSPPtr hsp;           /* HSP for this iteration */
598     SeqAlignPtr seqAlign;       /* the new SeqAlign */
599 
600     hsp      = hitlist->hsp_array[hsp_index];
601 
602     /* Shift the edit block to query local coordinates */
603     hsp->gap_info->start1           -= queryOrigin;
604     hsp->gap_info->length1          -= queryOrigin;
605     hsp->gap_info->original_length1  = queryLength;
606 
607     seqAlign = GapXEditBlockToSeqAlign(hsp->gap_info, subject_id, query_id);
608     seqAlign->score =
609         Kappa_GetScoreSetFromBlastHsp(hsp, lambda, logK, scoreDivisor);
610 
611     seqAlign->next  = aligns;
612     aligns          = seqAlign;
613   }
614   return aligns;
615 }
616 
617 
618 /**
619  * Adjusts the E-values in a BLAST_HitList to be composites of
620  * a composition-based P-value and a score/alignment-based P-value
621  *
622  * @param hitlist           the hitlist whose E-values need to be adjusted
623  * @param comp_p_value      P-value from sequence composition
624  * @param search            is the structure with all the information about
625  *                          the search
626  * @param LambdaRatio       the ratio between the observed value of Lambda
627  *                          and the predicted value of lambda (used to print
628  *                          diagnostics)
629  * @param subject_id        the subject id of this sequence (used to print
630  *                          diagnostics)
631  **/
632 static void
Kappa_AdjustEvaluesForComposition(BLAST_HitListPtr hitlist,Nlm_FloatHi comp_p_value,BlastSearchBlkPtr search,Nlm_FloatHi LambdaRatio,int subject_id)633 Kappa_AdjustEvaluesForComposition(
634   BLAST_HitListPtr hitlist,
635   Nlm_FloatHi comp_p_value,
636   BlastSearchBlkPtr search,
637   Nlm_FloatHi LambdaRatio,
638   int subject_id)
639 {
640   int        hsp_index;
641 
642   for (hsp_index = 0;  hsp_index < hitlist->hspcnt;  hsp_index++) {
643     /* for all HSPs */
644     BLAST_HSPPtr hsp;         /* HSP for this iteration */
645     double align_p_value;     /* P-value for the alignment score */
646     double combined_p_value;  /* combination of two P-values */
647     double old_e_value;       /* alignment E-value */
648     double db_to_sequence_scale;  /* scale factor for converting a database
649                                      e-value to a sequence e-value */
650 
651     hsp = hitlist->hsp_array[hsp_index];
652     old_e_value = hsp->evalue;
653     /* Convert the database E-value to the sequence E-value */
654     db_to_sequence_scale =
655       (MAX((search->subject->length - search->length_adjustment), 1.0))/
656       search->dblen_eff;
657     hsp->evalue *= db_to_sequence_scale;
658 
659     align_p_value = BlastKarlinEtoP(hsp->evalue);
660     combined_p_value = Blast_Overall_P_Value(comp_p_value,align_p_value);
661     hsp->evalue = BlastKarlinPtoE(combined_p_value);
662     hsp->evalue /= db_to_sequence_scale;
663 
664     if (KAPPA_PRINT_DIAGNOSTICS) {
665       int    sequence_gi; /*GI of a sequence*/
666       int    context_index; /* query context index of this HSP */
667       context_index = hsp->context;
668 
669       if (search->rdfp) {
670         SeqIdPtr sip; /*used to extract sequence from database*/
671         char    *sname = NULL; /*string for defline*/
672         char    *buff = NULL; /*string for defline*/
673         char    *string_descriptor = NULL; /*string for defline*/
674 
675         sname = (char *) malloc(SeqIdBufferSize * sizeof(Char));
676         buff = (char *) malloc(SeqIdBufferSize * sizeof(Char));
677         string_descriptor =
678           (char *) malloc(SeqIdBufferSize * sizeof(Char));
679         sip = NULL;
680         readdb_get_defline(search->rdfp,subject_id,&sname);
681         readdb_get_descriptor(search->rdfp, subject_id, &sip,
682                               &string_descriptor);
683         if (sip)
684           SeqIdWrite(sip, buff, PRINTID_FASTA_LONG, sizeof(buff));
685         (void) GetAccessionFromSeqId(SeqIdFindBest(sip,SEQID_GI),
686                                      &sequence_gi, &sname);
687         sip = SeqIdSetFree(sip);
688         free(string_descriptor);
689         free(buff);
690         free(sname);
691       } else {
692         sequence_gi = (-1);
693       }
694       printf("GI %d Lambda ratio %e comp. p-value %e; "
695              "adjust E-value of query length %d match length "
696              "%d from %e to %e\n",
697              sequence_gi, LambdaRatio, comp_p_value,
698              search->context[context_index].query->length,
699              search->subject->length, old_e_value, hsp->evalue);
700     } /* end if (KAPPA_PRINT_DIAGNOSTICS) */
701   } /* end for all HSPs */
702 }
703 
704 /**
705  * Converts a list of objects of type BlastCompo_Alignment to an
706  * new object of type BLAST_HitList and returns the result. Conversion
707  * in this direction is lossless.  The list passed to this routine is
708  * freed to ensure that there is no aliasing of fields between the
709  * list of BlastCompo_Alignments and the new hitlist.
710  *
711  * @param search            general search information
712  * @param alignments        a list of distinct alignments
713  */
714 static BLAST_HitListPtr
Kappa_SortedHitlistFromAligns(BlastSearchBlkPtr search,BlastCompo_Alignment ** alignments)715 Kappa_SortedHitlistFromAligns(
716   BlastSearchBlkPtr search,
717   BlastCompo_Alignment ** alignments)
718 {
719   /* The context of the query is always zero in current
720    * implementations of RedoAlignmentCore. */
721   const Int2 context = 0;
722 
723   BLAST_HitListPtr hitlist;             /* the new hitlist */
724   BlastCompo_Alignment * align;      /* represents the current
725                                          * alignment in loops */
726   if(search->current_hitlist != NULL) {
727     search->current_hitlist->hspcnt_max = search->current_hitlist->hspcnt;
728     BlastHitListPurge(search->current_hitlist);
729   } else {
730     search->current_hitlist =  BlastHitListNew(search);
731   }
732   hitlist = search->current_hitlist;
733   hitlist->do_not_reallocate = FALSE;
734 
735   for(align = *alignments; align != NULL; align = align->next) {
736     BLAST_HSPPtr hsp;           /* the new HSP for this alignment */
737     /* queryExtent and matchExtent represent the extent of the
738        alignment in the query and subject sequences respectively. */
739     int queryExtent = align->queryEnd - align->queryStart;
740     int matchExtent = align->matchEnd - align->matchStart;
741 
742     BlastSaveCurrentHsp(search, align->score, align->queryStart,
743                         align->matchStart, matchExtent, context);
744 
745     hsp = hitlist->hsp_array[hitlist->hspcnt - 1];
746 
747     hsp->query.length   = queryExtent;
748     hsp->query.end      = align->queryEnd;
749 
750     hsp->subject.length = matchExtent;
751     hsp->subject.end    = align->matchEnd;
752 
753     hsp->subject.frame  = (Int2) align->frame;
754     hsp->evalue         = 0.0;  /* E-values are computed after the full
755                                    hitlist has been created. */
756     hsp->gap_info       = (GapXEditBlockPtr) align->context;
757     switch (align->matrix_adjust_rule) {
758     case eDontAdjustMatrix:
759         hsp->comp_adjustment_method = eNoCompositionBasedStats;
760       break;
761     case eCompoScaleOldMatrix:
762         hsp->comp_adjustment_method = eCompositionBasedStats;
763       break;
764     default:
765         hsp->comp_adjustment_method = eCompositionMatrixAdjust;
766       break;
767     }
768     /* Break the aliasing between align->context and hsp->gap_info. */
769     align->context = NULL;
770   }
771   BlastCompo_AlignmentsFree(alignments, NULL);
772   if (hitlist->hspcnt > 0) {
773     HeapSort(hitlist->hsp_array, hitlist->hspcnt, sizeof(BLAST_HSPPtr),
774              score_compare_hsps);
775   }
776   return hitlist;
777 }
778 
779 
780 /**
781  * Calculate expect values for a list of alignments and remove those
782  * that don't meet an e-value cutoff from the list.
783  *
784  * @param *pbestScore      the largest score observed in the list
785  * @param *pbestEvalue     the best (smallest) e-value observed in the
786  *                         list
787  * @param full_subject_length   the untranslated length of the subject
788  *                              sequence
789  * @param search           contains search parameters used to compute
790  *                         e-values and the e-value cutoff
791  * @param do_link_hsps     use hsp linking when computing e-values
792  * @param pvalueForThisPair  composition p-value
793  * @param LambdaRatio        lambda ratio, if available
794  * @param subject_id         index of subject
795  */
796 static void
Kappa_HitlistEvaluateAndPurge(int * pbestScore,double * pbestEvalue,int full_subject_length,BlastSearchBlkPtr search,int do_link_hsps,double pvalueForThisPair,double LambdaRatio,int subject_id)797 Kappa_HitlistEvaluateAndPurge(int * pbestScore, double *pbestEvalue,
798                              int full_subject_length,
799                              BlastSearchBlkPtr search,
800                              int do_link_hsps,
801                              double pvalueForThisPair,
802                              double LambdaRatio,
803                              int subject_id)
804 {
805   BLAST_HitListPtr hitlist; /* a hitlist containing the newly-computed
806                                    * alignments */
807   Nlm_FloatHi bestEvalue;   /* best e-value among alignments in the
808                                      hitlist */
809   int bestScore;           /* best score among alignments in the
810                                hitlist */
811   int hsp_index;           /* index of the current HSP */
812 
813   hitlist = search->current_hitlist;
814   search->subject->length  = full_subject_length;
815   if (do_link_hsps) {
816     BlastLinkHsps(search);
817   } else {
818     BlastGetNonSumStatsEvalue(search);
819   }
820   if ((0 <= pvalueForThisPair) && (pvalueForThisPair <= 1))
821     Kappa_AdjustEvaluesForComposition(hitlist, pvalueForThisPair,
822                                       search, LambdaRatio, subject_id);
823   BlastReapHitlistByEvalue(search);
824   /* Find the e-value of the best alignment in the list -- the list
825    * is typically sorted by score and not by e-value, so a search is
826    * necessary. */
827   bestEvalue = DBL_MAX;
828   bestScore  = 0;
829   for (hsp_index = 0;  hsp_index < hitlist->hspcnt;  hsp_index++) {
830     if( hitlist->hsp_array[hsp_index]->evalue < bestEvalue ) {
831       bestEvalue = hitlist->hsp_array[hsp_index]->evalue;
832     }
833     if( hitlist->hsp_array[hsp_index]->score > bestScore ) {
834       bestScore = hitlist->hsp_array[hsp_index]->score;
835     }
836   }
837   *pbestScore  = bestScore;
838   *pbestEvalue = bestEvalue;
839 }
840 
841 
842 /**
843  * A callback routine: compute lambda for the given score frequencies.
844  * (@sa calc_lambda_type).
845  */
846 static double
Kappa_CalcLambda(double probs[],int min_score,int max_score,double lambda0)847 Kappa_CalcLambda(double probs[], int min_score, int max_score, double lambda0)
848 {
849   int i, n;
850   double avg;
851   BLAST_ScoreFreq freq;
852 
853   n = max_score - min_score + 1;
854   avg = 0.0;
855   for (i = 0;  i < n;  i++) {
856     avg += (min_score + i) * probs[i];
857   }
858   freq.obs_min = min_score;
859   freq.obs_max = max_score;
860   freq.sprob = &probs[-min_score];
861   freq.score_avg = avg;
862 
863   return impalaKarlinLambdaNR(&freq, lambda0);
864 }
865 
866 
867 /**
868  * Get a matrix of the frequency ratios that underlie the score
869  * matrix being used on this pass. The returned matrix is
870  * position-specific, so if we are in the first pass, use query to
871  * convert the 20x20 standard matrix into a position-specific
872  * variant. matrixName is the name of the underlying 20x20 score
873  * matrix used. numPositions is the length of the query;
874  * startNumerator is the matrix of frequency ratios as stored in
875  * posit.h. It needs to be divided by the frequency of the second
876  * character to get the intended ratio
877  *
878  * @param returnRatios      an allocated matrix to hold the frequency
879  *                          ratios [out]
880  * @param search            general search information [in]
881  * @param query             the query sequence [in]
882  * @param matrixName        name of the underlying matrix [in]
883  * @param startNumerator    matrix of frequency ratios [in]
884  * @param numPositions      length of the query [in]
885  * @param positionSpecific  is this a position-specific search? [in]
886  */
887 static void
Kappa_GetStartFreqRatios(Nlm_FloatHi ** returnRatios,BlastSearchBlkPtr search,Uint1Ptr query,const char * matrixName,Nlm_FloatHi ** startNumerator,int numPositions,Boolean positionSpecific)888 Kappa_GetStartFreqRatios(Nlm_FloatHi ** returnRatios,
889                    BlastSearchBlkPtr search,
890                    Uint1Ptr query,
891                    const char *matrixName,
892                    Nlm_FloatHi **startNumerator,
893                    int numPositions,
894                    Boolean positionSpecific)
895 {
896    int i,j;
897    FreqRatios * stdFreqRatios = NULL;
898    /* a small cutoff used to determine whether it is necessary
899     * to reverse the multiplication done in posit.c */
900    const double posEpsilon = 0.0001;
901 
902    stdFreqRatios = PSIMatrixFrequencyRatiosNew(matrixName);
903    if (positionSpecific) {
904     for(i = 0; i < numPositions; i++) {
905        for(j = 0; j < PROTEIN_ALPHABET; j++) {
906          returnRatios[i][j] = stdFreqRatios->data[query[i]][j];
907        }
908      }
909    } else {
910      for (i = 0; i < PROTEIN_ALPHABET; i++) {
911        for (j = 0; j < PROTEIN_ALPHABET; j++) {
912          returnRatios[i][j] = stdFreqRatios->data[i][j];
913        }
914      }
915    }
916    stdFreqRatios = PSIMatrixFrequencyRatiosFree(stdFreqRatios);
917 
918    if (positionSpecific) {
919      Nlm_FloatHi *standardProb; /*probabilities of each letter*/
920      BLAST_ResFreqPtr stdrfp; /* gets standard frequencies in prob field */
921 
922      stdrfp = BlastResFreqNew(search->sbp);
923      BlastResFreqStdComp(search->sbp,stdrfp);
924      standardProb = &stdrfp->prob[0];
925 
926      /*reverse multiplication done in posit.c*/
927      for(i = 0; i < numPositions; i++) {
928        for(j = 0; j < PROTEIN_ALPHABET; j++) {
929          if ((standardProb[query[i]] > posEpsilon) &&
930              (standardProb[j] > posEpsilon) &&
931              (j != eStopChar) && (j != Xchar) &&
932              (startNumerator[i][j] > posEpsilon))
933          {
934            returnRatios[i][j] = startNumerator[i][j]/standardProb[j];
935          }
936        }
937      }
938      stdrfp = BlastResFreqDestruct(stdrfp);
939    }
940 }
941 
942 
943 /** SCALING_FACTOR is a multiplicative factor used to get more bits of
944  * precision in the integer matrix scores. It cannot be arbitrarily
945  * large because we do not want total alignment scores to exceed
946  * -(INT4_MIN) */
947 #define SCALING_FACTOR 32
948 
949 
950 /**
951  * produce a scaled-up version of the position-specific matrix
952  * starting from posFreqs
953  *
954  * @param fillPosMatrix     is the matrix to be filled
955  * @param nonposMatrix      is the underlying position-independent matrix,
956  *                          used to fill positions where frequencies are
957  *                          irrelevant
958  * @param matrixName        name of the position-independent matrix
959  * @param posFreq           frequency ratios for the position-dependent
960  *                          matrix
961  * @param query             query sequence data
962  * @param queryLength       length of the query
963  * @param sbp               stores various parameters of the search
964  */
965 static void
Kappa_ScalePosMatrix(BLAST_Score ** fillPosMatrix,BLAST_Score ** nonposMatrix,const Char * matrixName,Nlm_FloatHi ** posFreqs,Uint1 * query,int queryLength,BLAST_ScoreBlkPtr sbp,Nlm_FloatHi localScalingFactor)966 Kappa_ScalePosMatrix(BLAST_Score **fillPosMatrix, BLAST_Score **nonposMatrix,
967                      const Char *matrixName, Nlm_FloatHi **posFreqs,
968                      Uint1 *query, int queryLength, BLAST_ScoreBlkPtr sbp,
969                      Nlm_FloatHi localScalingFactor)
970 {
971 
972      posSearchItems *posSearch; /*used to pass data into scaling routines*/
973      compactSearchItems *compactSearch; /*used to pass data into scaling routines*/
974      int i; /*loop index*/
975      BLAST_ResFreqPtr stdrfp; /* gets standard frequencies in prob field */
976      int a; /*index over characters*/
977 
978 
979      posSearch = (posSearchItems *) MemNew (1 * sizeof(posSearchItems));
980      compactSearch = (compactSearchItems *) MemNew (1 * sizeof(compactSearchItems));
981      posSearch->posMatrix = (BLAST_Score **) MemNew((queryLength + 1) * sizeof(BLAST_Score *));
982      posSearch->posPrivateMatrix = fillPosMatrix;
983      posSearch->posFreqs = posFreqs;
984      posSearch->stdFreqRatios = PSIMatrixFrequencyRatiosNew(matrixName);
985      for(i = 0; i <= queryLength; i++)
986        posSearch->posMatrix[i] = (BLAST_Score *) MemNew(PROTEIN_ALPHABET * sizeof(BLAST_Score));
987 
988      compactSearch->query = (Uint1Ptr) query;
989      compactSearch->qlength = queryLength;
990      compactSearch->alphabetSize = PROTEIN_ALPHABET;
991      compactSearch->gapped_calculation = TRUE;
992      compactSearch->matrix = nonposMatrix;
993      compactSearch->lambda =  sbp->kbp_gap_std[0]->Lambda;
994      compactSearch->kbp_std = sbp->kbp_std;
995      compactSearch->kbp_psi = sbp->kbp_psi;
996      compactSearch->kbp_gap_psi = sbp->kbp_gap_psi;
997      compactSearch->kbp_gap_std = sbp->kbp_gap_std;
998      compactSearch->lambda_ideal = sbp->kbp_ideal->Lambda;
999      compactSearch->K_ideal = sbp->kbp_ideal->K;
1000 
1001      stdrfp = BlastResFreqNew(sbp);
1002      BlastResFreqStdComp(sbp,stdrfp);
1003      compactSearch->standardProb = MemNew(compactSearch->alphabetSize * sizeof(Nlm_FloatHi));
1004      for(a = 0; a < compactSearch->alphabetSize; a++)
1005        compactSearch->standardProb[a] = stdrfp->prob[a];
1006      stdrfp = BlastResFreqDestruct(stdrfp);
1007 
1008      posFreqsToMatrix(posSearch,compactSearch);
1009      impalaScaling(posSearch, compactSearch, localScalingFactor, FALSE);
1010 
1011      for(i = 0; i <= queryLength; i++)
1012        MemFree(posSearch->posMatrix[i]);
1013 
1014      MemFree(compactSearch->standardProb);
1015      MemFree(posSearch->posMatrix);
1016      PSIMatrixFrequencyRatiosFree(posSearch->stdFreqRatios);
1017      MemFree(posSearch);
1018      MemFree(compactSearch);
1019 }
1020 
1021 
1022 /** Convert an array of HSPs into a list of BlastCompo_Alignment
1023  * objects.
1024  *
1025  * @param search              search parameters
1026  * @param hsp_array           existing array of HSPs
1027  * @param hspcnt              length of hsp_array
1028  * @param localScalingFactor  factor by which scores are scaled in the
1029  *                            new list of alignments
1030  * @return                    the new list of alignments.
1031  */
1032 static BlastCompo_Alignment *
Kappa_ResultHspToDistinctAlign(BlastSearchBlkPtr search,BLASTResultHsp hsp_array[],int hspcnt,double localScalingFactor)1033 Kappa_ResultHspToDistinctAlign(BlastSearchBlkPtr search,
1034                                BLASTResultHsp hsp_array[], int hspcnt,
1035                                double localScalingFactor)
1036 {
1037   BlastCompo_Alignment *aligns;  /* list of alignments to be returned */
1038   BlastCompo_Alignment **tail;   /* next location to store a pointer to
1039                                        a new alignment */
1040   int hsp_index;                    /* loop index */
1041 
1042   aligns = NULL;
1043   tail = &aligns;
1044   for (hsp_index = 0;  hsp_index < hspcnt;  hsp_index++) {
1045     BlastCompo_Alignment *new_align;  /* a new alignment */
1046     int queryIndex, queryEnd, matchEnd;
1047     BLASTResultHsp * hsp = &hsp_array[hsp_index];
1048     queryEnd = hsp->query_offset + hsp->query_length;
1049     matchEnd = hsp->subject_offset + hsp->subject_length;
1050     if(search->mult_queries != NULL) {
1051       queryIndex = GetQueryNum(search->mult_queries,
1052                                hsp->query_offset, queryEnd - 1, 0);
1053     } else {
1054       queryIndex = 0;
1055     }
1056     new_align =
1057       BlastCompo_AlignmentNew((int) (hsp->score * localScalingFactor),
1058                                  eDontAdjustMatrix,
1059                                  hsp->query_offset, queryEnd,
1060                                  queryIndex, hsp->subject_offset,
1061                                  matchEnd, hsp->subject_frame, hsp);
1062     if (new_align == NULL) {
1063       ErrPostEx(SEV_FATAL, E_NoMemory, 0,
1064                 "Failed to allocate a new alignment");
1065     }
1066     *tail = new_align;
1067     tail = &new_align->next;
1068   }
1069   return aligns;
1070 }
1071 
1072 
1073 /**
1074  * Redo a S-W alignment using an x-drop alignment.  The result will
1075  * usually be the same as the S-W alignment. The call to ALIGN
1076  * attempts to force the endpoints of the alignment to match the
1077  * optimal endpoints determined by the Smith-Waterman algorithm.
1078  * ALIGN is used, so that if the data structures for storing BLAST
1079  * alignments are changed, the code will not break
1080  *
1081  * @param query         the query data
1082  * @param queryStart    start of the alignment in the query sequence
1083  * @param queryEnd      end of the alignment in the query sequence,
1084  *                      as computed by the Smith-Waterman algorithm
1085  * @param subject       the subject (database) sequence
1086  * @param matchStart    start of the alignment in the subject sequence
1087  * @param matchEnd      end of the alignment in the query sequence,
1088  *                      as computed by the Smith-Waterman algorithm
1089  * @param frame         translation frame of the database sequence (0
1090  *                      if not translated)
1091  * @param gap_align     parameters for a gapped alignment
1092  * @param score         score computed by the Smith-Waterman algorithm
1093  */
1094 static void
Kappa_SWFindFinalEndsUsingXdrop(BlastCompo_SequenceData * query,int queryStart,int queryEnd,BlastCompo_SequenceData * subject,int matchStart,int matchEnd,int frame,GapAlignBlkPtr gap_align,int score)1095 Kappa_SWFindFinalEndsUsingXdrop(
1096   BlastCompo_SequenceData * query,
1097   int queryStart,
1098   int queryEnd,
1099   BlastCompo_SequenceData * subject,
1100   int matchStart,
1101   int matchEnd,
1102   int frame,
1103   GapAlignBlkPtr gap_align,
1104   int score)
1105 {
1106   int XdropAlignScore;         /* alignment score obtained using X-dropoff
1107                                  * method rather than Smith-Waterman */
1108   int doublingCount = 0;       /* number of times X-dropoff had to be
1109                                  * doubled */
1110   int *alignScript, *dummy;    /* the alignment script that will be
1111                                    generated below by the ALIGN
1112                                    routine. */
1113   GapXEditBlockPtr editBlock;   /* traceback info for this alignment */
1114   /* Extent of the alignment as computed by an x-drop alignment
1115    * (usually the same as (queryEnd - queryStart) and (matchEnd -
1116    * matchStart)) */
1117   int queryExtent, matchExtent;
1118 
1119   gap_align->query_start = queryStart;
1120   gap_align->subject_start = matchStart;
1121   do {
1122     alignScript =
1123       (int *) MemNew((subject->length + query->length + 3) * sizeof(int));
1124 
1125     XdropAlignScore =
1126       ALIGN(&query->data[queryStart - 1], &subject->data[matchStart - 1],
1127             queryEnd - queryStart + 1, matchEnd - matchStart + 1,
1128             alignScript, &queryExtent, &matchExtent, &dummy,
1129             gap_align, queryStart - 1, FALSE);
1130     gap_align->score = XdropAlignScore;
1131     gap_align->query_stop = gap_align->query_start + queryExtent - 1;
1132     gap_align->subject_stop = gap_align->subject_start + matchExtent - 1;
1133 
1134     gap_align->x_parameter *= 2;
1135     doublingCount++;
1136     if((XdropAlignScore < score) && (doublingCount < 3)) {
1137       MemFree(alignScript);
1138     }
1139   } while((XdropAlignScore < score) && (doublingCount < 3));
1140   editBlock =
1141     TracebackToGapXEditBlock(NULL, NULL, queryExtent, matchExtent,
1142                              alignScript, queryStart, matchStart);
1143   alignScript = MemFree(alignScript);
1144   editBlock->length1          = query->length;
1145   editBlock->length2          = subject->length;
1146   editBlock->discontinuous    = gap_align->discontinuous;
1147   editBlock->translate2       = gap_align->translate2;
1148   editBlock->frame2           = (Int2) frame;
1149 
1150   gap_align->edit_block = editBlock;
1151 }
1152 
1153 
1154 typedef struct Kappa_SequenceLocalData {
1155   Uint1         prog_number;    /**< identifies the type of blast search being
1156                                      performed. The type of search determines
1157                                      how sequence data should be obtained. */
1158   CharPtr       genetic_code;   /**< genetic code for translated searches */
1159   BioseqPtr     bsp_db;         /**< An object that represents this sequence
1160                                    in low level routines. */
1161   Boolean       bioseq_needs_unlock;    /**< true if the bsp_db must be
1162                                            disposed of by a call to
1163                                            BioseqUnlock, rather than
1164                                            BioseqFree */
1165   ReadDBFILEPtr rdfp;           /**< A pointer to a database from which
1166                                      sequences may be obtained */
1167 } Kappa_SequenceLocalData;
1168 
1169 
1170 /**
1171  * Initialize a new matching sequence, obtaining information about the
1172  * sequence from the search.
1173  *
1174  * @param self              object to be initialized
1175  * @param search            search information
1176  * @param subject_index     index of the matching sequence in the database
1177  */
1178 static void
Kappa_MatchingSequenceInitialize(BlastCompo_MatchingSequence * self,BlastSearchBlkPtr search,int subject_index)1179 Kappa_MatchingSequenceInitialize(
1180   BlastCompo_MatchingSequence * self,
1181   BlastSearchBlkPtr search,
1182   int subject_index)
1183 {
1184   Kappa_SequenceLocalData * local_data;
1185 
1186   local_data = self->local_data = MemNew(sizeof(Kappa_SequenceLocalData));
1187   self->index = subject_index;
1188 
1189   local_data->prog_number  = search->prog_number;
1190   local_data->rdfp         = search->rdfp;
1191   if(local_data->prog_number ==  blast_type_tblastn) {
1192     local_data ->genetic_code = search->db_genetic_code;
1193   } else {
1194     /* the sequence will not be translated */
1195     local_data ->genetic_code = NULL;
1196   }
1197   if(local_data->rdfp) {
1198     local_data->rdfp->parameters |= READDB_KEEP_HDR_AND_SEQ;
1199     local_data->bsp_db = readdb_get_bioseq(local_data->rdfp, self->index );
1200     local_data->bioseq_needs_unlock = FALSE;
1201   } else {
1202     local_data->bsp_db = BioseqLockById(search->subject_info->sip);
1203     local_data->bioseq_needs_unlock = TRUE;
1204   }
1205   self->length = local_data->bsp_db->length;
1206 }
1207 
1208 
1209 /** Release the resources associated with a matching sequence. */
1210 static void
Kappa_MatchingSequenceRelease(BlastCompo_MatchingSequence * self)1211 Kappa_MatchingSequenceRelease(BlastCompo_MatchingSequence * self)
1212 {
1213   Kappa_SequenceLocalData * local_data = self->local_data;
1214   if(local_data->bsp_db) {
1215     if(local_data->bioseq_needs_unlock) {
1216       BioseqUnlock(local_data->bsp_db);
1217       local_data->bsp_db = NULL;
1218     } else {
1219       local_data->bsp_db = BioseqFree(local_data->bsp_db);
1220     }
1221   }
1222   MemFree(local_data);
1223   self->local_data = NULL;
1224 }
1225 
1226 #define MINUMUM_FRACTION_NEAR_IDENTICAL 0.98
1227 
1228 /**
1229  * Test whether the aligned parts of two sequences that
1230  * have a high-scoring gapless alignment are nearly identical
1231  *
1232  * @param seqData           subject sequence
1233  * @param queryData         query sequence
1234  * @param queryOffset   offset for align if there are multiple queries
1235  * @param align             information about the alignment
1236  * @param rangeOffset       offset for subject sequence (used for tblastn)
1237  *
1238  * @return                  TRUE if the aligned portions are nearly identical
1239  */
testNearIdentical(const BlastCompo_SequenceData * seqData,const BlastCompo_SequenceData * queryData,const int queryOffset,const BlastCompo_Alignment * align,const int rangeOffset)1240 static Boolean testNearIdentical(const BlastCompo_SequenceData *seqData,
1241 				 const BlastCompo_SequenceData *queryData,
1242 				 const int queryOffset,
1243 				 const BlastCompo_Alignment *align,
1244 				 const int rangeOffset)
1245 {
1246   int numIdentical = 0;
1247   double fractionIdentical;
1248   int qPos, sPos; /*positions in query and subject;*/
1249   int qEnd; /*end of query*/
1250 
1251   qPos = align->queryStart - queryOffset;
1252   qEnd = align->queryEnd - queryOffset;
1253   sPos = align->matchStart - rangeOffset;
1254   while (qPos < qEnd)  {
1255       if (queryData->data[qPos] == seqData->data[sPos])
1256 	numIdentical++;
1257       sPos++;
1258       qPos++;
1259   }
1260   fractionIdentical = ((double) numIdentical/
1261   (double) (align->queryEnd - align->queryStart));
1262   if (fractionIdentical >= MINUMUM_FRACTION_NEAR_IDENTICAL)
1263     return(TRUE);
1264   else
1265     return(FALSE);
1266 }
1267 
1268 
1269 /** Character to use for masked residues */
1270 #define BLASTP_MASK_RESIDUE eXchar
1271 /** Default instructions and mask residue for SEG filtering */
1272 #define BLASTP_MASK_INSTRUCTIONS "S 10 1.8 2.1"
1273 
1274 
1275 /**
1276  * Obtain a string of translated data
1277  *
1278  * @param self          the sequence from which to obtain the data [in]
1279  * @param range         the range, in amino acid coordinates, of data
1280  *                      to get; includes the translation frame [in]
1281  * @param seqData       the resulting data [out]
1282  * @param q_range       the range, in amino acid coordinates, of data
1283  *                      to get; includes the translation frame [in]
1284  * @param queryData     the query sequence [in]
1285  * @param align          information about the alignment between query and subject
1286  * @param shouldTestIdentical did alignment pass a preliminary test in
1287  *                       redo_alignment.c that indicates the sequence
1288  *                        pieces may be near identical
1289  */
1290 static void
Kappa_SequenceGetTranslatedRange(const BlastCompo_MatchingSequence * self,const BlastCompo_SequenceRange * range,BlastCompo_SequenceData * seqData,const BlastCompo_SequenceRange * q_range,const BlastCompo_SequenceData * queryData,const BlastCompo_Alignment * align,const Boolean shouldTestIdentical)1291 Kappa_SequenceGetTranslatedRange(const BlastCompo_MatchingSequence * self,
1292                                  const BlastCompo_SequenceRange * range,
1293                                  BlastCompo_SequenceData * seqData,
1294                                  const BlastCompo_SequenceRange * q_range,
1295 				 const BlastCompo_SequenceData * queryData,
1296 				 const BlastCompo_Alignment *align,
1297 				 const Boolean shouldTestIdentical)
1298 {
1299   int i;
1300   int nucleotide_start;        /* position of the first nucleotide to be
1301                                  * translated */
1302   int num_nucleotides;         /* number of nucleotides to translate */
1303   Int2 translation_frame = (Int2) range->context;
1304   Kappa_SequenceLocalData * local_data = self->local_data;
1305 
1306   nucleotide_start   = 3 * range->begin;
1307   num_nucleotides    =
1308     3 * (range->end - range->begin) + ABS(translation_frame) - 1;
1309   { /* scope of nucleotide_data */
1310     Uint1Ptr nucleotide_data = MemNew((num_nucleotides + 1) * sizeof(Uint1));
1311     { /* scope of spp */
1312       SeqPortPtr spp;       /* a SeqPort used to read the sequence data */
1313       Uint1 strand;         /* a flag indicating which strand to read */
1314       Uint1 nucleotide;     /* an individual nucleotide */
1315 
1316       if (translation_frame >= 0) {
1317           strand = Seq_strand_plus;
1318       } else {
1319           strand = Seq_strand_minus;
1320       }
1321       spp    = SeqPortNew(local_data->bsp_db, FIRST_RESIDUE, LAST_RESIDUE,
1322                           strand, Seq_code_ncbi4na);
1323       SeqPortSeek(spp, nucleotide_start, SEEK_SET);
1324 
1325       for(i = 0;
1326           i < num_nucleotides &&
1327             (nucleotide = SeqPortGetResidue(spp)) != SEQPORT_EOF;
1328           i++ ) {
1329         /* for all nucleotides in the translation range */
1330         nucleotide_data[i] = nucleotide;
1331       }
1332 
1333       spp = SeqPortFree(spp);
1334     }  /* end scope of spp */
1335     seqData->buffer =
1336       GetTranslation(nucleotide_data, num_nucleotides, translation_frame,
1337                      &seqData->length, local_data->genetic_code);
1338     seqData->data = seqData->buffer + 1; /* This is a protein sequence,
1339                                             so the first byte is nil. */
1340     MemFree(nucleotide_data);
1341   } /* end scope of nucleotide_data */
1342 
1343 #ifndef KAPPA_NO_SEG_SEQUENCE_TBLASTN
1344   { /* scope of variables used for SEG filtering */
1345     BioseqPtr bsp_temp; /* a Bioseq for the translated sequence */
1346     ObjectIdPtr oip;    /* a unique ObjectId for the translated sequence */
1347     SeqLocPtr seg_slp;  /* a SeqLoc for SEG filtering */
1348     bsp_temp     = BlastMakeTempProteinBioseq(seqData->data, seqData->length,
1349                                               Seq_code_ncbistdaa);
1350     bsp_temp->id = SeqIdSetFree(bsp_temp->id);
1351 
1352     oip = (ObjectIdPtr) UniqueLocalId();
1353     ValNodeAddPointer(&(bsp_temp->id), SEQID_LOCAL, oip);
1354     SeqMgrAddToBioseqIndex(bsp_temp);
1355 
1356     seg_slp = BlastBioseqFilter(bsp_temp, BLASTP_MASK_INSTRUCTIONS);
1357     if (seg_slp) {
1358       HackSeqLocId(seg_slp, local_data->bsp_db->id);
1359       if ((!shouldTestIdentical) ||
1360             (shouldTestIdentical &&
1361 	     (!testNearIdentical(seqData, queryData, q_range->begin, align, range->begin))))
1362 	 BlastMaskTheResidues(seqData->data, seqData->length,
1363                            BLASTP_MASK_RESIDUE, seg_slp, FALSE, 0);
1364       seg_slp = SeqLocSetFree(seg_slp);
1365     }
1366 
1367     SeqMgrDeleteFromBioseqIndex(bsp_temp);
1368     bsp_temp->id = SeqIdSetFree(bsp_temp->id);
1369     bsp_temp     = BioseqFree(bsp_temp);
1370   } /* end scope of variables used for SEG filtering */
1371 #endif
1372 }
1373 
1374 
1375 /**
1376  * Obtain a string of protein data
1377  *
1378  * @param self          the sequence from which to obtain the data [in]
1379  * @param range         the range, in amino acid coordinates, of data
1380  *                      to get; includes the translation frame [in]
1381  * @param seqData       the resulting data [out]
1382  * @param range         the range, in amino acid coordinates, of data
1383  *                      to get; includes the translation frame [in]
1384  * @param queryData     the query sequence [in]
1385  * @param align          information about the alignment between query and subject
1386  * @param shouldTestIdentical did alignment pass a preliminary test in
1387  *                       redo_alignment.c that indicates the sequence
1388  *                        pieces may be near identical
1389  */
1390 static void
Kappa_SequenceGetProteinRange(const BlastCompo_MatchingSequence * self,const BlastCompo_SequenceRange * range,BlastCompo_SequenceData * seqData,const BlastCompo_SequenceRange * q_range,const BlastCompo_SequenceData * queryData,const BlastCompo_Alignment * align,const Boolean shouldTestIdentical)1391 Kappa_SequenceGetProteinRange(const BlastCompo_MatchingSequence * self,
1392                                  const BlastCompo_SequenceRange * range,
1393                                  BlastCompo_SequenceData * seqData,
1394                                  const BlastCompo_SequenceRange * q_range,
1395 				 const BlastCompo_SequenceData * queryData,
1396 				 const BlastCompo_Alignment *align,
1397 				 const Boolean shouldTestIdentical)
1398 {
1399   int i;
1400   Kappa_SequenceLocalData * local_data = self->local_data;
1401 
1402     /* The sequence does not need to be translated. */
1403     /* Obtain the entire sequence (necessary for SEG filtering.) */
1404     if(local_data->rdfp != NULL) {
1405       Uint1Ptr origData;        /* data obtained from readdb_get_sequence;
1406                                  * this data cannot be modified, so we copy
1407                                  * it. */
1408       int       idx;
1409       seqData->length    = readdb_get_sequence(local_data->rdfp, self->index,
1410                                                (Uint1Ptr PNTR) & origData );
1411       seqData->buffer    = MemNew((seqData->length + 2) * sizeof(Uint1));
1412       seqData->buffer[0] = '\0';
1413       seqData->data      = &seqData->buffer[1];
1414       for( idx = 0; idx < seqData->length; idx++ ) {
1415         /* Copy the sequence data, replacing occurrences of amino acid
1416          * number 24 (Selenocysteine) with number 21 (Undetermined or
1417          * atypical). */
1418         if(origData[idx] != eSelenocysteine) {
1419             seqData->data[idx] = origData[idx];
1420         } else {
1421           seqData->data[idx] = eXchar;
1422           fprintf(stderr, "Selenocysteine (U) at position %ld"
1423                   " replaced by X\n",
1424                   (long) idx + 1);
1425         }
1426       }
1427       seqData->data[seqData->length] = '\0';
1428     } else { /* self->rdfp is NULL */
1429       SeqPortPtr spp = NULL;      /* a SeqPort used to read the
1430                                      sequence data */
1431       Uint1      residue;         /* an individual residue */
1432       int       idx;
1433 
1434       seqData->length    = local_data->bsp_db->length;
1435       seqData->buffer    = MemNew((seqData->length + 2) * sizeof(Uint1));
1436       seqData->buffer[0] = '\0';
1437       seqData->data      = seqData->buffer + 1;
1438 
1439       spp =
1440         SeqPortNew(local_data->bsp_db, FIRST_RESIDUE, LAST_RESIDUE,
1441                    Seq_strand_unknown, Seq_code_ncbistdaa);
1442 
1443       idx = 0;
1444       while((residue = SeqPortGetResidue(spp)) != SEQPORT_EOF) {
1445         if(IS_residue(residue)) {
1446           /* Replace occurrences of amino acid number 24
1447              (Selenocysteine) with number 21 (Undetermined or
1448              atypical). */
1449           if(residue == eSelenocysteine) {
1450             residue = eXchar;
1451             fprintf(stderr, "Selenocysteine (U) at position %ld"
1452                     " replaced by X\n",
1453                     (long) idx + 1);
1454           }
1455           seqData->data[idx++] = residue;
1456         }
1457       }
1458       seqData->data[idx] = 0;    /* terminate the string */
1459       spp = SeqPortFree(spp);
1460     } /* end else self->rdfp is NULL */
1461 
1462 #ifndef KAPPA_BLASTP_NO_SEG_SEQUENCE
1463     {
1464       SeqLocPtr seg_slp;  /*pointer to structure for SEG filtering*/
1465 
1466       seg_slp =
1467         BlastBioseqFilter(local_data->bsp_db, BLASTP_MASK_INSTRUCTIONS);
1468       if (seg_slp) {
1469       if ((!shouldTestIdentical) ||
1470             (shouldTestIdentical &&
1471 	     (!testNearIdentical(seqData, queryData, q_range->begin, align, 0))))
1472 	    BlastMaskTheResidues(seqData->data, seqData->length,
1473 				 BLASTP_MASK_RESIDUE, seg_slp, FALSE, 0);
1474         seg_slp = SeqLocSetFree(seg_slp);
1475       }
1476     }
1477 #endif
1478     /* Fit the data to the range. */
1479     seqData ->data    = &seqData->data[range->begin - 1];
1480     *seqData->data++  = '\0';
1481     seqData ->length  = range->end - range->begin;
1482 }
1483 
1484 /**
1485  * Obtain the sequence data that lies within the given range.
1486  * Note: BLASTX search is not fully supported yet
1487  *
1488  * @param self          sequence information [in]
1489  * @param s_range       the range, in amino acid coordinates, of data
1490  *                      to get [in]
1491  * @param seqData       the sequence data obtained [out]
1492  * @param query         the query sequence [in]
1493  * @param q_range       the range, in amino acid coordinates, of data
1494  *                      to get [in]
1495  * @param queryData     the query data obtained [out]
1496  * @param align          information about the alignment between query and subject
1497  * @param shouldTestIdentical did alignment pass a preliminary test in
1498  *                       redo_alignment.c that indicates the sequence
1499  *                        pieces may be near identical
1500  *
1501  * @returns   always 0 (posts a fatal error on failure rather than
1502  *            returning an error code.)
1503  */
1504 static int
Kappa_SequenceGetRange(const BlastCompo_MatchingSequence * self,const BlastCompo_SequenceRange * s_range,BlastCompo_SequenceData * seqData,const BlastCompo_SequenceData * query,const BlastCompo_SequenceRange * q_range,BlastCompo_SequenceData * queryData,const Uint8 * query_words,const BlastCompo_Alignment * align,const Boolean shouldTestIdentical,const ECompoAdjustModes compo_adjust_mode,const Boolean isSmithWaterman,Boolean * subject_maybe_biased)1505 Kappa_SequenceGetRange(
1506   const BlastCompo_MatchingSequence * self,
1507   const BlastCompo_SequenceRange * s_range,
1508   BlastCompo_SequenceData * seqData,
1509   const BlastCompo_SequenceData * query,
1510   const BlastCompo_SequenceRange * q_range,
1511   BlastCompo_SequenceData * queryData,
1512   const Uint8* query_words,
1513   const BlastCompo_Alignment *align,
1514   const Boolean shouldTestIdentical,
1515   const ECompoAdjustModes compo_adjust_mode,
1516   const Boolean isSmithWaterman,
1517   Boolean* subject_maybe_biased)
1518 {
1519   Int4 idx;
1520   Kappa_SequenceLocalData * local_data = self->local_data;
1521   Uint1 *origData = query->data + q_range->begin;
1522   /* Copy the query sequence (necessary for SEG filtering.) */
1523   queryData->length = q_range->end - q_range->begin;
1524   queryData->buffer = calloc((queryData->length + 2), sizeof(Uint1));
1525   queryData->data   = queryData->buffer + 1;
1526 
1527   for (idx = 0; idx < queryData->length; idx++) {
1528       /* Copy the sequence data, replacing occurences of amino acid
1529        * number 24 (Selenocysteine) with number 21 (Undetermined or
1530        * atypical). */
1531       queryData->data[idx] = (origData[idx] != 24) ? origData[idx] : 21;
1532   }
1533   if(local_data->prog_number ==  blast_type_tblastn) {
1534     /* The sequence must be translated. */
1535     Kappa_SequenceGetTranslatedRange(self, s_range, seqData, q_range, queryData,
1536 				     align, shouldTestIdentical);
1537   } else {
1538     Kappa_SequenceGetProteinRange(self, s_range, seqData, q_range, queryData,
1539                                      align, shouldTestIdentical);
1540   }
1541   return 0;
1542 }
1543 
1544 
1545 
1546 /**
1547  * Converts an HSP, obtained from a blast search, to a GapAlignBlk that
1548  * is in a state in which it has all information necessary to redo the
1549  * computation of a traceback.
1550  *
1551  * @param gap_align     the GapAlignBlk to be modified  [out]
1552  * @param search        general information about the search [in]
1553  * @param hsp           the HSP to be converted [in]
1554  * @param queryOrigin   origin of the query participating in this
1555  *                      alignment within the concatenated query [in]
1556  * @param subject_range,       the subject_range used to compute the traceback,
1557  *                             in amino acid coordinates [in]
1558  * @param query,        the query data [in]
1559  * @param subject       the subject data [in]
1560  */
1561 static void
Kappa_HitToGapAlign(GapAlignBlkPtr gap_align,BlastSearchBlkPtr search,BLAST_HSPPtr hsp,int queryOrigin,BlastCompo_SequenceRange * subject_range,BlastCompo_SequenceData * query,BlastCompo_SequenceData * subject)1562 Kappa_HitToGapAlign(
1563   GapAlignBlkPtr gap_align,
1564   BlastSearchBlkPtr search,
1565   BLAST_HSPPtr       hsp,
1566   int queryOrigin,
1567   BlastCompo_SequenceRange * subject_range,
1568   BlastCompo_SequenceData * query,
1569   BlastCompo_SequenceData * subject)
1570 {
1571   gap_align->query           = query->data;
1572   gap_align->query_length    = query->length;
1573   gap_align->query_frame     = 0;
1574 
1575   gap_align->subject         = subject->data;
1576   gap_align->subject_length  = subject->length;
1577   gap_align->subject_frame   = hsp->subject.frame;
1578 
1579   /* Shift the hsp to use query/subject local coordinates. */
1580   hsp->query.offset         -= queryOrigin;
1581   hsp->query.gapped_start   -= queryOrigin;
1582   hsp->query.end            -= queryOrigin;
1583   hsp->subject.offset       -= subject_range->begin;
1584   hsp->subject.gapped_start -= subject_range->begin;
1585   hsp->subject.end          -= subject_range->begin;
1586   if(CheckStartForGappedAlignment(search, hsp, gap_align->query,
1587                                    gap_align->subject, search->sbp->matrix)) {
1588     /* We may use the starting point supplied by the HSP. */
1589     gap_align->q_start = hsp->query.gapped_start;
1590     gap_align->s_start = hsp->subject.gapped_start;
1591   } else {
1592     /* We must recompute the start for the gapped alignment, as the
1593        one in the HSP was unacceptable.*/
1594     gap_align->q_start =
1595       GetStartForGappedAlignment(search, hsp, gap_align->query,
1596                                  gap_align->subject, search->sbp->matrix);
1597     gap_align->s_start =
1598       (hsp->subject.offset - hsp->query.offset) + gap_align->q_start;
1599   }
1600 }
1601 
1602 
1603 /**
1604  * Reads a GapAlignBlk that has been used to compute a traceback, and
1605  * return a BlastCompo_Alignment representing the alignment.
1606  *
1607  * @param gap_align         the GapAlignBlk
1608  * @param matrix_adjust_rule    the rule used to compute the scoring matrix
1609  * @param queryRange        range of the query data in the
1610  *                          concatenated query
1611  * @param ccat_query_length length of the concatenated query
1612  * @param subjectRange      range of the subject data in the full
1613  *                          database sequence, expressed in amino acid
1614  *                          coordinates
1615  * @param subjectLength     original. untranslated length of the
1616  *                          subject sequence
1617  */
1618 static BlastCompo_Alignment *
Kappa_NewAlignFromGapAlign(GapAlignBlkPtr gap_align,EMatrixAdjustRule matrix_adjust_rule,BlastCompo_SequenceRange * query_range,int ccat_query_length,BlastCompo_SequenceRange * subject_range,int subjectLength)1619 Kappa_NewAlignFromGapAlign(
1620   GapAlignBlkPtr gap_align,
1621   EMatrixAdjustRule matrix_adjust_rule,
1622   BlastCompo_SequenceRange * query_range,
1623   int ccat_query_length,
1624   BlastCompo_SequenceRange * subject_range,
1625   int subjectLength)
1626 {
1627   int query_index, translation_frame;
1628   BlastCompo_Alignment * obj; /* the new alignment */
1629 
1630   /* The gap_align is in coordinates that are relative to the
1631      query/subject subject_range.  Shift to global coordinates. */
1632   int queryStart = gap_align->query_start   + query_range->begin;
1633   int queryEnd   = gap_align->query_stop    + query_range->begin   + 1;
1634   int matchStart = gap_align->subject_start + subject_range->begin;
1635   int matchEnd   = gap_align->subject_stop  + subject_range->begin + 1;
1636 
1637   if(gap_align->edit_block != NULL) {
1638     gap_align->edit_block->start1           += query_range->begin;
1639     gap_align->edit_block->length1          += query_range->begin;
1640     gap_align->edit_block->start2           += subject_range->begin;
1641     gap_align->edit_block->length2          += subject_range->begin;
1642     gap_align->edit_block->original_length1  = ccat_query_length;
1643     gap_align->edit_block->original_length2  = subjectLength;
1644   }
1645   query_index = query_range->context;
1646   translation_frame = subject_range->context;
1647   obj = BlastCompo_AlignmentNew(gap_align->score, matrix_adjust_rule,
1648                                    queryStart, queryEnd, query_index,
1649                                    matchStart, matchEnd,
1650                                    translation_frame,
1651                                    gap_align->edit_block);
1652   if (obj == NULL) {
1653       ErrPostEx(SEV_FATAL, E_NoMemory, 0,
1654                 "Failed to allocate a new alignment");
1655   }
1656   gap_align->edit_block = NULL; /* set to NULL to avoid aliasing */
1657 
1658   return obj;
1659 }
1660 
1661 
1662 /** A callback used when performing SmithWaterman alignments:
1663  * Calculate the traceback for one alignment by performing an x-drop
1664  * alignment in the forward direction, possibly increasing the x-drop
1665  * parameter until the desired score is attained.
1666  *
1667  * The start, end and score of the alignment should be obtained
1668  * using the Smith-Waterman algorithm before this routine is called.
1669  *
1670  * @param *palign          the new alignment
1671  * @param *pqueryEnd       on entry, the end of the alignment in the
1672  *                         query, as computed by the Smith-Waterman
1673  *                         algorithm.  On exit, the end as computed by
1674  *                         the x-drop algorithm
1675  * @param *pmatchEnd       like as *pqueryEnd, but for the subject
1676  *                         sequence
1677  * @param queryStart       the starting point in the query
1678  * @param matchStart       the starting point in the subject
1679  * @param score            the score of the alignment, as computed by
1680  *                         the Smith-Waterman algorithm
1681  * @param query            query sequence data
1682  * @param query_range      range of this query in the concatenated
1683  *                         query
1684  * @param ccat_query_length   total length of the concatenated query
1685  * @param subject          subject sequence data
1686  * @param subject_range    range of subject_data in the translated
1687  *                         query, in amino acid coordinates
1688  * @param full_subject_length   length of the full subject sequence
1689  * @param gapping_params        parameters used to compute gapped
1690  *                              alignments
1691  * @param matrix_adjust_rule    the rule used to compute the scoring matrix
1692  * @returns 0   (posts a fatal error if it fails)
1693  * @sa new_xdrop_align_type
1694  */
1695 static int
Kappa_NewAlignmentUsingXdrop(BlastCompo_Alignment ** pnewAlign,int * pqueryEnd,int * pmatchEnd,int queryStart,int matchStart,int score,BlastCompo_SequenceData * query,BlastCompo_SequenceRange * query_range,int queryLength,BlastCompo_SequenceData * subject,BlastCompo_SequenceRange * subject_range,int subjectLength,BlastCompo_GappingParams * gapping_params,EMatrixAdjustRule matrix_adjust_rule)1696 Kappa_NewAlignmentUsingXdrop(BlastCompo_Alignment ** pnewAlign,
1697                              int * pqueryEnd, int *pmatchEnd,
1698                              int queryStart, int matchStart, int score,
1699                              BlastCompo_SequenceData * query,
1700                              BlastCompo_SequenceRange * query_range,
1701                              int queryLength,
1702                              BlastCompo_SequenceData * subject,
1703                              BlastCompo_SequenceRange * subject_range,
1704                              int subjectLength,
1705                              BlastCompo_GappingParams * gapping_params,
1706                              EMatrixAdjustRule matrix_adjust_rule)
1707 {
1708   /* General search parameters */
1709   BlastSearchBlkPtr search = gapping_params->context;
1710   /* Specific parameters used for gapped alignments */
1711   GapAlignBlkPtr gap_align = search->gap_align;
1712   /* the translation frame */
1713   int frame = subject_range->context;
1714 
1715   gap_align->x_parameter = gapping_params->x_dropoff;
1716 
1717   Kappa_SWFindFinalEndsUsingXdrop(query,   queryStart, *pqueryEnd,
1718                                   subject, matchStart, *pmatchEnd,
1719                                   frame, gap_align, score);
1720   *pqueryEnd = gap_align->query_stop + 1;
1721   *pmatchEnd = gap_align->subject_stop + 1;
1722   *pnewAlign = Kappa_NewAlignFromGapAlign(gap_align, matrix_adjust_rule,
1723                                           query_range, queryLength,
1724                                           subject_range, subjectLength);
1725   gap_align->x_parameter = gapping_params->x_dropoff;
1726 
1727   return 0;
1728 }
1729 
1730 
1731 /**
1732  * A callback: calculate the traceback for one alignment by
1733  * performing an x-drop alignment in both directions
1734  *
1735  * @param in_align         the existing alignment, without traceback
1736  * @param matrix_adjust_rule    the rule used to compute the scoring matrix
1737  * @param query_data       query sequence data
1738  * @param query_range      range of this query in the concatenated
1739  *                         query
1740  * @param ccat_query_length   total length of the concatenated query
1741  * @param subject_data     subject sequence data
1742  * @param subject_range    range of subject_data in the translated
1743  *                         query, in amino acid coordinates
1744  * @param full_subject_length   length of the full subject sequence
1745  * @param gapping_params        parameters used to compute gapped
1746  *                              alignments
1747  * @sa redo_one_alignment_type
1748  */
1749 static BlastCompo_Alignment *
Kappa_RedoOneAlignment(BlastCompo_Alignment * in_align,EMatrixAdjustRule matrix_adjust_rule,BlastCompo_SequenceData * query_data,BlastCompo_SequenceRange * query_range,int ccat_query_length,BlastCompo_SequenceData * subject_data,BlastCompo_SequenceRange * subject_range,int full_subject_length,BlastCompo_GappingParams * gapping_params)1750 Kappa_RedoOneAlignment(BlastCompo_Alignment * in_align,
1751                        EMatrixAdjustRule matrix_adjust_rule,
1752                        BlastCompo_SequenceData * query_data,
1753                        BlastCompo_SequenceRange * query_range,
1754                        int ccat_query_length,
1755                        BlastCompo_SequenceData * subject_data,
1756                        BlastCompo_SequenceRange * subject_range,
1757                        int full_subject_length,
1758                        BlastCompo_GappingParams * gapping_params)
1759 {
1760   /* New alignment to be returned */
1761   BlastCompo_Alignment * newAlign;
1762   /* The HSP to be redone */
1763   BLASTResultHspPtr hsp = in_align->context;
1764   /* A copy of hsp as a BLAST_HSP; needed to fix a type mismatch */
1765   BLAST_HSPPtr temp_hsp = MemNew(sizeof(BLAST_HSP));
1766   /* General search information */
1767   BlastSearchBlkPtr search = gapping_params->context;
1768 
1769   CopyResultHspToHSP(hsp, temp_hsp);
1770   Kappa_HitToGapAlign(search->gap_align, search, temp_hsp, query_range->begin,
1771                       subject_range, query_data, subject_data);
1772   search->gap_align->x_parameter = gapping_params->x_dropoff;
1773 
1774   PerformGappedAlignmentWithTraceback(search->gap_align);
1775 
1776   newAlign =
1777       Kappa_NewAlignFromGapAlign(search->gap_align, matrix_adjust_rule,
1778                                  query_range, ccat_query_length,
1779                                  subject_range, full_subject_length);
1780   MemFree(temp_hsp);
1781 
1782   return newAlign;
1783 }
1784 
1785 
1786 /**
1787  * A Kappa_SearchParameters represents the data needed by
1788  * RedoAlignmentCore to adjust the parameters of a search, including
1789  * the original value of these parameters
1790  */
1791 typedef struct Kappa_SearchParameters {
1792   int          gap_open;        /**< a penalty for the existence of a gap */
1793   int          gapExtend;      /**< a penalty for each residue (or
1794                                       nucleotide) in the gap */
1795   int          gapDecline;     /**< a penalty for declining to align a pair
1796                                      of residues */
1797   int           gap_x_dropoff_final;  /**< value of the x-drop parameter
1798                                            for the final gapped alignment
1799                                            with traceback */
1800   BLAST_Score **origMatrix;     /**< The original matrix values */
1801   /** a matrix.  Each row represents a query and each column
1802        represents the probabilities of a particular residue */
1803   GapAlignBlkPtr orig_gap_align;
1804   Nlm_FloatHi original_expect_value;    /**< expect value on entry */
1805   BLAST_KarlinBlkPtr kbp_gap_orig;  /**< copy of the original gapped
1806                                       Karlin-Altschul block corresponding to
1807                                       the first context */
1808   BLAST_KarlinBlkPtr * orig_kbp_gap_array;  /**< pointer to the array of gapped
1809                                               Karlin-Altschul block for all
1810                                               contexts; needed to restore
1811                                               the search to its original
1812                                               configuration.  */
1813   Boolean    use_mq_flag;     /**< the initial value of the
1814                                    search->mult_queries->use_mq flag;
1815                                    or false if search->mult_queries ==
1816                                    NULL */
1817 } Kappa_SearchParameters;
1818 
1819 
1820 /**
1821  * Release the data associated with a Kappa_SearchParameters and
1822  * delete the object
1823  * @param searchParams the object to be deleted [in][out]
1824  */
1825 static void
Kappa_SearchParametersFree(Kappa_SearchParameters ** searchParams)1826 Kappa_SearchParametersFree(Kappa_SearchParameters ** searchParams)
1827 {
1828   /* for convenience, remove one level of indirection from searchParams */
1829   Kappa_SearchParameters *sp = *searchParams;
1830 
1831   if(sp->kbp_gap_orig) BlastKarlinBlkDestruct(sp->kbp_gap_orig);
1832 
1833   Nlm_Int4MatrixFree(&sp->origMatrix);
1834 
1835   Nlm_Free(*searchParams);
1836   *searchParams = NULL;
1837 }
1838 
1839 
1840 /**
1841  * Create a new instance of Kappa_SearchParameters
1842  *
1843  * @param rows              number of rows in the scoring matrix
1844  * @param compo_adjust_mode  if >0, use composition-based statistics
1845  * @param numQueries        the number of queries in the concatenated
1846  *                          query
1847  * @param positionBased     if true, the search is position-based
1848  */
1849 static Kappa_SearchParameters *
Kappa_SearchParametersNew(int rows,ECompoAdjustModes compo_adjust_mode,Boolean positionBased)1850 Kappa_SearchParametersNew(
1851   int rows,
1852   ECompoAdjustModes compo_adjust_mode,
1853   Boolean positionBased)
1854 {
1855   Kappa_SearchParameters *sp;   /* the new object */
1856   sp = MemNew(sizeof(Kappa_SearchParameters));
1857 
1858   sp->orig_kbp_gap_array = NULL;
1859   sp->kbp_gap_orig       = NULL;
1860   sp->origMatrix         = NULL;
1861 
1862   sp->kbp_gap_orig = BlastKarlinBlkCreate();
1863   if (compo_adjust_mode != eNoCompositionBasedStats) {
1864     if (positionBased) {
1865       sp->origMatrix = Nlm_Int4MatrixNew(rows, PROTEIN_ALPHABET);
1866     } else {
1867       sp->origMatrix = Nlm_Int4MatrixNew(PROTEIN_ALPHABET, PROTEIN_ALPHABET);
1868     }
1869     if (sp->origMatrix == NULL) {
1870       ErrPostEx(SEV_FATAL, E_NoMemory, 0,
1871                 "Failed to allocate a scoring matrix");
1872     }
1873   }
1874   /* end if(compo_adjust_mode) */
1875 
1876   return sp;
1877 }
1878 
1879 
1880 /**
1881  * Record the initial value of the search parameters that are to be
1882  * adjusted.
1883  *
1884  * @param searchParams       holds the recorded values [out]
1885  * @param search             the search parameters [in]
1886  * @param compo_adjust_mode   mode of composition adjustment [in]
1887  */
1888 static void
Kappa_RecordInitialSearch(Kappa_SearchParameters * searchParams,BlastSearchBlkPtr search,ECompoAdjustModes compo_adjust_mode)1889 Kappa_RecordInitialSearch(Kappa_SearchParameters * searchParams,
1890                           BlastSearchBlkPtr search,
1891                           ECompoAdjustModes compo_adjust_mode)
1892 {
1893   BLAST_KarlinBlkPtr kbp;     /* statistical parameters used to evaluate a
1894                                * query-subject pair */
1895   BLAST_Score **matrix;       /* matrix used to score a local
1896                                  query-subject alignment */
1897 
1898   searchParams->gap_x_dropoff_final   = search->pbp->gap_x_dropoff_final;
1899   searchParams->original_expect_value = search->pbp->cutoff_e;
1900 
1901   if (search->mult_queries != NULL) {
1902     searchParams->use_mq_flag = search->mult_queries->use_mq;
1903   } else {
1904     searchParams->use_mq_flag = 0;
1905   }
1906   searchParams->gap_open   = search->pbp->gap_open;
1907   searchParams->gapExtend  = search->pbp->gap_extend;
1908   searchParams->gapDecline = search->pbp->decline_align;
1909 
1910   searchParams->orig_kbp_gap_array   = search->sbp->kbp_gap;
1911   if(search->positionBased) {
1912     kbp    = search->sbp->kbp_gap_psi[0];
1913     matrix = search->sbp->posMatrix;
1914   } else {
1915     kbp    = search->sbp->kbp_gap_std[0];
1916     matrix = search->sbp->matrix;
1917   }
1918   searchParams->kbp_gap_orig->Lambda = kbp->Lambda;
1919   searchParams->kbp_gap_orig->K      = kbp->K;
1920   searchParams->kbp_gap_orig->logK   = kbp->logK;
1921   searchParams->kbp_gap_orig->H      = kbp->H;
1922 
1923   searchParams->orig_gap_align = search->gap_align;
1924   search->gap_align = NULL; /* break aliasing */
1925 
1926   if(compo_adjust_mode != eNoCompositionBasedStats) {
1927     int i, j;                  /* iteration indices */
1928     int rows;
1929     if (search->positionBased) {
1930       rows = search->context[0].query->length;
1931     } else {
1932       rows = PROTEIN_ALPHABET;
1933     }
1934     for(i = 0; i < rows; i++) {
1935       for(j = 0; j < PROTEIN_ALPHABET; j++) {
1936         searchParams->origMatrix[i][j] = matrix[i][j];
1937       }
1938     }
1939   }
1940 }
1941 
1942 
1943 /**
1944  * Rescale the search parameters in the search object to obtain more
1945  * precision.
1946  *
1947  * @param search               the search block to be rescaled
1948  * @param localScalingFactor   the scale factor
1949  * @param options              certain command-line options to BLAST
1950  */
1951 static void
Kappa_RescaleSearch(BlastSearchBlkPtr search,double localScalingFactor,BLAST_OptionsBlkPtr options)1952 Kappa_RescaleSearch(BlastSearchBlkPtr search,
1953                 double localScalingFactor,
1954                 BLAST_OptionsBlkPtr options)
1955 {
1956   BLAST_KarlinBlkPtr kbp;     /* the statistical parameters used to
1957                                * evaluate alignments of a
1958                                * query-subject pair */
1959   if(search->positionBased) {
1960     kbp = search->sbp->kbp_gap_psi[0];
1961   } else {
1962     kbp = search->sbp->kbp_gap_std[0];
1963   }
1964   kbp->Lambda /= localScalingFactor;
1965   kbp->logK = log(kbp->K);
1966 
1967   if (search->mult_queries != NULL) {
1968     search->mult_queries->use_mq = 0;
1969   }
1970   search->pbp->cutoff_e = options->kappa_expect_value;
1971   search->pbp->gap_open =
1972     Nlm_Nint(search->pbp->gap_open * localScalingFactor);
1973   search->pbp->gap_extend =
1974     Nlm_Nint(search->pbp->gap_extend * localScalingFactor);
1975   if(search->pbp->decline_align < INT2_MAX) {
1976     search->pbp->decline_align =
1977       Nlm_Nint(search->pbp->decline_align * localScalingFactor);
1978   }
1979   search->gap_align                = GapAlignBlkNew(1, 1);
1980   search->gap_align->matrix        = search->sbp->matrix;
1981   search->gap_align->positionBased = search->positionBased;
1982   if(search->positionBased) {
1983     search->gap_align->posMatrix   = search->sbp->posMatrix;
1984   }
1985   search->gap_align->gap_open      = search->pbp->gap_open;
1986   search->gap_align->gap_extend    = search->pbp->gap_extend;
1987   search->gap_align->decline_align = search->pbp->decline_align;
1988   search->gap_align->translate2 =
1989       (Boolean) (search->prog_number == blast_type_tblastn);
1990 
1991   search->gap_align->x_parameter   =
1992     (int) (options->gap_x_dropoff_final * NCBIMATH_LN2 / kbp->Lambda);
1993 }
1994 
1995 
1996 /**
1997  * Restore the parameters that were adjusted to their original values
1998  * @param search            the search to be restored [out]
1999  * @param matrix            the scoring matrix to be restored [out]
2000  * @param searchParams      a record of the original values [in]
2001  * @param SmithWaterman     if true, we have performed a Smith-Waterman
2002  *                          alignment with these search parameters [in]
2003  */
2004 static void
Kappa_RestoreSearch(BlastSearchBlkPtr search,BLAST_Score ** matrix,Kappa_SearchParameters * searchParams,ECompoAdjustModes compo_adjust_mode)2005 Kappa_RestoreSearch(
2006   BlastSearchBlkPtr search,
2007   BLAST_Score ** matrix,
2008   Kappa_SearchParameters * searchParams,
2009   ECompoAdjustModes compo_adjust_mode)
2010 {
2011   BLAST_KarlinBlkPtr kbp;     /* statistical parameters used to
2012                                  evaluate the significance of
2013                                  alignment of a query-subject
2014                                    pair */
2015   int i, j;                  /* iteration indices */
2016   search->pbp->gap_x_dropoff_final = searchParams->gap_x_dropoff_final;
2017   search->pbp->cutoff_e      = searchParams->original_expect_value;
2018   search->pbp->gap_open      = searchParams->gap_open;
2019   search->pbp->gap_extend    = searchParams->gapExtend;
2020   search->pbp->decline_align = searchParams->gapDecline;
2021   if (search->mult_queries != NULL) {
2022       search->mult_queries->use_mq = searchParams->use_mq_flag;
2023   }
2024   GapAlignBlkDelete(search->gap_align);
2025   search->gap_align    = searchParams->orig_gap_align;
2026   search->sbp->kbp_gap = searchParams->orig_kbp_gap_array;
2027 
2028   if(search->positionBased) {
2029     kbp = search->sbp->kbp_gap_psi[0];
2030   } else {
2031     kbp = search->sbp->kbp_gap_std[0];
2032   }
2033   kbp->Lambda = searchParams->kbp_gap_orig->Lambda;
2034   kbp->K      = searchParams->kbp_gap_orig->K;
2035   kbp->logK   = searchParams->kbp_gap_orig->logK;
2036   kbp->H      = searchParams->kbp_gap_orig->H;
2037 
2038   if(compo_adjust_mode != eNoCompositionBasedStats) {
2039     int rows;    /* the number of rows in matrix */
2040     if (search->positionBased) {
2041       rows = search->context[0].query->length;
2042     } else {
2043       rows = PROTEIN_ALPHABET;
2044     }
2045     for(i = 0; i < rows; i++) {
2046       for(j = 0; j < PROTEIN_ALPHABET; j++) {
2047         matrix[i][j] = searchParams->origMatrix[i][j];
2048       }
2049     }
2050   }
2051 }
2052 
2053 
2054 /** Initialize an object of type Blast_MatrixInfo.
2055  * @param self                 object to initialize
2056  * @param search               search data
2057  * @param localScalingFactor   amount by which this search is scaled
2058  * @param matrixName           name of the matrix */
2059 static void
Kappa_MatrixInfoInit(Blast_MatrixInfo * self,double localScalingFactor,BlastSearchBlkPtr search,const char * matrixName)2060 Kappa_MatrixInfoInit(Blast_MatrixInfo * self,
2061                      double localScalingFactor,
2062                      BlastSearchBlkPtr search,
2063                      const char * matrixName)
2064 {
2065   Uint1Ptr query;             /* the query sequence */
2066   int queryLength;           /* the length of the query sequence */
2067   Nlm_FloatHi initialUngappedLambda;
2068 
2069   query       = search->context[0].query->sequence;
2070   queryLength = search->context[0].query->length;
2071 
2072   if(self->positionBased) {
2073     if(search->sbp->posFreqs == NULL) {
2074       search->sbp->posFreqs =
2075         allocatePosFreqs(queryLength, PROTEIN_ALPHABET);
2076     }
2077     Kappa_GetStartFreqRatios(self->startFreqRatios, search, query, matrixName,
2078                              search->sbp->posFreqs, queryLength, TRUE);
2079     Kappa_ScalePosMatrix(self->startMatrix, search->sbp->matrix, matrixName,
2080                          search->sbp->posFreqs, query, queryLength,
2081                          search->sbp, localScalingFactor);
2082     initialUngappedLambda = search->sbp->kbp_psi[0]->Lambda;
2083   } else {
2084     Kappa_GetStartFreqRatios(self->startFreqRatios, search, query,
2085                              matrixName, NULL, PROTEIN_ALPHABET,
2086                              FALSE);
2087     initialUngappedLambda = search->sbp->kbp_ideal->Lambda;
2088   }
2089   self->ungappedLambda = initialUngappedLambda / localScalingFactor;
2090   if(!search->positionBased) {
2091     FreqRatios * freqRatios;  /* frequency ratios for the matrix */
2092 
2093     freqRatios = PSIMatrixFrequencyRatiosNew(matrixName);
2094     if (freqRatios == NULL) {
2095       ErrPostEx(SEV_FATAL, 1, 0, "blastpgp: Cannot adjust parameters "
2096                 "for matrix %s\n", matrixName);
2097     }
2098     Blast_Int4MatrixFromFreq(self->startMatrix, self->cols, freqRatios->data,
2099                              self->ungappedLambda);
2100     freqRatios = PSIMatrixFrequencyRatiosFree(freqRatios);
2101   }
2102   self->matrixName = strdup(matrixName);
2103 }
2104 
2105 
2106 /**
2107  * Record information about all queries in the concatenated query.
2108  *
2109  * @param *pquery       a new list of BlastCompo_QueryInfo objects
2110  * @param *pnumQueries  the length of *pquery
2111  * @param *maxLength    the length of the longest query
2112  * @param search        a search block, which is used to obtain the
2113  *                      query information */
2114 static void
Kappa_GetQueryInfo(BlastCompo_QueryInfo ** pquery,int * pnumQueries,int * maxLength,BlastSearchBlkPtr search)2115 Kappa_GetQueryInfo(BlastCompo_QueryInfo **pquery,
2116                    int * pnumQueries,
2117                    int *maxLength, BlastSearchBlkPtr search)
2118 {
2119   int query_index;
2120   int numQueries = search->mult_queries ? search->mult_queries->NumQueries : 1;
2121   BlastCompo_QueryInfo * query =
2122       Nlm_Calloc(numQueries, sizeof(BlastCompo_QueryInfo));
2123 
2124   *pnumQueries = numQueries;
2125   *maxLength = 0;
2126   *pquery = query;
2127   if (search->mult_queries) {
2128     for (query_index = 0;  query_index < numQueries;  query_index++) {
2129       query[query_index].eff_search_space =
2130         search->mult_queries->SearchSpEff[query_index];
2131     }
2132   } else {
2133     query[0].eff_search_space = search->searchsp_eff;
2134   }
2135   if (search->mult_queries != NULL) {
2136     /* Query concatenation is in use; find each individual query
2137        within the concatenated query */
2138     Uint1 * ccat_query = /* the concatenated query */
2139       search->context[0].query->sequence;
2140     int length;
2141 
2142     for (query_index = 0;  query_index < numQueries;  query_index++) {
2143       query[query_index].origin =
2144         search->mult_queries->QueryStarts[query_index];
2145       query[query_index].seq.data = &ccat_query[query[query_index].origin];
2146       length = search->mult_queries->QueryEnds[query_index] -
2147           query[query_index].origin + 1;
2148       query[query_index].seq.length = length;
2149       if (length > *maxLength) {
2150           *maxLength = length;
2151       }
2152     }
2153   } else {
2154     /* Query concatenation is not in use; just use the entire query */
2155     query[0].seq.data   = search->context[0].query->sequence;
2156     query[0].seq.length = search->context[0].query->length;
2157     query[0].origin = 0;
2158     *maxLength = query[0].seq.length;
2159   }
2160   for (query_index = 0;  query_index < numQueries;  query_index++) {
2161     Blast_ReadAaComposition(&query[query_index].composition,
2162                             PROTEIN_ALPHABET,
2163                             query[query_index].seq.data,
2164                             query[query_index].seq.length);
2165   }
2166 }
2167 
2168 
2169 /**
2170  * Initialize an object that contains the parameters needed to perform
2171  * a gapped alignment.
2172  *
2173  * @param gapping_params    the object to be initialized
2174  * @param search            holds search parameters
2175  * @param options           holds BLAST command-line options
2176  * @param Lambda            the statistical parameter Lambda;
2177  *                          gives the scale of the scoring system.
2178  */
2179 static void
Kappa_GappingParamsInit(BlastCompo_GappingParams * gapping_params,BlastSearchBlkPtr search)2180 Kappa_GappingParamsInit(BlastCompo_GappingParams * gapping_params,
2181                         BlastSearchBlkPtr search)
2182 {
2183   gapping_params->gap_open = search->gap_align->gap_open;
2184   gapping_params->gap_extend = search->gap_align->gap_extend;
2185   gapping_params->decline_align = search->gap_align->decline_align;
2186   gapping_params->x_dropoff = search->gap_align->x_parameter;
2187   gapping_params->context = search;
2188 }
2189 
2190 
2191 /**
2192  * A callback: Free GapXEditBlock that is pointed to by a (void *).
2193  */
Kappa_FreeEditBlock(void * traceback)2194 static void Kappa_FreeEditBlock(void * traceback)
2195 {
2196     if (traceback != NULL)
2197         GapXEditBlockDelete((GapXEditBlockPtr)traceback);
2198 }
2199 
2200 
2201 /**
2202  * Callbacks used by Blast_RedoOneMatch and
2203  * Blast_RedoOneMatchSmithWaterman */
2204 static const Blast_RedoAlignCallbacks
2205 redo_align_callbacks = {
2206     Kappa_CalcLambda,
2207     Kappa_SequenceGetRange,
2208     Kappa_RedoOneAlignment,
2209     Kappa_NewAlignmentUsingXdrop,
2210     Kappa_FreeEditBlock
2211     };
2212 
2213 
2214 /** Get a set of alignment parameters, for use by Blast_RedoOneMatch or
2215  * Blast_RedoOneMatchSmithWaterman.
2216  * @param search               search parameters
2217  * @param options              BLAST command line options
2218  * @param compo_adjust_mode     composition adjustment mode
2219  * @param localScalingFactor   factor by which scores are scaled */
2220 static Blast_RedoAlignParams *
Kappa_GetAlignParams(BlastSearchBlkPtr search,BLAST_OptionsBlkPtr options,ECompoAdjustModes compo_adjust_mode,double localScalingFactor)2221 Kappa_GetAlignParams(BlastSearchBlkPtr search,
2222                      BLAST_OptionsBlkPtr options,
2223                      ECompoAdjustModes compo_adjust_mode,
2224                      double localScalingFactor)
2225 {
2226   int ccat_query_length;
2227   int rows;
2228   int cutoff_s;
2229   double cutoff_e;
2230   BlastCompo_GappingParams * gapping_params = NULL;
2231   Blast_MatrixInfo * scaledMatrixInfo;
2232   Blast_RedoAlignParams * params;
2233   int query_is_translated = search->prog_number == blast_type_blastx;
2234   int subject_is_translated = search->prog_number == blast_type_tblastn;
2235   int do_link_hsps = search->pbp->do_sum_stats;
2236   /* Negative cutoff means that the old rules for CBS segging will be used */
2237   double near_identical_cutoff = -1.0;
2238 
2239   if(do_link_hsps) {
2240     cutoff_s = (int) (search->pbp->cutoff_s1 * localScalingFactor);
2241   } else {
2242     /* There is no cutoff score; we consider e-values instead */
2243     cutoff_s = 0;
2244   }
2245   cutoff_e = search->pbp->cutoff_e;
2246   ccat_query_length = search->context[0].query->length;
2247   rows = search->positionBased ? ccat_query_length : PROTEIN_ALPHABET;
2248   if (compo_adjust_mode == eNoCompositionBasedStats) {
2249     /* We cannot compute a value for scaledMatrixInfo if
2250      * compo_adjust_mode is off.  There are side effects to the
2251      * computation (specifically a call to updateLambdaK that alters
2252      * the gapped K and ungapped lambda for position-based searches)
2253      * that are undesirable and deeply embedded in the code. */
2254     scaledMatrixInfo = NULL;
2255   } else {
2256     scaledMatrixInfo = Blast_MatrixInfoNew(rows, PROTEIN_ALPHABET,
2257                                            search->positionBased);
2258     if (scaledMatrixInfo == NULL) {
2259       ErrPostEx(SEV_FATAL, E_NoMemory, 0, "Failed to allocate memory");
2260     }
2261     Kappa_MatrixInfoInit(scaledMatrixInfo, localScalingFactor, search,
2262                          options->matrix);
2263   }
2264   gapping_params = malloc(sizeof(BlastCompo_GappingParams));
2265   Kappa_GappingParamsInit(gapping_params, search);
2266   params =
2267     Blast_RedoAlignParamsNew(&scaledMatrixInfo, &gapping_params,
2268                              compo_adjust_mode, search->positionBased,
2269                              query_is_translated, subject_is_translated, ccat_query_length,
2270                              cutoff_s, cutoff_e, do_link_hsps,
2271                              &redo_align_callbacks, near_identical_cutoff);
2272   if (params == NULL) {
2273       ErrPostEx(SEV_FATAL, E_NoMemory, 0, "Failed to allocate memory");
2274   }
2275   return params;
2276 }
2277 
2278 
2279 /**
2280  * Convert a Blast_CompoHeap to a flat list of SeqAligns. Note that there may
2281  * be more than one alignment per element in the heap.  The new list
2282  * preserves the order of the SeqAligns associated with each
2283  * HeapRecord.
2284  *
2285  * @param self           a Blast_CompoHeap
2286  */
2287 static SeqAlignPtr
Kappa_CompoHeapToFlatList(BlastCompo_Heap * self)2288 Kappa_CompoHeapToFlatList(BlastCompo_Heap * self)
2289 {
2290     SeqAlignPtr list = NULL;
2291     SeqAlignPtr result;
2292 
2293     while (NULL != (result = BlastCompo_HeapPop(self))) {
2294         SeqAlignPtr oldList = list;
2295         list = result;
2296         while (NULL != result->next) {
2297             result = result->next;
2298         }
2299         result->next = oldList;
2300     }
2301     return list;
2302 }
2303 
2304 
2305 /**
2306  *  Top level routine to recompute alignments for each
2307  *  match found by the gapped BLAST algorithm
2308  *
2309  *  @param search           is the structure with all the information about
2310  *                          the search
2311  *  @param options          is used to pass certain command line options
2312  *                          taken in by BLAST
2313  *  @param hitlist_count    is the number of old matches
2314  *  @param compo_adjust_mode determines whether we are to adjust the
2315  *                          Karlin-Altschul parameters and score matrix
2316  *  @param SmithWaterman    determines whether the new local alignments
2317  *                          should be computed by the optimal Smith-Waterman
2318  *                          algorithm; SmithWaterman false means that
2319  *                          alignments will be recomputed by the current
2320  *                          X-drop algorithm as implemented in the procedure
2321  *                          ALIGN.
2322  *  @return                 a array of lists of SeqAlign; each element
2323  *                          in the array is a list of SeqAligns for
2324  *                          one query in the concatenated query.
2325  *  It is assumed that at least one of compo_adjust_mode and
2326  *  SmithWaterman is >0 or true when this procedure is called A linked list
2327  *  of alignments is returned; the alignments are sorted according to
2328  *  the lowest E-value of the best alignment for each matching
2329  *  sequence; alignments for the same matching sequence are in the
2330  *  list consecutively regardless of the E-value of the secondary
2331  *  alignments. Ties in sorted order are much rarer than for the
2332  *  standard BLAST method, but are broken deterministically based on
2333  *  the index of the matching sequences in the database.
2334  */
2335 SeqAlignPtr *
RedoAlignmentCore(BlastSearchBlkPtr search,BLAST_OptionsBlkPtr options,int hitlist_count,int adjustParameters,Boolean SmithWaterman)2336 RedoAlignmentCore(BlastSearchBlkPtr search,
2337                   BLAST_OptionsBlkPtr options,
2338                   int hitlist_count,
2339                   int adjustParameters,
2340                   Boolean SmithWaterman)
2341 {
2342   int status = 0;               /* error status returned by routines */
2343   int match_index;             /* index over matches */
2344   SeqAlignPtr * results = NULL; /* an array of lists of SeqAligns to
2345                                    return */
2346   Nlm_FloatHi localScalingFactor;       /* the factor by which to
2347                                          * scale the scoring system in
2348                                          * order to obtain greater
2349                                          * precision */
2350   BLAST_Score      **matrix;    /* score matrix */
2351   Kappa_SearchParameters *searchParams; /* the values of the search
2352                                          * parameters that will be
2353                                          * recorded, altered in the
2354                                          * search structure in this
2355                                          * routine, and then restored
2356                                          * before the routine
2357                                          * exits. */
2358   Blast_ForbiddenRanges  forbidden;     /* forbidden ranges for each
2359                                          * database position (used in
2360                                          * Smith-Waterman alignments)
2361                                          */
2362   BlastCompo_Heap * significantMatches; /* a collection of alignments for each
2363                                 * query sequence with sequences from
2364                                 * the database */
2365   Blast_CompositionWorkspace
2366       *NRrecord = NULL;        /* stores all fields needed for
2367                                 * computing a compositionally adjusted
2368                                 * score matrix using Newton's method */
2369   int query_index;            /* loop index */
2370   int numQueries;             /* number of queries in the
2371                                   concatenated query */
2372   int ccat_query_length;      /* length of the concatenated query, or
2373                                   of the sole query if query
2374                                   concatenation is not in use */
2375   int maxQueryLength;         /* the greatest length among all queries */
2376   BlastCompo_Alignment * incoming_aligns;  /* existing alignments for a match */
2377   BlastCompo_QueryInfo * query_info = NULL;
2378   Blast_RedoAlignParams * redo_align_params;
2379   double Lambda, logK;
2380   double pvalueForThisPair = (-1); /*p-value for this match
2381                                      for composition; -1 == no adjustment*/
2382   double LambdaRatio; /*lambda ratio*/
2383   int compositionTestIndex = options->unified_p;
2384                               /*which test function do we use to see if
2385                                a composition-adjusted p-value is desired*/
2386 
2387   /* the composition adjustment mode, obtained from adjustParameters
2388      (we don't make adjustParameters itself an enum so that only
2389      kappa.c depends on the header that defines ECompoAdjustModes) */
2390   ECompoAdjustModes compo_adjust_mode;
2391 
2392   if (search->positionBased) {
2393     /* Can't use a compositional P-value with position based searches */
2394     compositionTestIndex = 0;
2395   }
2396   /**** Validate parameters *************/
2397   if (0 > adjustParameters || eNumCompoAdjustModes <= adjustParameters) {
2398       /* Unknown composition adjustment mode */
2399       return NULL;
2400   } else {
2401       compo_adjust_mode = (ECompoAdjustModes) adjustParameters;
2402   }
2403   if(0 == strcmp(options->matrix, "BLOSUM62_20") &&
2404      eNoCompositionBasedStats == compo_adjust_mode) {
2405     /* BLOSUM62_20 only makes sense if compo_adjust_mode is on */
2406     return NULL;
2407   }
2408   if (search->positionBased && search->sbp->posMatrix == NULL) {
2409     Char* msg = "Cannot perform position-specific search without a PSSM";
2410     BlastConstructErrorMessage("RedoAlignmentCore", msg, 3,
2411                                &(search->error_return));
2412     return NULL;
2413   }
2414   if (search->positionBased && compo_adjust_mode > 1) {
2415     /* Only mode 1 (or 0) works with position-based searches, silently
2416      * change the value. */
2417     compo_adjust_mode = eCompositionBasedStats;
2418   }
2419   if (compo_adjust_mode > 1 &&
2420       !Blast_FrequencyDataIsAvailable(options->matrix)) {
2421     Char* msg = "Unsupported matrix for composition-based"
2422       " matrix adjustment";
2423     BlastConstructErrorMessage("RedoAlignmentCore", msg, 3,
2424                                &(search->error_return));
2425     return NULL;
2426   }
2427   /**** End validate parameters *************/
2428 
2429   ccat_query_length = search->context[0].query->length;
2430   searchParams =
2431       Kappa_SearchParametersNew(ccat_query_length, compo_adjust_mode,
2432                                 search->positionBased);
2433   Kappa_RecordInitialSearch(searchParams, search, compo_adjust_mode);
2434   if (compo_adjust_mode == eNoCompositionBasedStats) {
2435       localScalingFactor = 1.0;
2436   } else {
2437       localScalingFactor = SCALING_FACTOR;
2438   }
2439   if((0 == strcmp(options->matrix, "BLOSUM62_20"))) {
2440     localScalingFactor /= 10;
2441   }
2442   Kappa_RescaleSearch(search, localScalingFactor, options);
2443   if(search->positionBased) {
2444     matrix = search->sbp->posMatrix;
2445   } else {
2446     matrix = search->sbp->matrix;
2447   }
2448   redo_align_params = Kappa_GetAlignParams(search, options, compo_adjust_mode,
2449                                            localScalingFactor);
2450   {
2451       /* Get appropriate values for Lambda and logK */
2452       BLAST_KarlinBlkPtr kbp;
2453       if(search->positionBased) {
2454           kbp    = search->sbp->kbp_gap_psi[0];
2455       } else {
2456           kbp    = search->sbp->kbp_gap_std[0];
2457       }
2458       Lambda = kbp->Lambda;
2459       logK   = kbp->logK;
2460   }
2461   Kappa_GetQueryInfo(&query_info, &numQueries, &maxQueryLength, search);
2462   if(SmithWaterman) {
2463       status = Blast_ForbiddenRangesInitialize(&forbidden, maxQueryLength);
2464       if (status != 0)
2465           ErrPostEx(SEV_FATAL, E_NoMemory, 0, "Failed to allocate memory");
2466   }
2467   significantMatches = Nlm_Calloc(numQueries, sizeof(BlastCompo_Heap));
2468   ASSERT(options->ethresh != 0.0);
2469   for (query_index = 0;  query_index < numQueries;  query_index++) {
2470     status =
2471       BlastCompo_HeapInitialize(&significantMatches[query_index],
2472                                 options->hitlist_size, options->ethresh);
2473     if (status != 0) {
2474         ErrPostEx(SEV_FATAL, E_NoMemory, 0, "Failed to allocate memory");
2475     }
2476   }
2477   if (compo_adjust_mode != eNoCompositionBasedStats) {
2478     NRrecord = Blast_CompositionWorkspaceNew();
2479     if (NRrecord == NULL) {
2480       ErrPostEx(SEV_FATAL, E_NoMemory, 0, "Failed to allocate memory");
2481     }
2482     /* We already checked that the frequency data is available for
2483      * options->matrix, so this call can't fail */
2484     (void) Blast_CompositionWorkspaceInit(NRrecord, options->matrix);
2485   }
2486   for(match_index = 0; match_index < hitlist_count; match_index++) {
2487     /* for all matching sequences */
2488     BlastCompo_MatchingSequence matchingSeq; /* the data for a matching
2489                                          * database sequence */
2490     BLASTResultHitlistPtr thisMatch;    /* alignment data for the
2491                                          * current query-subject
2492                                          * match */
2493     BlastCompo_Alignment ** alignments; /* array of lists of
2494                                             * alignments for each
2495                                             * query to this subject */
2496     alignments = Nlm_Calloc(numQueries, sizeof(BlastCompo_Alignment *));
2497 
2498     thisMatch = search->result_struct->results[match_index];
2499     if(thisMatch->hsp_array == NULL) {
2500       continue;
2501     }
2502     if (BlastCompo_EarlyTermination(thisMatch->best_evalue,
2503                                     significantMatches, numQueries)) {
2504         break;
2505     }
2506     /* Get the sequence for this match */
2507     Kappa_MatchingSequenceInitialize(&matchingSeq, search,
2508                                      thisMatch->subject_id);
2509     incoming_aligns =
2510       Kappa_ResultHspToDistinctAlign(search, thisMatch->hsp_array,
2511                                      thisMatch->hspcnt, localScalingFactor);
2512     if (SmithWaterman) {
2513       status =
2514         Blast_RedoOneMatchSmithWaterman(alignments, redo_align_params,
2515                                         incoming_aligns, thisMatch->hspcnt,
2516                                         Lambda, logK,
2517                                         &matchingSeq, query_info, numQueries,
2518                                         matrix, PROTEIN_ALPHABET,
2519                                         NRrecord, &forbidden,
2520                                         significantMatches,
2521 					&pvalueForThisPair,
2522 					compositionTestIndex, &LambdaRatio);
2523     } else {
2524       status =
2525         Blast_RedoOneMatch(alignments, redo_align_params, incoming_aligns,
2526                            thisMatch->hspcnt, Lambda, &matchingSeq,
2527                            ccat_query_length, query_info, numQueries,
2528                            matrix, PROTEIN_ALPHABET,
2529                            NRrecord, &pvalueForThisPair,
2530                            compositionTestIndex, &LambdaRatio);
2531     }
2532     if (status != 0) {
2533       ErrPostEx(SEV_FATAL, E_NoMemory, 0, "Failed to allocate memory");
2534     }
2535     for (query_index = 0;  query_index < numQueries;  query_index++) {
2536       /* Loop over queries */
2537       if( alignments[query_index] != NULL) { /* alignments were found */
2538         Nlm_FloatHi bestEvalue;   /* best e-value among alignments in the
2539                                      hitlist */
2540         int bestScore;           /* best score among alignments in the
2541                                      hitlist */
2542         BLAST_HitListPtr hitlist; /* a hitlist containing the newly-computed
2543                                    * alignments */
2544         Kappa_SortedHitlistFromAligns(search, &alignments[query_index]);
2545         hitlist = search->current_hitlist;
2546         if (hitlist->hspcnt > 1 &&
2547             !(SmithWaterman && search->prog_number == blast_type_blastp)) {
2548           /* Eliminate HSPs that are contained in a higher-scoring
2549            * HSP.  With blastp and SmithWaterman alignments, the
2550            * forbidden ranges rule does not allow one alignment to be
2551            * contained in another. */
2552           BLASTCheckHSPInclusion(hitlist->hsp_array, hitlist->hspcnt,
2553                                  FALSE);
2554           hitlist->hspcnt =
2555             HspArrayPurge(hitlist->hsp_array, hitlist->hspcnt, FALSE);
2556         }
2557         Kappa_HitlistEvaluateAndPurge(&bestScore, &bestEvalue,
2558                                       matchingSeq.length,
2559                                       search, redo_align_params->do_link_hsps,
2560 				      pvalueForThisPair, LambdaRatio,
2561 				      thisMatch->subject_id);
2562         if (bestEvalue <= search->pbp->cutoff_e &&
2563             BlastCompo_HeapWouldInsert(&significantMatches[query_index],
2564                                        bestEvalue, bestScore,
2565                                        thisMatch->subject_id)) {
2566           /* If the best alignment is significant, then create and save
2567            * a list of SeqAligns. */
2568           /* SeqAligns for this query-subject pair */
2569           SeqAlignPtr aligns;
2570           void * discardedAligns;
2571           SeqIdPtr query_id;
2572           Kappa_SequenceLocalData * local_data = matchingSeq.local_data;
2573 
2574           if (search->mult_queries) {
2575             query_id = search->mult_queries->FakeBsps[query_index]->id;
2576           } else {
2577             query_id = search->query_id;
2578           }
2579           aligns =
2580             Kappa_SeqAlignsFromHitlist(hitlist, local_data->bsp_db->id,
2581                                        query_id,
2582                                        query_info[query_index].origin,
2583                                        query_info[query_index].seq.length,
2584                                        Lambda, logK, localScalingFactor);
2585           status =
2586             BlastCompo_HeapInsert(&significantMatches[query_index],
2587                                   aligns, bestEvalue, bestScore,
2588                                   thisMatch->subject_id,
2589                                   &discardedAligns);
2590           if (status != 0) {
2591               ErrPostEx(SEV_FATAL, E_NoMemory, 0, "Failed to allocate memory");
2592           }
2593           if (discardedAligns != NULL) {
2594             SeqAlignSetFree(discardedAligns);
2595           }
2596         } /* end if the best alignment is significant */
2597       } /* end if any alignments were found */
2598     } /* end loop over queries */
2599     Kappa_MatchingSequenceRelease(&matchingSeq);
2600     MemFree(alignments);
2601     BlastCompo_AlignmentsFree(&incoming_aligns, NULL);
2602   }
2603   /* end for all matching sequences */
2604   results = Nlm_Calloc(numQueries, sizeof(SeqAlignPtr));
2605   for (query_index = 0;  query_index < numQueries;  query_index++) {
2606     results[query_index] =
2607       Kappa_CompoHeapToFlatList(&significantMatches[query_index]);
2608   }
2609   /* Clean up */
2610   free(query_info);
2611   Blast_RedoAlignParamsFree(&redo_align_params);
2612   if (search->current_hitlist) {
2613     search->current_hitlist->hspcnt_max = search->current_hitlist->hspcnt;
2614     search->current_hitlist = BlastHitListDestruct(search->current_hitlist);
2615   }
2616   for (query_index = 0;  query_index < numQueries;  query_index++) {
2617     BlastCompo_HeapRelease(&significantMatches[query_index]);
2618   }
2619   MemFree(significantMatches); significantMatches = NULL;
2620   if(SmithWaterman) {
2621       Blast_ForbiddenRangesRelease(&forbidden);
2622   }
2623   Kappa_RestoreSearch(search, matrix, searchParams, compo_adjust_mode);
2624   Kappa_SearchParametersFree(&searchParams);
2625   if (NULL != NRrecord) {
2626     Blast_CompositionWorkspaceFree(&NRrecord);
2627   }
2628   return (results);
2629 }
2630