1 /* $Id: blast_kappa.c,v 1.156 2016/09/27 14:59:15 fukanchi Exp $
2  * ==========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Alejandro Schaffer, Mike Gertz (ported to algo/blast by Tom Madden)
27  *
28  */
29 
30 /** @file blast_kappa.c
31  * Utilities for doing Smith-Waterman alignments and adjusting the scoring
32  * system for each match in blastpgp
33  */
34 
35 #include <float.h>
36 #include <algo/blast/core/ncbi_math.h>
37 #include <algo/blast/core/blast_hits.h>
38 #include <algo/blast/core/blast_kappa.h>
39 #include <algo/blast/core/blast_util.h>
40 #include <algo/blast/core/blast_gapalign.h>
41 #include <algo/blast/core/blast_filter.h>
42 #include <algo/blast/core/blast_traceback.h>
43 #include <algo/blast/core/link_hsps.h>
44 #include <algo/blast/core/gencode_singleton.h>
45 #include "blast_psi_priv.h"
46 #include "blast_gapalign_priv.h"
47 #include "blast_hits_priv.h"
48 #include "blast_posit.h"
49 #include "blast_hspstream_mt_utils.h"
50 #include "blast_traceback_mt_priv.h"
51 
52 #ifdef _OPENMP
53 #include <omp.h>
54 #endif
55 
56 #include <algo/blast/composition_adjustment/nlm_linear_algebra.h>
57 #include <algo/blast/composition_adjustment/compo_heap.h>
58 #include <algo/blast/composition_adjustment/redo_alignment.h>
59 #include <algo/blast/composition_adjustment/matrix_frequency_data.h>
60 #include <algo/blast/composition_adjustment/unified_pvalues.h>
61 
62 /* Define KAPPA_PRINT_DIAGNOSTICS to turn on printing of
63  * diagnostic information from some routines. */
64 
65 /** Compile-time option; if set to a true value, then blastp runs
66     that use Blast_RedoAlignmentCore to compute the traceback will not
67     SEG the subject sequence */
68 #ifndef KAPPA_BLASTP_NO_SEG_SEQUENCE
69 #define KAPPA_BLASTP_NO_SEG_SEQUENCE 0
70 #endif
71 
72 
73 /** Compile-time option; if set to a true value, then blastp runs
74     that use Blast_RedoAlignmentCore to compute the traceback will not
75     SEG the subject sequence */
76 #ifndef KAPPA_TBLASTN_NO_SEG_SEQUENCE
77 #define KAPPA_TBLASTN_NO_SEG_SEQUENCE 0
78 #endif
79 
80 
81 /**
82  * Given a list of HSPs with (possibly) high-precision scores, rescale
83  * the scores to have standard precision and set the scale-independent
84  * bit scores.  This routine does *not* resort the list; it is assumed
85  * that the list is already sorted according to e-values that have been
86  * computed using the initial, higher-precision scores.
87  *
88  * @param hsp_list          the HSP list
89  * @param logK              Karlin-Altschul statistical parameter [in]
90  * @param lambda            Karlin-Altschul statistical parameter [in]
91  * @param scoreDivisor      the value by which reported scores are to be
92  */
93 static void
s_HSPListNormalizeScores(BlastHSPList * hsp_list,double lambda,double logK,double scoreDivisor)94 s_HSPListNormalizeScores(BlastHSPList * hsp_list,
95                          double lambda,
96                          double logK,
97                          double scoreDivisor)
98 {
99     int hsp_index;
100     for(hsp_index = 0; hsp_index < hsp_list->hspcnt; hsp_index++) {
101         BlastHSP * hsp = hsp_list->hsp_array[hsp_index];
102 
103         hsp->score = BLAST_Nint(((double) hsp->score) / scoreDivisor);
104         /* Compute the bit score using the newly computed scaled score. */
105         hsp->bit_score = (hsp->score*lambda*scoreDivisor - logK)/NCBIMATH_LN2;
106     }
107 }
108 
109 
110 /**
111  * Adjusts the E-values in a BLAST_HitList to be composites of
112  * a composition-based P-value and a score/alignment-based P-value
113  *
114  * @param hsp_list           the hitlist whose E-values need to be adjusted
115  * @param comp_p_value      P-value from sequence composition
116  * @param seqSrc            a source of sequence data
117  * @param subject_length    length of database sequence
118  * @param query_context     info about this query context; needed when
119  *                          multiple queries are being used
120  * @param LambdaRatio       the ratio between the observed value of Lambda
121  *                          and the predicted value of lambda (used to print
122  *                          diagnostics)
123  * @param subject_id        the subject id of this sequence (used to print
124  *                          diagnostics)
125  **/
126 static void
s_AdjustEvaluesForComposition(BlastHSPList * hsp_list,double comp_p_value,const BlastSeqSrc * seqSrc,Int4 subject_length,const BlastContextInfo * query_context,double LambdaRatio,int subject_id)127 s_AdjustEvaluesForComposition(
128     BlastHSPList *hsp_list,
129     double comp_p_value,
130     const BlastSeqSrc* seqSrc,
131     Int4 subject_length,
132     const BlastContextInfo * query_context,
133     double LambdaRatio,
134     int subject_id)
135 {
136     /* Smallest observed evalue after adjustment */
137     double best_evalue = DBL_MAX;
138 
139     /* True length of the query */
140     int query_length =      query_context->query_length;
141     /* Length adjustment to compensate for edge effects */
142     int length_adjustment = query_context->length_adjustment;
143 
144     /* Effective lengths of the query, subject, and database */
145     double query_eff   = MAX((query_length - length_adjustment), 1);
146     double subject_eff = MAX((subject_length - length_adjustment), 1.0);
147     double dblen_eff = (double) query_context->eff_searchsp / query_eff;
148 
149     /* Scale factor to convert the database E-value to the sequence E-value */
150     double db_to_sequence_scale = subject_eff / dblen_eff;
151 
152     int        hsp_index;
153     for (hsp_index = 0;  hsp_index < hsp_list->hspcnt;  hsp_index++) {
154         /* for all HSPs */
155         double align_p_value;     /* P-value for the alignment score */
156         double combined_p_value;  /* combination of two P-values */
157 
158         /* HSP for this iteration */
159         BlastHSP * hsp = hsp_list->hsp_array[hsp_index];
160 #ifdef KAPPA_PRINT_DIAGNOSTICS
161         /* Original E-value, saved if diagnostics are printed. */
162         double old_e_value = hsp->evalue;
163 #endif
164         hsp->evalue *= db_to_sequence_scale;
165 
166         align_p_value = BLAST_KarlinEtoP(hsp->evalue);
167         combined_p_value = Blast_Overall_P_Value(comp_p_value,align_p_value);
168         hsp->evalue = BLAST_KarlinPtoE(combined_p_value);
169         hsp->evalue /= db_to_sequence_scale;
170 
171         if (hsp->evalue < best_evalue) {
172             best_evalue = hsp->evalue;
173         }
174 
175 #ifdef KAPPA_PRINT_DIAGNOSTICS
176         if (seqSrc){
177             int    sequence_gi; /*GI of a sequence*/
178             Blast_GiList* gi_list; /*list of GI's for a sequence*/
179             gi_list = BlastSeqSrcGetGis(seqSrc, (void *) (&subject_id));
180             if ((gi_list) && (gi_list->num_used > 0)) {
181                 sequence_gi = gi_list->data[0];
182             } else {
183                 sequence_gi = (-1);
184             }
185             printf("GI %d Lambda ratio %e comp. p-value %e; "
186                    "adjust E-value of query length %d match length "
187                    "%d from %e to %e\n",
188                    sequence_gi, LambdaRatio, comp_p_value,
189                    query_length, subject_length, old_e_value, hsp->evalue);
190             Blast_GiListFree(gi_list);
191         }
192 #endif
193     } /* end for all HSPs */
194 
195     hsp_list->best_evalue = best_evalue;
196 
197     /* suppress unused parameter warnings if diagnostics are not printed */
198     (void) seqSrc;
199     (void) query_length;
200     (void) LambdaRatio;
201     (void) subject_id;
202 }
203 
204 
205 /**
206  * Remove from a hitlist all HSPs that are completely contained in an
207  * HSP that occurs earlier in the list and that:
208  * - is on the same strand; and
209  * - has equal or greater score.  T
210  * The hitlist should be sorted by some measure of significance before
211  * this routine is called.
212  * @param hsp_array         array to be reaped
213  * @param hspcnt            length of hsp_array
214  */
215 static void
s_HitlistReapContained(BlastHSP * hsp_array[],Int4 * hspcnt)216 s_HitlistReapContained(BlastHSP * hsp_array[], Int4 * hspcnt)
217 {
218     Int4 iread;       /* iteration index used to read the hitlist */
219     Int4 iwrite;      /* iteration index used to write to the hitlist */
220     Int4 old_hspcnt;  /* number of HSPs in the hitlist on entry */
221 
222     old_hspcnt = *hspcnt;
223 
224     for (iread = 1;  iread < *hspcnt;  iread++) {
225         /* for all HSPs in the hitlist */
226         Int4      ireadBack;  /* iterator over indices less than iread */
227         BlastHSP *hsp1;       /* an HSP that is a candidate for deletion */
228 
229         hsp1 = hsp_array[iread];
230         for (ireadBack = 0;  ireadBack < iread && hsp1 != NULL;  ireadBack++) {
231             /* for all HSPs before hsp1 in the hitlist and while hsp1
232              * has not been deleted */
233             BlastHSP *hsp2;    /* an HSP that occurs earlier in hsp_array
234                                 * than hsp1 */
235             hsp2 = hsp_array[ireadBack];
236 
237             if( hsp2 == NULL ) {  /* hsp2 was deleted in a prior iteration. */
238                 continue;
239             }
240             if (hsp2->query.frame == hsp1->query.frame &&
241                 hsp2->subject.frame == hsp1->subject.frame) {
242                 /* hsp1 and hsp2 are in the same query/subject frame. */
243                 if (CONTAINED_IN_HSP
244                     (hsp2->query.offset, hsp2->query.end, hsp1->query.offset,
245                      hsp2->subject.offset, hsp2->subject.end,
246                      hsp1->subject.offset) &&
247                     CONTAINED_IN_HSP
248                     (hsp2->query.offset, hsp2->query.end, hsp1->query.end,
249                      hsp2->subject.offset, hsp2->subject.end,
250                      hsp1->subject.end)    &&
251                     hsp1->score <= hsp2->score) {
252                     hsp1 = hsp_array[iread] = Blast_HSPFree(hsp_array[iread]);
253                 }
254             } /* end if hsp1 and hsp2 are in the same query/subject frame */
255         } /* end for all HSPs before hsp1 in the hitlist */
256     } /* end for all HSPs in the hitlist */
257 
258     /* Condense the hsp_array, removing any NULL items. */
259     iwrite = 0;
260     for (iread = 0;  iread < *hspcnt;  iread++) {
261         if (hsp_array[iread] != NULL) {
262             hsp_array[iwrite++] = hsp_array[iread];
263         }
264     }
265     *hspcnt = iwrite;
266     /* Fill the remaining memory in hsp_array with NULL pointers. */
267     for ( ;  iwrite < old_hspcnt;  iwrite++) {
268         hsp_array[iwrite] = NULL;
269     }
270 }
271 
272 
273 /** A callback used to free an EditScript that has been stored in a
274  * BlastCompo_Alignment. */
s_FreeEditScript(void * edit_script)275 static void s_FreeEditScript(void * edit_script)
276 {
277     if (edit_script != NULL)
278         GapEditScriptDelete(edit_script);
279 }
280 
281 
282 /**
283  * Converts a list of objects of type BlastCompo_Alignment to an
284  * new object of type BlastHSPList and returns the result. Conversion
285  * in this direction is lossless.  The list passed to this routine is
286  * freed to ensure that there is no aliasing of fields between the
287  * list of BlastCompo_Alignments and the new hitlist.
288  *
289  * @param hsp_list   The hsp_list to populate
290  * @param alignments A list of distinct alignments; freed before return [in]
291  * @param oid        Ordinal id of a database sequence [in]
292  * @param queryInfo  information about all queries in this search [in]
293  * @param frame     query frame
294  * @return Allocated and filled BlastHSPList structure.
295  */
296 static int
s_HSPListFromDistinctAlignments(BlastHSPList * hsp_list,BlastCompo_Alignment ** alignments,int oid,const BlastQueryInfo * queryInfo,int frame)297 s_HSPListFromDistinctAlignments(BlastHSPList *hsp_list,
298                                 BlastCompo_Alignment ** alignments,
299                                 int oid,
300                                 const BlastQueryInfo* queryInfo,
301                                 int frame)
302 {
303     int status = 0;                    /* return code for any routine called */
304     static const int unknown_value = 0;   /* dummy constant to use when a
305                                              parameter value is not known */
306     BlastCompo_Alignment * align;  /* an alignment in the list */
307 
308     if (hsp_list == NULL) {
309         return -1;
310     }
311     hsp_list->oid = oid;
312 
313     for (align = *alignments;  NULL != align;  align = align->next) {
314         BlastHSP * new_hsp = NULL;
315         GapEditScript * editScript = align->context;
316         align->context = NULL;
317 
318         status = Blast_HSPInit(align->queryStart, align->queryEnd,
319                                align->matchStart, align->matchEnd,
320                                unknown_value, unknown_value,
321                                align->queryIndex,
322                                frame, (Int2) align->frame, align->score,
323                                &editScript, &new_hsp);
324         switch (align->matrix_adjust_rule) {
325         case eDontAdjustMatrix:
326             new_hsp->comp_adjustment_method = eNoCompositionBasedStats;
327             break;
328         case eCompoScaleOldMatrix:
329             new_hsp->comp_adjustment_method = eCompositionBasedStats;
330             break;
331         default:
332             new_hsp->comp_adjustment_method = eCompositionMatrixAdjust;
333             break;
334         }
335         if (status != 0)
336             break;
337         /* At this point, the subject and possibly the query sequence have
338          * been filtered; since it is not clear that num_ident of the
339          * filtered sequences, rather than the original, is desired,
340          * explicitly leave num_ident blank. */
341         new_hsp->num_ident = 0;
342 
343         status = Blast_HSPListSaveHSP(hsp_list, new_hsp);
344         if (status != 0)
345             break;
346     }
347     if (status == 0) {
348         BlastCompo_AlignmentsFree(alignments, s_FreeEditScript);
349         Blast_HSPListSortByScore(hsp_list);
350     } else {
351         hsp_list = Blast_HSPListFree(hsp_list);
352     }
353     return 0;
354 }
355 
s_GetSubjectLength(Int4 total_subj_length,EBlastProgramType program_number)356 Int4 s_GetSubjectLength(Int4 total_subj_length, EBlastProgramType program_number)
357 {
358 	return ((program_number == eBlastTypeRpsTblastn) ?
359 				(GET_NUCL_LENGTH(total_subj_length) - 1 ) /3 : total_subj_length);
360 }
361 
362 
363 /**
364  * Adding evalues to a list of HSPs and remove those that do not have
365  * sufficiently good (low) evalue.
366  *
367  * @param *pbestScore      best (highest) score in the list
368  * @param *pbestEvalue     best (lowest) evalue in the list
369  * @param hsp_list         the list
370  * @param seqSrc            a source of sequence data
371  * @param subject_length   length of the subject sequence
372  * @param program_number   the type of BLAST search being performed
373  * @param queryInfo        information about the queries
374  * @param context_index      the index of the query corresponding to
375  *                         the HSPs in hsp_list
376  * @param sbp              the score block for this search
377  * @param hitParams        parameters used to assign evalues and
378  *                         decide whether to save hits.
379  * @param pvalueForThisPair  composition p-value
380  * @param LambdaRatio        lambda ratio, if available
381  * @param subject_id         index of subject
382  *
383  * @return 0 on success; -1 on failure (can fail because some methods
384  *         of generating evalues use auxiliary structures)
385  */
386 static int
s_HitlistEvaluateAndPurge(int * pbestScore,double * pbestEvalue,BlastHSPList * hsp_list,const BlastSeqSrc * seqSrc,int subject_length,EBlastProgramType program_number,const BlastQueryInfo * queryInfo,int context_index,BlastScoreBlk * sbp,const BlastHitSavingParameters * hitParams,double pvalueForThisPair,double LambdaRatio,int subject_id)387 s_HitlistEvaluateAndPurge(int * pbestScore, double *pbestEvalue,
388                           BlastHSPList * hsp_list,
389                           const BlastSeqSrc* seqSrc,
390                           int subject_length,
391                           EBlastProgramType program_number,
392                           const BlastQueryInfo* queryInfo,
393                           int context_index,
394                           BlastScoreBlk* sbp,
395                           const BlastHitSavingParameters* hitParams,
396                           double pvalueForThisPair,
397                           double LambdaRatio,
398                           int subject_id)
399 {
400     int status = 0;
401     *pbestEvalue = DBL_MAX;
402     *pbestScore  = 0;
403     if (hitParams->do_sum_stats) {
404         status = BLAST_LinkHsps(program_number, hsp_list, queryInfo,
405                                 subject_length, sbp,
406                                 hitParams->link_hsp_params, TRUE);
407     } else {
408 
409 
410         status =
411             Blast_HSPListGetEvalues(program_number, queryInfo,
412             		                s_GetSubjectLength(subject_length, program_number),
413                                     hsp_list, TRUE, FALSE, sbp,
414                                     0.0, /* use a non-zero gap decay
415                                             only when linking HSPs */
416                                     1.0); /* Use scaling factor equal to
417                                              1, because both scores and
418                                              Lambda are scaled, so they
419                                              will cancel each other. */
420     }
421     if (eBlastTypeBlastp == program_number ||
422         eBlastTypeBlastx == program_number) {
423         if ((0 <= pvalueForThisPair) && (pvalueForThisPair <= 1)) {
424             s_AdjustEvaluesForComposition(hsp_list, pvalueForThisPair, seqSrc,
425                                           subject_length,
426                                           &queryInfo->contexts[context_index],
427                                           LambdaRatio, subject_id);
428         }
429     }
430     if (status == 0) {
431         Blast_HSPListReapByEvalue(hsp_list, hitParams->options);
432         if (hsp_list->hspcnt > 0) {
433             *pbestEvalue = hsp_list->best_evalue;
434             *pbestScore  = hsp_list->hsp_array[0]->score;
435         }
436     }
437     return status == 0 ? 0 : -1;
438 }
439 
440 /** Compute the number of identities for the HSPs in the hsp_list
441  * @note Should work for blastp and tblastn now.
442  *
443  * @param query_blk the query sequence data [in]
444  * @param query_info structure describing the query_blk structure [in]
445  * @param seq_src source of subject sequence data [in]
446  * @param hsp_list list of HSPs to be processed [in|out]
447  * @param scoring_options scoring options [in]
448  * @gen_code_string Genetic code for tblastn [in]
449  */
450 static void
s_ComputeNumIdentities(const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info,BLAST_SequenceBlk * subject_blk,const BlastSeqSrc * seq_src,BlastHSPList * hsp_list,const BlastScoringOptions * scoring_options,const Uint1 * gen_code_string,const BlastScoreBlk * sbp)451 s_ComputeNumIdentities(const BLAST_SequenceBlk* query_blk,
452                        const BlastQueryInfo* query_info,
453                        BLAST_SequenceBlk* subject_blk,
454                        const BlastSeqSrc* seq_src,
455                        BlastHSPList* hsp_list,
456                        const BlastScoringOptions* scoring_options,
457                        const Uint1* gen_code_string,
458                        const BlastScoreBlk* sbp)
459 {
460     Uint1* query = NULL;
461     Uint1* query_nomask = NULL;
462     Uint1* subject = NULL;
463     const EBlastProgramType program_number = scoring_options->program_number;
464     const Boolean kIsOutOfFrame = scoring_options->is_ooframe;
465     const EBlastEncoding encoding = Blast_TracebackGetEncoding(program_number);
466     BlastSeqSrcGetSeqArg seq_arg;
467     Int2 status = 0;
468     int i;
469     SBlastTargetTranslation* target_t = NULL;
470 
471     if ( !hsp_list) return;
472 
473     /* Initialize the subject */
474     if (seq_src){
475         memset((void*) &seq_arg, 0, sizeof(seq_arg));
476         seq_arg.oid = hsp_list->oid;
477         seq_arg.encoding = encoding;
478         seq_arg.check_oid_exclusion = TRUE;
479         status = BlastSeqSrcGetSequence(seq_src, (void*) &seq_arg);
480         ASSERT(status == 0);
481         (void)status; /* to pacify compiler warning */
482 
483         if (program_number == eBlastTypeTblastn) {
484             subject_blk = seq_arg.seq;
485             BlastTargetTranslationNew(
486                     subject_blk,
487                     gen_code_string,
488                     eBlastTypeTblastn,
489                     kIsOutOfFrame,
490                     &target_t
491             );
492         } else {
493             subject = seq_arg.seq->sequence;
494         }
495     } else {
496         subject = subject_blk->sequence;
497     }
498 
499     for (i = 0; i < hsp_list->hspcnt; i++) {
500         BlastHSP* hsp = hsp_list->hsp_array[i];
501 
502         /* Initialize the query */
503         if (program_number == eBlastTypeBlastx && kIsOutOfFrame) {
504             Int4 context = hsp->context - hsp->context % CODON_LENGTH;
505             Int4 context_offset = query_info->contexts[context].query_offset;
506             query = query_blk->oof_sequence + CODON_LENGTH + context_offset;
507             query_nomask = query_blk->oof_sequence + CODON_LENGTH + context_offset;
508         } else {
509             query = query_blk->sequence +
510                 query_info->contexts[hsp->context].query_offset;
511             query_nomask = query_blk->sequence_nomask +
512                 query_info->contexts[hsp->context].query_offset;
513         }
514 
515         /* Translate subject if needed. */
516         if (program_number == eBlastTypeTblastn) {
517             const Uint1* target_sequence = Blast_HSPGetTargetTranslation(target_t, hsp, NULL);
518             status = Blast_HSPGetNumIdentitiesAndPositives(query, target_sequence, hsp, scoring_options, 0, sbp);
519         }
520         else
521             status = Blast_HSPGetNumIdentitiesAndPositives(query_nomask, subject, hsp, scoring_options, 0, sbp);
522 
523         ASSERT(status == 0);
524     }
525     target_t = BlastTargetTranslationFree(target_t);
526     if (seq_src) {
527         BlastSeqSrcReleaseSequence(seq_src, (void*) &seq_arg);
528         BlastSequenceBlkFree(seq_arg.seq);
529     }
530 }
531 
532 
533 /**
534  * A callback routine: compute lambda for the given score
535  * probabilities.
536  * (@sa calc_lambda_type).
537  */
538 static double
s_CalcLambda(double probs[],int min_score,int max_score,double lambda0)539 s_CalcLambda(double probs[], int min_score, int max_score, double lambda0)
540 {
541 
542     int i;                 /* loop index */
543     int score_range;       /* range of possible scores */
544     double avg;            /* expected score of aligning two characters */
545     Blast_ScoreFreq freq;  /* score frequency data */
546 
547     score_range = max_score - min_score + 1;
548     avg = 0.0;
549     for (i = 0;  i < score_range;  i++) {
550         avg += (min_score + i) * probs[i];
551     }
552     freq.score_min = min_score;
553     freq.score_max = max_score;
554     freq.obs_min = min_score;
555     freq.obs_max = max_score;
556     freq.sprob0 = probs;
557     freq.sprob = &probs[-min_score];
558     freq.score_avg = avg;
559 
560     return Blast_KarlinLambdaNR(&freq, lambda0);
561 }
562 
563 
564 /** Fill a two-dimensional array with the frequency ratios that
565  * underlie a position specific score matrix (PSSM).
566  *
567  * @param returnRatios     a two-dimensional array with BLASTAA_SIZE
568  *                         columns
569  * @param numPositions     the number of rows in returnRatios
570  * @param query            query sequence data, of length numPositions
571  * @param matrixName       the name of the position independent matrix
572  *                         corresponding to this PSSM
573  * @param startNumerator   position-specific data used to generate the
574  *                         PSSM
575  * @return   0 on success; -1 if the named matrix isn't known, or if
576  *           there was a memory error
577  * @todo find out what start numerator is.
578  */
579 static int
s_GetPosBasedStartFreqRatios(double ** returnRatios,Int4 numPositions,Uint1 * query,const char * matrixName,double ** startNumerator)580 s_GetPosBasedStartFreqRatios(double ** returnRatios,
581                              Int4 numPositions,
582                              Uint1 * query,
583                              const char *matrixName,
584                              double **startNumerator)
585 {
586     Int4 i,j;                            /* loop indices */
587     SFreqRatios * stdFreqRatios = NULL;  /* frequency ratios for the
588                                             named matrix. */
589     double *standardProb;                /* probabilities of each
590                                             letter*/
591     const double kPosEpsilon = 0.0001;   /* values below this cutoff
592                                             are treated specially */
593 
594     stdFreqRatios = _PSIMatrixFrequencyRatiosNew(matrixName);
595     if (stdFreqRatios == NULL) {
596         return -1;
597     }
598     for (i = 0;  i < numPositions;  i++) {
599         for (j = 0;  j < BLASTAA_SIZE;  j++) {
600             returnRatios[i][j] = stdFreqRatios->data[query[i]][j];
601         }
602     }
603     stdFreqRatios = _PSIMatrixFrequencyRatiosFree(stdFreqRatios);
604 
605     standardProb = BLAST_GetStandardAaProbabilities();
606     if(standardProb == NULL) {
607         return -1;
608     }
609     /*reverse multiplication done in posit.c*/
610     for (i = 0;  i < numPositions;  i++) {
611         for (j = 0;  j < BLASTAA_SIZE;  j++) {
612             if ((standardProb[query[i]] > kPosEpsilon) &&
613                 (standardProb[j] > kPosEpsilon) &&
614                 (j != eStopChar) && (j != eXchar) &&
615                 (startNumerator[i][j] > kPosEpsilon)) {
616                 returnRatios[i][j] = startNumerator[i][j] / standardProb[j];
617             }
618         }
619     }
620     sfree(standardProb);
621 
622     return 0;
623 }
624 
625 
626 /**
627  * Fill a two-dimensional array with the frequency ratios that underlie the
628  * named score matrix.
629  *
630  * @param returnRatios  a two-dimensional array of size
631  *                      BLASTAA_SIZE x  BLASTAA_SIZE
632  * @param matrixName    the name of a matrix
633  * @return   0 on success; -1 if the named matrix isn't known, or if
634  *           there was a memory error
635  */
636 static int
s_GetStartFreqRatios(double ** returnRatios,const char * matrixName)637 s_GetStartFreqRatios(double ** returnRatios,
638                      const char *matrixName)
639 {
640     /* Loop indices */
641     int i,j;
642     /* Frequency ratios for the matrix */
643     SFreqRatios * stdFreqRatios = NULL;
644 
645     stdFreqRatios = _PSIMatrixFrequencyRatiosNew(matrixName);
646     if (stdFreqRatios == NULL) {
647         return -1;
648     }
649     for (i = 0;  i < BLASTAA_SIZE;  i++) {
650         for (j = 0;  j < BLASTAA_SIZE;  j++) {
651             returnRatios[i][j] = stdFreqRatios->data[i][j];
652         }
653     }
654     stdFreqRatios = _PSIMatrixFrequencyRatiosFree(stdFreqRatios);
655 
656     return 0;
657 }
658 
659 
660 /** SCALING_FACTOR is a multiplicative factor used to get more bits of
661  * precision in the integer matrix scores. It cannot be arbitrarily
662  * large because we do not want total alignment scores to exceed
663  * -(BLAST_SCORE_MIN) */
664 #define SCALING_FACTOR 32
665 
666 
667 /**
668  * Produce a scaled-up version of the position-specific matrix
669  * with a given set of position-specific residue frequencies.
670  *
671  * @param fillPosMatrix     is the matrix to be filled
672  * @param matrixName        name of the standard substitution matrix [in]
673  * @param posFreqs          PSSM's frequency ratios [in]
674  * @param query             Query sequence data [in]
675  * @param queryLength       Length of the query sequence above [in]
676  * @param sbp               stores various parameters of the search
677  * @param scale_factor      amount by which ungapped parameters should be
678  *                          scaled.
679  * @return 0 on success; -1 on failure
680  */
681 static int
s_ScalePosMatrix(int ** fillPosMatrix,const char * matrixName,double ** posFreqs,Uint1 * query,int queryLength,BlastScoreBlk * sbp,double scale_factor)682 s_ScalePosMatrix(int ** fillPosMatrix,
683                  const char * matrixName,
684                  double ** posFreqs,
685                  Uint1 * query,
686                  int queryLength,
687                  BlastScoreBlk* sbp,
688                  double scale_factor)
689 {
690     /* Data used by scaling routines */
691     Kappa_posSearchItems *posSearch = NULL;
692     /* A reduced collection of search parameters used by PSI-blast */
693     Kappa_compactSearchItems *compactSearch = NULL;
694     /* Representation of a PSSM internal to PSI-blast */
695     _PSIInternalPssmData* internal_pssm = NULL;
696     /* return code */
697     int status = 0;
698 
699     posSearch = Kappa_posSearchItemsNew(queryLength, matrixName,
700                                         fillPosMatrix, posFreqs);
701     compactSearch = Kappa_compactSearchItemsNew(query, queryLength, sbp);
702     /* Copy data into new structures */
703     internal_pssm = _PSIInternalPssmDataNew(queryLength, BLASTAA_SIZE);
704     if (posSearch == NULL || compactSearch == NULL || internal_pssm == NULL) {
705         status = -1;
706         goto cleanup;
707     }
708     _PSICopyMatrix_int(internal_pssm->pssm, posSearch->posMatrix,
709                        internal_pssm->ncols, internal_pssm->nrows);
710     _PSICopyMatrix_int(internal_pssm->scaled_pssm,
711                        posSearch->posPrivateMatrix,
712                        internal_pssm->ncols, internal_pssm->nrows);
713     _PSICopyMatrix_double(internal_pssm->freq_ratios,
714                           posSearch->posFreqs, internal_pssm->ncols,
715                           internal_pssm->nrows);
716     status = _PSIConvertFreqRatiosToPSSM(internal_pssm, query, sbp,
717                                          compactSearch->standardProb);
718     if (status != 0) {
719         goto cleanup;
720     }
721     /* Copy data from new structures to posSearchItems */
722     _PSICopyMatrix_int(posSearch->posMatrix, internal_pssm->pssm,
723                        internal_pssm->ncols, internal_pssm->nrows);
724     _PSICopyMatrix_int(posSearch->posPrivateMatrix,
725                        internal_pssm->scaled_pssm,
726                        internal_pssm->ncols, internal_pssm->nrows);
727     _PSICopyMatrix_double(posSearch->posFreqs,
728                           internal_pssm->freq_ratios,
729                           internal_pssm->ncols, internal_pssm->nrows);
730     status = Kappa_impalaScaling(posSearch, compactSearch, (double)
731                                  scale_factor, FALSE, sbp);
732 cleanup:
733     internal_pssm = _PSIInternalPssmDataFree(internal_pssm);
734     posSearch = Kappa_posSearchItemsFree(posSearch);
735     compactSearch = Kappa_compactSearchItemsFree(compactSearch);
736 
737     return status;
738 }
739 
740 
741 /**
742  * Convert an array of HSPs to a list of BlastCompo_Alignment objects.
743  * The context field of each BlastCompo_Alignment is set to point to the
744  * corresponding HSP.
745  *
746  * @param self                 the array of alignment to be filled
747  * @param numAligns            number of alignments
748  * @param hsp_array             an array of HSPs
749  * @param hspcnt                the length of hsp_array
750  * @param init_context          the initial context to process
751  * @param queryInfo            information about the concatenated query
752  * @param localScalingFactor    the amount by which this search is scaled
753  *
754  * @return the new list of alignments; or NULL if there is an out-of-memory
755  *         error (or if the original array is empty)
756  */
757 static int
s_ResultHspToDistinctAlign(BlastCompo_Alignment ** self,int * numAligns,BlastHSP * hsp_array[],Int4 hspcnt,int init_context,const BlastQueryInfo * queryInfo,double localScalingFactor)758 s_ResultHspToDistinctAlign(BlastCompo_Alignment **self,
759                            int *numAligns,
760                            BlastHSP * hsp_array[], Int4 hspcnt,
761                            int init_context,
762                            const BlastQueryInfo* queryInfo,
763                            double localScalingFactor)
764 {
765     BlastCompo_Alignment * tail[6];        /* last element in aligns */
766     int hsp_index;                             /* loop index */
767     int frame_index;
768 
769     for (frame_index = 0; frame_index < 6; frame_index++) {
770         tail[frame_index] = NULL;
771         numAligns[frame_index] = 0;
772     }
773 
774     for (hsp_index = 0;  hsp_index < hspcnt;  hsp_index++) {
775         BlastHSP * hsp = hsp_array[hsp_index]; /* current HSP */
776         BlastCompo_Alignment * new_align;      /* newly-created alignment */
777         frame_index = hsp->context - init_context;
778         ASSERT(frame_index < 6 && frame_index >= 0);
779         /* Incoming alignments will have coordinates of the query
780            portion relative to a particular query context; they must
781            be shifted for used in the composition_adjustment library.
782         */
783         new_align =
784             BlastCompo_AlignmentNew((int) (hsp->score * localScalingFactor),
785                                     eDontAdjustMatrix,
786                                     hsp->query.offset, hsp->query.end, hsp->context,
787                                     hsp->subject.offset, hsp->subject.end,
788                                     hsp->subject.frame, hsp);
789         if (new_align == NULL) /* out of memory */
790             return -1;
791         if (tail[frame_index] == NULL) { /* if the list aligns is empty; */
792             /* make new_align the first element in the list */
793             self[frame_index] = new_align;
794         } else {
795             /* otherwise add new_align to the end of the list */
796             tail[frame_index]->next = new_align;
797         }
798         tail[frame_index] = new_align;
799         numAligns[frame_index]++;
800     }
801     return 0;
802 }
803 
804 
805 /**
806  * Redo a S-W alignment using an x-drop alignment.  The result will
807  * usually be the same as the S-W alignment. The call to ALIGN_EX
808  * attempts to force the endpoints of the alignment to match the
809  * optimal endpoints determined by the Smith-Waterman algorithm.
810  * ALIGN_EX is used, so that if the data structures for storing BLAST
811  * alignments are changed, the code will not break
812  *
813  * @param query         the query data
814  * @param queryStart    start of the alignment in the query sequence
815  * @param queryEnd      end of the alignment in the query sequence,
816  *                      as computed by the Smith-Waterman algorithm
817  * @param subject       the subject (database) sequence
818  * @param matchStart    start of the alignment in the subject sequence
819  * @param matchEnd      end of the alignment in the query sequence,
820  *                      as computed by the Smith-Waterman algorithm
821  * @param gap_align     parameters for a gapped alignment
822  * @param scoringParams Settings for gapped alignment.[in]
823  * @param score         score computed by the Smith-Waterman algorithm
824  * @param queryAlignmentExtent  length of the alignment in the query sequence,
825  *                              as computed by the x-drop algorithm
826  * @param matchAlignmentExtent  length of the alignment in the subject
827  *                              sequence, as computed by the x-drop algorithm
828  * @param newScore              alignment score computed by the x-drop
829  *                              algorithm
830  */
831 static void
s_SWFindFinalEndsUsingXdrop(BlastCompo_SequenceData * query,Int4 queryStart,Int4 queryEnd,BlastCompo_SequenceData * subject,Int4 matchStart,Int4 matchEnd,BlastGapAlignStruct * gap_align,const BlastScoringParameters * scoringParams,Int4 score,Int4 * queryAlignmentExtent,Int4 * matchAlignmentExtent,Int4 * newScore)832 s_SWFindFinalEndsUsingXdrop(BlastCompo_SequenceData * query,
833                             Int4 queryStart,
834                             Int4 queryEnd,
835                             BlastCompo_SequenceData * subject,
836                             Int4 matchStart,
837                             Int4 matchEnd,
838                             BlastGapAlignStruct* gap_align,
839                             const BlastScoringParameters* scoringParams,
840                             Int4 score,
841                             Int4 * queryAlignmentExtent,
842                             Int4 * matchAlignmentExtent,
843                             Int4 * newScore)
844 {
845     Int4 XdropAlignScore;         /* alignment score obtained using X-dropoff
846                                    * method rather than Smith-Waterman */
847     Int4 doublingCount = 0;       /* number of times X-dropoff had to be
848                                    * doubled */
849     Int4 gap_x_dropoff_orig = gap_align->gap_x_dropoff;
850 
851     GapPrelimEditBlockReset(gap_align->rev_prelim_tback);
852     GapPrelimEditBlockReset(gap_align->fwd_prelim_tback);
853     do {
854         XdropAlignScore =
855             ALIGN_EX(&(query->data[queryStart]) - 1,
856                      &(subject->data[matchStart]) - 1,
857                      queryEnd - queryStart + 1, matchEnd - matchStart + 1,
858                      queryAlignmentExtent,
859                      matchAlignmentExtent, gap_align->fwd_prelim_tback,
860                      gap_align, scoringParams, queryStart - 1, FALSE, FALSE,
861                      NULL);
862 
863         gap_align->gap_x_dropoff *= 2;
864         doublingCount++;
865         if((XdropAlignScore < score) && (doublingCount < 3)) {
866             GapPrelimEditBlockReset(gap_align->fwd_prelim_tback);
867         }
868     } while((XdropAlignScore < score) && (doublingCount < 3));
869 
870     gap_align->gap_x_dropoff = gap_x_dropoff_orig;
871     *newScore = XdropAlignScore;
872 }
873 
874 
875 /**
876  * BLAST-specific information that is associated with a
877  * BlastCompo_MatchingSequence.
878  */
879 typedef struct
880 BlastKappa_SequenceInfo {
881     EBlastProgramType prog_number; /**< identifies the type of blast
882                                         search being performed. The type
883                                         of search determines how sequence
884                                         data should be obtained. */
885     const BlastSeqSrc* seq_src;    /**< BLAST sequence data source */
886     BlastSeqSrcGetSeqArg seq_arg;  /**< argument to GetSequence method
887                                      of the BlastSeqSrc (@todo this
888                                      structure was designed to be
889                                      allocated on the stack, i.e.: in
890                                      Kappa_MatchingSequenceInitialize) */
891 } BlastKappa_SequenceInfo;
892 
893 
894 /** Release the resources associated with a matching sequence. */
895 static void
s_MatchingSequenceRelease(BlastCompo_MatchingSequence * self)896 s_MatchingSequenceRelease(BlastCompo_MatchingSequence * self)
897 {
898     if (self != NULL) {
899         if (self->index >=0) {
900             BlastKappa_SequenceInfo * local_data = self->local_data;
901             if (self->length > 0) {
902                 BlastSeqSrcReleaseSequence(local_data->seq_src,
903                                    &local_data->seq_arg);
904                 BlastSequenceBlkFree(local_data->seq_arg.seq);
905             }
906             free(self->local_data);
907         }
908         self->local_data = NULL;
909     }
910 }
911 
912 /**
913  * Do a simple gapped extension to the right from the beginning of query and
914  * subject ranges examining only matches and mismatches. The extension stops
915  * when there are more than max_shift mismatches or mismatches or gaps are not
916  * followed by two identical matches. This is a simplified version of the
917  * Danielle and Jean Thierry-Miegs' jumper
918  * alignment implemented in NCBI Magic
919  * http://www.ncbi.nlm.nih.gov/IEB/Research/Acembly/Download/Downloads.html
920  *
921  * @param query_seq Query sequence [in]
922  * @param query_len Query length [in]
923  * @param subject_seq Subject sequence [in]
924  * @param subject_len Subject length [in]
925  * @param max_shift Maximum number of mismatches or gaps, extension stops if
926  *        this number is reached [in]
927  * @param query_ext_len Extension length on the query [out]
928  * @param subject_ext_len Extension length on the subject [out]
929  * @param align_len Alignment length [out]
930  * @return Number of identical residues
931  */
s_ExtendRight(Uint1 * query_seq,int query_len,Uint1 * subject_seq,int subject_len,int max_shift,int * query_ext_len,int * subject_ext_len,int * align_len)932 static int s_ExtendRight(Uint1* query_seq, int query_len,
933                          Uint1* subject_seq, int subject_len,
934                          int max_shift,
935                          int* query_ext_len, int* subject_ext_len,
936                          int* align_len)
937 {
938     int num_identical = 0;
939     int q_pos, s_pos;
940     int gaps_in_query = 0;
941     int gaps_in_subject = 0;
942     q_pos = 0;
943     s_pos = 0;
944     while (q_pos < query_len && s_pos < subject_len) {
945         int n;
946         int match = 0;
947 
948         while (q_pos < query_len && s_pos < subject_len
949                && query_seq[q_pos] == subject_seq[s_pos]) {
950 
951             num_identical++;
952             q_pos++;
953             s_pos++;
954         }
955 
956         /* try to skip mismatches or gaps */
957         for (n=1; n < max_shift && q_pos + n + 1 < query_len
958                  && s_pos + n + 1 < subject_len && !match; n++) {
959 
960             /* mismatches */
961             if (query_seq[q_pos + n] == subject_seq[s_pos + n]
962                 && query_seq[q_pos + n + 1] == subject_seq[s_pos + n + 1]) {
963 
964                 /* we have already checked that two positions behind mismatches
965                    match so we can advance further */
966                 q_pos += n + 2;
967                 s_pos += n + 2;
968                 num_identical += 2;
969                 match = 1;
970             }
971 
972             /* gap in subject */
973             if (!match && query_seq[q_pos + n] == subject_seq[s_pos]
974                 && query_seq[q_pos + n + 1] == subject_seq[s_pos + 1]) {
975 
976                 q_pos += n + 2;
977                 s_pos += 2;
978                 num_identical += 2;
979                 gaps_in_subject += n;
980                 match = 1;
981             }
982 
983             /* gap in query */
984             if (!match && query_seq[q_pos] == subject_seq[s_pos + n]
985                 && query_seq[q_pos + 1] == subject_seq[s_pos + n + 1]) {
986 
987                 q_pos += 2;
988                 s_pos += n + 2;
989                 num_identical += 2;
990                 gaps_in_query += n;
991                 match = 1;
992             }
993         }
994 
995         if (match) {
996             continue;
997         }
998 
999         /* exit the loop */
1000         break;
1001     }
1002     *query_ext_len = q_pos;
1003     *subject_ext_len = s_pos;
1004     *align_len = q_pos > s_pos ? q_pos + gaps_in_query : s_pos + gaps_in_subject;
1005 
1006     return num_identical;
1007 }
1008 
1009 
1010 /**
1011  * Extend left from the end of the sequence and subject ranges and count
1012  * identities. The extension stops when there are more than max_shift
1013  * mismatches or mismatches or gaps are not followed by two identical matches.
1014  * See description for s_ExtendRight for more details.
1015  *
1016  * @param query_seq Query sequence [in]
1017  * @param query_len Query length [in]
1018  * @param subject_seq Subject sequence [in]
1019  * @param subject_len Subject length [in]
1020  * @param max_shift Maximum number of mismatches or gaps, extension stops if
1021  *        this number is reached [in]
1022  * @param query_ext_len Extension length on the query [out]
1023  * @param subject_ext_len Extension length on the subject [out]
1024  * @param align_len Alignment length [out]
1025  * @return Number of identical residues
1026  */
s_ExtendLeft(Uint1 * query_seq,int query_len,Uint1 * subject_seq,int subject_len,int max_shift,int * query_ext_len,int * subject_ext_len,int * align_len)1027 static int s_ExtendLeft(Uint1* query_seq, int query_len,
1028                         Uint1* subject_seq, int subject_len,
1029                         int max_shift,
1030                         int* query_ext_len, int* subject_ext_len,
1031                         int* align_len)
1032 {
1033     int q_pos = query_len - 1;
1034     int s_pos = subject_len - 1;
1035     int num_identical = 0;
1036     int gaps_in_query = 0;
1037     int gaps_in_subject = 0;
1038     while (q_pos >= 0 && s_pos >= 0) {
1039         int n;
1040         int match = 0;
1041 
1042         /* process identies */
1043         while (q_pos > 0 && s_pos > 0 && query_seq[q_pos] == subject_seq[s_pos]) {
1044             num_identical++;
1045             q_pos--;
1046             s_pos--;
1047         }
1048 
1049         /* try to skip mismatches or gaps */
1050         for (n=1;n < max_shift && q_pos - n - 1 > 0 && s_pos - n - 1 > 0
1051                  && !match; n++) {
1052 
1053             /* mismatch */
1054             if (query_seq[q_pos - n] == subject_seq[s_pos - n]
1055                 && query_seq[q_pos - n - 1] == subject_seq[s_pos - n - 1]) {
1056                 q_pos -= n + 2;
1057                 s_pos -= n + 2;
1058                 num_identical += 2;
1059                 match = 1;
1060             }
1061 
1062             /* gap in subject */
1063             if (!match && query_seq[q_pos - n] == subject_seq[s_pos]
1064                 && query_seq[q_pos - n - 1] == subject_seq[s_pos - 1]) {
1065                 q_pos -= n + 2;
1066                 s_pos -= 2;
1067                 num_identical += 2;
1068                 gaps_in_subject += n;
1069                 match = 1;
1070             }
1071 
1072             /* gap in query */
1073             if (!match && query_seq[q_pos] == subject_seq[s_pos - n]
1074                 && query_seq[q_pos - 1] == subject_seq[s_pos - n - 1]) {
1075                 q_pos -= 2;
1076                 s_pos -= n + 2;
1077                 num_identical += 2;
1078                 gaps_in_query += n;
1079                 match = 1;
1080             }
1081         }
1082 
1083         if (match) {
1084             continue;
1085         }
1086 
1087         break;
1088     }
1089     *query_ext_len = query_len - q_pos - 1;
1090     *subject_ext_len = subject_len - s_pos - 1;
1091     *align_len += *query_ext_len > *subject_ext_len ?
1092         *query_ext_len + gaps_in_query : *subject_ext_len + gaps_in_subject;
1093 
1094     return num_identical;
1095 }
1096 
1097 
1098 /**
1099  * Get hash for a word of word_size residues assuming 28-letter alphabet
1100  *
1101  * @param data Sequence [in]
1102  * @param word_size Word size [in]
1103  * @return Hash value
1104  */
s_GetHash(const Uint1 * data,int word_size)1105 static Uint8 s_GetHash(const Uint1* data, int word_size)
1106 {
1107     Uint8 hash = 0;
1108     int k;
1109     for (k=0;k < word_size;k++) {
1110         hash <<= 5;
1111         hash += (Int8)data[k];
1112     }
1113     return hash;
1114 }
1115 
1116 
1117 /**
1118  * Find a local number of identical residues in two aligned sequences by
1119  * finding word matches and doing a simple gapped extensions from the word hits
1120  *
1121  * @param query_seq Query sequence [in]
1122  * @param query_hashes Array of query words with index of each word
1123  *        corresponding to word position in the query [in]
1124  * @param query_len Query length [in]
1125  * @param subject_seq Subject sequence [in]
1126  * @param subject_len Subject length [in]
1127  * @param max_shift Maximum number of local mismatches or gaps for extensions
1128  *        [in]
1129  * @return Number of identical residues
1130  */
s_FindNumIdentical(Uint1 * query_seq,const Uint8 * query_hashes,int query_len,Uint1 * subject_seq,int subject_len,int max_shift)1131 static int s_FindNumIdentical(Uint1* query_seq,
1132                               const Uint8* query_hashes,
1133                               int query_len,
1134                               Uint1* subject_seq,
1135                               int subject_len,
1136                               int max_shift)
1137 {
1138     int word_size = 8;         /* word size for k-mer matching */
1139     Uint8 hash = 0;
1140     Uint8 mask = NCBI_CONST_UINT8(0xFFFFFFFFFF); /* mask for computing hash
1141                                                     values */
1142     int query_from = 0;
1143     int subject_from = 0;
1144 
1145     int s_pos;                 /* position in the subject sequence */
1146     int num_identical = 0;     /* number of identical residues found */
1147     Boolean match = FALSE;
1148 
1149     /* if query or subject length is smaller than word size, exit */
1150     if (!query_seq || !query_hashes || !subject_seq
1151         || query_len < word_size || subject_len < word_size) {
1152 
1153         return 0;
1154     }
1155 
1156     /* for each subject position */
1157     for (s_pos = 0; s_pos < subject_len - word_size; s_pos++) {
1158         int q_pos;
1159 
1160         /* find word hash */
1161         if (s_pos == 0 || match) {
1162             hash = s_GetHash(&subject_seq[s_pos], word_size);
1163         }
1164         else {
1165             hash <<= 5;
1166             hash &= mask;
1167             hash += subject_seq[s_pos + word_size - 1];
1168         }
1169 
1170         /* find matching query word; index of hash is position of the word
1171            the query */
1172         for (q_pos = query_from;q_pos < query_len - word_size; q_pos++) {
1173             if (query_hashes[q_pos] == hash) {
1174                 break;
1175             }
1176         }
1177 
1178         /* if match */
1179         if (q_pos < query_len - word_size) {
1180             int query_start = q_pos;
1181             int subject_start = s_pos;
1182 
1183             int query_left_len, query_right_len;
1184             int subject_left_len, subject_right_len;
1185             int align_len_left=0, align_len_right=0;
1186 
1187             match = TRUE;
1188             num_identical += word_size;
1189 
1190             /* extend left from word match */
1191             num_identical += s_ExtendLeft(query_seq + query_start - 1,
1192                                           query_start - query_from,
1193                                           subject_seq + subject_start - 1,
1194                                           subject_start - subject_from,
1195                                           max_shift,
1196                                           &query_left_len, &subject_left_len,
1197                                           &align_len_left);
1198 
1199             /* extend right from word match */
1200             num_identical += s_ExtendRight(query_seq + query_start + word_size,
1201                                        query_len - query_start - word_size,
1202                                        subject_seq + subject_start + word_size,
1203                                        subject_len - subject_start - word_size,
1204                                        max_shift,
1205                                        &query_right_len, &subject_right_len,
1206                                        &align_len_right);
1207 
1208 
1209             /* disregard already matched and extended words when matching
1210                further positions */
1211 
1212             query_from = query_start + word_size + query_right_len;
1213             subject_from = subject_start + word_size + subject_right_len;
1214             /* s_pos will be incremented in the loop */
1215             s_pos = subject_from - 1;
1216         }
1217         else {
1218             match = FALSE;
1219         }
1220     }
1221 
1222     return num_identical;
1223 }
1224 
1225 /**
1226  * Test whether the aligned parts of two sequences that
1227  * have a high-scoring gapless alignment are nearly identical.
1228  *
1229  * First extend from the left end of the query and subject ranges and stop if
1230  * there are too manu mismatches. Then extend from the right end. Then for the
1231  * remaining protion of ths sequences find matching words and extend left and
1232  * right from the word hit. Repeat the last steo until the whole alignment
1233  * ranges are processed.
1234  *
1235  * @params seqData Subject sequence [in]
1236  * @params seqOffse Starting offset of the subject sequence in alignment data
1237  *        [in]
1238  * @params queryData Query sequence [in]
1239  * @params queryOffset Starting offset of the query sequence in alignment data
1240  *         [in]
1241  * @param query_words Array of query words with word index corresponding to
1242  *        word's position in the query [in]
1243  * @param align Alignment data [in]
1244  * @return True if sequence parts are nearly identical, false otherwise
1245  */
1246 static Boolean
s_TestNearIdentical(const BlastCompo_SequenceData * seqData,const int seqOffset,const BlastCompo_SequenceData * queryData,const int queryOffset,const Uint8 * query_words,const BlastCompo_Alignment * align)1247 s_TestNearIdentical(const BlastCompo_SequenceData* seqData,
1248                     const int seqOffset,
1249                     const BlastCompo_SequenceData* queryData,
1250                     const int queryOffset,
1251                     const Uint8* query_words,
1252                     const BlastCompo_Alignment* align)
1253 {
1254     int qStart = align->queryStart - queryOffset;
1255     /* align->queryEnd points to one position past alignment end */
1256     int qEnd = align->queryEnd - queryOffset - 1;
1257     int sStart = align->matchStart - seqOffset;
1258     int sEnd = align->matchEnd - seqOffset - 1;
1259     const double kMinFractionNearIdentical = 0.96;
1260     int max_shift = 8;
1261 
1262     int query_len = qEnd - qStart + 1;
1263     int subject_len = sEnd - sStart + 1;
1264     int align_len = MIN(query_len, subject_len);
1265 
1266     int query_left_len = 0;
1267     int subject_left_len = 0;
1268     int query_right_len = 0;
1269     int subject_right_len = 0;
1270     int align_left_len = 0;
1271     int align_right_len = 0;
1272 
1273     double fraction_identical;
1274 
1275     /* first find number of identies going from the beginning of the query
1276        and subject ranges */
1277     int num_identical = s_ExtendRight(queryData->data + qStart, query_len,
1278                                       seqData->data + sStart, subject_len,
1279                                       max_shift,
1280                                       &query_right_len, &subject_right_len,
1281                                       &align_right_len);
1282 
1283     /* if the whole query range was processed return near identical status */
1284     if (query_right_len >= query_len || subject_right_len >= subject_len) {
1285         fraction_identical = (double)num_identical / (double)align_len;
1286         ASSERT(fraction_identical - 1.0 < 1e-10);
1287         return fraction_identical > kMinFractionNearIdentical;
1288     }
1289 
1290     /* find the number of identies going from the end of the query and subject
1291        ranges */
1292     num_identical += s_ExtendLeft(queryData->data + qStart + query_right_len,
1293                                   query_len - query_right_len,
1294                                   seqData->data + sStart + subject_right_len,
1295                                   subject_len - subject_right_len,
1296                                   max_shift,
1297                                   &query_left_len, &subject_left_len,
1298                                   &align_left_len);
1299 
1300     /* if the whole alignment ranges where covered, return the near identical
1301        status */
1302     if (query_left_len + query_right_len >= query_len
1303         || subject_left_len + subject_right_len >= subject_len) {
1304 
1305         fraction_identical = (double)num_identical / (double)(align_len);
1306         ASSERT(fraction_identical - 1.0 < 1e-10);
1307         return fraction_identical > kMinFractionNearIdentical;
1308     }
1309 
1310     /* find the number of identical matches in the middle portion of the
1311        alignment ranges */
1312     num_identical += s_FindNumIdentical(queryData->data + qStart + query_right_len,
1313                             query_words + qStart + query_right_len,
1314                             query_len - query_left_len - query_right_len,
1315                             seqData->data + sStart + subject_right_len,
1316                             subject_len - subject_left_len - subject_right_len,
1317                             max_shift);
1318 
1319     fraction_identical = (double)num_identical / (double)align_len;
1320     ASSERT(fraction_identical - 1.0 < 1e-10);
1321     if (fraction_identical > kMinFractionNearIdentical) {
1322         return TRUE;
1323     }
1324     else {
1325         return FALSE;
1326     }
1327 }
1328 
1329 
1330 /**
1331  * Initialize a new matching sequence, obtaining information about the
1332  * sequence from the search.
1333  *
1334  * @param self              object to be initialized
1335  * @param seqSrc            A pointer to a source from which sequence data
1336  *                          may be obtained
1337  * @param program_number    identifies the type of blast search being
1338  *                          performed.
1339  * @param default_db_genetic_code   default genetic code to use when
1340  *                          subject sequences are translated and there is
1341  *                          no other guidance on what code to use
1342  * @param subject_index     index of the matching sequence in the database
1343  */
1344 static int
s_MatchingSequenceInitialize(BlastCompo_MatchingSequence * self,EBlastProgramType program_number,const BlastSeqSrc * seqSrc,Int4 default_db_genetic_code,Int4 subject_index)1345 s_MatchingSequenceInitialize(BlastCompo_MatchingSequence * self,
1346                              EBlastProgramType program_number,
1347                              const BlastSeqSrc* seqSrc,
1348                              Int4 default_db_genetic_code,
1349                              Int4 subject_index)
1350 {
1351     BlastKappa_SequenceInfo * seq_info;  /* BLAST-specific sequence
1352                                             information */
1353     self->length = 0;
1354     self->local_data = NULL;
1355 
1356     seq_info = malloc(sizeof(BlastKappa_SequenceInfo));
1357     if (seq_info != NULL) {
1358         self->local_data = seq_info;
1359 
1360         seq_info->seq_src      = seqSrc;
1361         seq_info->prog_number  = program_number;
1362 
1363         memset((void*) &seq_info->seq_arg, 0, sizeof(seq_info->seq_arg));
1364         seq_info->seq_arg.oid = self->index = subject_index;
1365         seq_info->seq_arg.check_oid_exclusion = TRUE;
1366 
1367         if( program_number == eBlastTypeTblastn ) {
1368             seq_info->seq_arg.encoding = eBlastEncodingNcbi4na;
1369         } else {
1370             seq_info->seq_arg.encoding = eBlastEncodingProtein;
1371         }
1372         if (BlastSeqSrcGetSequence(seqSrc, &seq_info->seq_arg) >= 0) {
1373             self->length =
1374                 BlastSeqSrcGetSeqLen(seqSrc, (void*) &seq_info->seq_arg);
1375 
1376             /* If the subject is translated and the BlastSeqSrc implementation
1377              * doesn't provide a genetic code string, use the default genetic
1378              * code for all subjects (as in the C toolkit) */
1379             if (Blast_SubjectIsTranslated(program_number) &&
1380                 seq_info->seq_arg.seq->gen_code_string == NULL) {
1381                 seq_info->seq_arg.seq->gen_code_string =
1382                              GenCodeSingletonFind(default_db_genetic_code);
1383                 ASSERT(seq_info->seq_arg.seq->gen_code_string);
1384             }
1385         } else {
1386             self->length = 0;
1387         }
1388     }
1389     if (self->length == 0) {
1390         /* Could not obtain the required data */
1391         s_MatchingSequenceRelease(self);
1392         return -1;
1393     } else {
1394         return 0;
1395     }
1396 }
1397 
1398 
1399 /** NCBIstdaa encoding for 'X' character */
1400 #define BLASTP_MASK_RESIDUE 21
1401 /** Default instructions and mask residue for SEG filtering */
1402 #define BLASTP_MASK_INSTRUCTIONS "S 10 1.8 2.1"
1403 
1404 
1405 /**
1406  * Filter low complexity regions from the sequence data; uses the SEG
1407  * algorithm.
1408  *
1409  * @param seqData            data to be filtered
1410  * @param program_name       type of search being performed
1411  * @return   0 for success; -1 for out-of-memory
1412  */
1413 static int
s_DoSegSequenceData(BlastCompo_SequenceData * seqData,EBlastProgramType program_name,Boolean * is_seq_biased)1414 s_DoSegSequenceData(BlastCompo_SequenceData * seqData,
1415                     EBlastProgramType program_name,
1416                     Boolean* is_seq_biased)
1417 {
1418     int status = 0;
1419     BlastSeqLoc* mask_seqloc = NULL;
1420     SBlastFilterOptions* filter_options = NULL;
1421 
1422     status = BlastFilteringOptionsFromString(program_name,
1423                                              BLASTP_MASK_INSTRUCTIONS,
1424                                              &filter_options, NULL);
1425     if (status == 0) {
1426         status = BlastSetUp_Filter(program_name, seqData->data,
1427                                    seqData->length, 0, filter_options,
1428                                    &mask_seqloc, NULL);
1429         filter_options = SBlastFilterOptionsFree(filter_options);
1430     }
1431     if (is_seq_biased) {
1432         *is_seq_biased = (mask_seqloc != NULL);
1433     }
1434     if (status == 0) {
1435         Blast_MaskTheResidues(seqData->data, seqData->length,
1436                               FALSE, mask_seqloc, FALSE, 0);
1437     }
1438     if (mask_seqloc != NULL) {
1439         mask_seqloc = BlastSeqLocFree(mask_seqloc);
1440     }
1441     return status;
1442 }
1443 
1444 
1445 /**
1446  * Obtain a string of translated data
1447  *
1448  * @param self          the sequence from which to obtain the data [in]
1449  * @param range         the range and translation frame to get [in]
1450  * @param seqData       the resulting data [out]
1451  * @param queryData     the query sequence [in]
1452  * @param queryOffset   offset for align if there are multiple queries
1453  * @param align          information about the alignment between query and subject
1454  * @param shouldTestIdentical did alignment pass a preliminary test in
1455  *                       redo_alignment.c that indicates the sequence
1456  *                        pieces may be near identical
1457  *
1458  * @return 0 on success; -1 on failure
1459  */
1460 static int
s_SequenceGetTranslatedRange(const BlastCompo_MatchingSequence * self,const BlastCompo_SequenceRange * range,BlastCompo_SequenceData * seqData,const BlastCompo_SequenceRange * q_range,BlastCompo_SequenceData * queryData,const Uint8 * query_words,const BlastCompo_Alignment * align,const Boolean shouldTestIdentical,const ECompoAdjustModes compo_adjust_mode,const Boolean isSmithWaterman,Boolean * subject_maybe_biased)1461 s_SequenceGetTranslatedRange(const BlastCompo_MatchingSequence * self,
1462                              const BlastCompo_SequenceRange * range,
1463                              BlastCompo_SequenceData * seqData,
1464                              const BlastCompo_SequenceRange * q_range,
1465                              BlastCompo_SequenceData * queryData,
1466                              const Uint8* query_words,
1467                              const BlastCompo_Alignment *align,
1468                              const Boolean shouldTestIdentical,
1469                              const ECompoAdjustModes compo_adjust_mode,
1470                              const Boolean isSmithWaterman,
1471                              Boolean* subject_maybe_biased)
1472 {
1473     int status = 0;
1474     BlastKappa_SequenceInfo * local_data; /* BLAST-specific
1475                                              information associated
1476                                              with the sequence */
1477     Uint1 * translation_buffer;           /* a buffer for the translated,
1478                                              amino-acid sequence */
1479     Int4 translated_length;  /* length of the translated sequence */
1480     int translation_frame;   /* frame in which to translate */
1481     Uint1 * na_sequence;     /* the nucleotide sequence */
1482     int translation_start;   /* location in na_sequence to start
1483                                 translating */
1484     int num_nucleotides;     /* the number of nucleotides to be translated */
1485 
1486     local_data = self->local_data;
1487     na_sequence = local_data->seq_arg.seq->sequence_start;
1488 
1489     /* Initialize seqData to nil, in case this routine fails */
1490     seqData->buffer = NULL;
1491     seqData->data = NULL;
1492     seqData->length = 0;
1493 
1494     translation_frame = range->context;
1495     if (translation_frame > 0) {
1496         translation_start = 3 * range->begin;
1497     } else {
1498         translation_start =
1499             self->length - 3 * range->end + translation_frame + 1;
1500     }
1501     num_nucleotides =
1502         3 * (range->end - range->begin) + ABS(translation_frame) - 1;
1503 
1504     status = Blast_GetPartialTranslation(na_sequence + translation_start,
1505                                          num_nucleotides,
1506                                          (Int2) translation_frame,
1507                                          local_data->seq_arg.seq->gen_code_string,
1508                                          &translation_buffer,
1509                                          &translated_length,
1510                                          NULL);
1511     if (status == 0) {
1512         seqData->buffer = translation_buffer;
1513         seqData->data   = translation_buffer + 1;
1514         seqData->length = translated_length;
1515 
1516         if ( !(KAPPA_TBLASTN_NO_SEG_SEQUENCE) ) {
1517             if (compo_adjust_mode
1518                 && (!subject_maybe_biased || *subject_maybe_biased)) {
1519 
1520                 if ( (!shouldTestIdentical)
1521                      || (shouldTestIdentical
1522                          && (!s_TestNearIdentical(seqData, range->begin,
1523                                                   queryData, q_range->begin,
1524                                                   query_words, align)))) {
1525 
1526                     status = s_DoSegSequenceData(seqData, eBlastTypeTblastn,
1527                                                  subject_maybe_biased);
1528                     if (status != 0) {
1529                         free(seqData->buffer);
1530                         seqData->buffer = NULL;
1531                         seqData->data = NULL;
1532                         seqData->length = 0;
1533                     }
1534                 }
1535             }
1536         }
1537     }
1538     return status;
1539 }
1540 
1541 
1542 /**
1543  * Get a string of protein data from a protein sequence.
1544  *
1545  * @param self          a protein sequence [in]
1546  * @param range         the range to get [in]
1547  * @param seqData       the resulting data [out]
1548  * @param queryData     the query sequence [in]
1549  * @param queryOffset   offset for align if there are multiple queries
1550  * @param align          information about the alignment
1551  *                         between query and subject [in]
1552  * @param shouldTestIdentical did alignment pass a preliminary test in
1553  *                       redo_alignment.c that indicates the sequence
1554  *                        pieces may be near identical [in]
1555  *
1556  * @return 0 on success; -1 on failure
1557  */
1558 static int
s_SequenceGetProteinRange(const BlastCompo_MatchingSequence * self,const BlastCompo_SequenceRange * range,BlastCompo_SequenceData * seqData,const BlastCompo_SequenceRange * q_range,BlastCompo_SequenceData * queryData,const Uint8 * query_words,const BlastCompo_Alignment * align,const Boolean shouldTestIdentical,const ECompoAdjustModes compo_adjust_mode,const Boolean isSmithWaterman,Boolean * subject_maybe_biased)1559 s_SequenceGetProteinRange(const BlastCompo_MatchingSequence * self,
1560                           const BlastCompo_SequenceRange * range,
1561                           BlastCompo_SequenceData * seqData,
1562                           const BlastCompo_SequenceRange * q_range,
1563                           BlastCompo_SequenceData * queryData,
1564                           const Uint8* query_words,
1565                           const BlastCompo_Alignment *align,
1566                           const Boolean shouldTestIdentical,
1567                           const ECompoAdjustModes compo_adjust_mode,
1568                           const Boolean isSmithWaterman,
1569                           Boolean* subject_maybe_biased)
1570 
1571 {
1572     int status = 0;       /* return status */
1573     Int4       idx;       /* loop index */
1574     Uint1     *origData;  /* the unfiltered data for the sequence */
1575     /* BLAST-specific sequence information */
1576     BlastKappa_SequenceInfo * local_data = self->local_data;
1577     BLAST_SequenceBlk * seq = self->local_data;
1578 
1579     if (self->local_data == NULL)
1580 	return -1;
1581 
1582     seqData->data = NULL;
1583     seqData->length = 0;
1584     /* Copy the entire sequence (necessary for SEG filtering.) */
1585     seqData->buffer  = calloc((self->length + 2), sizeof(Uint1));
1586     if (seqData->buffer == NULL) {
1587         return -1;
1588     }
1589     /* First and last characters of the buffer MUST be '\0', which is
1590      * true here because the buffer was allocated using calloc. */
1591     seqData->data    = seqData->buffer + 1;
1592     seqData->length  = self->length;
1593 
1594     origData = (self->index >= 0) ? local_data->seq_arg.seq->sequence
1595                             : seq->sequence;
1596     if((self->index < 0) && (align->frame != 0)) {
1597     	int i=0, offsets =0;
1598     	int f = GET_SEQ_FRAME(align->frame);
1599     	int nucl_length = GET_NUCL_LENGTH(self->length);
1600     	seqData->length = GET_TRANSLATED_LENGTH(nucl_length, f);
1601     	for(; i < f; i++) {
1602     		offsets = GET_TRANSLATED_LENGTH(nucl_length, i) +1;
1603     		origData += offsets;
1604     	}
1605     }
1606     /* Copy the sequence data */
1607     for (idx = 0;  idx < seqData->length;  idx++) {
1608         seqData->data[idx] = origData[idx];
1609     }
1610 
1611     if ( !(KAPPA_BLASTP_NO_SEG_SEQUENCE) ) {
1612         if (compo_adjust_mode
1613             && (!subject_maybe_biased || *subject_maybe_biased)) {
1614 
1615             if ( (!shouldTestIdentical)
1616                  || (shouldTestIdentical
1617                      && (!s_TestNearIdentical(seqData, 0, queryData,
1618                                               q_range->begin, query_words,
1619                                               align)))) {
1620 
1621                 status = s_DoSegSequenceData(seqData, eBlastTypeBlastp,
1622                                              subject_maybe_biased);
1623             }
1624         }
1625     }
1626     /* Fit the data to the range. */
1627     seqData ->data    = &seqData->data[range->begin - 1];
1628     *seqData->data++  = '\0';
1629     seqData ->length  = range->end - range->begin;
1630 
1631     if (status != 0) {
1632         free(seqData->buffer);
1633         seqData->buffer = NULL;
1634         seqData->data = NULL;
1635     }
1636     return status;
1637 }
1638 
1639 
1640 /**
1641  * Obtain the sequence data that lies within the given range.
1642  *
1643  * @param self          sequence information [in]
1644  * @param range        range specifying the range of data [in]
1645  * @param seqData       the sequence data obtained [out]
1646  * @param seqData       the resulting data [out]
1647  * @param queryData     the query sequence [in]
1648  * @param queryOffset   offset for align if there are multiple queries
1649  * @param align          information about the alignment between query and subject
1650  * @param shouldTestIdentical did alignment pass a preliminary test in
1651  *                       redo_alignment.c that indicates the sequence
1652  *                        pieces may be near identical
1653  *
1654  * @return 0 on success; -1 on failure
1655  */
1656 static int
s_SequenceGetRange(const BlastCompo_MatchingSequence * self,const BlastCompo_SequenceRange * s_range,BlastCompo_SequenceData * seqData,const BlastCompo_SequenceData * query,const BlastCompo_SequenceRange * q_range,BlastCompo_SequenceData * queryData,const Uint8 * query_words,const BlastCompo_Alignment * align,const Boolean shouldTestIdentical,const ECompoAdjustModes compo_adjust_mode,const Boolean isSmithWaterman,Boolean * subject_maybe_biased)1657 s_SequenceGetRange(const BlastCompo_MatchingSequence * self,
1658                    const BlastCompo_SequenceRange * s_range,
1659                    BlastCompo_SequenceData * seqData,
1660                    const BlastCompo_SequenceData * query,
1661                    const BlastCompo_SequenceRange * q_range,
1662                    BlastCompo_SequenceData * queryData,
1663                    const Uint8* query_words,
1664                    const BlastCompo_Alignment *align,
1665                    const Boolean shouldTestIdentical,
1666                    const ECompoAdjustModes compo_adjust_mode,
1667                    const Boolean isSmithWaterman,
1668                    Boolean* subject_maybe_biased)
1669 {
1670     Int4 idx;
1671     BlastKappa_SequenceInfo * seq_info = self->local_data;
1672     Uint1 *origData = query->data + q_range->begin;
1673     /* Copy the query sequence (necessary for SEG filtering.) */
1674     queryData->length = q_range->end - q_range->begin;
1675     queryData->buffer = calloc((queryData->length + 2), sizeof(Uint1));
1676     queryData->data   = queryData->buffer + 1;
1677 
1678     for (idx = 0;  idx < queryData->length;  idx++) {
1679         /* Copy the sequence data, replacing occurrences of amino acid
1680          * number 24 (Selenocysteine) with number 3 (Cysteine). */
1681         queryData->data[idx] = (origData[idx] != 24) ? origData[idx] : 3;
1682     }
1683     if (seq_info && seq_info->prog_number ==  eBlastTypeTblastn) {
1684         /* The sequence must be translated. */
1685         return s_SequenceGetTranslatedRange(self, s_range, seqData,
1686                                             q_range, queryData, query_words,
1687                                             align, shouldTestIdentical,
1688                                             compo_adjust_mode, isSmithWaterman,
1689                                             subject_maybe_biased);
1690     } else {
1691         return s_SequenceGetProteinRange(self, s_range, seqData,
1692                                          q_range, queryData, query_words,
1693                                          align, shouldTestIdentical,
1694                                          compo_adjust_mode, isSmithWaterman,
1695                                          subject_maybe_biased);
1696     }
1697 }
1698 
1699 
1700 /** Data and data-structures needed to perform a gapped alignment */
1701 typedef struct BlastKappa_GappingParamsContext {
1702     const BlastScoringParameters*
1703         scoringParams;                /**< scoring parameters for a
1704                                            gapped alignment */
1705     BlastGapAlignStruct * gap_align;  /**< additional parameters for a
1706                                            gapped alignment */
1707     BlastScoreBlk* sbp;               /**< the score block for this search */
1708     double localScalingFactor;        /**< the amount by which this
1709                                            search has been scaled */
1710     EBlastProgramType prog_number;    /**< the type of search being
1711                                            performed */
1712 } BlastKappa_GappingParamsContext;
1713 
1714 
1715 /**
1716  * Reads a BlastGapAlignStruct that has been used to compute a
1717  * traceback, and return a BlastCompo_Alignment representing the
1718  * alignment.  The BlastGapAlignStruct is in coordinates local to the
1719  * ranges being aligned; the resulting alignment is in coordinates w.r.t.
1720  * the whole query and subject.
1721  *
1722  * @param gap_align         the BlastGapAlignStruct
1723  * @param *edit_script      the edit script from the alignment; on exit
1724  *                          NULL.  The edit_script is usually
1725  *                          gap_align->edit_script, but we don't want
1726  *                          an implicit side effect on the gap_align.
1727  * @param query_range       the range of the query used in this alignment
1728  * @param subject_range     the range of the subject used in this alignment
1729  * @param matrix_adjust_rule   the rule used to compute the scoring matrix
1730  *
1731  * @return the new alignment on success or NULL on error
1732  */
1733 static BlastCompo_Alignment *
s_NewAlignmentFromGapAlign(BlastGapAlignStruct * gap_align,GapEditScript ** edit_script,BlastCompo_SequenceRange * query_range,BlastCompo_SequenceRange * subject_range,EMatrixAdjustRule matrix_adjust_rule)1734 s_NewAlignmentFromGapAlign(BlastGapAlignStruct * gap_align,
1735                            GapEditScript ** edit_script,
1736                            BlastCompo_SequenceRange * query_range,
1737                            BlastCompo_SequenceRange * subject_range,
1738                            EMatrixAdjustRule matrix_adjust_rule)
1739 {
1740     /* parameters to BlastCompo_AlignmentNew */
1741     int queryStart, queryEnd, queryIndex, matchStart, matchEnd, frame;
1742     BlastCompo_Alignment * obj; /* the new alignment */
1743 
1744     /* In the composition_adjustment library, the query start/end are
1745        indices into the concatenated query, and so must be shifted.  */
1746     queryStart = gap_align->query_start + query_range->begin;
1747     queryEnd   = gap_align->query_stop + query_range->begin;
1748     queryIndex = query_range->context;
1749     matchStart = gap_align->subject_start + subject_range->begin;
1750     matchEnd   = gap_align->subject_stop  + subject_range->begin;
1751     frame      = subject_range->context;
1752 
1753     obj = BlastCompo_AlignmentNew(gap_align->score, matrix_adjust_rule,
1754                                   queryStart, queryEnd, queryIndex,
1755                                   matchStart, matchEnd, frame,
1756                                   *edit_script);
1757     if (obj != NULL) {
1758         *edit_script = NULL;
1759     }
1760     return obj;
1761 }
1762 
1763 
1764 /** A callback used when performing SmithWaterman alignments:
1765  * Calculate the traceback for one alignment by performing an x-drop
1766  * alignment in the forward direction, possibly increasing the x-drop
1767  * parameter until the desired score is attained.
1768  *
1769  * The start, end and score of the alignment should be obtained
1770  * using the Smith-Waterman algorithm before this routine is called.
1771  *
1772  * @param *pnewAlign       the new alignment
1773  * @param *pqueryEnd       on entry, the end of the alignment in the
1774  *                         query, as computed by the Smith-Waterman
1775  *                         algorithm.  On exit, the end as computed by
1776  *                         the x-drop algorithm
1777  * @param *pmatchEnd       like as *pqueryEnd, but for the subject
1778  *                         sequence
1779  * @param queryStart       the starting point in the query
1780  * @param matchStart       the starting point in the subject
1781  * @param score            the score of the alignment, as computed by
1782  *                         the Smith-Waterman algorithm
1783  * @param query            query sequence data
1784  * @param query_range      range of this query in the concatenated
1785  *                         query
1786  * @param ccat_query_length   total length of the concatenated query
1787  * @param subject          subject sequence data
1788  * @param subject_range    range of subject_data in the translated
1789  *                         query, in amino acid coordinates
1790  * @param full_subject_length   length of the full subject sequence
1791  * @param gapping_params        parameters used to compute gapped
1792  *                              alignments
1793  * @param matrix_adjust_rule    the rule used to compute the scoring matrix
1794  *
1795  * @returns 0   (posts a fatal error if it fails)
1796  * @sa new_xdrop_align_type
1797  */
1798 static int
s_NewAlignmentUsingXdrop(BlastCompo_Alignment ** pnewAlign,Int4 * pqueryEnd,Int4 * pmatchEnd,Int4 queryStart,Int4 matchStart,Int4 score,BlastCompo_SequenceData * query,BlastCompo_SequenceRange * query_range,Int4 ccat_query_length,BlastCompo_SequenceData * subject,BlastCompo_SequenceRange * subject_range,Int4 full_subject_length,BlastCompo_GappingParams * gapping_params,EMatrixAdjustRule matrix_adjust_rule)1799 s_NewAlignmentUsingXdrop(BlastCompo_Alignment ** pnewAlign,
1800                          Int4 * pqueryEnd, Int4 *pmatchEnd,
1801                          Int4 queryStart, Int4 matchStart, Int4 score,
1802                          BlastCompo_SequenceData * query,
1803                          BlastCompo_SequenceRange * query_range,
1804                          Int4 ccat_query_length,
1805                          BlastCompo_SequenceData * subject,
1806                          BlastCompo_SequenceRange * subject_range,
1807                          Int4 full_subject_length,
1808                          BlastCompo_GappingParams * gapping_params,
1809                          EMatrixAdjustRule matrix_adjust_rule)
1810 {
1811     Int4 newScore;
1812     /* Extent of the alignment as computed by an x-drop alignment
1813      * (usually the same as (queryEnd - queryStart) and (matchEnd -
1814      * matchStart)) */
1815     Int4 queryExtent, matchExtent;
1816     BlastCompo_Alignment * obj = NULL;  /* the new object */
1817     /* BLAST-specific parameters needed compute an X-drop alignment */
1818     BlastKappa_GappingParamsContext * context = gapping_params->context;
1819     /* Auxiliarly structure for computing gapped alignments */
1820     BlastGapAlignStruct * gap_align = context->gap_align;
1821     /* Scoring parameters for gapped alignments */
1822     const BlastScoringParameters* scoringParams = context->scoringParams;
1823     /* A structure containing the traceback of a gapped alignment */
1824     GapEditScript* editScript = NULL;
1825 
1826     /* suppress unused parameter warnings; this is a callback
1827        function, so these parameter cannot be deleted */
1828     (void) ccat_query_length;
1829     (void) full_subject_length;
1830 
1831     gap_align->gap_x_dropoff = gapping_params->x_dropoff;
1832 
1833     s_SWFindFinalEndsUsingXdrop(query,   queryStart, *pqueryEnd,
1834                                 subject, matchStart, *pmatchEnd,
1835                                 gap_align, scoringParams,
1836                                 score, &queryExtent, &matchExtent,
1837                                 &newScore);
1838     *pqueryEnd = queryStart + queryExtent;
1839     *pmatchEnd = matchStart + matchExtent;
1840 
1841     editScript =
1842         Blast_PrelimEditBlockToGapEditScript(gap_align->rev_prelim_tback,
1843                                              gap_align->fwd_prelim_tback);
1844     if (editScript != NULL) {
1845         /* Shifted values of the endpoints */
1846         Int4 aqueryStart =  queryStart + query_range->begin;
1847         Int4 aqueryEnd   = *pqueryEnd  + query_range->begin;
1848         Int4 amatchStart =  matchStart + subject_range->begin;
1849         Int4 amatchEnd   = *pmatchEnd  + subject_range->begin;
1850 
1851         obj = BlastCompo_AlignmentNew(newScore, matrix_adjust_rule,
1852                                       aqueryStart, aqueryEnd,
1853                                       query_range->context,
1854                                       amatchStart, amatchEnd,
1855                                       subject_range->context, editScript);
1856         if (obj == NULL) {
1857             GapEditScriptDelete(editScript);
1858         }
1859     }
1860     *pnewAlign = obj;
1861 
1862     return obj != NULL ? 0 : -1;
1863 }
1864 
1865 
1866 /**
1867  * A callback: calculate the traceback for one alignment by
1868  * performing an x-drop alignment in both directions
1869  *
1870  * @param in_align         the existing alignment, without traceback
1871  * @param matrix_adjust_rule    the rule used to compute the scoring matrix
1872  * @param query_data       query sequence data
1873  * @param query_range      range of this query in the concatenated
1874  *                         query
1875  * @param ccat_query_length   total length of the concatenated query
1876  * @param subject_data     subject sequence data
1877  * @param subject_range    range of subject_data in the translated
1878  *                         query, in amino acid coordinates
1879  * @param full_subject_length   length of the full subject sequence
1880  * @param gapping_params        parameters used to compute gapped
1881  *                              alignments
1882  * @sa redo_one_alignment_type
1883  */
1884 static BlastCompo_Alignment *
s_RedoOneAlignment(BlastCompo_Alignment * in_align,EMatrixAdjustRule matrix_adjust_rule,BlastCompo_SequenceData * query_data,BlastCompo_SequenceRange * query_range,int ccat_query_length,BlastCompo_SequenceData * subject_data,BlastCompo_SequenceRange * subject_range,int full_subject_length,BlastCompo_GappingParams * gapping_params)1885 s_RedoOneAlignment(BlastCompo_Alignment * in_align,
1886                    EMatrixAdjustRule matrix_adjust_rule,
1887                    BlastCompo_SequenceData * query_data,
1888                    BlastCompo_SequenceRange * query_range,
1889                    int ccat_query_length,
1890                    BlastCompo_SequenceData * subject_data,
1891                    BlastCompo_SequenceRange * subject_range,
1892                    int full_subject_length,
1893                    BlastCompo_GappingParams * gapping_params)
1894 {
1895     int status;                /* return code */
1896     Int4 q_start, s_start;     /* starting point in query and subject */
1897     /* BLAST-specific parameters needed to compute a gapped alignment */
1898     BlastKappa_GappingParamsContext * context = gapping_params->context;
1899     /* Auxiliary structure for computing gapped alignments */
1900     BlastGapAlignStruct* gapAlign = context->gap_align;
1901     /* The preliminary gapped HSP that were are recomputing */
1902     BlastHSP * hsp = in_align->context;
1903     Boolean fence_hit = FALSE;
1904 
1905     /* suppress unused parameter warnings; this is a callback
1906        function, so these parameter cannot be deleted */
1907     (void) ccat_query_length;
1908     (void) full_subject_length;
1909 
1910     /* Use the starting point supplied by the HSP. */
1911     q_start = hsp->query.gapped_start - query_range->begin;
1912     s_start = hsp->subject.gapped_start - subject_range->begin;
1913 
1914     gapAlign->gap_x_dropoff = gapping_params->x_dropoff;
1915 
1916     /*
1917      * Previously, last argument was NULL which could cause problems for
1918      * tblastn.
1919      */
1920     status =
1921         BLAST_GappedAlignmentWithTraceback(context->prog_number,
1922                                            query_data->data,
1923                                            subject_data->data, gapAlign,
1924                                            context->scoringParams,
1925                                            q_start, s_start,
1926                                            query_data->length,
1927                                            subject_data->length,
1928                                            &fence_hit);
1929     if (status == 0) {
1930         return s_NewAlignmentFromGapAlign(gapAlign, &gapAlign->edit_script,
1931                                           query_range, subject_range,
1932                                           matrix_adjust_rule);
1933     } else {
1934         return NULL;
1935     }
1936 }
1937 
1938 
1939 /**
1940  * A BlastKappa_SavedParameters holds the value of certain search
1941  * parameters on entry to RedoAlignmentCore.  These values are
1942  * restored on exit.
1943  */
1944 typedef struct BlastKappa_SavedParameters {
1945     Int4          gap_open;    /**< a penalty for the existence of a gap */
1946     Int4          gapExtend;   /**< a penalty for each residue in the
1947                                     gap */
1948     double        scale_factor;     /**< the original scale factor */
1949     Int4 **origMatrix;              /**< The original matrix values */
1950     double original_expect_value;   /**< expect value on entry */
1951     /** copy of the original gapped Karlin-Altschul block
1952      * corresponding to the first context */
1953     Blast_KarlinBlk** kbp_gap_orig;
1954     Int4             num_queries;   /**< Number of queries in this search */
1955 } BlastKappa_SavedParameters;
1956 
1957 
1958 /**
1959  * Release the data associated with a BlastKappa_SavedParameters and
1960  * delete the object
1961  * @param searchParams the object to be deleted [in][out]
1962  */
1963 static void
s_SavedParametersFree(BlastKappa_SavedParameters ** searchParams)1964 s_SavedParametersFree(BlastKappa_SavedParameters ** searchParams)
1965 {
1966     /* for convenience, remove one level of indirection from searchParams */
1967     BlastKappa_SavedParameters *sp = *searchParams;
1968 
1969     if (sp != NULL) {
1970         if (sp->kbp_gap_orig != NULL) {
1971             int i;
1972             for (i = 0;  i < sp->num_queries;  i++) {
1973                 if (sp->kbp_gap_orig[i] != NULL)
1974                     Blast_KarlinBlkFree(sp->kbp_gap_orig[i]);
1975             }
1976             free(sp->kbp_gap_orig);
1977         }
1978         if (sp->origMatrix != NULL)
1979             Nlm_Int4MatrixFree(&sp->origMatrix);
1980     }
1981     sfree(*searchParams);
1982     *searchParams = NULL;
1983 }
1984 
1985 
1986 /**
1987  * Create a new instance of BlastKappa_SavedParameters
1988  *
1989  * @param rows               number of rows in the scoring matrix
1990  * @param numQueries         number of queries in this search
1991  * @param compo_adjust_mode  if >0, use composition-based statistics
1992  * @param positionBased      if true, the search is position-based
1993  */
1994 static BlastKappa_SavedParameters *
s_SavedParametersNew(Int4 rows,Int4 numQueries,ECompoAdjustModes compo_adjust_mode,Boolean positionBased)1995 s_SavedParametersNew(Int4 rows,
1996                      Int4 numQueries,
1997                      ECompoAdjustModes compo_adjust_mode,
1998                      Boolean positionBased)
1999 {
2000     int i;
2001     BlastKappa_SavedParameters *sp;   /* the new object */
2002     sp = malloc(sizeof(BlastKappa_SavedParameters));
2003 
2004     if (sp == NULL) {
2005         goto error_return;
2006     }
2007     sp->kbp_gap_orig       = NULL;
2008     sp->origMatrix         = NULL;
2009 
2010     sp->kbp_gap_orig = calloc(numQueries, sizeof(Blast_KarlinBlk*));
2011     if (sp->kbp_gap_orig == NULL) {
2012         goto error_return;
2013     }
2014     sp->num_queries = numQueries;
2015     for (i = 0;  i < numQueries;  i++) {
2016         sp->kbp_gap_orig[i] = NULL;
2017     }
2018     if (compo_adjust_mode != eNoCompositionBasedStats) {
2019         if (positionBased) {
2020             sp->origMatrix = Nlm_Int4MatrixNew(rows, BLASTAA_SIZE);
2021         } else {
2022             sp->origMatrix = Nlm_Int4MatrixNew(BLASTAA_SIZE, BLASTAA_SIZE);
2023         }
2024         if (sp->origMatrix == NULL)
2025             goto error_return;
2026     }
2027     return sp;
2028 error_return:
2029     s_SavedParametersFree(&sp);
2030     return NULL;
2031 }
2032 
2033 
2034 /**
2035  * Record the initial value of the search parameters that are to be
2036  * adjusted.
2037  *
2038  * @param searchParams       holds the recorded values [out]
2039  * @param sbp                a score block [in]
2040  * @param scoring            gapped alignment parameters [in]
2041  * @param query_length       length of the concatenated query [in]
2042  * @param compo_adjust_mode  composition adjustment mode [in]
2043  * @param positionBased     is this search position-based [in]
2044  */
2045 static int
s_RecordInitialSearch(BlastKappa_SavedParameters * searchParams,BlastScoreBlk * sbp,const BlastScoringParameters * scoring,int query_length,ECompoAdjustModes compo_adjust_mode,Boolean positionBased)2046 s_RecordInitialSearch(BlastKappa_SavedParameters * searchParams,
2047                       BlastScoreBlk* sbp,
2048                       const BlastScoringParameters* scoring,
2049                       int query_length,
2050                       ECompoAdjustModes compo_adjust_mode,
2051                       Boolean positionBased)
2052 {
2053     int i;
2054 
2055     searchParams->gap_open     = scoring->gap_open;
2056     searchParams->gapExtend    = scoring->gap_extend;
2057     searchParams->scale_factor = scoring->scale_factor;
2058 
2059     for (i = 0;  i < searchParams->num_queries;  i++) {
2060         if (sbp->kbp_gap[i] != NULL) {
2061             /* There is a kbp_gap for query i and it must be copied */
2062             searchParams->kbp_gap_orig[i] = Blast_KarlinBlkNew();
2063             if (searchParams->kbp_gap_orig[i] == NULL) {
2064                 return -1;
2065             }
2066             Blast_KarlinBlkCopy(searchParams->kbp_gap_orig[i],
2067                                 sbp->kbp_gap[i]);
2068         }
2069     }
2070 
2071     if (compo_adjust_mode != eNoCompositionBasedStats) {
2072         Int4 **matrix;              /* scoring matrix */
2073         int j;                      /* iteration index */
2074         int rows;                   /* number of rows in matrix */
2075         if (positionBased) {
2076             matrix = sbp->psi_matrix->pssm->data;
2077             rows = query_length;
2078         } else {
2079             matrix = sbp->matrix->data;
2080             rows = BLASTAA_SIZE;
2081         }
2082 
2083         for (i = 0;  i < rows;  i++) {
2084             for (j = 0;  j < BLASTAA_SIZE;  j++) {
2085                 searchParams->origMatrix[i][j] = matrix[i][j];
2086             }
2087         }
2088     }
2089     return 0;
2090 }
2091 
2092 
2093 /**
2094  * Rescale the search parameters in the search object and options
2095  * object to obtain more precision.
2096  *
2097  * @param sbp               score block to be rescaled
2098  * @param sp                scoring parameters to be rescaled
2099  * @param num_queries       number of queries in this search
2100  * @param scale_factor      amount by which to scale this search
2101  */
2102 static void
s_RescaleSearch(BlastScoreBlk * sbp,BlastScoringParameters * sp,int num_queries,double scale_factor)2103 s_RescaleSearch(BlastScoreBlk* sbp,
2104                 BlastScoringParameters* sp,
2105                 int num_queries,
2106                 double scale_factor)
2107 {
2108     int i;
2109     for (i = 0;  i < num_queries;  i++) {
2110         if (sbp->kbp_gap[i] != NULL) {
2111             Blast_KarlinBlk * kbp = sbp->kbp_gap[i];
2112             kbp->Lambda /= scale_factor;
2113             kbp->logK = log(kbp->K);
2114         }
2115     }
2116 
2117     sp->gap_open = BLAST_Nint(sp->gap_open  * scale_factor);
2118     sp->gap_extend = BLAST_Nint(sp->gap_extend * scale_factor);
2119     sp->scale_factor = scale_factor;
2120 }
2121 
2122 
2123 /**
2124  * Restore the parameters that were adjusted to their original values.
2125  *
2126  * @param sbp                the score block to be restored
2127  * @param scoring            the scoring parameters to be restored
2128  * @param searchParams       the initial recorded values of the parameters
2129  * @param query_length       the concatenated query length
2130  * @param positionBased      is this search position-based
2131  * @param compo_adjust_mode  mode of composition adjustment
2132  */
2133 static void
s_RestoreSearch(BlastScoreBlk * sbp,BlastScoringParameters * scoring,const BlastKappa_SavedParameters * searchParams,int query_length,Boolean positionBased,ECompoAdjustModes compo_adjust_mode)2134 s_RestoreSearch(BlastScoreBlk* sbp,
2135                 BlastScoringParameters* scoring,
2136                 const BlastKappa_SavedParameters * searchParams,
2137                 int query_length,
2138                 Boolean positionBased,
2139                 ECompoAdjustModes compo_adjust_mode)
2140 {
2141     int i;
2142 
2143     scoring->gap_open = searchParams->gap_open;
2144     scoring->gap_extend = searchParams->gapExtend;
2145     scoring->scale_factor = searchParams->scale_factor;
2146 
2147     for (i = 0;  i < searchParams->num_queries;  i++) {
2148         if (sbp->kbp_gap[i] != NULL) {
2149             Blast_KarlinBlkCopy(sbp->kbp_gap[i],
2150                                 searchParams->kbp_gap_orig[i]);
2151         }
2152     }
2153     if(compo_adjust_mode != eNoCompositionBasedStats) {
2154         int  j;             /* iteration index */
2155         Int4 ** matrix;     /* matrix to be restored */
2156         int rows;           /* number of rows in the matrix */
2157 
2158         if (positionBased) {
2159             matrix = sbp->psi_matrix->pssm->data;
2160             rows = query_length;
2161         } else {
2162             matrix = sbp->matrix->data;
2163             rows = BLASTAA_SIZE;
2164         }
2165         for (i = 0;  i < rows;  i++) {
2166             for (j = 0;  j < BLASTAA_SIZE;  j++) {
2167                 matrix[i][j] = searchParams->origMatrix[i][j];
2168             }
2169         }
2170     }
2171 }
2172 
2173 
2174 /**
2175  * Initialize an object of type Blast_MatrixInfo.
2176  *
2177  * @param self            object being initialized
2178  * @param queryBlk        the query sequence data
2179  * @param sbp             score block for this search
2180  * @param scale_factor    amount by which ungapped parameters should be
2181  *                        scaled
2182  * @param matrixName      name of the matrix
2183  */
2184 static int
s_MatrixInfoInit(Blast_MatrixInfo * self,BLAST_SequenceBlk * queryBlk,BlastScoreBlk * sbp,double scale_factor,const char * matrixName)2185 s_MatrixInfoInit(Blast_MatrixInfo * self,
2186                  BLAST_SequenceBlk* queryBlk,
2187                  BlastScoreBlk* sbp,
2188                  double scale_factor,
2189                  const char * matrixName)
2190 {
2191     int status = 0;    /* return status */
2192     int lenName;       /* length of matrixName as a string */
2193 
2194     /* copy the matrix name (strdup is not standard C) */
2195     lenName = strlen(matrixName);
2196     if (NULL == (self->matrixName = malloc(lenName + 1))) {
2197         return -1;
2198     }
2199     memcpy(self->matrixName, matrixName, lenName + 1);
2200 
2201     if (self->positionBased) {
2202         status = s_GetPosBasedStartFreqRatios(self->startFreqRatios,
2203                                               queryBlk->length,
2204                                               queryBlk->sequence,
2205                                               matrixName,
2206                                               sbp->psi_matrix->freq_ratios);
2207         if (status == 0) {
2208             status = s_ScalePosMatrix(self->startMatrix, matrixName,
2209                                       sbp->psi_matrix->freq_ratios,
2210                                       queryBlk->sequence,
2211                                       queryBlk->length, sbp, scale_factor);
2212             self->ungappedLambda = sbp->kbp_psi[0]->Lambda / scale_factor;
2213         }
2214     } else {
2215         self->ungappedLambda = sbp->kbp_ideal->Lambda / scale_factor;
2216         status = s_GetStartFreqRatios(self->startFreqRatios, matrixName);
2217         if (status == 0) {
2218             Blast_Int4MatrixFromFreq(self->startMatrix, self->cols,
2219                                      self->startFreqRatios,
2220                                      self->ungappedLambda);
2221         }
2222     }
2223     return status;
2224 }
2225 
2226 
2227 /* Create an array of 8-mers for a sequence, such that index of each 8-mer
2228    is the same as its position in the query */
2229 static int
s_CreateWordArray(const Uint1 * seq_data,Int4 seq_len,Uint8 ** words)2230 s_CreateWordArray(const Uint1* seq_data, Int4 seq_len, Uint8** words)
2231 {
2232     int word_size = 8;         /* word size for k-mer matching */
2233     Uint8* query_hashes;       /* list of hashes for query words */
2234     Uint8 mask = NCBI_CONST_UINT8(0xFFFFFFFFFF); /* mask for computing hash
2235                                                     values */
2236     int i;
2237 
2238     /* if query or subject length is smaller than word size, exit */
2239     if (!seq_data || !words || seq_len < word_size) {
2240         return -1;
2241     }
2242 
2243     query_hashes = (Uint8*)calloc((seq_len - word_size + 1),
2244                                   sizeof(Uint8));
2245     *words = query_hashes;
2246 
2247     if (!query_hashes) {
2248         return -1;
2249     }
2250 
2251 
2252     /* find query word hashes */
2253     query_hashes[0] = s_GetHash(&seq_data[0], word_size);
2254     for (i = 1; i < seq_len - word_size; i++) {
2255         query_hashes[i] = query_hashes[i - 1];
2256         query_hashes[i] <<= 5;
2257         query_hashes[i] &= mask;
2258         query_hashes[i] += (Uint8)seq_data[i + word_size - 1];
2259     }
2260 
2261     return 0;
2262 }
2263 
2264 
s_FreeBlastCompo_QueryInfoArray(BlastCompo_QueryInfo ** query_info,int num_queries)2265 static void s_FreeBlastCompo_QueryInfoArray(BlastCompo_QueryInfo** query_info,
2266                                             int num_queries)
2267 {
2268     int i;
2269 
2270     if (!query_info) {
2271         return;
2272     }
2273 
2274     for (i = 0;i < num_queries;i++) {
2275         if ((*query_info)[i].words) {
2276             free((*query_info)[i].words);
2277         }
2278     }
2279 
2280     free(*query_info);
2281     *query_info = NULL;
2282 }
2283 
2284 /**
2285  * Save information about all queries in an array of objects of type
2286  * BlastCompo_QueryInfo.
2287  *
2288  * @param query_data        query sequence data
2289  * @param blast_query_info  information about all queries, as an
2290  *                          internal blast data structure
2291  *
2292  * @return the new array on success, or NULL on error
2293  */
2294 static BlastCompo_QueryInfo *
s_GetQueryInfo(Uint1 * query_data,const BlastQueryInfo * blast_query_info,Boolean skip)2295 s_GetQueryInfo(Uint1 * query_data, const BlastQueryInfo * blast_query_info, Boolean skip)
2296 {
2297     int i;                   /* loop index */
2298     BlastCompo_QueryInfo *
2299         compo_query_info;    /* the new array */
2300     int num_queries;         /* the number of queries/elements in
2301                                 compo_query_info */
2302 
2303     num_queries = blast_query_info->last_context + 1;
2304     compo_query_info = calloc(num_queries, sizeof(BlastCompo_QueryInfo));
2305     if (compo_query_info != NULL) {
2306         for (i = 0;  i < num_queries;  i++) {
2307             BlastCompo_QueryInfo * query_info = &compo_query_info[i];
2308             const BlastContextInfo * query_context = &blast_query_info->contexts[i];
2309 
2310             query_info->eff_search_space =
2311                 (double) query_context->eff_searchsp;
2312             query_info->origin = query_context->query_offset;
2313             query_info->seq.data = &query_data[query_info->origin];
2314             query_info->seq.length = query_context->query_length;
2315             query_info->words = NULL;
2316 
2317             s_CreateWordArray(query_info->seq.data, query_info->seq.length,
2318                               &query_info->words);
2319             if (! skip) {
2320                 Blast_ReadAaComposition(&query_info->composition, BLASTAA_SIZE,
2321                                         query_info->seq.data,
2322                                         query_info->seq.length);
2323             }
2324         }
2325     }
2326     return compo_query_info;
2327 }
2328 
2329 
2330 /**
2331  * Create a new object of type BlastCompo_GappingParams.  The new
2332  * object contains the parameters needed by the composition adjustment
2333  * library to compute a gapped alignment.
2334  *
2335  * @param context     the data structures needed by callback functions
2336  *                    that perform the gapped alignments.
2337  * @param extendParams parameters used for a gapped extension
2338  * @param num_queries  the number of queries in the concatenated query
2339  */
2340 static BlastCompo_GappingParams *
s_GappingParamsNew(BlastKappa_GappingParamsContext * context,const BlastExtensionParameters * extendParams,int num_queries)2341 s_GappingParamsNew(BlastKappa_GappingParamsContext * context,
2342                    const BlastExtensionParameters* extendParams,
2343                    int num_queries)
2344 {
2345     int i;
2346     double min_lambda = DBL_MAX;   /* smallest gapped Lambda */
2347     const BlastScoringParameters * scoring = context->scoringParams;
2348     const BlastExtensionOptions * options = extendParams->options;
2349     /* The new object */
2350     BlastCompo_GappingParams * gapping_params = NULL;
2351 
2352     gapping_params = malloc(sizeof(BlastCompo_GappingParams));
2353     if (gapping_params == NULL)
2354 	return NULL;
2355 
2356     gapping_params->gap_open = scoring->gap_open;
2357     gapping_params->gap_extend = scoring->gap_extend;
2358     gapping_params->context = context;
2359 
2360     for (i = 0;  i < num_queries;  i++) {
2361         if (context->sbp->kbp_gap[i] != NULL &&
2362             context->sbp->kbp_gap[i]->Lambda < min_lambda) {
2363             min_lambda = context->sbp->kbp_gap[i]->Lambda;
2364         }
2365     }
2366     gapping_params->x_dropoff = (Int4)
2367         MAX(options->gap_x_dropoff_final*NCBIMATH_LN2 / min_lambda,
2368             extendParams->gap_x_dropoff_final);
2369     context->gap_align->gap_x_dropoff = gapping_params->x_dropoff;
2370 
2371     return gapping_params;
2372 }
2373 
2374 
2375 /** Callbacks used by the Blast_RedoOneMatch* routines */
2376 static const Blast_RedoAlignCallbacks
2377 redo_align_callbacks = {
2378     s_CalcLambda, s_SequenceGetRange, s_RedoOneAlignment,
2379     s_NewAlignmentUsingXdrop, s_FreeEditScript
2380 };
2381 
2382 
2383 /* Bit score per alignment position threshold for preliminaru near identical
2384    test */
2385 #define NEAR_IDENTICAL_BITS_PER_POSITION (1.74)
2386 
2387 /**
2388  * Read the parameters required for the Blast_RedoOneMatch* functions from
2389  * the corresponding parameters in standard BLAST datatypes.  Return a new
2390  * object representing these parameters.
2391  */
2392 static Blast_RedoAlignParams *
s_GetAlignParams(BlastKappa_GappingParamsContext * context,BLAST_SequenceBlk * queryBlk,const BlastQueryInfo * queryInfo,const BlastHitSavingParameters * hitParams,const BlastExtensionParameters * extendParams)2393 s_GetAlignParams(BlastKappa_GappingParamsContext * context,
2394                  BLAST_SequenceBlk * queryBlk,
2395                  const BlastQueryInfo* queryInfo,
2396                  const BlastHitSavingParameters* hitParams,
2397                  const BlastExtensionParameters* extendParams)
2398 {
2399     int status = 0;    /* status code */
2400     int rows;          /* number of rows in the scoring matrix */
2401     int cutoff_s;      /* cutoff score for saving an alignment */
2402     double cutoff_e;   /* cutoff evalue for saving an alignment */
2403     BlastCompo_GappingParams *
2404         gapping_params = NULL;    /* parameters needed to compute a gapped
2405                                      alignment */
2406     Blast_MatrixInfo *
2407         scaledMatrixInfo;         /* information about the scoring matrix */
2408     /* does this kind of search translate the database sequence */
2409     int subject_is_translated = (context->prog_number == eBlastTypeTblastn) || (context->prog_number == eBlastTypeRpsTblastn);
2410     int query_is_translated   = context->prog_number == eBlastTypeBlastx;
2411     /* is this a positiion-based search */
2412     Boolean positionBased = (Boolean) (context->sbp->psi_matrix != NULL);
2413     /* will BLAST_LinkHsps be called to assign e-values */
2414     Boolean do_link_hsps = (hitParams->do_sum_stats);
2415     ECompoAdjustModes compo_adjust_mode =
2416         (ECompoAdjustModes) extendParams->options->compositionBasedStats;
2417 
2418     /* per position bit score cutoff for testing whether sequences are
2419        near identical */
2420     double near_identical_cutoff_bits = NEAR_IDENTICAL_BITS_PER_POSITION;
2421 
2422     /* score block is already scaled by context->localScalingFactor */
2423     double near_identical_cutoff=0;
2424     Int4 index;
2425     for (index = queryInfo->first_context;
2426                      index <= queryInfo->last_context; ++index) {
2427 
2428         if ((queryInfo->contexts[index].is_valid)) {
2429     		near_identical_cutoff =
2430        		 (near_identical_cutoff_bits * NCBIMATH_LN2)
2431         		/ context->sbp->kbp_gap[index]->Lambda;
2432 		break;
2433 	}
2434     }
2435 
2436     if (do_link_hsps) {
2437         ASSERT(hitParams->link_hsp_params != NULL);
2438         cutoff_s =
2439             (int) (hitParams->cutoff_score_min * context->localScalingFactor);
2440     } else {
2441         /* There is no cutoff score; we consider e-values instead */
2442         cutoff_s = 1;
2443     }
2444     cutoff_e = hitParams->options->expect_value;
2445     rows = positionBased ? queryInfo->max_length : BLASTAA_SIZE;
2446     scaledMatrixInfo = Blast_MatrixInfoNew(rows, BLASTAA_SIZE, positionBased);
2447     status = s_MatrixInfoInit(scaledMatrixInfo, queryBlk, context->sbp,
2448                               context->localScalingFactor,
2449                               context->scoringParams->options->matrix);
2450     if (status != 0) {
2451         return NULL;
2452     }
2453     gapping_params = s_GappingParamsNew(context, extendParams,
2454                                         queryInfo->last_context + 1);
2455     if (gapping_params == NULL) {
2456         return NULL;
2457     } else {
2458         return
2459             Blast_RedoAlignParamsNew(&scaledMatrixInfo, &gapping_params,
2460                                      compo_adjust_mode, positionBased,
2461                                      query_is_translated,
2462                                      subject_is_translated,
2463                                      queryInfo->max_length, cutoff_s, cutoff_e,
2464                                      do_link_hsps, &redo_align_callbacks,
2465                                      near_identical_cutoff);
2466     }
2467 }
2468 
2469 
2470 /**
2471  * Convert an array of BlastCompo_Heap objects to a BlastHSPResults structure.
2472  *
2473  * @param results        BLAST core external results structure (pre-SeqAlign)
2474  *                       [out]
2475  * @param heaps          an array of BlastCompo_Heap objects
2476  * @param hitlist_size   size of each list in the results structure above [in]
2477  */
2478 static void
s_FillResultsFromCompoHeaps(BlastHSPResults * results,BlastCompo_Heap heaps[],Int4 hitlist_size)2479 s_FillResultsFromCompoHeaps(BlastHSPResults * results,
2480                             BlastCompo_Heap heaps[],
2481                             Int4 hitlist_size)
2482 {
2483     int query_index;   /* loop index */
2484     int num_queries;   /* Number of queries in this search */
2485 
2486     num_queries = results->num_queries;
2487     for (query_index = 0;  query_index < num_queries;  query_index++) {
2488         BlastHSPList* hsp_list;
2489         BlastHitList* hitlist;
2490         BlastCompo_Heap * heap = &heaps[query_index];
2491 
2492         results->hitlist_array[query_index] = Blast_HitListNew(hitlist_size);
2493         hitlist = results->hitlist_array[query_index];
2494 
2495         while (NULL != (hsp_list = BlastCompo_HeapPop(heap))) {
2496             Blast_HitListUpdate(hitlist, hsp_list);
2497         }
2498     }
2499     Blast_HSPResultsReverseOrder(results);
2500 }
2501 
2502 
2503 /** Remove all matches from a BlastCompo_Heap. */
s_ClearHeap(BlastCompo_Heap * self)2504 static void s_ClearHeap(BlastCompo_Heap * self)
2505 {
2506     BlastHSPList* hsp_list = NULL;   /* an element of the heap */
2507 
2508     while (NULL != (hsp_list = BlastCompo_HeapPop(self))) {
2509         hsp_list = Blast_HSPListFree(hsp_list);
2510     }
2511 }
2512 
2513 extern void
2514 BLAST_SetupPartialFetching(EBlastProgramType program_number,
2515                            BlastSeqSrc* seq_src,
2516                            const BlastHSPList** hsp_list,
2517                            Int4 num_hsplists);
2518 
2519 /**
2520  * Free a BlastGapAlignStruct copy created by s_BlastGapAlignStruct_Copy
2521  *
2522  * @param copy Pointer to BlastGapAlignStruct to be freed
2523  */
s_BlastGapAlignStruct_Free(BlastGapAlignStruct * copy)2524 static void s_BlastGapAlignStruct_Free(BlastGapAlignStruct* copy)
2525 {
2526     {
2527         while (copy->state_struct != NULL) {
2528             GapStateArrayStruct* cur = copy->state_struct;
2529             copy->state_struct = copy->state_struct->next;
2530             if (cur->state_array) {
2531                 sfree(cur->state_array);
2532             }
2533             if (cur) {
2534                 sfree(cur);
2535             }
2536         }
2537     }
2538     {
2539         if (copy->edit_script != NULL) {
2540             if (copy->edit_script->op_type) {
2541                 sfree(copy->edit_script->op_type);
2542             }
2543             if (copy->edit_script->num) {
2544                 sfree(copy->edit_script->num);
2545             }
2546             sfree(copy->edit_script);
2547         }
2548     }
2549     {
2550         if (copy->fwd_prelim_tback != NULL) {
2551             if (copy->fwd_prelim_tback->edit_ops) {
2552                 sfree(copy->fwd_prelim_tback->edit_ops);
2553             }
2554             sfree(copy->fwd_prelim_tback);
2555         }
2556     }
2557     {
2558         if (copy->rev_prelim_tback != NULL) {
2559             if (copy->rev_prelim_tback->edit_ops) {
2560                 sfree(copy->rev_prelim_tback->edit_ops);
2561             }
2562             sfree(copy->rev_prelim_tback);
2563         }
2564     }
2565     {
2566         if (copy->greedy_align_mem != NULL) {
2567             sfree(copy->greedy_align_mem);
2568         }
2569     }
2570     {
2571         if (copy->dp_mem != NULL) {
2572             sfree(copy->dp_mem);
2573         }
2574     }
2575     {
2576         if (copy->sbp != NULL) {
2577             sfree(copy->sbp);
2578         }
2579     }
2580     sfree(copy);
2581 }
2582 
2583 /**
2584  * Create a "deep" copy of a BlastGapAlignStruct structure.
2585  *
2586  * Non-pointer structure members are copied.  Pointers to data which will
2587  * only be read are copied.  For data which will be changing, memory for copies
2588  * will be allocated and new pointers will be assigned to them.  The process
2589  * repeats down the structure hierarchy until all pointers are dealt with.
2590  *
2591  * @param orig Pointer to BlastGapAlignStruct structure to be copied
2592  * @param sbp  Pointer to BlastScoreBlk structure, required to set copy->sbp
2593  *
2594  * @return Pointer to copy of original BlastGapAlignStruct structure
2595  */
s_BlastGapAlignStruct_Copy(BlastGapAlignStruct * orig,BlastScoreBlk * sbp)2596 static BlastGapAlignStruct* s_BlastGapAlignStruct_Copy(
2597         BlastGapAlignStruct* orig,
2598         BlastScoreBlk* sbp
2599 )
2600 {
2601     BlastGapAlignStruct* copy =
2602             (BlastGapAlignStruct*) calloc(1, sizeof(BlastGapAlignStruct));
2603 
2604     // Copy plain old data (ints, doubles, booleans, ...).
2605     // Any pointer members will be processed separately.
2606     memcpy(copy, orig, sizeof(BlastGapAlignStruct));
2607 
2608     {
2609         GapStateArrayStruct* o = orig->state_struct;
2610         if (o != NULL) {
2611             GapStateArrayStruct* c = (GapStateArrayStruct*) calloc(
2612                     1,
2613                     sizeof(GapStateArrayStruct)
2614             );
2615             copy->state_struct = c;
2616             memcpy(c, o, sizeof(GapStateArrayStruct));
2617             c->state_array = (Uint1*) calloc(c->length, sizeof(Uint1));
2618             int i;
2619             for (i = 0; i < c->length; ++i) {
2620                 c->state_array[i] = o->state_array[i];
2621             }
2622             while (o->next != NULL) {
2623                 c->next = (GapStateArrayStruct*)
2624                         calloc(1, sizeof(GapStateArrayStruct));
2625                 c = c->next;
2626                 o = o->next;
2627                 memcpy(c, o, sizeof(GapStateArrayStruct));
2628                 c->state_array = (Uint1*) calloc(c->length, sizeof(Uint1));
2629                 int i;
2630                 for (i = 0; i < c->length; ++i) {
2631                     c->state_array[i] = o->state_array[i];
2632                 }
2633             }
2634         }
2635     }
2636     {
2637         GapEditScript* o = orig->edit_script;
2638         if (o != NULL) {
2639             GapEditScript* c = (GapEditScript*) calloc(
2640                     1,
2641                     sizeof(GapEditScript)
2642             );
2643             copy->edit_script = c;
2644             memcpy(c, o, sizeof(GapEditScript));
2645             c->op_type = (EGapAlignOpType*) calloc(
2646                     o->size,
2647                     sizeof(EGapAlignOpType)
2648             );
2649             c->num = (Int4*) calloc(o->size, sizeof(Int4));
2650             int i;
2651             for (i = 0; i < o->size; ++i) {
2652                 c->op_type[i] = o->op_type[i];
2653                 c->num[i] = o->num[i];
2654             }
2655         }
2656     }
2657     {
2658         GapPrelimEditBlock* o = orig->fwd_prelim_tback;
2659         if (o != NULL) {
2660             GapPrelimEditBlock* c = (GapPrelimEditBlock*) calloc(
2661                     1,
2662                     sizeof(GapPrelimEditBlock)
2663             );
2664             copy->fwd_prelim_tback = c;
2665             memcpy(c, o, sizeof(GapPrelimEditBlock));
2666             c->edit_ops = calloc(
2667                     o->num_ops_allocated,
2668                     sizeof(GapPrelimEditScript)
2669             );
2670             int i;
2671             for (i = 0; i < o->num_ops_allocated; ++i) {
2672                 c->edit_ops[i].op_type = o->edit_ops[i].op_type;
2673                 c->edit_ops[i].num = o->edit_ops[i].num;
2674             }
2675         }
2676     }
2677     {
2678         GapPrelimEditBlock* o = orig->rev_prelim_tback;
2679         if (o != NULL) {
2680             GapPrelimEditBlock* c = (GapPrelimEditBlock*) calloc(
2681                     1,
2682                     sizeof(GapPrelimEditBlock)
2683             );
2684             copy->rev_prelim_tback = c;
2685             memcpy(c, o, sizeof(GapPrelimEditBlock));
2686             c->edit_ops = calloc(
2687                     o->num_ops_allocated,
2688                     sizeof(GapPrelimEditScript)
2689             );
2690             int i;
2691             for (i = 0; i < o->num_ops_allocated; ++i) {
2692                 c->edit_ops[i].op_type = o->edit_ops[i].op_type;
2693                 c->edit_ops[i].num = o->edit_ops[i].num;
2694             }
2695         }
2696     }
2697     {
2698         SGreedyAlignMem* o = orig->greedy_align_mem;
2699         if (o != NULL) {
2700             SGreedyAlignMem* c = (SGreedyAlignMem*) calloc(
2701                     1,
2702                     sizeof(SGreedyAlignMem)
2703             );
2704             copy->greedy_align_mem = c;
2705             memcpy(c, o, sizeof(SGreedyAlignMem));
2706         }
2707     }
2708     {
2709         BlastGapDP* o = orig->dp_mem;
2710         if (o != NULL) {
2711             BlastGapDP* c = (BlastGapDP*) calloc(
2712                     orig->dp_mem_alloc,
2713                     sizeof(BlastGapDP)
2714             );
2715             copy->dp_mem = c;
2716             memcpy(c, o, orig->dp_mem_alloc * sizeof(BlastGapDP));
2717         }
2718     }
2719     {
2720         copy->sbp = sbp;
2721     }
2722 
2723     return copy;
2724 }
2725 
2726 /**
2727  * Free a BlastScoreBlk copy created by s_BlastScoreBlk_Copy
2728  *
2729  * BlastScoreBlk* pointer "bsb_ptr" should be passed as (&bsb_ptr);
2730  * this function will set bsb_ptr to NULL before returning.
2731  *
2732  * @param copy Pointer to (pointer to BlastScoreBlk to be freed)
2733  */
2734 static
s_BlastScoreBlk_Free(BlastScoreBlk ** copy)2735 void s_BlastScoreBlk_Free(BlastScoreBlk** copy)
2736 {
2737     BlastScoreBlkFree(*copy);
2738     *copy = NULL;
2739 }
2740 
2741 /**
2742  * Create a "deep" copy of a BlastScoreBlk structure.
2743  *
2744  * Non-pointer structure members are copied.  Pointers to data which will
2745  * only be read are copied.  For data which will be changing, memory for copies
2746  * will be allocated and new pointers will be assigned to them.  The process
2747  * repeats down the structure hierarchy until all pointers are dealt with.
2748  *
2749  * @param program            The program type
2750  * @param orig               Pointer to BlastScoreBlk structure to be copied
2751  * @param alphabet_code      Alphabet code
2752  * @param number_of_contexts Number of contexts
2753  *
2754  * @return Pointer to copy of original BlastScoreBlk structure
2755  */
2756 static
s_BlastScoreBlk_Copy(EBlastProgramType program,BlastScoreBlk * orig,Uint1 alphabet_code,Int4 number_of_contexts)2757 BlastScoreBlk* s_BlastScoreBlk_Copy(
2758         EBlastProgramType program,
2759         BlastScoreBlk* orig,
2760         Uint1 alphabet_code,
2761         Int4 number_of_contexts
2762 )
2763 {
2764     BlastScoreBlk* copy = BlastScoreBlkNew(
2765             orig->alphabet_code,
2766             orig->number_of_contexts
2767     );
2768     if (copy == NULL) {
2769         return NULL;
2770     }
2771 
2772     copy->alphabet_start = orig->alphabet_start;
2773     copy->name = strdup(orig->name);
2774     copy->comments = orig->comments;
2775     /* Deep-copy orig->matrix */
2776     if (orig->matrix != NULL) {
2777         if (copy->matrix == NULL) {
2778             return BlastScoreBlkFree(copy);
2779         }
2780         SBlastScoreMatrix* m = copy->matrix;
2781         if (m->data != NULL  &&  orig->matrix->data != NULL) {
2782             int i;
2783             for (i = 0; i < orig->matrix->ncols; ++i) {
2784                 memcpy(
2785                         m->data[i],
2786                         orig->matrix->data[i],
2787                         m->nrows * sizeof(int)
2788                 );
2789             }
2790         }
2791         if (m->freqs != NULL  &&  orig->matrix->freqs != NULL) {
2792             memcpy(
2793                     m->freqs,
2794                     orig->matrix->freqs,
2795                     m->ncols * sizeof(double)
2796             );
2797         }
2798         m->lambda = orig->matrix->lambda;
2799     }
2800     /* Deep-copy orig->psi_matrix */
2801     if (orig->psi_matrix != NULL
2802             &&  orig->psi_matrix->pssm != NULL) {
2803         copy->psi_matrix = SPsiBlastScoreMatrixNew(orig->psi_matrix->pssm->ncols);
2804         if (copy->psi_matrix == NULL) {
2805             return BlastScoreBlkFree(copy);
2806         }
2807         SPsiBlastScoreMatrix* pm = copy->psi_matrix;
2808         SBlastScoreMatrix* m = pm->pssm;
2809         if (m->data != NULL  &&  orig->psi_matrix->pssm->data != NULL) {
2810             int i;
2811             for (i = 0; i < orig->psi_matrix->pssm->ncols; ++i) {
2812                 memcpy(
2813                         m->data[i],
2814                         orig->psi_matrix->pssm->data[i],
2815                         m->nrows * sizeof(int)
2816                 );
2817             }
2818         }
2819         if (m->freqs != NULL
2820                 &&  orig->psi_matrix->pssm->freqs != NULL) {
2821             memcpy(
2822                     m->freqs,
2823                     orig->psi_matrix->pssm->freqs,
2824                     m->ncols * sizeof(double)
2825             );
2826         }
2827         m->lambda = orig->psi_matrix->pssm->lambda;
2828         if (pm->freq_ratios != NULL
2829                 &&  orig->psi_matrix->freq_ratios != NULL) {
2830             int i;
2831             for (i = 0; i < orig->psi_matrix->pssm->ncols; ++i) {
2832                 memcpy(
2833                         pm->freq_ratios[i],
2834                         orig->psi_matrix->freq_ratios[i],
2835                         orig->psi_matrix->pssm->nrows * sizeof(double)
2836                 );
2837             }
2838         }
2839         if (orig->psi_matrix->kbp != NULL) {
2840             memcpy(pm->kbp, orig->psi_matrix->kbp, sizeof(Blast_KarlinBlk));
2841         }
2842     }
2843     copy->matrix_only_scoring = orig->matrix_only_scoring;
2844     copy->complexity_adjusted_scoring = orig->complexity_adjusted_scoring;
2845     copy->loscore = orig->loscore;
2846     copy->hiscore = orig->hiscore;
2847     copy->penalty = orig->penalty;
2848     copy->reward = orig->reward;
2849     copy->read_in_matrix = orig->read_in_matrix;
2850     if (Blast_QueryIsPssm(program)) {
2851         copy->kbp     = copy->kbp_psi;
2852         copy->kbp_gap = copy->kbp_gap_psi;
2853     } else {
2854         copy->kbp     = copy->kbp_std;
2855         copy->kbp_gap = copy->kbp_gap_std;
2856     }
2857     if (orig->gbp != NULL) {
2858         memcpy(copy->gbp, orig->gbp, sizeof(Blast_GumbelBlk));
2859     }
2860     int ctx;
2861     for (ctx = 0; ctx < orig->number_of_contexts; ++ctx) {
2862         if (orig->sfp != NULL  &&  orig->sfp[ctx] != NULL) {
2863             copy->sfp[ctx] = Blast_ScoreFreqNew(
2864                     orig->sfp[ctx]->score_min,
2865                     orig->sfp[ctx]->score_max
2866             );
2867             if (copy->sfp[ctx] == NULL) {
2868                 return BlastScoreBlkFree(copy);
2869             }
2870             copy->sfp[ctx]->obs_min = orig->sfp[ctx]->obs_min;
2871             copy->sfp[ctx]->obs_max = orig->sfp[ctx]->obs_max;
2872             copy->sfp[ctx]->score_avg = orig->sfp[ctx]->score_avg;
2873             int r = orig->sfp[ctx]->score_max - orig->sfp[ctx]->score_min + 1;
2874             memcpy(
2875                     copy->sfp[ctx]->sprob0,
2876                     orig->sfp[ctx]->sprob0,
2877                     r * sizeof(double)
2878             );
2879         }
2880         if (orig->kbp_std != NULL  &&  orig->kbp_std[ctx] != NULL) {
2881             copy->kbp_std[ctx] = Blast_KarlinBlkNew();
2882             if (Blast_KarlinBlkCopy(copy->kbp_std[ctx], orig->kbp_std[ctx]) != 0) {
2883                 return BlastScoreBlkFree(copy);
2884             }
2885         }
2886         if (orig->kbp_gap_std != NULL  &&  orig->kbp_gap_std[ctx] != NULL) {
2887             copy->kbp_gap_std[ctx] = Blast_KarlinBlkNew();
2888             if (Blast_KarlinBlkCopy(copy->kbp_gap_std[ctx], orig->kbp_gap_std[ctx]) != 0) {
2889                 return BlastScoreBlkFree(copy);
2890             }
2891         }
2892         if (orig->kbp_psi != NULL  &&  orig->kbp_psi[ctx] != NULL) {
2893             copy->kbp_psi[ctx] = Blast_KarlinBlkNew();
2894             if (Blast_KarlinBlkCopy(copy->kbp_psi[ctx], orig->kbp_psi[ctx]) != 0) {
2895                 return BlastScoreBlkFree(copy);
2896             }
2897         }
2898         if (orig->kbp_gap_psi != NULL  &&  orig->kbp_gap_psi[ctx] != NULL) {
2899             copy->kbp_gap_psi[ctx] = Blast_KarlinBlkNew();
2900             if (Blast_KarlinBlkCopy(copy->kbp_gap_psi[ctx], orig->kbp_gap_psi[ctx]) != 0) {
2901                 return BlastScoreBlkFree(copy);
2902             }
2903         }
2904         if (Blast_QueryIsPssm(program)) {
2905             copy->kbp[ctx]     = copy->kbp_psi[ctx];
2906             copy->kbp_gap[ctx] = copy->kbp_gap_psi[ctx];
2907         } else {
2908             copy->kbp[ctx]     = copy->kbp_std[ctx];
2909             copy->kbp_gap[ctx] = copy->kbp_gap_std[ctx];
2910         }
2911     }
2912     if (orig->kbp_ideal != NULL) {
2913         copy->kbp_ideal = Blast_KarlinBlkNew();
2914         if (Blast_KarlinBlkCopy(copy->kbp_ideal, orig->kbp_ideal) != 0) {
2915             return BlastScoreBlkFree(copy);
2916         }
2917     }
2918     copy->ambiguous_res = (Uint1*) calloc(orig->ambig_size, sizeof(Uint1));
2919     if (orig->ambiguous_res != NULL) {
2920         memcpy(copy->ambiguous_res, orig->ambiguous_res, orig->ambig_size);
2921     }
2922     copy->ambig_size = orig->ambig_size;
2923     copy->ambig_occupy = orig->ambig_occupy;
2924     copy->round_down = orig->round_down;
2925 
2926     return copy;
2927 }
2928 
2929 /**
2930  *  Recompute alignments for each match found by the gapped BLAST
2931  *  algorithm.  Single-thread adapter to Blast_RedoAlignmentCore_MT.
2932  */
2933 Int2
Blast_RedoAlignmentCore(EBlastProgramType program_number,BLAST_SequenceBlk * queryBlk,const BlastQueryInfo * queryInfo,BlastScoreBlk * sbp,BLAST_SequenceBlk * subjectBlk,const BlastSeqSrc * seqSrc,Int4 default_db_genetic_code,BlastHSPList * thisMatch,BlastHSPStream * hsp_stream,BlastScoringParameters * scoringParams,const BlastExtensionParameters * extendParams,const BlastHitSavingParameters * hitParams,const PSIBlastOptions * psiOptions,BlastHSPResults * results)2934 Blast_RedoAlignmentCore(EBlastProgramType program_number,
2935                         BLAST_SequenceBlk * queryBlk,
2936                         const BlastQueryInfo* queryInfo,
2937                         BlastScoreBlk* sbp,
2938                         BLAST_SequenceBlk * subjectBlk,
2939                         const BlastSeqSrc* seqSrc,
2940                         Int4 default_db_genetic_code,
2941                         BlastHSPList * thisMatch,
2942                         BlastHSPStream* hsp_stream,
2943                         BlastScoringParameters* scoringParams,
2944                         const BlastExtensionParameters* extendParams,
2945                         const BlastHitSavingParameters* hitParams,
2946                         const PSIBlastOptions* psiOptions,
2947                         BlastHSPResults* results)
2948 {
2949     return Blast_RedoAlignmentCore_MT(
2950             program_number,
2951             1,                  /* number of threads */
2952             queryBlk,
2953             queryInfo,
2954             sbp,
2955             subjectBlk,
2956             seqSrc,
2957             default_db_genetic_code,
2958             thisMatch,
2959             hsp_stream,
2960             scoringParams,
2961             extendParams,
2962             hitParams,
2963             psiOptions,
2964             results
2965     );
2966 }
2967 
2968 /**
2969  *  Recompute alignments for each match found by the gapped BLAST
2970  *  algorithm.
2971  */
2972 Int2
Blast_RedoAlignmentCore_MT(EBlastProgramType program_number,Uint4 num_threads,BLAST_SequenceBlk * queryBlk,const BlastQueryInfo * queryInfo,BlastScoreBlk * sbp,BLAST_SequenceBlk * subjectBlk,const BlastSeqSrc * seqSrc,Int4 default_db_genetic_code,BlastHSPList * thisMatch,BlastHSPStream * hsp_stream,BlastScoringParameters * scoringParams,const BlastExtensionParameters * extendParams,const BlastHitSavingParameters * hitParams,const PSIBlastOptions * psiOptions,BlastHSPResults * results)2973 Blast_RedoAlignmentCore_MT(EBlastProgramType program_number,
2974                            Uint4 num_threads,
2975                            BLAST_SequenceBlk * queryBlk,
2976                            const BlastQueryInfo* queryInfo,
2977                            BlastScoreBlk* sbp,
2978                            BLAST_SequenceBlk * subjectBlk,
2979                            const BlastSeqSrc* seqSrc,
2980                            Int4 default_db_genetic_code,
2981                            BlastHSPList * thisMatch,
2982                            BlastHSPStream* hsp_stream,
2983                            BlastScoringParameters* scoringParams,
2984                            const BlastExtensionParameters* extendParams,
2985                            const BlastHitSavingParameters* hitParams,
2986                            const PSIBlastOptions* psiOptions,
2987                            BlastHSPResults* results)
2988 {
2989     int status_code = 0;                    /* return value code */
2990     /* the factor by which to scale the scoring system in order to
2991      * obtain greater precision */
2992     double localScalingFactor;
2993     /* forbidden ranges for each database position (used in
2994      * Smith-Waterman alignments) */
2995     Blast_ForbiddenRanges forbidden = {0,};
2996     /* a collection of alignments for each query sequence with
2997      * sequences from the database */
2998     BlastCompo_Heap* redoneMatches = NULL;
2999     /* stores all fields needed for computing a compositionally
3000      * adjusted score matrix using Newton's method */
3001     Blast_CompositionWorkspace** NRrecord_tld = NULL;
3002     /* loop index */
3003     int query_index;
3004     /* number of queries in the concatenated query */
3005     int numQueries = queryInfo->num_queries;
3006     /* number of contexts in the concatenated query */
3007     int numContexts = queryInfo->last_context + 1;
3008     /* number of contexts within a query */
3009     int numFrames = (program_number == eBlastTypeBlastx) ? 6:1;
3010     /* keeps track of gapped alignment params */
3011     BlastGapAlignStruct* gapAlign = NULL;
3012     /* the values of the search parameters that will be recorded, altered
3013      * in the search structure in this routine, and then restored before
3014      * the routine exits. */
3015     BlastKappa_SavedParameters *savedParams = NULL;
3016     /* All alignments above this value will be reported, no matter how many. */
3017     double inclusion_ethresh;
3018 
3019     BlastHSPResults* local_results = NULL;
3020 
3021     BlastCompo_QueryInfo** query_info_tld = NULL;
3022     int* numContexts_tld = NULL;
3023     int* compositionTestIndex_tld = NULL;
3024     Blast_RedoAlignParams** redo_align_params_tld = NULL;
3025     BLAST_SequenceBlk** subjectBlk_tld = NULL;
3026     Boolean positionBased = (Boolean) (sbp->psi_matrix != NULL);
3027     ECompoAdjustModes compo_adjust_mode =
3028         (ECompoAdjustModes) extendParams->options->compositionBasedStats;
3029     Boolean smithWaterman =
3030         (Boolean) (extendParams->options->eTbackExt == eSmithWatermanTbck);
3031 
3032     /* which test function do we use to see if a composition-adjusted
3033        p-value is desired; value needs to be passed in eventually*/
3034     int compositionTestIndex = extendParams->options->unifiedP;
3035     Uint1* genetic_code_string = GenCodeSingletonFind(default_db_genetic_code);
3036 
3037     ASSERT(program_number == eBlastTypeBlastp   ||
3038            program_number == eBlastTypeTblastn  ||
3039            program_number == eBlastTypeBlastx   ||
3040            program_number == eBlastTypePsiBlast ||
3041            program_number == eBlastTypeRpsBlast ||
3042            program_number == eBlastTypeRpsTblastn);
3043 
3044     if (0 == strcmp(scoringParams->options->matrix, "BLOSUM62_20") &&
3045         compo_adjust_mode == eNoCompositionBasedStats) {
3046         return -1;                   /* BLOSUM62_20 only makes sense if
3047                                       * compo_adjust_mode is on */
3048     }
3049     if (positionBased) {
3050         /* Position based searches can only use traditional
3051          * composition based stats */
3052         if ((int) compo_adjust_mode > 1) {
3053             compo_adjust_mode = eCompositionBasedStats;
3054         }
3055         /* A position-based search can only have one query */
3056         ASSERT(queryInfo->num_queries == 1);
3057         ASSERT(queryBlk->length == (Int4)sbp->psi_matrix->pssm->ncols);
3058     }
3059 
3060     if ((int) compo_adjust_mode > 1 &&
3061         !Blast_FrequencyDataIsAvailable(scoringParams->options->matrix)) {
3062         return -1;   /* Unsupported matrix */
3063     }
3064     /*****************/
3065     inclusion_ethresh = (psiOptions /* this can be NULL for CBl2Seq */
3066                          ? psiOptions->inclusion_ethresh
3067                          : PSI_INCLUSION_ETHRESH);
3068     ASSERT(inclusion_ethresh != 0.0);
3069 
3070     int actual_num_threads = 1;
3071 #ifdef _OPENMP
3072     actual_num_threads = num_threads;
3073 #endif
3074 
3075     /* Initialize savedParams */
3076     savedParams =
3077         s_SavedParametersNew(queryInfo->max_length, numContexts,
3078                              compo_adjust_mode, positionBased);
3079     if (savedParams == NULL) {
3080         status_code = -1;
3081         goto function_cleanup;
3082     }
3083     status_code =
3084         s_RecordInitialSearch(savedParams, sbp, scoringParams,
3085                               queryInfo->max_length, compo_adjust_mode,
3086                               positionBased);
3087     if (status_code != 0) {
3088         goto function_cleanup;
3089     }
3090 
3091     if (compo_adjust_mode != eNoCompositionBasedStats) {
3092         if((0 == strcmp(scoringParams->options->matrix, "BLOSUM62_20"))) {
3093             localScalingFactor = SCALING_FACTOR / 10;
3094         } else {
3095             localScalingFactor = SCALING_FACTOR;
3096         }
3097     } else {
3098         localScalingFactor = 1.0;
3099     }
3100     s_RescaleSearch(sbp, scoringParams, numContexts, localScalingFactor);
3101     status_code =
3102         BLAST_GapAlignStructNew(scoringParams, extendParams,
3103                                 (seqSrc) ?  BlastSeqSrcGetMaxSeqLen(seqSrc)
3104                                          :  subjectBlk->length,
3105                                 sbp, &gapAlign);
3106     if (status_code != 0) {
3107         return (Int2) status_code;
3108     }
3109 
3110     if(smithWaterman) {
3111         status_code =
3112             Blast_ForbiddenRangesInitialize(&forbidden, queryInfo->max_length);
3113         if (status_code != 0) {
3114             goto function_cleanup;
3115         }
3116     }
3117     redoneMatches = calloc(numQueries, sizeof(BlastCompo_Heap));
3118     if (redoneMatches == NULL) {
3119         status_code = -1;
3120         goto function_cleanup;
3121     }
3122     for (query_index = 0;  query_index < numQueries;  query_index++) {
3123         status_code =
3124             BlastCompo_HeapInitialize(&redoneMatches[query_index],
3125                                       hitParams->options->hitlist_size,
3126                                       inclusion_ethresh);
3127         if (status_code != 0) {
3128             goto function_cleanup;
3129         }
3130     }
3131 
3132     BlastCompo_Heap** redoneMatches_tld =
3133             (BlastCompo_Heap**) calloc(
3134                     actual_num_threads,
3135                     sizeof(BlastCompo_Heap*)
3136             );
3137     BlastCompo_Alignment*** alignments_tld =
3138             (BlastCompo_Alignment***) calloc(
3139                     actual_num_threads,
3140                     sizeof(BlastCompo_Alignment**)
3141             );
3142     BlastCompo_Alignment*** incoming_align_set_tld =
3143             (BlastCompo_Alignment***) calloc(
3144                     actual_num_threads,
3145                     sizeof(BlastCompo_Alignment**)
3146             );
3147     BlastKappa_SavedParameters** savedParams_tld =
3148             (BlastKappa_SavedParameters**) calloc(
3149                     actual_num_threads,
3150                     sizeof(BlastKappa_SavedParameters*)
3151             );
3152     BlastScoreBlk** sbp_tld =
3153             (BlastScoreBlk**) calloc(
3154                     actual_num_threads,
3155                     sizeof(BlastScoreBlk*)
3156             );
3157     BlastKappa_GappingParamsContext* gapping_params_context_tld =
3158             (BlastKappa_GappingParamsContext*) calloc(
3159                     actual_num_threads,
3160                     sizeof(BlastKappa_GappingParamsContext)
3161             );
3162     Int4*** matrix_tld =
3163             (Int4***) calloc(
3164                     actual_num_threads,
3165                     sizeof(Int4**)
3166             );
3167 
3168     NRrecord_tld =
3169             (Blast_CompositionWorkspace**) calloc(
3170                     actual_num_threads,
3171                     sizeof(Blast_CompositionWorkspace*)
3172             );
3173 
3174     subjectBlk_tld =
3175             (BLAST_SequenceBlk**) calloc(
3176                     actual_num_threads,
3177                     sizeof(BLAST_SequenceBlk*)
3178             );
3179     redo_align_params_tld =
3180             (Blast_RedoAlignParams**) calloc(
3181                     actual_num_threads,
3182                     sizeof(Blast_RedoAlignParams*)
3183             );
3184     int* status_code_tld =
3185             (int*) calloc(
3186                     actual_num_threads,
3187                     sizeof(int)
3188             );
3189     BlastSeqSrc** seqsrc_tld =
3190             (BlastSeqSrc**) calloc(
3191                     actual_num_threads,
3192                     sizeof(BlastSeqSrc*)
3193             );
3194     BlastGapAlignStruct** gap_align_tld =
3195             (BlastGapAlignStruct**) calloc(
3196                     actual_num_threads,
3197                     sizeof(BlastGapAlignStruct*)
3198             );
3199     BlastScoringParameters** score_params_tld =
3200             (BlastScoringParameters**) calloc(
3201                     actual_num_threads,
3202                     sizeof(BlastScoringParameters*)
3203             );
3204     BlastHitSavingParameters** hit_params_tld =
3205             (BlastHitSavingParameters**) calloc(
3206                     actual_num_threads,
3207                     sizeof(BlastHitSavingParameters*)
3208             );
3209     BlastHSPResults** results_tld =
3210             (BlastHSPResults**) calloc(
3211                     actual_num_threads,
3212                     sizeof(BlastHSPResults*)
3213             );
3214     query_info_tld =
3215             (BlastCompo_QueryInfo**) calloc(
3216                     actual_num_threads,
3217                     sizeof(BlastCompo_QueryInfo*)
3218             );
3219     numContexts_tld =
3220             (int*) calloc(
3221                     actual_num_threads,
3222                     sizeof(int)
3223             );
3224     compositionTestIndex_tld =
3225             (int*) calloc(
3226                     actual_num_threads,
3227                     sizeof(int)
3228             );
3229 
3230     int i;
3231     for (i = 0; i < actual_num_threads; ++i) {
3232         query_info_tld[i] = s_GetQueryInfo(
3233                 queryBlk->sequence,
3234                 queryInfo,
3235                 (program_number == eBlastTypeBlastx)
3236         );
3237         if (query_info_tld[i] == NULL) {
3238             status_code = -1;
3239             goto function_cleanup;
3240         }
3241 
3242         sbp_tld[i] = s_BlastScoreBlk_Copy(
3243                 program_number,
3244                 sbp,
3245                 sbp->alphabet_code,
3246                 sbp->number_of_contexts
3247         );
3248 
3249         numContexts_tld[i]          = numContexts;
3250         compositionTestIndex_tld[i] = compositionTestIndex;
3251         seqsrc_tld[i]               = BlastSeqSrcCopy(seqSrc);
3252         gap_align_tld[i]            =
3253                 s_BlastGapAlignStruct_Copy(gapAlign, sbp_tld[i]);
3254         score_params_tld[i]         = scoringParams;
3255         hit_params_tld[i]           = (BlastHitSavingParameters*) hitParams;
3256         results_tld[i]              =
3257                 Blast_HSPResultsNew(queryInfo->num_queries);
3258         subjectBlk_tld[i]           = subjectBlk;
3259 
3260         redoneMatches_tld[i] =
3261                 (BlastCompo_Heap*) calloc(numQueries, sizeof(BlastCompo_Heap));
3262         if (redoneMatches_tld[i] == NULL) {
3263             status_code = -1;
3264             goto function_cleanup;
3265         }
3266         for (query_index = 0; query_index < numQueries; query_index++) {
3267             status_code =
3268                 BlastCompo_HeapInitialize(&redoneMatches_tld[i][query_index],
3269                                           hitParams->options->hitlist_size,
3270                                           inclusion_ethresh);
3271             if (status_code != 0) {
3272                 goto function_cleanup;
3273             }
3274         }
3275 
3276         alignments_tld[i] = (BlastCompo_Alignment**) calloc(
3277                 numContexts,
3278                 sizeof(BlastCompo_Alignment*)
3279         );
3280         incoming_align_set_tld[i] = (BlastCompo_Alignment**) calloc(
3281                 numFrames,
3282                 sizeof(BlastCompo_Alignment*)
3283         );
3284 
3285         savedParams_tld[i] = s_SavedParametersNew(
3286                 queryInfo->max_length,
3287                 numContexts,
3288                 compo_adjust_mode,
3289                 positionBased
3290         );
3291         if (savedParams_tld[i] == NULL) {
3292             status_code = -1;
3293             goto function_cleanup;
3294         }
3295         status_code = s_RecordInitialSearch(
3296                 savedParams_tld[i],
3297                 sbp,
3298                 scoringParams,
3299                 queryInfo->max_length,
3300                 compo_adjust_mode,
3301                 positionBased
3302         );
3303         if (status_code != 0) {
3304             goto function_cleanup;
3305         }
3306 
3307         if ((int) compo_adjust_mode > 1 && !positionBased) {
3308             NRrecord_tld[i] = Blast_CompositionWorkspaceNew();
3309             status_code = Blast_CompositionWorkspaceInit(
3310                     NRrecord_tld[i],
3311                     scoringParams->options->matrix
3312             );
3313             if (status_code != 0) {
3314                 goto function_cleanup;
3315             }
3316         }
3317 
3318         gapping_params_context_tld[i].gap_align = gap_align_tld[i];
3319         gapping_params_context_tld[i].scoringParams = score_params_tld[i];
3320         gapping_params_context_tld[i].sbp = sbp_tld[i];
3321         gapping_params_context_tld[i].localScalingFactor = localScalingFactor;
3322         gapping_params_context_tld[i].prog_number = program_number;
3323 
3324         redo_align_params_tld[i] =
3325             s_GetAlignParams(
3326                     &gapping_params_context_tld[i],
3327                     queryBlk,
3328                     queryInfo,
3329                     hitParams,
3330                     extendParams
3331             );
3332         if (redo_align_params_tld[i] == NULL) {
3333             status_code = -1;
3334             goto function_cleanup;
3335         }
3336 
3337         if (positionBased) {
3338             matrix_tld[i] = sbp_tld[i]->psi_matrix->pssm->data;
3339         } else {
3340             matrix_tld[i] = sbp_tld[i]->matrix->data;
3341         }
3342         /**** Validate parameters *************/
3343         if (matrix_tld[i] == NULL) {
3344             goto function_cleanup;
3345         }
3346     }
3347 
3348     /*
3349      * There are two use cases here.
3350      * (1) hsp_stream == NULL, so single match is passed in thisMatch.
3351      *     Also, seqSrc == NULL and subjectBlk are != NULL.
3352      * (2) hsp_stream != NULL, so one or more matches are taken from
3353      *     hsp_stream, and thisMatch is (probably) NULL.
3354      *     Also, seqSrc != NULL, subjectBlk and thisMatch are == NULL.
3355      */
3356     struct BlastHSPListLinkedList {
3357         BlastHSPList* match;
3358         struct BlastHSPListLinkedList* next;
3359     };
3360     typedef struct BlastHSPListLinkedList BlastHSPListLinkedList;
3361 
3362     BlastHSPList** theseMatches = NULL;
3363     int numMatches = 0;
3364     if (hsp_stream == NULL) {
3365         theseMatches = (BlastHSPList**) calloc(1, sizeof(BlastHSPList*));
3366         *theseMatches = thisMatch;
3367         numMatches = 1;
3368     } else {
3369         BlastHSPList* localMatch = NULL;
3370         BlastHSPListLinkedList* head = NULL;
3371         BlastHSPListLinkedList* tail = NULL;
3372         /*
3373          * Collect matches from stream into linked list, counting them
3374          * along the way.
3375          */
3376         while (BlastHSPStreamRead(hsp_stream, &localMatch)
3377                 != kBlastHSPStream_Eof) {
3378             BlastHSPListLinkedList* entry =
3379                     (BlastHSPListLinkedList*) calloc(
3380                             1,
3381                             sizeof(BlastHSPListLinkedList)
3382                     );
3383             entry->match = localMatch;
3384             if (head == NULL) {
3385                 head = entry;
3386             } else {
3387                 tail->next = entry;
3388             }
3389             tail = entry;
3390             ++numMatches;
3391         }
3392         /*
3393          * Convert linked list of matches into array.
3394          */
3395         theseMatches =
3396                 (BlastHSPList**) calloc(numMatches, sizeof(BlastHSPList*));
3397         int i;
3398         for (i = 0; i < numMatches; ++i) {
3399             theseMatches[i] = head->match;
3400             BlastHSPListLinkedList* here = head;
3401             head = head->next;
3402             sfree(here);
3403         }
3404     }
3405 
3406     Boolean interrupt = FALSE;
3407 #pragma omp parallel \
3408     default(none) num_threads(actual_num_threads) \
3409     if(actual_num_threads>1) \
3410     shared(interrupt, seqsrc_tld, score_params_tld, hit_params_tld, \
3411     gap_align_tld, results_tld, \
3412     redoneMatches_tld, \
3413     stderr, \
3414     numQueries, numMatches, theseMatches, \
3415     numFrames, program_number, subjectBlk_tld, positionBased, \
3416     default_db_genetic_code, localScalingFactor, queryInfo, \
3417     sbp, smithWaterman, compositionTestIndex_tld, forbidden, \
3418     NRrecord_tld, actual_num_threads, sbp_tld, \
3419     matrix_tld, query_info_tld, numContexts_tld, \
3420     genetic_code_string, queryBlk, compo_adjust_mode, \
3421     alignments_tld, incoming_align_set_tld, savedParams_tld, \
3422     scoringParams, redo_align_params_tld, \
3423     status_code_tld)
3424     {
3425         int b;
3426 #pragma omp for schedule(static)
3427         for (b = 0; b < numMatches; ++b) {
3428 #pragma omp flush(interrupt)
3429             if (!interrupt) {
3430                 BlastCompo_Alignment** alignments = NULL;
3431                 BlastCompo_Alignment** incoming_align_set = NULL;
3432                 Blast_CompositionWorkspace* NRrecord = NULL;
3433                 BlastCompo_QueryInfo* query_info = NULL;
3434 
3435                 int numAligns[6];
3436                 Blast_KarlinBlk* kbp = NULL;
3437                 BlastCompo_MatchingSequence matchingSeq = {0,};
3438                 BlastHSPList* hsp_list = NULL;
3439                 BlastCompo_Alignment* incoming_aligns = NULL;
3440                 Blast_RedoAlignParams* redo_align_params;
3441                 double best_evalue;
3442                 Int4 best_score;
3443                 int query_index;
3444                 int context_index;
3445                 int frame_index;
3446                 void* discarded_aligns = NULL;
3447                 BlastSeqSrc* seqSrc;
3448                 BlastScoringParameters* scoringParams;
3449                 BlastHitSavingParameters* hitParams;
3450                 BlastCompo_Heap* redoneMatches;
3451                 BlastScoreBlk* sbp;
3452                 BLAST_SequenceBlk* subjectBlk;
3453                 int numContexts;
3454                 int compositionTestIndex;
3455                 /* existing alignments for a match */
3456                 Int4** matrix;                   /* score matrix */
3457                 int* pStatusCode;
3458 
3459                 double pvalueForThisPair = (-1); /* p-value for this match
3460                                                     for composition; -1 == no adjustment*/
3461                 double LambdaRatio; /*lambda ratio*/
3462 
3463                 int tid = 0;
3464 #ifdef _OPENMP
3465                 tid = omp_get_thread_num();
3466 #endif
3467                 seqSrc               = seqsrc_tld[tid];
3468                 scoringParams        = score_params_tld[tid];
3469                 hitParams            = hit_params_tld[tid];
3470                 redoneMatches        = redoneMatches_tld[tid];
3471                 alignments           = alignments_tld[tid];
3472                 incoming_align_set   = incoming_align_set_tld[tid];
3473                 NRrecord             = NRrecord_tld[tid];
3474                 sbp                  = sbp_tld[tid];
3475                 redo_align_params    = redo_align_params_tld[tid];
3476                 matrix               = matrix_tld[tid];
3477                 pStatusCode          = &status_code_tld[tid];
3478                 query_info           = query_info_tld[tid];
3479                 numContexts          = numContexts_tld[tid];
3480                 compositionTestIndex = compositionTestIndex_tld[tid];
3481                 subjectBlk           = subjectBlk_tld[tid];
3482 
3483                 BlastHSPList* localMatch = theseMatches[b];
3484 
3485                 if (localMatch->hsp_array == NULL) {
3486                     if (seqSrc) {
3487                         continue;
3488                     }
3489 #pragma omp critical(intrpt)
3490                     interrupt = TRUE;
3491 #pragma omp flush(interrupt)
3492                     continue;
3493                 }
3494 
3495                 if (BlastCompo_EarlyTermination(
3496                         localMatch->best_evalue,
3497                         redoneMatches,
3498                         numQueries
3499                 )) {
3500                     Blast_HSPListFree(localMatch);
3501                     if (seqSrc) {
3502                         continue;
3503                     }
3504 #pragma omp critical(intrpt)
3505                     interrupt = TRUE;
3506 #pragma omp flush(interrupt)
3507                     continue;
3508                 }
3509 
3510                 query_index = localMatch->query_index;
3511                 context_index = query_index * numFrames;
3512                 /* Get the sequence for this match */
3513                 if (seqSrc && BlastSeqSrcGetSupportsPartialFetching(seqSrc)) {
3514                     BLAST_SetupPartialFetching(
3515                             program_number,
3516                             (BlastSeqSrc*) seqSrc,
3517                             (const BlastHSPList**)&localMatch,
3518                             1
3519                     );
3520                 }
3521 
3522                 if (subjectBlk) {
3523                     matchingSeq.length = subjectBlk->length;
3524                     matchingSeq.index = -1;
3525                     matchingSeq.local_data = subjectBlk;
3526                 } else {
3527                     *pStatusCode = s_MatchingSequenceInitialize(
3528                             &matchingSeq,
3529                             program_number,
3530                             seqSrc,
3531                             default_db_genetic_code,
3532                             localMatch->oid
3533                     );
3534                     if (*pStatusCode != 0) {
3535                         /*
3536                          * some sequences may have been excluded by membit filtering
3537                          * so this is not really an exception
3538                          */
3539                         *pStatusCode = 0;
3540                         goto match_loop_cleanup;
3541                     }
3542                 }
3543                 *pStatusCode = s_ResultHspToDistinctAlign(
3544                         incoming_align_set,     /* o */
3545                         numAligns,              /* o */
3546                         localMatch->hsp_array,  /* i */
3547                         localMatch->hspcnt,     /* i */
3548                         context_index,          /* i */
3549                         queryInfo,              /* i */
3550                         localScalingFactor      /* i */
3551                 );
3552                 if (*pStatusCode != 0) {
3553                     goto match_loop_cleanup;
3554                 }
3555 
3556                 hsp_list = Blast_HSPListNew(0);
3557                 for (frame_index = 0;
3558                         frame_index < numFrames;
3559                         frame_index++, context_index++) {
3560                     incoming_aligns = incoming_align_set[frame_index];
3561                     if (!incoming_aligns) {
3562                         continue;
3563                     }
3564                     /*
3565                      * All alignments in thisMatch should be to the same query
3566                      */
3567                     kbp = sbp->kbp_gap[context_index];
3568                     if (smithWaterman) {
3569                         *pStatusCode =
3570                                 Blast_RedoOneMatchSmithWaterman(
3571                                         alignments,
3572                                         redo_align_params,
3573                                         incoming_aligns,
3574                                         numAligns[frame_index],
3575                                         kbp->Lambda,
3576                                         kbp->logK,
3577                                         &matchingSeq,
3578                                         query_info,
3579                                         numQueries,
3580                                         matrix,
3581                                         BLASTAA_SIZE,
3582                                         NRrecord,
3583                                         &forbidden,
3584                                         redoneMatches,
3585                                         &pvalueForThisPair,
3586                                         compositionTestIndex,
3587                                         &LambdaRatio
3588                                 );
3589                     } else {
3590                         *pStatusCode =
3591                                 Blast_RedoOneMatch(
3592                                         alignments,             // thread-local
3593                                         redo_align_params,      // thread-local
3594                                         incoming_aligns,        // thread-local
3595                                         numAligns[frame_index], // local
3596                                         kbp->Lambda,            // thread-local
3597                                         &matchingSeq,           // thread-local
3598                                         -1,                     // const
3599                                         query_info,             // thread-local
3600                                         numContexts,            // thread-local
3601                                         matrix,                 // thread-local
3602                                         BLASTAA_SIZE,           // const
3603                                         NRrecord,               // thread-local
3604                                         &pvalueForThisPair,     // local
3605                                         compositionTestIndex,   // thread-local
3606                                         &LambdaRatio            // local
3607                                 );
3608                     }
3609 
3610                     if (*pStatusCode != 0) {
3611                         goto match_loop_cleanup;
3612                     }
3613 
3614                     if (alignments[context_index] != NULL) {
3615                         Int2 qframe = frame_index;
3616                         if (program_number == eBlastTypeBlastx) {
3617                             if (qframe < 3) {
3618                                 qframe++;
3619                             } else {
3620                                 qframe = 2 - qframe;
3621                             }
3622                         }
3623                         *pStatusCode =
3624                                 s_HSPListFromDistinctAlignments(hsp_list,
3625                                         &alignments[context_index],
3626                                         matchingSeq.index,
3627                                         queryInfo, qframe);
3628                         if (*pStatusCode) {
3629                             goto match_loop_cleanup;
3630                         }
3631                     }
3632                     BlastCompo_AlignmentsFree(&incoming_aligns, NULL);
3633                     incoming_align_set[frame_index] = NULL;
3634                 }
3635 
3636                 if (hsp_list->hspcnt > 1) {
3637                     s_HitlistReapContained(hsp_list->hsp_array,
3638                             &hsp_list->hspcnt);
3639                 }
3640                 *pStatusCode =
3641                         s_HitlistEvaluateAndPurge(&best_score, &best_evalue,
3642                                 hsp_list,
3643                                 seqSrc,
3644                                 matchingSeq.length,
3645                                 program_number,
3646                                 queryInfo, context_index,
3647                                 sbp, hitParams,
3648                                 pvalueForThisPair, LambdaRatio,
3649                                 matchingSeq.index);
3650                 if (*pStatusCode != 0) {
3651                     goto query_loop_cleanup;
3652                 }
3653                 if (best_evalue <= hitParams->options->expect_value) {
3654                     /* The best alignment is significant */
3655                     s_HSPListNormalizeScores(hsp_list, kbp->Lambda, kbp->logK,
3656                             localScalingFactor);
3657                     s_ComputeNumIdentities(
3658                             queryBlk,
3659                             queryInfo,
3660                             subjectBlk,
3661                             seqSrc,
3662                             hsp_list,
3663                             scoringParams->options,
3664                             genetic_code_string,
3665                             sbp
3666                     );
3667                     if (!seqSrc) {
3668                         goto query_loop_cleanup;
3669                     }
3670                     if (BlastCompo_HeapWouldInsert(
3671                             &redoneMatches[query_index],
3672                             best_evalue,
3673                             best_score,
3674                             localMatch->oid
3675                     )) {
3676                         *pStatusCode =
3677                                 BlastCompo_HeapInsert(
3678                                         &redoneMatches[query_index],
3679                                         hsp_list,
3680                                         best_evalue,
3681                                         best_score,
3682                                         localMatch->oid,
3683                                         &discarded_aligns
3684                                 );
3685                         if (*pStatusCode == 0) {
3686                             hsp_list = NULL;
3687                         }
3688                     } else {
3689                         hsp_list = Blast_HSPListFree(hsp_list);
3690                     }
3691 
3692                     if (*pStatusCode) {
3693                         goto query_loop_cleanup;
3694                     }
3695                     if (discarded_aligns != NULL) {
3696                         Blast_HSPListFree(discarded_aligns);
3697                     }
3698                 }
3699 query_loop_cleanup:
3700 match_loop_cleanup:
3701                 if (seqSrc) {
3702                     localMatch = Blast_HSPListFree(localMatch);
3703                 } else {
3704                     Blast_HSPListSwap(localMatch, hsp_list);
3705                     localMatch->oid = hsp_list->oid;
3706                 }
3707                 hsp_list = Blast_HSPListFree(hsp_list);
3708 
3709                 if (*pStatusCode != 0) {
3710                     for (context_index = 0;
3711                             context_index < numContexts;
3712                             context_index++) {
3713                         BlastCompo_AlignmentsFree(
3714                                 &alignments[context_index],
3715                                 s_FreeEditScript
3716                         );
3717                     }
3718                 }
3719                 s_MatchingSequenceRelease(&matchingSeq);
3720                 BlastCompo_AlignmentsFree(&incoming_aligns, NULL);
3721                 if (*pStatusCode != 0 || !seqSrc) {
3722 #pragma omp critical(intrpt)
3723                     interrupt = TRUE;
3724 #pragma omp flush(interrupt)
3725                     continue;
3726                 }
3727 
3728             } /* end of if(!interrupt) */
3729         }
3730 #pragma omp barrier
3731 
3732         /*
3733          * end of omp parallel section
3734          */
3735     }
3736 
3737 function_cleanup:
3738 
3739     for (i = 0; i < actual_num_threads; ++i) {
3740         if (status_code_tld[i] != 0) {
3741             status_code = status_code_tld[i];
3742         }
3743     }
3744     for (i = 0; i < actual_num_threads; ++i) {
3745         if (seqSrc  &&  status_code == 0) {
3746             s_FillResultsFromCompoHeaps(
3747                     results_tld[i],
3748                     redoneMatches_tld[i],
3749                     hitParams->options->hitlist_size
3750             );
3751             if (redoneMatches_tld[i] != NULL) {
3752                 int qi;
3753                 for (qi = 0; qi < numQueries; ++qi) {
3754                     sfree(redoneMatches_tld[i][qi].array);
3755                     sfree(redoneMatches_tld[i][qi].heapArray);
3756                 }
3757                 s_ClearHeap(redoneMatches_tld[i]);
3758             }
3759         } else {
3760             if (redoneMatches_tld[i] != NULL) {
3761                 int qi;
3762                 for (qi = 0; qi < numQueries; ++qi) {
3763                     sfree(redoneMatches_tld[i][qi].array);
3764                     sfree(redoneMatches_tld[i][qi].heapArray);
3765                 }
3766                 s_ClearHeap(redoneMatches_tld[i]);
3767             }
3768         }
3769         sfree(redoneMatches_tld[i]);
3770     }
3771     if (redoneMatches != NULL) {
3772         int qi;
3773         for (qi = 0; qi < numQueries; ++qi) {
3774             sfree(redoneMatches[qi].array);
3775             sfree(redoneMatches[qi].heapArray);
3776         }
3777         s_ClearHeap(redoneMatches);
3778     }
3779 
3780     if (hsp_stream != NULL) {
3781         /* Reduce results from all threads and continue with business as usual */
3782         SThreadLocalDataArray* thread_data =
3783                 SThreadLocalDataArrayNew(actual_num_threads);
3784         int i;
3785         for (i = 0; i < actual_num_threads; ++i) {
3786             SThreadLocalData* tdi = thread_data->tld[i];
3787             BlastHSPResults* rdi = results_tld[i];
3788             tdi->hit_params = hit_params_tld[i];
3789             hit_params_tld[i] = NULL;
3790             tdi->results =
3791                     (BlastHSPResults*) calloc(1, sizeof(BlastHSPResults));
3792             tdi->results->num_queries = rdi->num_queries;
3793             tdi->results->hitlist_array =
3794                     (BlastHitList**) calloc(
3795                             tdi->results->num_queries,
3796                             sizeof(BlastHitList*)
3797                     );
3798             int j;
3799             for (j = 0; j < tdi->results->num_queries; ++j) {
3800                 tdi->results->hitlist_array[j] = rdi->hitlist_array[j];
3801                 rdi->hitlist_array[j] = NULL;
3802             }
3803         }
3804         local_results = SThreadLocalDataArrayConsolidateResults(thread_data);
3805         ASSERT(local_results);
3806 
3807         /* post-traceback pipes */
3808         BlastHSPStreamTBackClose(hsp_stream, local_results);
3809 
3810         for (i = 0; i < local_results->num_queries; ++i) {
3811             results->hitlist_array[i] = local_results->hitlist_array[i];
3812             local_results->hitlist_array[i] = NULL;
3813         }
3814         for (i = 0; i < actual_num_threads; ++i) {
3815             thread_data->tld[i]->hit_params = NULL;
3816             int j;
3817             for (j = 0; j < local_results->num_queries; ++j) {
3818                 thread_data->tld[i]->results->hitlist_array[j] =
3819                         Blast_HitListFree(
3820                                 thread_data->tld[i]->results->hitlist_array[j]
3821                         );
3822             }
3823             sfree(thread_data->tld[i]->results->hitlist_array);
3824             sfree(thread_data->tld[i]->results);
3825             thread_data->tld[i] = SThreadLocalDataFree(thread_data->tld[i]);
3826             results_tld[i] = Blast_HSPResultsFree(results_tld[i]);
3827             s_FreeBlastCompo_QueryInfoArray(&query_info_tld[i], numContexts);
3828         }
3829         sfree(thread_data->tld);
3830         sfree(thread_data);
3831         Blast_HSPResultsFree(local_results);
3832     }
3833 
3834     if (redoneMatches != NULL) {
3835         for (query_index = 0;  query_index < numQueries;  query_index++) {
3836             BlastCompo_HeapRelease(&redoneMatches[query_index]);
3837         }
3838         sfree(redoneMatches);
3839         redoneMatches = NULL;
3840     }
3841     if (smithWaterman) {
3842         Blast_ForbiddenRangesRelease(&forbidden);
3843     }
3844     if (gapAlign != NULL) {
3845         gapAlign = BLAST_GapAlignStructFree(gapAlign);
3846     }
3847     s_RestoreSearch(sbp, scoringParams, savedParams, queryBlk->length,
3848                     positionBased, compo_adjust_mode);
3849     s_SavedParametersFree(&savedParams);
3850 
3851     for (i = 0; i < actual_num_threads; ++i) {
3852         s_BlastScoreBlk_Free(&sbp_tld[i]);
3853         gap_align_tld[i]->sbp = NULL;
3854         s_BlastGapAlignStruct_Free(gap_align_tld[i]);
3855         Blast_RedoAlignParamsFree(&redo_align_params_tld[i]);
3856         sfree(alignments_tld[i]);
3857         sfree(incoming_align_set_tld[i]);
3858         Blast_CompositionWorkspaceFree(&NRrecord_tld[i]);
3859         s_SavedParametersFree(&savedParams_tld[i]);
3860         BlastSeqSrcFree(seqsrc_tld[i]);
3861     }
3862     sfree(alignments_tld);
3863     sfree(compositionTestIndex_tld);
3864     sfree(gap_align_tld);
3865     sfree(gapping_params_context_tld);
3866     sfree(hit_params_tld);
3867     sfree(incoming_align_set_tld);
3868     sfree(matrix_tld);
3869     sfree(NRrecord_tld);
3870     sfree(numContexts_tld);
3871     sfree(query_info_tld);
3872     sfree(redo_align_params_tld);
3873     sfree(redoneMatches_tld);
3874     sfree(results_tld);
3875     sfree(savedParams_tld);
3876     sfree(sbp_tld);
3877     sfree(score_params_tld);
3878     sfree(seqsrc_tld);
3879     sfree(status_code_tld);
3880     sfree(subjectBlk_tld);
3881     sfree(theseMatches);
3882 
3883     return (Int2) status_code;
3884 }
3885 
3886 
3887 
3888