1 /* $Id: blast_kappa.c 615057 2020-08-26 15:29:10Z fongah2 $
2  * ==========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Alejandro Schaffer, Mike Gertz (ported to algo/blast by Tom Madden)
27  *
28  */
29 
30 /** @file blast_kappa.c
31  * Utilities for doing Smith-Waterman alignments and adjusting the scoring
32  * system for each match in blastpgp
33  */
34 
35 #include <float.h>
36 #include <algo/blast/core/ncbi_math.h>
37 #include <algo/blast/core/blast_hits.h>
38 #include <algo/blast/core/blast_kappa.h>
39 #include <algo/blast/core/blast_util.h>
40 #include <algo/blast/core/blast_gapalign.h>
41 #include <algo/blast/core/blast_filter.h>
42 #include <algo/blast/core/blast_traceback.h>
43 #include <algo/blast/core/link_hsps.h>
44 #include <algo/blast/core/gencode_singleton.h>
45 #include "blast_psi_priv.h"
46 #include "blast_gapalign_priv.h"
47 #include "blast_hits_priv.h"
48 #include "blast_posit.h"
49 #include "blast_hspstream_mt_utils.h"
50 #include "blast_traceback_mt_priv.h"
51 
52 #ifdef _OPENMP
53 #include <omp.h>
54 
55 #  ifdef _WIN32
56 /* stderr expands to (__acrt_iob_func(2)), which won't work in an OpenMP
57  * shared(...) list. */
58 #    define STDERR_COMMA
59 #  else
60 #    define STDERR_COMMA stderr,
61 #  endif
62 #endif
63 
64 #include <algo/blast/composition_adjustment/nlm_linear_algebra.h>
65 #include <algo/blast/composition_adjustment/compo_heap.h>
66 #include <algo/blast/composition_adjustment/redo_alignment.h>
67 #include <algo/blast/composition_adjustment/matrix_frequency_data.h>
68 #include <algo/blast/composition_adjustment/unified_pvalues.h>
69 
70 /* Define KAPPA_PRINT_DIAGNOSTICS to turn on printing of
71  * diagnostic information from some routines. */
72 
73 /** Compile-time option; if set to a true value, then blastp runs
74     that use Blast_RedoAlignmentCore to compute the traceback will not
75     SEG the subject sequence */
76 #ifndef KAPPA_BLASTP_NO_SEG_SEQUENCE
77 #define KAPPA_BLASTP_NO_SEG_SEQUENCE 0
78 #endif
79 
80 
81 /** Compile-time option; if set to a true value, then blastp runs
82     that use Blast_RedoAlignmentCore to compute the traceback will not
83     SEG the subject sequence */
84 #ifndef KAPPA_TBLASTN_NO_SEG_SEQUENCE
85 #define KAPPA_TBLASTN_NO_SEG_SEQUENCE 0
86 #endif
87 
88 
89 /**
90  * Given a list of HSPs with (possibly) high-precision scores, rescale
91  * the scores to have standard precision and set the scale-independent
92  * bit scores.  This routine does *not* resort the list; it is assumed
93  * that the list is already sorted according to e-values that have been
94  * computed using the initial, higher-precision scores.
95  *
96  * @param hsp_list          the HSP list
97  * @param logK              Karlin-Altschul statistical parameter [in]
98  * @param lambda            Karlin-Altschul statistical parameter [in]
99  * @param scoreDivisor      the value by which reported scores are to be
100  */
101 static void
s_HSPListNormalizeScores(BlastHSPList * hsp_list,double lambda,double logK,double scoreDivisor)102 s_HSPListNormalizeScores(BlastHSPList * hsp_list,
103                          double lambda,
104                          double logK,
105                          double scoreDivisor)
106 {
107     int hsp_index;
108     for(hsp_index = 0; hsp_index < hsp_list->hspcnt; hsp_index++) {
109         BlastHSP * hsp = hsp_list->hsp_array[hsp_index];
110 
111         hsp->score = BLAST_Nint(((double) hsp->score) / scoreDivisor);
112         /* Compute the bit score using the newly computed scaled score. */
113         hsp->bit_score = (hsp->score*lambda*scoreDivisor - logK)/NCBIMATH_LN2;
114     }
115 }
116 
117 
118 /**
119  * Adjusts the E-values in a BLAST_HitList to be composites of
120  * a composition-based P-value and a score/alignment-based P-value
121  *
122  * @param hsp_list           the hitlist whose E-values need to be adjusted
123  * @param comp_p_value      P-value from sequence composition
124  * @param seqSrc            a source of sequence data
125  * @param subject_length    length of database sequence
126  * @param query_context     info about this query context; needed when
127  *                          multiple queries are being used
128  * @param LambdaRatio       the ratio between the observed value of Lambda
129  *                          and the predicted value of lambda (used to print
130  *                          diagnostics)
131  * @param subject_id        the subject id of this sequence (used to print
132  *                          diagnostics)
133  **/
134 static void
s_AdjustEvaluesForComposition(BlastHSPList * hsp_list,double comp_p_value,const BlastSeqSrc * seqSrc,Int4 subject_length,const BlastContextInfo * query_context,double LambdaRatio,int subject_id)135 s_AdjustEvaluesForComposition(
136     BlastHSPList *hsp_list,
137     double comp_p_value,
138     const BlastSeqSrc* seqSrc,
139     Int4 subject_length,
140     const BlastContextInfo * query_context,
141     double LambdaRatio,
142     int subject_id)
143 {
144     /* Smallest observed evalue after adjustment */
145     double best_evalue = DBL_MAX;
146 
147     /* True length of the query */
148     int query_length =      query_context->query_length;
149     /* Length adjustment to compensate for edge effects */
150     int length_adjustment = query_context->length_adjustment;
151 
152     /* Effective lengths of the query, subject, and database */
153     double query_eff   = MAX((query_length - length_adjustment), 1);
154     double subject_eff = MAX((subject_length - length_adjustment), 1.0);
155     double dblen_eff = (double) query_context->eff_searchsp / query_eff;
156 
157     /* Scale factor to convert the database E-value to the sequence E-value */
158     double db_to_sequence_scale = subject_eff / dblen_eff;
159 
160     int        hsp_index;
161     for (hsp_index = 0;  hsp_index < hsp_list->hspcnt;  hsp_index++) {
162         /* for all HSPs */
163         double align_p_value;     /* P-value for the alignment score */
164         double combined_p_value;  /* combination of two P-values */
165 
166         /* HSP for this iteration */
167         BlastHSP * hsp = hsp_list->hsp_array[hsp_index];
168 #ifdef KAPPA_PRINT_DIAGNOSTICS
169         /* Original E-value, saved if diagnostics are printed. */
170         double old_e_value = hsp->evalue;
171 #endif
172         hsp->evalue *= db_to_sequence_scale;
173 
174         align_p_value = BLAST_KarlinEtoP(hsp->evalue);
175         combined_p_value = Blast_Overall_P_Value(comp_p_value,align_p_value);
176         hsp->evalue = BLAST_KarlinPtoE(combined_p_value);
177         hsp->evalue /= db_to_sequence_scale;
178 
179         if (hsp->evalue < best_evalue) {
180             best_evalue = hsp->evalue;
181         }
182 
183 #ifdef KAPPA_PRINT_DIAGNOSTICS
184         if (seqSrc){
185             int    sequence_gi; /*GI of a sequence*/
186             Blast_GiList* gi_list; /*list of GI's for a sequence*/
187             gi_list = BlastSeqSrcGetGis(seqSrc, (void *) (&subject_id));
188             if ((gi_list) && (gi_list->num_used > 0)) {
189                 sequence_gi = gi_list->data[0];
190             } else {
191                 sequence_gi = (-1);
192             }
193             printf("GI %d Lambda ratio %e comp. p-value %e; "
194                    "adjust E-value of query length %d match length "
195                    "%d from %e to %e\n",
196                    sequence_gi, LambdaRatio, comp_p_value,
197                    query_length, subject_length, old_e_value, hsp->evalue);
198             Blast_GiListFree(gi_list);
199         }
200 #endif
201     } /* end for all HSPs */
202 
203     hsp_list->best_evalue = best_evalue;
204 
205     /* suppress unused parameter warnings if diagnostics are not printed */
206     (void) seqSrc;
207     (void) query_length;
208     (void) LambdaRatio;
209     (void) subject_id;
210 }
211 
212 
213 /**
214  * Remove from a hitlist all HSPs that are completely contained in an
215  * HSP that occurs earlier in the list and that:
216  * - is on the same strand; and
217  * - has equal or greater score.  T
218  * The hitlist should be sorted by some measure of significance before
219  * this routine is called.
220  * @param hsp_array         array to be reaped
221  * @param hspcnt            length of hsp_array
222  */
223 static void
s_HitlistReapContained(BlastHSP * hsp_array[],Int4 * hspcnt)224 s_HitlistReapContained(BlastHSP * hsp_array[], Int4 * hspcnt)
225 {
226     Int4 iread;       /* iteration index used to read the hitlist */
227     Int4 iwrite;      /* iteration index used to write to the hitlist */
228     Int4 old_hspcnt;  /* number of HSPs in the hitlist on entry */
229 
230     old_hspcnt = *hspcnt;
231 
232     for (iread = 1;  iread < *hspcnt;  iread++) {
233         /* for all HSPs in the hitlist */
234         Int4      ireadBack;  /* iterator over indices less than iread */
235         BlastHSP *hsp1;       /* an HSP that is a candidate for deletion */
236 
237         hsp1 = hsp_array[iread];
238         for (ireadBack = 0;  ireadBack < iread && hsp1 != NULL;  ireadBack++) {
239             /* for all HSPs before hsp1 in the hitlist and while hsp1
240              * has not been deleted */
241             BlastHSP *hsp2;    /* an HSP that occurs earlier in hsp_array
242                                 * than hsp1 */
243             hsp2 = hsp_array[ireadBack];
244 
245             if( hsp2 == NULL ) {  /* hsp2 was deleted in a prior iteration. */
246                 continue;
247             }
248             if (hsp2->query.frame == hsp1->query.frame &&
249                 hsp2->subject.frame == hsp1->subject.frame) {
250                 /* hsp1 and hsp2 are in the same query/subject frame. */
251                 if (CONTAINED_IN_HSP
252                     (hsp2->query.offset, hsp2->query.end, hsp1->query.offset,
253                      hsp2->subject.offset, hsp2->subject.end,
254                      hsp1->subject.offset) &&
255                     CONTAINED_IN_HSP
256                     (hsp2->query.offset, hsp2->query.end, hsp1->query.end,
257                      hsp2->subject.offset, hsp2->subject.end,
258                      hsp1->subject.end)    &&
259                     hsp1->score <= hsp2->score) {
260                     hsp1 = hsp_array[iread] = Blast_HSPFree(hsp_array[iread]);
261                 }
262             } /* end if hsp1 and hsp2 are in the same query/subject frame */
263         } /* end for all HSPs before hsp1 in the hitlist */
264     } /* end for all HSPs in the hitlist */
265 
266     /* Condense the hsp_array, removing any NULL items. */
267     iwrite = 0;
268     for (iread = 0;  iread < *hspcnt;  iread++) {
269         if (hsp_array[iread] != NULL) {
270             hsp_array[iwrite++] = hsp_array[iread];
271         }
272     }
273     *hspcnt = iwrite;
274     /* Fill the remaining memory in hsp_array with NULL pointers. */
275     for ( ;  iwrite < old_hspcnt;  iwrite++) {
276         hsp_array[iwrite] = NULL;
277     }
278 }
279 
280 
281 /** A callback used to free an EditScript that has been stored in a
282  * BlastCompo_Alignment. */
s_FreeEditScript(void * edit_script)283 static void s_FreeEditScript(void * edit_script)
284 {
285     if (edit_script != NULL)
286         GapEditScriptDelete(edit_script);
287 }
288 
289 
290 /**
291  * Converts a list of objects of type BlastCompo_Alignment to an
292  * new object of type BlastHSPList and returns the result. Conversion
293  * in this direction is lossless.  The list passed to this routine is
294  * freed to ensure that there is no aliasing of fields between the
295  * list of BlastCompo_Alignments and the new hitlist.
296  *
297  * @param hsp_list   The hsp_list to populate
298  * @param alignments A list of distinct alignments; freed before return [in]
299  * @param oid        Ordinal id of a database sequence [in]
300  * @param queryInfo  information about all queries in this search [in]
301  * @param frame     query frame
302  * @return Allocated and filled BlastHSPList structure.
303  */
304 static int
s_HSPListFromDistinctAlignments(BlastHSPList * hsp_list,BlastCompo_Alignment ** alignments,int oid,const BlastQueryInfo * queryInfo,int frame)305 s_HSPListFromDistinctAlignments(BlastHSPList *hsp_list,
306                                 BlastCompo_Alignment ** alignments,
307                                 int oid,
308                                 const BlastQueryInfo* queryInfo,
309                                 int frame)
310 {
311     int status = 0;                    /* return code for any routine called */
312     static const int unknown_value = 0;   /* dummy constant to use when a
313                                              parameter value is not known */
314     BlastCompo_Alignment * align;  /* an alignment in the list */
315 
316     if (hsp_list == NULL) {
317         return -1;
318     }
319     hsp_list->oid = oid;
320 
321     for (align = *alignments;  NULL != align;  align = align->next) {
322         BlastHSP * new_hsp = NULL;
323         GapEditScript * editScript = align->context;
324         align->context = NULL;
325 
326         status = Blast_HSPInit(align->queryStart, align->queryEnd,
327                                align->matchStart, align->matchEnd,
328                                unknown_value, unknown_value,
329                                align->queryIndex,
330                                frame, (Int2) align->frame, align->score,
331                                &editScript, &new_hsp);
332         switch (align->matrix_adjust_rule) {
333         case eDontAdjustMatrix:
334             new_hsp->comp_adjustment_method = eNoCompositionBasedStats;
335             break;
336         case eCompoScaleOldMatrix:
337             new_hsp->comp_adjustment_method = eCompositionBasedStats;
338             break;
339         default:
340             new_hsp->comp_adjustment_method = eCompositionMatrixAdjust;
341             break;
342         }
343         if (status != 0)
344             break;
345         /* At this point, the subject and possibly the query sequence have
346          * been filtered; since it is not clear that num_ident of the
347          * filtered sequences, rather than the original, is desired,
348          * explicitly leave num_ident blank. */
349         new_hsp->num_ident = 0;
350 
351         status = Blast_HSPListSaveHSP(hsp_list, new_hsp);
352         if (status != 0)
353             break;
354     }
355     if (status == 0) {
356         BlastCompo_AlignmentsFree(alignments, s_FreeEditScript);
357         Blast_HSPListSortByScore(hsp_list);
358     } else {
359         hsp_list = Blast_HSPListFree(hsp_list);
360     }
361     return 0;
362 }
363 
s_GetSubjectLength(Int4 total_subj_length,EBlastProgramType program_number)364 Int4 s_GetSubjectLength(Int4 total_subj_length, EBlastProgramType program_number)
365 {
366 	return ((program_number == eBlastTypeRpsTblastn) ?
367 				(GET_NUCL_LENGTH(total_subj_length) - 1 ) /3 : total_subj_length);
368 }
369 
370 
371 /**
372  * Adding evalues to a list of HSPs and remove those that do not have
373  * sufficiently good (low) evalue.
374  *
375  * @param *pbestScore      best (highest) score in the list
376  * @param *pbestEvalue     best (lowest) evalue in the list
377  * @param hsp_list         the list
378  * @param seqSrc            a source of sequence data
379  * @param subject_length   length of the subject sequence
380  * @param program_number   the type of BLAST search being performed
381  * @param queryInfo        information about the queries
382  * @param context_index      the index of the query corresponding to
383  *                         the HSPs in hsp_list
384  * @param sbp              the score block for this search
385  * @param hitParams        parameters used to assign evalues and
386  *                         decide whether to save hits.
387  * @param pvalueForThisPair  composition p-value
388  * @param LambdaRatio        lambda ratio, if available
389  * @param subject_id         index of subject
390  *
391  * @return 0 on success; -1 on failure (can fail because some methods
392  *         of generating evalues use auxiliary structures)
393  */
394 static int
s_HitlistEvaluateAndPurge(int * pbestScore,double * pbestEvalue,BlastHSPList * hsp_list,const BlastSeqSrc * seqSrc,int subject_length,EBlastProgramType program_number,const BlastQueryInfo * queryInfo,int context_index,BlastScoreBlk * sbp,const BlastHitSavingParameters * hitParams,double pvalueForThisPair,double LambdaRatio,int subject_id)395 s_HitlistEvaluateAndPurge(int * pbestScore, double *pbestEvalue,
396                           BlastHSPList * hsp_list,
397                           const BlastSeqSrc* seqSrc,
398                           int subject_length,
399                           EBlastProgramType program_number,
400                           const BlastQueryInfo* queryInfo,
401                           int context_index,
402                           BlastScoreBlk* sbp,
403                           const BlastHitSavingParameters* hitParams,
404                           double pvalueForThisPair,
405                           double LambdaRatio,
406                           int subject_id)
407 {
408     int status = 0;
409     *pbestEvalue = DBL_MAX;
410     *pbestScore  = 0;
411     if (hitParams->do_sum_stats) {
412         status = BLAST_LinkHsps(program_number, hsp_list, queryInfo,
413                                 subject_length, sbp,
414                                 hitParams->link_hsp_params, TRUE);
415     } else {
416 
417 
418         status =
419             Blast_HSPListGetEvalues(program_number, queryInfo,
420             		                s_GetSubjectLength(subject_length, program_number),
421                                     hsp_list, TRUE, FALSE, sbp,
422                                     0.0, /* use a non-zero gap decay
423                                             only when linking HSPs */
424                                     1.0); /* Use scaling factor equal to
425                                              1, because both scores and
426                                              Lambda are scaled, so they
427                                              will cancel each other. */
428     }
429     if (eBlastTypeBlastp == program_number ||
430         eBlastTypeBlastx == program_number) {
431         if ((0 <= pvalueForThisPair) && (pvalueForThisPair <= 1)) {
432             s_AdjustEvaluesForComposition(hsp_list, pvalueForThisPair, seqSrc,
433                                           subject_length,
434                                           &queryInfo->contexts[context_index],
435                                           LambdaRatio, subject_id);
436         }
437     }
438     if (status == 0) {
439         Blast_HSPListReapByEvalue(hsp_list, hitParams->options);
440         if (hsp_list->hspcnt > 0) {
441             *pbestEvalue = hsp_list->best_evalue;
442             *pbestScore  = hsp_list->hsp_array[0]->score;
443         }
444     }
445     return status == 0 ? 0 : -1;
446 }
447 
448 /** Compute the number of identities for the HSPs in the hsp_list
449  * @note Should work for blastp and tblastn now.
450  *
451  * @param query_blk the query sequence data [in]
452  * @param query_info structure describing the query_blk structure [in]
453  * @param seq_src source of subject sequence data [in]
454  * @param hsp_list list of HSPs to be processed [in|out]
455  * @param scoring_options scoring options [in]
456  * @gen_code_string Genetic code for tblastn [in]
457  */
458 static void
s_ComputeNumIdentities(const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info,BLAST_SequenceBlk * subject_blk,const BlastSeqSrc * seq_src,BlastHSPList * hsp_list,const BlastScoringOptions * scoring_options,const Uint1 * gen_code_string,const BlastScoreBlk * sbp,BlastSeqSrcSetRangesArg * ranges)459 s_ComputeNumIdentities(const BLAST_SequenceBlk* query_blk,
460                        const BlastQueryInfo* query_info,
461                        BLAST_SequenceBlk* subject_blk,
462                        const BlastSeqSrc* seq_src,
463                        BlastHSPList* hsp_list,
464                        const BlastScoringOptions* scoring_options,
465                        const Uint1* gen_code_string,
466                        const BlastScoreBlk* sbp,
467                        BlastSeqSrcSetRangesArg * ranges)
468 {
469     Uint1* query = NULL;
470     Uint1* query_nomask = NULL;
471     Uint1* subject = NULL;
472     const EBlastProgramType program_number = scoring_options->program_number;
473     const Boolean kIsOutOfFrame = scoring_options->is_ooframe;
474     const EBlastEncoding encoding = Blast_TracebackGetEncoding(program_number);
475     BlastSeqSrcGetSeqArg seq_arg;
476     Int2 status = 0;
477     int i;
478     SBlastTargetTranslation* target_t = NULL;
479 
480     if ( !hsp_list) return;
481 
482     /* Initialize the subject */
483     if (seq_src){
484         memset((void*) &seq_arg, 0, sizeof(seq_arg));
485         seq_arg.oid = hsp_list->oid;
486         seq_arg.encoding = encoding;
487         seq_arg.check_oid_exclusion = TRUE;
488         seq_arg.ranges = ranges;
489         status = BlastSeqSrcGetSequence(seq_src, (void*) &seq_arg);
490         ASSERT(status == 0);
491         (void)status; /* to pacify compiler warning */
492 
493         if (program_number == eBlastTypeTblastn) {
494             subject_blk = seq_arg.seq;
495             BlastTargetTranslationNew(
496                     subject_blk,
497                     gen_code_string,
498                     eBlastTypeTblastn,
499                     kIsOutOfFrame,
500                     &target_t
501             );
502         } else {
503             subject = seq_arg.seq->sequence;
504         }
505     } else {
506         subject = subject_blk->sequence;
507     }
508 
509     for (i = 0; i < hsp_list->hspcnt; i++) {
510         BlastHSP* hsp = hsp_list->hsp_array[i];
511 
512         /* Initialize the query */
513         if (program_number == eBlastTypeBlastx && kIsOutOfFrame) {
514             Int4 context = hsp->context - hsp->context % CODON_LENGTH;
515             Int4 context_offset = query_info->contexts[context].query_offset;
516             query = query_blk->oof_sequence + CODON_LENGTH + context_offset;
517             query_nomask = query_blk->oof_sequence + CODON_LENGTH + context_offset;
518         } else {
519             query = query_blk->sequence +
520                 query_info->contexts[hsp->context].query_offset;
521             query_nomask = query_blk->sequence_nomask +
522                 query_info->contexts[hsp->context].query_offset;
523         }
524 
525         /* Translate subject if needed. */
526         if (program_number == eBlastTypeTblastn) {
527             const Uint1* target_sequence = Blast_HSPGetTargetTranslation(target_t, hsp, NULL);
528             status = Blast_HSPGetNumIdentitiesAndPositives(query, target_sequence, hsp, scoring_options, 0, sbp);
529         }
530         else
531             status = Blast_HSPGetNumIdentitiesAndPositives(query_nomask, subject, hsp, scoring_options, 0, sbp);
532 
533         ASSERT(status == 0);
534     }
535     target_t = BlastTargetTranslationFree(target_t);
536     if (seq_src) {
537         BlastSeqSrcReleaseSequence(seq_src, (void*) &seq_arg);
538         BlastSequenceBlkFree(seq_arg.seq);
539     }
540 }
541 
542 
543 /**
544  * A callback routine: compute lambda for the given score
545  * probabilities.
546  * (@sa calc_lambda_type).
547  */
548 static double
s_CalcLambda(double probs[],int min_score,int max_score,double lambda0)549 s_CalcLambda(double probs[], int min_score, int max_score, double lambda0)
550 {
551 
552     int i;                 /* loop index */
553     int score_range;       /* range of possible scores */
554     double avg;            /* expected score of aligning two characters */
555     Blast_ScoreFreq freq;  /* score frequency data */
556 
557     score_range = max_score - min_score + 1;
558     avg = 0.0;
559     for (i = 0;  i < score_range;  i++) {
560         avg += (min_score + i) * probs[i];
561     }
562     freq.score_min = min_score;
563     freq.score_max = max_score;
564     freq.obs_min = min_score;
565     freq.obs_max = max_score;
566     freq.sprob0 = probs;
567     freq.sprob = &probs[-min_score];
568     freq.score_avg = avg;
569 
570     return Blast_KarlinLambdaNR(&freq, lambda0);
571 }
572 
573 
574 /** Fill a two-dimensional array with the frequency ratios that
575  * underlie a position specific score matrix (PSSM).
576  *
577  * @param returnRatios     a two-dimensional array with BLASTAA_SIZE
578  *                         columns
579  * @param numPositions     the number of rows in returnRatios
580  * @param query            query sequence data, of length numPositions
581  * @param matrixName       the name of the position independent matrix
582  *                         corresponding to this PSSM
583  * @param startNumerator   position-specific data used to generate the
584  *                         PSSM
585  * @return   0 on success; -1 if the named matrix isn't known, or if
586  *           there was a memory error
587  * @todo find out what start numerator is.
588  */
589 static int
s_GetPosBasedStartFreqRatios(double ** returnRatios,Int4 numPositions,Uint1 * query,const char * matrixName,double ** startNumerator)590 s_GetPosBasedStartFreqRatios(double ** returnRatios,
591                              Int4 numPositions,
592                              Uint1 * query,
593                              const char *matrixName,
594                              double **startNumerator)
595 {
596     Int4 i,j;                            /* loop indices */
597     SFreqRatios * stdFreqRatios = NULL;  /* frequency ratios for the
598                                             named matrix. */
599     double *standardProb;                /* probabilities of each
600                                             letter*/
601     const double kPosEpsilon = 0.0001;   /* values below this cutoff
602                                             are treated specially */
603 
604     stdFreqRatios = _PSIMatrixFrequencyRatiosNew(matrixName);
605     if (stdFreqRatios == NULL) {
606         return -1;
607     }
608     for (i = 0;  i < numPositions;  i++) {
609         for (j = 0;  j < BLASTAA_SIZE;  j++) {
610             returnRatios[i][j] = stdFreqRatios->data[query[i]][j];
611         }
612     }
613     stdFreqRatios = _PSIMatrixFrequencyRatiosFree(stdFreqRatios);
614 
615     standardProb = BLAST_GetStandardAaProbabilities();
616     if(standardProb == NULL) {
617         return -1;
618     }
619     /*reverse multiplication done in posit.c*/
620     for (i = 0;  i < numPositions;  i++) {
621         for (j = 0;  j < BLASTAA_SIZE;  j++) {
622             if ((standardProb[query[i]] > kPosEpsilon) &&
623                 (standardProb[j] > kPosEpsilon) &&
624                 (j != eStopChar) && (j != eXchar) &&
625                 (startNumerator[i][j] > kPosEpsilon)) {
626                 returnRatios[i][j] = startNumerator[i][j] / standardProb[j];
627             }
628         }
629     }
630     sfree(standardProb);
631 
632     return 0;
633 }
634 
635 
636 /**
637  * Fill a two-dimensional array with the frequency ratios that underlie the
638  * named score matrix.
639  *
640  * @param returnRatios  a two-dimensional array of size
641  *                      BLASTAA_SIZE x  BLASTAA_SIZE
642  * @param matrixName    the name of a matrix
643  * @return   0 on success; -1 if the named matrix isn't known, or if
644  *           there was a memory error
645  */
646 static int
s_GetStartFreqRatios(double ** returnRatios,const char * matrixName)647 s_GetStartFreqRatios(double ** returnRatios,
648                      const char *matrixName)
649 {
650     /* Loop indices */
651     int i,j;
652     /* Frequency ratios for the matrix */
653     SFreqRatios * stdFreqRatios = NULL;
654 
655     stdFreqRatios = _PSIMatrixFrequencyRatiosNew(matrixName);
656     if (stdFreqRatios == NULL) {
657         return -1;
658     }
659     for (i = 0;  i < BLASTAA_SIZE;  i++) {
660         for (j = 0;  j < BLASTAA_SIZE;  j++) {
661             returnRatios[i][j] = stdFreqRatios->data[i][j];
662         }
663     }
664     stdFreqRatios = _PSIMatrixFrequencyRatiosFree(stdFreqRatios);
665 
666     return 0;
667 }
668 
669 
670 /** SCALING_FACTOR is a multiplicative factor used to get more bits of
671  * precision in the integer matrix scores. It cannot be arbitrarily
672  * large because we do not want total alignment scores to exceed
673  * -(BLAST_SCORE_MIN) */
674 #define SCALING_FACTOR 32
675 
676 
677 /**
678  * Produce a scaled-up version of the position-specific matrix
679  * with a given set of position-specific residue frequencies.
680  *
681  * @param fillPosMatrix     is the matrix to be filled
682  * @param matrixName        name of the standard substitution matrix [in]
683  * @param posFreqs          PSSM's frequency ratios [in]
684  * @param query             Query sequence data [in]
685  * @param queryLength       Length of the query sequence above [in]
686  * @param sbp               stores various parameters of the search
687  * @param scale_factor      amount by which ungapped parameters should be
688  *                          scaled.
689  * @return 0 on success; -1 on failure
690  */
691 static int
s_ScalePosMatrix(int ** fillPosMatrix,const char * matrixName,double ** posFreqs,Uint1 * query,int queryLength,BlastScoreBlk * sbp,double scale_factor)692 s_ScalePosMatrix(int ** fillPosMatrix,
693                  const char * matrixName,
694                  double ** posFreqs,
695                  Uint1 * query,
696                  int queryLength,
697                  BlastScoreBlk* sbp,
698                  double scale_factor)
699 {
700     /* Data used by scaling routines */
701     Kappa_posSearchItems *posSearch = NULL;
702     /* A reduced collection of search parameters used by PSI-blast */
703     Kappa_compactSearchItems *compactSearch = NULL;
704     /* Representation of a PSSM internal to PSI-blast */
705     _PSIInternalPssmData* internal_pssm = NULL;
706     /* return code */
707     int status = 0;
708 
709     posSearch = Kappa_posSearchItemsNew(queryLength, matrixName,
710                                         fillPosMatrix, posFreqs);
711     compactSearch = Kappa_compactSearchItemsNew(query, queryLength, sbp);
712     /* Copy data into new structures */
713     internal_pssm = _PSIInternalPssmDataNew(queryLength, BLASTAA_SIZE);
714     if (posSearch == NULL || compactSearch == NULL || internal_pssm == NULL) {
715         status = -1;
716         goto cleanup;
717     }
718     _PSICopyMatrix_int(internal_pssm->pssm, posSearch->posMatrix,
719                        internal_pssm->ncols, internal_pssm->nrows);
720     _PSICopyMatrix_int(internal_pssm->scaled_pssm,
721                        posSearch->posPrivateMatrix,
722                        internal_pssm->ncols, internal_pssm->nrows);
723     _PSICopyMatrix_double(internal_pssm->freq_ratios,
724                           posSearch->posFreqs, internal_pssm->ncols,
725                           internal_pssm->nrows);
726     status = _PSIConvertFreqRatiosToPSSM(internal_pssm, query, sbp,
727                                          compactSearch->standardProb);
728     if (status != 0) {
729         goto cleanup;
730     }
731     /* Copy data from new structures to posSearchItems */
732     _PSICopyMatrix_int(posSearch->posMatrix, internal_pssm->pssm,
733                        internal_pssm->ncols, internal_pssm->nrows);
734     _PSICopyMatrix_int(posSearch->posPrivateMatrix,
735                        internal_pssm->scaled_pssm,
736                        internal_pssm->ncols, internal_pssm->nrows);
737     _PSICopyMatrix_double(posSearch->posFreqs,
738                           internal_pssm->freq_ratios,
739                           internal_pssm->ncols, internal_pssm->nrows);
740     status = Kappa_impalaScaling(posSearch, compactSearch, (double)
741                                  scale_factor, FALSE, sbp);
742 cleanup:
743     internal_pssm = _PSIInternalPssmDataFree(internal_pssm);
744     posSearch = Kappa_posSearchItemsFree(posSearch);
745     compactSearch = Kappa_compactSearchItemsFree(compactSearch);
746 
747     return status;
748 }
749 
750 
751 /**
752  * Convert an array of HSPs to a list of BlastCompo_Alignment objects.
753  * The context field of each BlastCompo_Alignment is set to point to the
754  * corresponding HSP.
755  *
756  * @param self                 the array of alignment to be filled
757  * @param numAligns            number of alignments
758  * @param hsp_array             an array of HSPs
759  * @param hspcnt                the length of hsp_array
760  * @param init_context          the initial context to process
761  * @param queryInfo            information about the concatenated query
762  * @param localScalingFactor    the amount by which this search is scaled
763  *
764  * @return the new list of alignments; or NULL if there is an out-of-memory
765  *         error (or if the original array is empty)
766  */
767 static int
s_ResultHspToDistinctAlign(BlastCompo_Alignment ** self,int * numAligns,BlastHSP * hsp_array[],Int4 hspcnt,int init_context,const BlastQueryInfo * queryInfo,double localScalingFactor)768 s_ResultHspToDistinctAlign(BlastCompo_Alignment **self,
769                            int *numAligns,
770                            BlastHSP * hsp_array[], Int4 hspcnt,
771                            int init_context,
772                            const BlastQueryInfo* queryInfo,
773                            double localScalingFactor)
774 {
775     BlastCompo_Alignment * tail[6];        /* last element in aligns */
776     int hsp_index;                             /* loop index */
777     int frame_index;
778 
779     for (frame_index = 0; frame_index < 6; frame_index++) {
780         tail[frame_index] = NULL;
781         numAligns[frame_index] = 0;
782     }
783 
784     for (hsp_index = 0;  hsp_index < hspcnt;  hsp_index++) {
785         BlastHSP * hsp = hsp_array[hsp_index]; /* current HSP */
786         BlastCompo_Alignment * new_align;      /* newly-created alignment */
787         frame_index = hsp->context - init_context;
788         ASSERT(frame_index < 6 && frame_index >= 0);
789         /* Incoming alignments will have coordinates of the query
790            portion relative to a particular query context; they must
791            be shifted for used in the composition_adjustment library.
792         */
793         new_align =
794             BlastCompo_AlignmentNew((int) (hsp->score * localScalingFactor),
795                                     eDontAdjustMatrix,
796                                     hsp->query.offset, hsp->query.end, hsp->context,
797                                     hsp->subject.offset, hsp->subject.end,
798                                     hsp->subject.frame, hsp);
799         if (new_align == NULL) /* out of memory */
800             return -1;
801         if (tail[frame_index] == NULL) { /* if the list aligns is empty; */
802             /* make new_align the first element in the list */
803             self[frame_index] = new_align;
804         } else {
805             /* otherwise add new_align to the end of the list */
806             tail[frame_index]->next = new_align;
807         }
808         tail[frame_index] = new_align;
809         numAligns[frame_index]++;
810     }
811     return 0;
812 }
813 
814 
815 /**
816  * Redo a S-W alignment using an x-drop alignment.  The result will
817  * usually be the same as the S-W alignment. The call to ALIGN_EX
818  * attempts to force the endpoints of the alignment to match the
819  * optimal endpoints determined by the Smith-Waterman algorithm.
820  * ALIGN_EX is used, so that if the data structures for storing BLAST
821  * alignments are changed, the code will not break
822  *
823  * @param query         the query data
824  * @param queryStart    start of the alignment in the query sequence
825  * @param queryEnd      end of the alignment in the query sequence,
826  *                      as computed by the Smith-Waterman algorithm
827  * @param subject       the subject (database) sequence
828  * @param matchStart    start of the alignment in the subject sequence
829  * @param matchEnd      end of the alignment in the query sequence,
830  *                      as computed by the Smith-Waterman algorithm
831  * @param gap_align     parameters for a gapped alignment
832  * @param scoringParams Settings for gapped alignment.[in]
833  * @param score         score computed by the Smith-Waterman algorithm
834  * @param queryAlignmentExtent  length of the alignment in the query sequence,
835  *                              as computed by the x-drop algorithm
836  * @param matchAlignmentExtent  length of the alignment in the subject
837  *                              sequence, as computed by the x-drop algorithm
838  * @param newScore              alignment score computed by the x-drop
839  *                              algorithm
840  */
841 static void
s_SWFindFinalEndsUsingXdrop(BlastCompo_SequenceData * query,Int4 queryStart,Int4 queryEnd,BlastCompo_SequenceData * subject,Int4 matchStart,Int4 matchEnd,BlastGapAlignStruct * gap_align,const BlastScoringParameters * scoringParams,Int4 score,Int4 * queryAlignmentExtent,Int4 * matchAlignmentExtent,Int4 * newScore)842 s_SWFindFinalEndsUsingXdrop(BlastCompo_SequenceData * query,
843                             Int4 queryStart,
844                             Int4 queryEnd,
845                             BlastCompo_SequenceData * subject,
846                             Int4 matchStart,
847                             Int4 matchEnd,
848                             BlastGapAlignStruct* gap_align,
849                             const BlastScoringParameters* scoringParams,
850                             Int4 score,
851                             Int4 * queryAlignmentExtent,
852                             Int4 * matchAlignmentExtent,
853                             Int4 * newScore)
854 {
855     Int4 XdropAlignScore;         /* alignment score obtained using X-dropoff
856                                    * method rather than Smith-Waterman */
857     Int4 doublingCount = 0;       /* number of times X-dropoff had to be
858                                    * doubled */
859     Int4 gap_x_dropoff_orig = gap_align->gap_x_dropoff;
860 
861     GapPrelimEditBlockReset(gap_align->rev_prelim_tback);
862     GapPrelimEditBlockReset(gap_align->fwd_prelim_tback);
863     do {
864         XdropAlignScore =
865             ALIGN_EX(&(query->data[queryStart]) - 1,
866                      &(subject->data[matchStart]) - 1,
867                      queryEnd - queryStart + 1, matchEnd - matchStart + 1,
868                      queryAlignmentExtent,
869                      matchAlignmentExtent, gap_align->fwd_prelim_tback,
870                      gap_align, scoringParams, queryStart - 1, FALSE, FALSE,
871                      NULL);
872 
873         gap_align->gap_x_dropoff *= 2;
874         doublingCount++;
875         if((XdropAlignScore < score) && (doublingCount < 3)) {
876             GapPrelimEditBlockReset(gap_align->fwd_prelim_tback);
877         }
878     } while((XdropAlignScore < score) && (doublingCount < 3));
879 
880     gap_align->gap_x_dropoff = gap_x_dropoff_orig;
881     *newScore = XdropAlignScore;
882 }
883 
884 
885 /**
886  * BLAST-specific information that is associated with a
887  * BlastCompo_MatchingSequence.
888  */
889 typedef struct
890 BlastKappa_SequenceInfo {
891     EBlastProgramType prog_number; /**< identifies the type of blast
892                                         search being performed. The type
893                                         of search determines how sequence
894                                         data should be obtained. */
895     const BlastSeqSrc* seq_src;    /**< BLAST sequence data source */
896     BlastSeqSrcGetSeqArg seq_arg;  /**< argument to GetSequence method
897                                      of the BlastSeqSrc (@todo this
898                                      structure was designed to be
899                                      allocated on the stack, i.e.: in
900                                      Kappa_MatchingSequenceInitialize) */
901 } BlastKappa_SequenceInfo;
902 
903 
904 /** Release the resources associated with a matching sequence. */
905 static void
s_MatchingSequenceRelease(BlastCompo_MatchingSequence * self)906 s_MatchingSequenceRelease(BlastCompo_MatchingSequence * self)
907 {
908     if (self != NULL) {
909         if (self->index >=0) {
910             BlastKappa_SequenceInfo * local_data = self->local_data;
911             if (self->length > 0) {
912                 BlastSeqSrcReleaseSequence(local_data->seq_src,
913                                    &local_data->seq_arg);
914                 BlastSequenceBlkFree(local_data->seq_arg.seq);
915             }
916             free(self->local_data);
917         }
918         self->local_data = NULL;
919     }
920 }
921 
922 /**
923  * Do a simple gapped extension to the right from the beginning of query and
924  * subject ranges examining only matches and mismatches. The extension stops
925  * when there are more than max_shift mismatches or mismatches or gaps are not
926  * followed by two identical matches. This is a simplified version of the
927  * Danielle and Jean Thierry-Miegs' jumper
928  * alignment implemented in NCBI Magic
929  * http://www.ncbi.nlm.nih.gov/IEB/Research/Acembly/Download/Downloads.html
930  *
931  * @param query_seq Query sequence [in]
932  * @param query_len Query length [in]
933  * @param subject_seq Subject sequence [in]
934  * @param subject_len Subject length [in]
935  * @param max_shift Maximum number of mismatches or gaps, extension stops if
936  *        this number is reached [in]
937  * @param query_ext_len Extension length on the query [out]
938  * @param subject_ext_len Extension length on the subject [out]
939  * @param align_len Alignment length [out]
940  * @return Number of identical residues
941  */
s_ExtendRight(Uint1 * query_seq,int query_len,Uint1 * subject_seq,int subject_len,int max_shift,int * query_ext_len,int * subject_ext_len,int * align_len)942 static int s_ExtendRight(Uint1* query_seq, int query_len,
943                          Uint1* subject_seq, int subject_len,
944                          int max_shift,
945                          int* query_ext_len, int* subject_ext_len,
946                          int* align_len)
947 {
948     int num_identical = 0;
949     int q_pos, s_pos;
950     int gaps_in_query = 0;
951     int gaps_in_subject = 0;
952     q_pos = 0;
953     s_pos = 0;
954     while (q_pos < query_len && s_pos < subject_len) {
955         int n;
956         int match = 0;
957 
958         while (q_pos < query_len && s_pos < subject_len
959                && query_seq[q_pos] == subject_seq[s_pos]) {
960 
961             num_identical++;
962             q_pos++;
963             s_pos++;
964         }
965 
966         /* try to skip mismatches or gaps */
967         for (n=1; n < max_shift && q_pos + n + 1 < query_len
968                  && s_pos + n + 1 < subject_len && !match; n++) {
969 
970             /* mismatches */
971             if (query_seq[q_pos + n] == subject_seq[s_pos + n]
972                 && query_seq[q_pos + n + 1] == subject_seq[s_pos + n + 1]) {
973 
974                 /* we have already checked that two positions behind mismatches
975                    match so we can advance further */
976                 q_pos += n + 2;
977                 s_pos += n + 2;
978                 num_identical += 2;
979                 match = 1;
980             }
981 
982             /* gap in subject */
983             if (!match && query_seq[q_pos + n] == subject_seq[s_pos]
984                 && query_seq[q_pos + n + 1] == subject_seq[s_pos + 1]) {
985 
986                 q_pos += n + 2;
987                 s_pos += 2;
988                 num_identical += 2;
989                 gaps_in_subject += n;
990                 match = 1;
991             }
992 
993             /* gap in query */
994             if (!match && query_seq[q_pos] == subject_seq[s_pos + n]
995                 && query_seq[q_pos + 1] == subject_seq[s_pos + n + 1]) {
996 
997                 q_pos += 2;
998                 s_pos += n + 2;
999                 num_identical += 2;
1000                 gaps_in_query += n;
1001                 match = 1;
1002             }
1003         }
1004 
1005         if (match) {
1006             continue;
1007         }
1008 
1009         /* exit the loop */
1010         break;
1011     }
1012     *query_ext_len = q_pos;
1013     *subject_ext_len = s_pos;
1014     *align_len = q_pos > s_pos ? q_pos + gaps_in_query : s_pos + gaps_in_subject;
1015 
1016     return num_identical;
1017 }
1018 
1019 
1020 /**
1021  * Extend left from the end of the sequence and subject ranges and count
1022  * identities. The extension stops when there are more than max_shift
1023  * mismatches or mismatches or gaps are not followed by two identical matches.
1024  * See description for s_ExtendRight for more details.
1025  *
1026  * @param query_seq Query sequence [in]
1027  * @param query_len Query length [in]
1028  * @param subject_seq Subject sequence [in]
1029  * @param subject_len Subject length [in]
1030  * @param max_shift Maximum number of mismatches or gaps, extension stops if
1031  *        this number is reached [in]
1032  * @param query_ext_len Extension length on the query [out]
1033  * @param subject_ext_len Extension length on the subject [out]
1034  * @param align_len Alignment length [out]
1035  * @return Number of identical residues
1036  */
s_ExtendLeft(Uint1 * query_seq,int query_len,Uint1 * subject_seq,int subject_len,int max_shift,int * query_ext_len,int * subject_ext_len,int * align_len)1037 static int s_ExtendLeft(Uint1* query_seq, int query_len,
1038                         Uint1* subject_seq, int subject_len,
1039                         int max_shift,
1040                         int* query_ext_len, int* subject_ext_len,
1041                         int* align_len)
1042 {
1043     int q_pos = query_len - 1;
1044     int s_pos = subject_len - 1;
1045     int num_identical = 0;
1046     int gaps_in_query = 0;
1047     int gaps_in_subject = 0;
1048     while (q_pos >= 0 && s_pos >= 0) {
1049         int n;
1050         int match = 0;
1051 
1052         /* process identies */
1053         while (q_pos > 0 && s_pos > 0 && query_seq[q_pos] == subject_seq[s_pos]) {
1054             num_identical++;
1055             q_pos--;
1056             s_pos--;
1057         }
1058 
1059         /* try to skip mismatches or gaps */
1060         for (n=1;n < max_shift && q_pos - n - 1 > 0 && s_pos - n - 1 > 0
1061                  && !match; n++) {
1062 
1063             /* mismatch */
1064             if (query_seq[q_pos - n] == subject_seq[s_pos - n]
1065                 && query_seq[q_pos - n - 1] == subject_seq[s_pos - n - 1]) {
1066                 q_pos -= n + 2;
1067                 s_pos -= n + 2;
1068                 num_identical += 2;
1069                 match = 1;
1070             }
1071 
1072             /* gap in subject */
1073             if (!match && query_seq[q_pos - n] == subject_seq[s_pos]
1074                 && query_seq[q_pos - n - 1] == subject_seq[s_pos - 1]) {
1075                 q_pos -= n + 2;
1076                 s_pos -= 2;
1077                 num_identical += 2;
1078                 gaps_in_subject += n;
1079                 match = 1;
1080             }
1081 
1082             /* gap in query */
1083             if (!match && query_seq[q_pos] == subject_seq[s_pos - n]
1084                 && query_seq[q_pos - 1] == subject_seq[s_pos - n - 1]) {
1085                 q_pos -= 2;
1086                 s_pos -= n + 2;
1087                 num_identical += 2;
1088                 gaps_in_query += n;
1089                 match = 1;
1090             }
1091         }
1092 
1093         if (match) {
1094             continue;
1095         }
1096 
1097         break;
1098     }
1099     *query_ext_len = query_len - q_pos - 1;
1100     *subject_ext_len = subject_len - s_pos - 1;
1101     *align_len += *query_ext_len > *subject_ext_len ?
1102         *query_ext_len + gaps_in_query : *subject_ext_len + gaps_in_subject;
1103 
1104     return num_identical;
1105 }
1106 
1107 
1108 /**
1109  * Get hash for a word of word_size residues assuming 28-letter alphabet
1110  *
1111  * @param data Sequence [in]
1112  * @param word_size Word size [in]
1113  * @return Hash value
1114  */
s_GetHash(const Uint1 * data,int word_size)1115 static Uint8 s_GetHash(const Uint1* data, int word_size)
1116 {
1117     Uint8 hash = 0;
1118     int k;
1119     for (k=0;k < word_size;k++) {
1120         hash <<= 5;
1121         hash += (Int8)data[k];
1122     }
1123     return hash;
1124 }
1125 
1126 
1127 /**
1128  * Find a local number of identical residues in two aligned sequences by
1129  * finding word matches and doing a simple gapped extensions from the word hits
1130  *
1131  * @param query_seq Query sequence [in]
1132  * @param query_hashes Array of query words with index of each word
1133  *        corresponding to word position in the query [in]
1134  * @param query_len Query length [in]
1135  * @param subject_seq Subject sequence [in]
1136  * @param subject_len Subject length [in]
1137  * @param max_shift Maximum number of local mismatches or gaps for extensions
1138  *        [in]
1139  * @return Number of identical residues
1140  */
s_FindNumIdentical(Uint1 * query_seq,const Uint8 * query_hashes,int query_len,Uint1 * subject_seq,int subject_len,int max_shift)1141 static int s_FindNumIdentical(Uint1* query_seq,
1142                               const Uint8* query_hashes,
1143                               int query_len,
1144                               Uint1* subject_seq,
1145                               int subject_len,
1146                               int max_shift)
1147 {
1148     int word_size = 8;         /* word size for k-mer matching */
1149     Uint8 hash = 0;
1150     Uint8 mask = NCBI_CONST_UINT8(0xFFFFFFFFFF); /* mask for computing hash
1151                                                     values */
1152     int query_from = 0;
1153     int subject_from = 0;
1154 
1155     int s_pos;                 /* position in the subject sequence */
1156     int num_identical = 0;     /* number of identical residues found */
1157     Boolean match = FALSE;
1158 
1159     /* if query or subject length is smaller than word size, exit */
1160     if (!query_seq || !query_hashes || !subject_seq
1161         || query_len < word_size || subject_len < word_size) {
1162 
1163         return 0;
1164     }
1165 
1166     /* for each subject position */
1167     for (s_pos = 0; s_pos < subject_len - word_size; s_pos++) {
1168         int q_pos;
1169 
1170         /* find word hash */
1171         if (s_pos == 0 || match) {
1172             hash = s_GetHash(&subject_seq[s_pos], word_size);
1173         }
1174         else {
1175             hash <<= 5;
1176             hash &= mask;
1177             hash += subject_seq[s_pos + word_size - 1];
1178         }
1179 
1180         /* find matching query word; index of hash is position of the word
1181            the query */
1182         for (q_pos = query_from;q_pos < query_len - word_size; q_pos++) {
1183             if (query_hashes[q_pos] == hash) {
1184                 break;
1185             }
1186         }
1187 
1188         /* if match */
1189         if (q_pos < query_len - word_size) {
1190             int query_start = q_pos;
1191             int subject_start = s_pos;
1192 
1193             int query_left_len, query_right_len;
1194             int subject_left_len, subject_right_len;
1195             int align_len_left=0, align_len_right=0;
1196 
1197             match = TRUE;
1198             num_identical += word_size;
1199 
1200             /* extend left from word match */
1201             num_identical += s_ExtendLeft(query_seq + query_from,
1202                                           query_start - query_from,
1203                                           subject_seq + subject_from,
1204                                           subject_start - subject_from,
1205                                           max_shift,
1206                                           &query_left_len, &subject_left_len,
1207                                           &align_len_left);
1208 
1209             /* extend right from word match */
1210             num_identical += s_ExtendRight(query_seq + query_start + word_size,
1211                                        query_len - query_start - word_size,
1212                                        subject_seq + subject_start + word_size,
1213                                        subject_len - subject_start - word_size,
1214                                        max_shift,
1215                                        &query_right_len, &subject_right_len,
1216                                        &align_len_right);
1217 
1218 
1219             /* disregard already matched and extended words when matching
1220                further positions */
1221 
1222             query_from = query_start + word_size + query_right_len;
1223             subject_from = subject_start + word_size + subject_right_len;
1224             /* s_pos will be incremented in the loop */
1225             s_pos = subject_from - 1;
1226         }
1227         else {
1228             match = FALSE;
1229         }
1230     }
1231 
1232     return num_identical;
1233 }
1234 
1235 /**
1236  * Test whether the aligned parts of two sequences that
1237  * have a high-scoring gapless alignment are nearly identical.
1238  *
1239  * First extend from the left end of the query and subject ranges and stop if
1240  * there are too manu mismatches. Then extend from the right end. Then for the
1241  * remaining protion of ths sequences find matching words and extend left and
1242  * right from the word hit. Repeat the last steo until the whole alignment
1243  * ranges are processed.
1244  *
1245  * @params seqData Subject sequence [in]
1246  * @params seqOffse Starting offset of the subject sequence in alignment data
1247  *        [in]
1248  * @params queryData Query sequence [in]
1249  * @params queryOffset Starting offset of the query sequence in alignment data
1250  *         [in]
1251  * @param query_words Array of query words with word index corresponding to
1252  *        word's position in the query [in]
1253  * @param align Alignment data [in]
1254  * @return True if sequence parts are nearly identical, false otherwise
1255  */
1256 static Boolean
s_TestNearIdentical(const BlastCompo_SequenceData * seqData,const int seqOffset,const BlastCompo_SequenceData * queryData,const int queryOffset,const Uint8 * query_words,const BlastCompo_Alignment * align)1257 s_TestNearIdentical(const BlastCompo_SequenceData* seqData,
1258                     const int seqOffset,
1259                     const BlastCompo_SequenceData* queryData,
1260                     const int queryOffset,
1261                     const Uint8* query_words,
1262                     const BlastCompo_Alignment* align)
1263 {
1264     int qStart = align->queryStart - queryOffset;
1265     /* align->queryEnd points to one position past alignment end */
1266     int qEnd = align->queryEnd - queryOffset - 1;
1267     int sStart = align->matchStart - seqOffset;
1268     int sEnd = align->matchEnd - seqOffset - 1;
1269     const double kMinFractionNearIdentical = 0.96;
1270     int max_shift = 8;
1271 
1272     int query_len = qEnd - qStart + 1;
1273     int subject_len = sEnd - sStart + 1;
1274     int align_len = MIN(query_len, subject_len);
1275 
1276     int query_left_len = 0;
1277     int subject_left_len = 0;
1278     int query_right_len = 0;
1279     int subject_right_len = 0;
1280     int align_left_len = 0;
1281     int align_right_len = 0;
1282 
1283     double fraction_identical;
1284 
1285     /* first find number of identies going from the beginning of the query
1286        and subject ranges */
1287     int num_identical = s_ExtendRight(queryData->data + qStart, query_len,
1288                                       seqData->data + sStart, subject_len,
1289                                       max_shift,
1290                                       &query_right_len, &subject_right_len,
1291                                       &align_right_len);
1292 
1293     /* if the whole query range was processed return near identical status */
1294     if (query_right_len >= query_len || subject_right_len >= subject_len) {
1295         fraction_identical = (double)num_identical / (double)align_len;
1296         ASSERT(fraction_identical - 1.0 < 1e-10);
1297         return fraction_identical > kMinFractionNearIdentical;
1298     }
1299 
1300     /* find the number of identies going from the end of the query and subject
1301        ranges */
1302     num_identical += s_ExtendLeft(queryData->data + qStart + query_right_len,
1303                                   query_len - query_right_len,
1304                                   seqData->data + sStart + subject_right_len,
1305                                   subject_len - subject_right_len,
1306                                   max_shift,
1307                                   &query_left_len, &subject_left_len,
1308                                   &align_left_len);
1309 
1310     /* if the whole alignment ranges where covered, return the near identical
1311        status */
1312     if (query_left_len + query_right_len >= query_len
1313         || subject_left_len + subject_right_len >= subject_len) {
1314 
1315         fraction_identical = (double)num_identical / (double)(align_len);
1316         ASSERT(fraction_identical - 1.0 < 1e-10);
1317         return fraction_identical > kMinFractionNearIdentical;
1318     }
1319 
1320     /* find the number of identical matches in the middle portion of the
1321        alignment ranges */
1322     num_identical += s_FindNumIdentical(queryData->data + qStart + query_right_len,
1323                             query_words + qStart + query_right_len,
1324                             query_len - query_left_len - query_right_len,
1325                             seqData->data + sStart + subject_right_len,
1326                             subject_len - subject_left_len - subject_right_len,
1327                             max_shift);
1328 
1329     fraction_identical = (double)num_identical / (double)align_len;
1330     ASSERT(fraction_identical - 1.0 < 1e-10);
1331     if (fraction_identical > kMinFractionNearIdentical) {
1332         return TRUE;
1333     }
1334     else {
1335         return FALSE;
1336     }
1337 }
1338 
1339 
1340 /**
1341  * Initialize a new matching sequence, obtaining information about the
1342  * sequence from the search.
1343  *
1344  * @param self              object to be initialized
1345  * @param seqSrc            A pointer to a source from which sequence data
1346  *                          may be obtained
1347  * @param program_number    identifies the type of blast search being
1348  *                          performed.
1349  * @param default_db_genetic_code   default genetic code to use when
1350  *                          subject sequences are translated and there is
1351  *                          no other guidance on what code to use
1352  * @param subject_index     index of the matching sequence in the database
1353  */
1354 static int
s_MatchingSequenceInitialize(BlastCompo_MatchingSequence * self,EBlastProgramType program_number,const BlastSeqSrc * seqSrc,Int4 default_db_genetic_code,Int4 subject_index,BlastSeqSrcSetRangesArg * ranges)1355 s_MatchingSequenceInitialize(BlastCompo_MatchingSequence * self,
1356                              EBlastProgramType program_number,
1357                              const BlastSeqSrc* seqSrc,
1358                              Int4 default_db_genetic_code,
1359                              Int4 subject_index,
1360                              BlastSeqSrcSetRangesArg * ranges)
1361 {
1362     BlastKappa_SequenceInfo * seq_info;  /* BLAST-specific sequence
1363                                             information */
1364     self->length = 0;
1365     self->local_data = NULL;
1366 
1367     seq_info = malloc(sizeof(BlastKappa_SequenceInfo));
1368     if (seq_info != NULL) {
1369         self->local_data = seq_info;
1370 
1371         seq_info->seq_src      = seqSrc;
1372         seq_info->prog_number  = program_number;
1373 
1374         memset((void*) &seq_info->seq_arg, 0, sizeof(seq_info->seq_arg));
1375         seq_info->seq_arg.oid = self->index = subject_index;
1376         seq_info->seq_arg.check_oid_exclusion = TRUE;
1377         seq_info->seq_arg.ranges = ranges;
1378 
1379         if( program_number == eBlastTypeTblastn ) {
1380             seq_info->seq_arg.encoding = eBlastEncodingNcbi4na;
1381         } else {
1382             seq_info->seq_arg.encoding = eBlastEncodingProtein;
1383         }
1384         if (BlastSeqSrcGetSequence(seqSrc, &seq_info->seq_arg) >= 0) {
1385             self->length =
1386                 BlastSeqSrcGetSeqLen(seqSrc, (void*) &seq_info->seq_arg);
1387 
1388             /* If the subject is translated and the BlastSeqSrc implementation
1389              * doesn't provide a genetic code string, use the default genetic
1390              * code for all subjects (as in the C toolkit) */
1391             if (Blast_SubjectIsTranslated(program_number) &&
1392                 seq_info->seq_arg.seq->gen_code_string == NULL) {
1393                 seq_info->seq_arg.seq->gen_code_string =
1394                              GenCodeSingletonFind(default_db_genetic_code);
1395                 ASSERT(seq_info->seq_arg.seq->gen_code_string);
1396             }
1397         } else {
1398             self->length = 0;
1399         }
1400     }
1401     if (self->length == 0) {
1402         /* Could not obtain the required data */
1403         s_MatchingSequenceRelease(self);
1404         return -1;
1405     } else {
1406         return 0;
1407     }
1408 }
1409 
1410 
1411 /** NCBIstdaa encoding for 'X' character */
1412 #define BLASTP_MASK_RESIDUE 21
1413 /** Default instructions and mask residue for SEG filtering */
1414 #define BLASTP_MASK_INSTRUCTIONS "S 10 1.8 2.1"
1415 
1416 
1417 /**
1418  * Filter low complexity regions from the sequence data; uses the SEG
1419  * algorithm.
1420  *
1421  * @param seqData            data to be filtered
1422  * @param program_name       type of search being performed
1423  * @return   0 for success; -1 for out-of-memory
1424  */
1425 static int
s_DoSegSequenceData(BlastCompo_SequenceData * seqData,EBlastProgramType program_name,Boolean * is_seq_biased)1426 s_DoSegSequenceData(BlastCompo_SequenceData * seqData,
1427                     EBlastProgramType program_name,
1428                     Boolean* is_seq_biased)
1429 {
1430     int status = 0;
1431     BlastSeqLoc* mask_seqloc = NULL;
1432     SBlastFilterOptions* filter_options = NULL;
1433 
1434     status = BlastFilteringOptionsFromString(program_name,
1435                                              BLASTP_MASK_INSTRUCTIONS,
1436                                              &filter_options, NULL);
1437     if (status == 0) {
1438         status = BlastSetUp_Filter(program_name, seqData->data,
1439                                    seqData->length, 0, filter_options,
1440                                    &mask_seqloc, NULL);
1441         filter_options = SBlastFilterOptionsFree(filter_options);
1442     }
1443     if (is_seq_biased) {
1444         *is_seq_biased = (mask_seqloc != NULL);
1445     }
1446     if (status == 0) {
1447         Blast_MaskTheResidues(seqData->data, seqData->length,
1448                               FALSE, mask_seqloc, FALSE, 0);
1449     }
1450     if (mask_seqloc != NULL) {
1451         mask_seqloc = BlastSeqLocFree(mask_seqloc);
1452     }
1453     return status;
1454 }
1455 
1456 
1457 /**
1458  * Obtain a string of translated data
1459  *
1460  * @param self          the sequence from which to obtain the data [in]
1461  * @param range         the range and translation frame to get [in]
1462  * @param seqData       the resulting data [out]
1463  * @param queryData     the query sequence [in]
1464  * @param queryOffset   offset for align if there are multiple queries
1465  * @param align          information about the alignment between query and subject
1466  * @param shouldTestIdentical did alignment pass a preliminary test in
1467  *                       redo_alignment.c that indicates the sequence
1468  *                        pieces may be near identical
1469  *
1470  * @return 0 on success; -1 on failure
1471  */
1472 static int
s_SequenceGetTranslatedRange(const BlastCompo_MatchingSequence * self,const BlastCompo_SequenceRange * range,BlastCompo_SequenceData * seqData,const BlastCompo_SequenceRange * q_range,BlastCompo_SequenceData * queryData,const Uint8 * query_words,const BlastCompo_Alignment * align,const Boolean shouldTestIdentical,const ECompoAdjustModes compo_adjust_mode,const Boolean isSmithWaterman,Boolean * subject_maybe_biased)1473 s_SequenceGetTranslatedRange(const BlastCompo_MatchingSequence * self,
1474                              const BlastCompo_SequenceRange * range,
1475                              BlastCompo_SequenceData * seqData,
1476                              const BlastCompo_SequenceRange * q_range,
1477                              BlastCompo_SequenceData * queryData,
1478                              const Uint8* query_words,
1479                              const BlastCompo_Alignment *align,
1480                              const Boolean shouldTestIdentical,
1481                              const ECompoAdjustModes compo_adjust_mode,
1482                              const Boolean isSmithWaterman,
1483                              Boolean* subject_maybe_biased)
1484 {
1485     int status = 0;
1486     BlastKappa_SequenceInfo * local_data; /* BLAST-specific
1487                                              information associated
1488                                              with the sequence */
1489     Uint1 * translation_buffer;           /* a buffer for the translated,
1490                                              amino-acid sequence */
1491     Int4 translated_length;  /* length of the translated sequence */
1492     int translation_frame;   /* frame in which to translate */
1493     Uint1 * na_sequence;     /* the nucleotide sequence */
1494     int translation_start;   /* location in na_sequence to start
1495                                 translating */
1496     int num_nucleotides;     /* the number of nucleotides to be translated */
1497 
1498     local_data = self->local_data;
1499     na_sequence = local_data->seq_arg.seq->sequence_start;
1500 
1501     /* Initialize seqData to nil, in case this routine fails */
1502     seqData->buffer = NULL;
1503     seqData->data = NULL;
1504     seqData->length = 0;
1505 
1506     translation_frame = range->context;
1507     if (translation_frame > 0) {
1508         translation_start = 3 * range->begin;
1509     } else {
1510         translation_start =
1511             self->length - 3 * range->end + translation_frame + 1;
1512     }
1513     num_nucleotides =
1514         3 * (range->end - range->begin) + ABS(translation_frame) - 1;
1515 
1516     status = Blast_GetPartialTranslation(na_sequence + translation_start,
1517                                          num_nucleotides,
1518                                          (Int2) translation_frame,
1519                                          local_data->seq_arg.seq->gen_code_string,
1520                                          &translation_buffer,
1521                                          &translated_length,
1522                                          NULL);
1523     if (status == 0) {
1524         seqData->buffer = translation_buffer;
1525         seqData->data   = translation_buffer + 1;
1526         seqData->length = translated_length;
1527 
1528         if ( !(KAPPA_TBLASTN_NO_SEG_SEQUENCE) ) {
1529             if (compo_adjust_mode
1530                 && (!subject_maybe_biased || *subject_maybe_biased)) {
1531 
1532                 if ( (!shouldTestIdentical)
1533                      || (shouldTestIdentical
1534                          && (!s_TestNearIdentical(seqData, range->begin,
1535                                                   queryData, q_range->begin,
1536                                                   query_words, align)))) {
1537 
1538                     status = s_DoSegSequenceData(seqData, eBlastTypeTblastn,
1539                                                  subject_maybe_biased);
1540                     if (status != 0) {
1541                         free(seqData->buffer);
1542                         seqData->buffer = NULL;
1543                         seqData->data = NULL;
1544                         seqData->length = 0;
1545                     }
1546                 }
1547             }
1548         }
1549     }
1550     return status;
1551 }
1552 
1553 
1554 /**
1555  * Get a string of protein data from a protein sequence.
1556  *
1557  * @param self          a protein sequence [in]
1558  * @param range         the range to get [in]
1559  * @param seqData       the resulting data [out]
1560  * @param queryData     the query sequence [in]
1561  * @param queryOffset   offset for align if there are multiple queries
1562  * @param align          information about the alignment
1563  *                         between query and subject [in]
1564  * @param shouldTestIdentical did alignment pass a preliminary test in
1565  *                       redo_alignment.c that indicates the sequence
1566  *                        pieces may be near identical [in]
1567  *
1568  * @return 0 on success; -1 on failure
1569  */
1570 static int
s_SequenceGetProteinRange(const BlastCompo_MatchingSequence * self,const BlastCompo_SequenceRange * range,BlastCompo_SequenceData * seqData,const BlastCompo_SequenceRange * q_range,BlastCompo_SequenceData * queryData,const Uint8 * query_words,const BlastCompo_Alignment * align,const Boolean shouldTestIdentical,const ECompoAdjustModes compo_adjust_mode,const Boolean isSmithWaterman,Boolean * subject_maybe_biased)1571 s_SequenceGetProteinRange(const BlastCompo_MatchingSequence * self,
1572                           const BlastCompo_SequenceRange * range,
1573                           BlastCompo_SequenceData * seqData,
1574                           const BlastCompo_SequenceRange * q_range,
1575                           BlastCompo_SequenceData * queryData,
1576                           const Uint8* query_words,
1577                           const BlastCompo_Alignment *align,
1578                           const Boolean shouldTestIdentical,
1579                           const ECompoAdjustModes compo_adjust_mode,
1580                           const Boolean isSmithWaterman,
1581                           Boolean* subject_maybe_biased)
1582 
1583 {
1584     int status = 0;       /* return status */
1585     Int4       idx;       /* loop index */
1586     Uint1     *origData;  /* the unfiltered data for the sequence */
1587     /* BLAST-specific sequence information */
1588     BlastKappa_SequenceInfo * local_data = self->local_data;
1589     BLAST_SequenceBlk * seq = self->local_data;
1590 
1591     if (self->local_data == NULL)
1592 	return -1;
1593 
1594     seqData->data = NULL;
1595     seqData->length = 0;
1596     /* Copy the entire sequence (necessary for SEG filtering.) */
1597     seqData->buffer  = calloc((self->length + 2), sizeof(Uint1));
1598     if (seqData->buffer == NULL) {
1599         return -1;
1600     }
1601     /* First and last characters of the buffer MUST be '\0', which is
1602      * true here because the buffer was allocated using calloc. */
1603     seqData->data    = seqData->buffer + 1;
1604     seqData->length  = self->length;
1605 
1606     origData = (self->index >= 0) ? local_data->seq_arg.seq->sequence
1607                             : seq->sequence;
1608     if((self->index < 0) && (align->frame != 0)) {
1609     	int i=0, offsets =0;
1610     	int f = GET_SEQ_FRAME(align->frame);
1611     	int nucl_length = GET_NUCL_LENGTH(self->length);
1612     	seqData->length = GET_TRANSLATED_LENGTH(nucl_length, f);
1613     	for(; i < f; i++) {
1614     		offsets = GET_TRANSLATED_LENGTH(nucl_length, i) +1;
1615     		origData += offsets;
1616     	}
1617     }
1618     /* Copy the sequence data */
1619     for (idx = 0;  idx < seqData->length;  idx++) {
1620         seqData->data[idx] = origData[idx];
1621     }
1622 
1623     if ( !(KAPPA_BLASTP_NO_SEG_SEQUENCE) ) {
1624         if (compo_adjust_mode
1625             && (!subject_maybe_biased || *subject_maybe_biased)) {
1626 
1627             if ( (!shouldTestIdentical)
1628                  || (shouldTestIdentical
1629                      && (!s_TestNearIdentical(seqData, 0, queryData,
1630                                               q_range->begin, query_words,
1631                                               align)))) {
1632 
1633                 status = s_DoSegSequenceData(seqData, eBlastTypeBlastp,
1634                                              subject_maybe_biased);
1635             }
1636         }
1637     }
1638     /* Fit the data to the range. */
1639     seqData ->data    = &seqData->data[range->begin - 1];
1640     *seqData->data++  = '\0';
1641     seqData ->length  = range->end - range->begin;
1642 
1643     if (status != 0) {
1644         free(seqData->buffer);
1645         seqData->buffer = NULL;
1646         seqData->data = NULL;
1647     }
1648     return status;
1649 }
1650 
1651 
1652 /**
1653  * Obtain the sequence data that lies within the given range.
1654  *
1655  * @param self          sequence information [in]
1656  * @param range        range specifying the range of data [in]
1657  * @param seqData       the sequence data obtained [out]
1658  * @param seqData       the resulting data [out]
1659  * @param queryData     the query sequence [in]
1660  * @param queryOffset   offset for align if there are multiple queries
1661  * @param align          information about the alignment between query and subject
1662  * @param shouldTestIdentical did alignment pass a preliminary test in
1663  *                       redo_alignment.c that indicates the sequence
1664  *                        pieces may be near identical
1665  *
1666  * @return 0 on success; -1 on failure
1667  */
1668 static int
s_SequenceGetRange(const BlastCompo_MatchingSequence * self,const BlastCompo_SequenceRange * s_range,BlastCompo_SequenceData * seqData,const BlastCompo_SequenceData * query,const BlastCompo_SequenceRange * q_range,BlastCompo_SequenceData * queryData,const Uint8 * query_words,const BlastCompo_Alignment * align,const Boolean shouldTestIdentical,const ECompoAdjustModes compo_adjust_mode,const Boolean isSmithWaterman,Boolean * subject_maybe_biased)1669 s_SequenceGetRange(const BlastCompo_MatchingSequence * self,
1670                    const BlastCompo_SequenceRange * s_range,
1671                    BlastCompo_SequenceData * seqData,
1672                    const BlastCompo_SequenceData * query,
1673                    const BlastCompo_SequenceRange * q_range,
1674                    BlastCompo_SequenceData * queryData,
1675                    const Uint8* query_words,
1676                    const BlastCompo_Alignment *align,
1677                    const Boolean shouldTestIdentical,
1678                    const ECompoAdjustModes compo_adjust_mode,
1679                    const Boolean isSmithWaterman,
1680                    Boolean* subject_maybe_biased)
1681 {
1682     Int4 idx;
1683     BlastKappa_SequenceInfo * seq_info = self->local_data;
1684     Uint1 *origData = query->data + q_range->begin;
1685     /* Copy the query sequence (necessary for SEG filtering.) */
1686     queryData->length = q_range->end - q_range->begin;
1687     queryData->buffer = calloc((queryData->length + 2), sizeof(Uint1));
1688     queryData->data   = queryData->buffer + 1;
1689 
1690     for (idx = 0;  idx < queryData->length;  idx++) {
1691         /* Copy the sequence data, replacing occurrences of amino acid
1692          * number 24 (Selenocysteine) with number 3 (Cysteine). */
1693         queryData->data[idx] = (origData[idx] != 24) ? origData[idx] : 3;
1694     }
1695     if (seq_info && seq_info->prog_number ==  eBlastTypeTblastn) {
1696         /* The sequence must be translated. */
1697         return s_SequenceGetTranslatedRange(self, s_range, seqData,
1698                                             q_range, queryData, query_words,
1699                                             align, shouldTestIdentical,
1700                                             compo_adjust_mode, isSmithWaterman,
1701                                             subject_maybe_biased);
1702     } else {
1703         return s_SequenceGetProteinRange(self, s_range, seqData,
1704                                          q_range, queryData, query_words,
1705                                          align, shouldTestIdentical,
1706                                          compo_adjust_mode, isSmithWaterman,
1707                                          subject_maybe_biased);
1708     }
1709 }
1710 
1711 
1712 /** Data and data-structures needed to perform a gapped alignment */
1713 typedef struct BlastKappa_GappingParamsContext {
1714     const BlastScoringParameters*
1715         scoringParams;                /**< scoring parameters for a
1716                                            gapped alignment */
1717     BlastGapAlignStruct * gap_align;  /**< additional parameters for a
1718                                            gapped alignment */
1719     BlastScoreBlk* sbp;               /**< the score block for this search */
1720     double localScalingFactor;        /**< the amount by which this
1721                                            search has been scaled */
1722     EBlastProgramType prog_number;    /**< the type of search being
1723                                            performed */
1724 } BlastKappa_GappingParamsContext;
1725 
1726 
1727 /**
1728  * Reads a BlastGapAlignStruct that has been used to compute a
1729  * traceback, and return a BlastCompo_Alignment representing the
1730  * alignment.  The BlastGapAlignStruct is in coordinates local to the
1731  * ranges being aligned; the resulting alignment is in coordinates w.r.t.
1732  * the whole query and subject.
1733  *
1734  * @param gap_align         the BlastGapAlignStruct
1735  * @param *edit_script      the edit script from the alignment; on exit
1736  *                          NULL.  The edit_script is usually
1737  *                          gap_align->edit_script, but we don't want
1738  *                          an implicit side effect on the gap_align.
1739  * @param query_range       the range of the query used in this alignment
1740  * @param subject_range     the range of the subject used in this alignment
1741  * @param matrix_adjust_rule   the rule used to compute the scoring matrix
1742  *
1743  * @return the new alignment on success or NULL on error
1744  */
1745 static BlastCompo_Alignment *
s_NewAlignmentFromGapAlign(BlastGapAlignStruct * gap_align,GapEditScript ** edit_script,BlastCompo_SequenceRange * query_range,BlastCompo_SequenceRange * subject_range,EMatrixAdjustRule matrix_adjust_rule)1746 s_NewAlignmentFromGapAlign(BlastGapAlignStruct * gap_align,
1747                            GapEditScript ** edit_script,
1748                            BlastCompo_SequenceRange * query_range,
1749                            BlastCompo_SequenceRange * subject_range,
1750                            EMatrixAdjustRule matrix_adjust_rule)
1751 {
1752     /* parameters to BlastCompo_AlignmentNew */
1753     int queryStart, queryEnd, queryIndex, matchStart, matchEnd, frame;
1754     BlastCompo_Alignment * obj; /* the new alignment */
1755 
1756     /* In the composition_adjustment library, the query start/end are
1757        indices into the concatenated query, and so must be shifted.  */
1758     queryStart = gap_align->query_start + query_range->begin;
1759     queryEnd   = gap_align->query_stop + query_range->begin;
1760     queryIndex = query_range->context;
1761     matchStart = gap_align->subject_start + subject_range->begin;
1762     matchEnd   = gap_align->subject_stop  + subject_range->begin;
1763     frame      = subject_range->context;
1764 
1765     obj = BlastCompo_AlignmentNew(gap_align->score, matrix_adjust_rule,
1766                                   queryStart, queryEnd, queryIndex,
1767                                   matchStart, matchEnd, frame,
1768                                   *edit_script);
1769     if (obj != NULL) {
1770         *edit_script = NULL;
1771     }
1772     return obj;
1773 }
1774 
1775 
1776 /** A callback used when performing SmithWaterman alignments:
1777  * Calculate the traceback for one alignment by performing an x-drop
1778  * alignment in the forward direction, possibly increasing the x-drop
1779  * parameter until the desired score is attained.
1780  *
1781  * The start, end and score of the alignment should be obtained
1782  * using the Smith-Waterman algorithm before this routine is called.
1783  *
1784  * @param *pnewAlign       the new alignment
1785  * @param *pqueryEnd       on entry, the end of the alignment in the
1786  *                         query, as computed by the Smith-Waterman
1787  *                         algorithm.  On exit, the end as computed by
1788  *                         the x-drop algorithm
1789  * @param *pmatchEnd       like as *pqueryEnd, but for the subject
1790  *                         sequence
1791  * @param queryStart       the starting point in the query
1792  * @param matchStart       the starting point in the subject
1793  * @param score            the score of the alignment, as computed by
1794  *                         the Smith-Waterman algorithm
1795  * @param query            query sequence data
1796  * @param query_range      range of this query in the concatenated
1797  *                         query
1798  * @param ccat_query_length   total length of the concatenated query
1799  * @param subject          subject sequence data
1800  * @param subject_range    range of subject_data in the translated
1801  *                         query, in amino acid coordinates
1802  * @param full_subject_length   length of the full subject sequence
1803  * @param gapping_params        parameters used to compute gapped
1804  *                              alignments
1805  * @param matrix_adjust_rule    the rule used to compute the scoring matrix
1806  *
1807  * @returns 0   (posts a fatal error if it fails)
1808  * @sa new_xdrop_align_type
1809  */
1810 static int
s_NewAlignmentUsingXdrop(BlastCompo_Alignment ** pnewAlign,Int4 * pqueryEnd,Int4 * pmatchEnd,Int4 queryStart,Int4 matchStart,Int4 score,BlastCompo_SequenceData * query,BlastCompo_SequenceRange * query_range,Int4 ccat_query_length,BlastCompo_SequenceData * subject,BlastCompo_SequenceRange * subject_range,Int4 full_subject_length,BlastCompo_GappingParams * gapping_params,EMatrixAdjustRule matrix_adjust_rule)1811 s_NewAlignmentUsingXdrop(BlastCompo_Alignment ** pnewAlign,
1812                          Int4 * pqueryEnd, Int4 *pmatchEnd,
1813                          Int4 queryStart, Int4 matchStart, Int4 score,
1814                          BlastCompo_SequenceData * query,
1815                          BlastCompo_SequenceRange * query_range,
1816                          Int4 ccat_query_length,
1817                          BlastCompo_SequenceData * subject,
1818                          BlastCompo_SequenceRange * subject_range,
1819                          Int4 full_subject_length,
1820                          BlastCompo_GappingParams * gapping_params,
1821                          EMatrixAdjustRule matrix_adjust_rule)
1822 {
1823     Int4 newScore;
1824     /* Extent of the alignment as computed by an x-drop alignment
1825      * (usually the same as (queryEnd - queryStart) and (matchEnd -
1826      * matchStart)) */
1827     Int4 queryExtent, matchExtent;
1828     BlastCompo_Alignment * obj = NULL;  /* the new object */
1829     /* BLAST-specific parameters needed compute an X-drop alignment */
1830     BlastKappa_GappingParamsContext * context = gapping_params->context;
1831     /* Auxiliarly structure for computing gapped alignments */
1832     BlastGapAlignStruct * gap_align = context->gap_align;
1833     /* Scoring parameters for gapped alignments */
1834     const BlastScoringParameters* scoringParams = context->scoringParams;
1835     /* A structure containing the traceback of a gapped alignment */
1836     GapEditScript* editScript = NULL;
1837 
1838     /* suppress unused parameter warnings; this is a callback
1839        function, so these parameter cannot be deleted */
1840     (void) ccat_query_length;
1841     (void) full_subject_length;
1842 
1843     gap_align->gap_x_dropoff = gapping_params->x_dropoff;
1844 
1845     s_SWFindFinalEndsUsingXdrop(query,   queryStart, *pqueryEnd,
1846                                 subject, matchStart, *pmatchEnd,
1847                                 gap_align, scoringParams,
1848                                 score, &queryExtent, &matchExtent,
1849                                 &newScore);
1850     *pqueryEnd = queryStart + queryExtent;
1851     *pmatchEnd = matchStart + matchExtent;
1852 
1853     editScript =
1854         Blast_PrelimEditBlockToGapEditScript(gap_align->rev_prelim_tback,
1855                                              gap_align->fwd_prelim_tback);
1856     if (editScript != NULL) {
1857         /* Shifted values of the endpoints */
1858         Int4 aqueryStart =  queryStart + query_range->begin;
1859         Int4 aqueryEnd   = *pqueryEnd  + query_range->begin;
1860         Int4 amatchStart =  matchStart + subject_range->begin;
1861         Int4 amatchEnd   = *pmatchEnd  + subject_range->begin;
1862 
1863         obj = BlastCompo_AlignmentNew(newScore, matrix_adjust_rule,
1864                                       aqueryStart, aqueryEnd,
1865                                       query_range->context,
1866                                       amatchStart, amatchEnd,
1867                                       subject_range->context, editScript);
1868         if (obj == NULL) {
1869             GapEditScriptDelete(editScript);
1870         }
1871     }
1872     *pnewAlign = obj;
1873 
1874     return obj != NULL ? 0 : -1;
1875 }
1876 
1877 
1878 /**
1879  * A callback: calculate the traceback for one alignment by
1880  * performing an x-drop alignment in both directions
1881  *
1882  * @param in_align         the existing alignment, without traceback
1883  * @param matrix_adjust_rule    the rule used to compute the scoring matrix
1884  * @param query_data       query sequence data
1885  * @param query_range      range of this query in the concatenated
1886  *                         query
1887  * @param ccat_query_length   total length of the concatenated query
1888  * @param subject_data     subject sequence data
1889  * @param subject_range    range of subject_data in the translated
1890  *                         query, in amino acid coordinates
1891  * @param full_subject_length   length of the full subject sequence
1892  * @param gapping_params        parameters used to compute gapped
1893  *                              alignments
1894  * @sa redo_one_alignment_type
1895  */
1896 static BlastCompo_Alignment *
s_RedoOneAlignment(BlastCompo_Alignment * in_align,EMatrixAdjustRule matrix_adjust_rule,BlastCompo_SequenceData * query_data,BlastCompo_SequenceRange * query_range,int ccat_query_length,BlastCompo_SequenceData * subject_data,BlastCompo_SequenceRange * subject_range,int full_subject_length,BlastCompo_GappingParams * gapping_params)1897 s_RedoOneAlignment(BlastCompo_Alignment * in_align,
1898                    EMatrixAdjustRule matrix_adjust_rule,
1899                    BlastCompo_SequenceData * query_data,
1900                    BlastCompo_SequenceRange * query_range,
1901                    int ccat_query_length,
1902                    BlastCompo_SequenceData * subject_data,
1903                    BlastCompo_SequenceRange * subject_range,
1904                    int full_subject_length,
1905                    BlastCompo_GappingParams * gapping_params)
1906 {
1907     int status;                /* return code */
1908     Int4 q_start, s_start;     /* starting point in query and subject */
1909     /* BLAST-specific parameters needed to compute a gapped alignment */
1910     BlastKappa_GappingParamsContext * context = gapping_params->context;
1911     /* Auxiliary structure for computing gapped alignments */
1912     BlastGapAlignStruct* gapAlign = context->gap_align;
1913     /* The preliminary gapped HSP that were are recomputing */
1914     BlastHSP * hsp = in_align->context;
1915     Boolean fence_hit = FALSE;
1916 
1917     /* suppress unused parameter warnings; this is a callback
1918        function, so these parameter cannot be deleted */
1919     (void) ccat_query_length;
1920     (void) full_subject_length;
1921 
1922     /* Use the starting point supplied by the HSP. */
1923     q_start = hsp->query.gapped_start - query_range->begin;
1924     s_start = hsp->subject.gapped_start - subject_range->begin;
1925 
1926     gapAlign->gap_x_dropoff = gapping_params->x_dropoff;
1927 
1928     /*
1929      * Previously, last argument was NULL which could cause problems for
1930      * tblastn.
1931      */
1932     status =
1933         BLAST_GappedAlignmentWithTraceback(context->prog_number,
1934                                            query_data->data,
1935                                            subject_data->data, gapAlign,
1936                                            context->scoringParams,
1937                                            q_start, s_start,
1938                                            query_data->length,
1939                                            subject_data->length,
1940                                            &fence_hit);
1941     if (status == 0) {
1942         return s_NewAlignmentFromGapAlign(gapAlign, &gapAlign->edit_script,
1943                                           query_range, subject_range,
1944                                           matrix_adjust_rule);
1945     } else {
1946         return NULL;
1947     }
1948 }
1949 
1950 
1951 /**
1952  * A BlastKappa_SavedParameters holds the value of certain search
1953  * parameters on entry to RedoAlignmentCore.  These values are
1954  * restored on exit.
1955  */
1956 typedef struct BlastKappa_SavedParameters {
1957     Int4          gap_open;    /**< a penalty for the existence of a gap */
1958     Int4          gapExtend;   /**< a penalty for each residue in the
1959                                     gap */
1960     double        scale_factor;     /**< the original scale factor */
1961     Int4 **origMatrix;              /**< The original matrix values */
1962     double original_expect_value;   /**< expect value on entry */
1963     /** copy of the original gapped Karlin-Altschul block
1964      * corresponding to the first context */
1965     Blast_KarlinBlk** kbp_gap_orig;
1966     Int4             num_queries;   /**< Number of queries in this search */
1967 } BlastKappa_SavedParameters;
1968 
1969 
1970 /**
1971  * Release the data associated with a BlastKappa_SavedParameters and
1972  * delete the object
1973  * @param searchParams the object to be deleted [in][out]
1974  */
1975 static void
s_SavedParametersFree(BlastKappa_SavedParameters ** searchParams)1976 s_SavedParametersFree(BlastKappa_SavedParameters ** searchParams)
1977 {
1978     /* for convenience, remove one level of indirection from searchParams */
1979     BlastKappa_SavedParameters *sp = *searchParams;
1980 
1981     if (sp != NULL) {
1982         if (sp->kbp_gap_orig != NULL) {
1983             int i;
1984             for (i = 0;  i < sp->num_queries;  i++) {
1985                 if (sp->kbp_gap_orig[i] != NULL)
1986                     Blast_KarlinBlkFree(sp->kbp_gap_orig[i]);
1987             }
1988             free(sp->kbp_gap_orig);
1989         }
1990         if (sp->origMatrix != NULL)
1991             Nlm_Int4MatrixFree(&sp->origMatrix);
1992     }
1993     sfree(*searchParams);
1994     *searchParams = NULL;
1995 }
1996 
1997 
1998 /**
1999  * Create a new instance of BlastKappa_SavedParameters
2000  *
2001  * @param rows               number of rows in the scoring matrix
2002  * @param numQueries         number of queries in this search
2003  * @param compo_adjust_mode  if >0, use composition-based statistics
2004  * @param positionBased      if true, the search is position-based
2005  */
2006 static BlastKappa_SavedParameters *
s_SavedParametersNew(Int4 rows,Int4 numQueries,ECompoAdjustModes compo_adjust_mode,Boolean positionBased)2007 s_SavedParametersNew(Int4 rows,
2008                      Int4 numQueries,
2009                      ECompoAdjustModes compo_adjust_mode,
2010                      Boolean positionBased)
2011 {
2012     int i;
2013     BlastKappa_SavedParameters *sp;   /* the new object */
2014     sp = malloc(sizeof(BlastKappa_SavedParameters));
2015 
2016     if (sp == NULL) {
2017         goto error_return;
2018     }
2019     sp->kbp_gap_orig       = NULL;
2020     sp->origMatrix         = NULL;
2021 
2022     sp->kbp_gap_orig = calloc(numQueries, sizeof(Blast_KarlinBlk*));
2023     if (sp->kbp_gap_orig == NULL) {
2024         goto error_return;
2025     }
2026     sp->num_queries = numQueries;
2027     for (i = 0;  i < numQueries;  i++) {
2028         sp->kbp_gap_orig[i] = NULL;
2029     }
2030     if (compo_adjust_mode != eNoCompositionBasedStats) {
2031         if (positionBased) {
2032             sp->origMatrix = Nlm_Int4MatrixNew(rows, BLASTAA_SIZE);
2033         } else {
2034             sp->origMatrix = Nlm_Int4MatrixNew(BLASTAA_SIZE, BLASTAA_SIZE);
2035         }
2036         if (sp->origMatrix == NULL)
2037             goto error_return;
2038     }
2039     return sp;
2040 error_return:
2041     s_SavedParametersFree(&sp);
2042     return NULL;
2043 }
2044 
2045 
2046 /**
2047  * Record the initial value of the search parameters that are to be
2048  * adjusted.
2049  *
2050  * @param searchParams       holds the recorded values [out]
2051  * @param sbp                a score block [in]
2052  * @param scoring            gapped alignment parameters [in]
2053  * @param query_length       length of the concatenated query [in]
2054  * @param compo_adjust_mode  composition adjustment mode [in]
2055  * @param positionBased     is this search position-based [in]
2056  */
2057 static int
s_RecordInitialSearch(BlastKappa_SavedParameters * searchParams,BlastScoreBlk * sbp,const BlastScoringParameters * scoring,int query_length,ECompoAdjustModes compo_adjust_mode,Boolean positionBased)2058 s_RecordInitialSearch(BlastKappa_SavedParameters * searchParams,
2059                       BlastScoreBlk* sbp,
2060                       const BlastScoringParameters* scoring,
2061                       int query_length,
2062                       ECompoAdjustModes compo_adjust_mode,
2063                       Boolean positionBased)
2064 {
2065     int i;
2066 
2067     searchParams->gap_open     = scoring->gap_open;
2068     searchParams->gapExtend    = scoring->gap_extend;
2069     searchParams->scale_factor = scoring->scale_factor;
2070 
2071     for (i = 0;  i < searchParams->num_queries;  i++) {
2072         if (sbp->kbp_gap[i] != NULL) {
2073             /* There is a kbp_gap for query i and it must be copied */
2074             searchParams->kbp_gap_orig[i] = Blast_KarlinBlkNew();
2075             if (searchParams->kbp_gap_orig[i] == NULL) {
2076                 return -1;
2077             }
2078             Blast_KarlinBlkCopy(searchParams->kbp_gap_orig[i],
2079                                 sbp->kbp_gap[i]);
2080         }
2081     }
2082 
2083     if (compo_adjust_mode != eNoCompositionBasedStats) {
2084         Int4 **matrix;              /* scoring matrix */
2085         int j;                      /* iteration index */
2086         int rows;                   /* number of rows in matrix */
2087         if (positionBased) {
2088             matrix = sbp->psi_matrix->pssm->data;
2089             rows = query_length;
2090         } else {
2091             matrix = sbp->matrix->data;
2092             rows = BLASTAA_SIZE;
2093         }
2094 
2095         for (i = 0;  i < rows;  i++) {
2096             for (j = 0;  j < BLASTAA_SIZE;  j++) {
2097                 searchParams->origMatrix[i][j] = matrix[i][j];
2098             }
2099         }
2100     }
2101     return 0;
2102 }
2103 
2104 
2105 /**
2106  * Rescale the search parameters in the search object and options
2107  * object to obtain more precision.
2108  *
2109  * @param sbp               score block to be rescaled
2110  * @param sp                scoring parameters to be rescaled
2111  * @param num_queries       number of queries in this search
2112  * @param scale_factor      amount by which to scale this search
2113  */
2114 static void
s_RescaleSearch(BlastScoreBlk * sbp,BlastScoringParameters * sp,int num_queries,double scale_factor)2115 s_RescaleSearch(BlastScoreBlk* sbp,
2116                 BlastScoringParameters* sp,
2117                 int num_queries,
2118                 double scale_factor)
2119 {
2120     int i;
2121     for (i = 0;  i < num_queries;  i++) {
2122         if (sbp->kbp_gap[i] != NULL) {
2123             Blast_KarlinBlk * kbp = sbp->kbp_gap[i];
2124             kbp->Lambda /= scale_factor;
2125             kbp->logK = log(kbp->K);
2126         }
2127     }
2128 
2129     sp->gap_open = BLAST_Nint(sp->gap_open  * scale_factor);
2130     sp->gap_extend = BLAST_Nint(sp->gap_extend * scale_factor);
2131     sp->scale_factor = scale_factor;
2132 }
2133 
2134 
2135 /**
2136  * Restore the parameters that were adjusted to their original values.
2137  *
2138  * @param sbp                the score block to be restored
2139  * @param scoring            the scoring parameters to be restored
2140  * @param searchParams       the initial recorded values of the parameters
2141  * @param query_length       the concatenated query length
2142  * @param positionBased      is this search position-based
2143  * @param compo_adjust_mode  mode of composition adjustment
2144  */
2145 static void
s_RestoreSearch(BlastScoreBlk * sbp,BlastScoringParameters * scoring,const BlastKappa_SavedParameters * searchParams,int query_length,Boolean positionBased,ECompoAdjustModes compo_adjust_mode)2146 s_RestoreSearch(BlastScoreBlk* sbp,
2147                 BlastScoringParameters* scoring,
2148                 const BlastKappa_SavedParameters * searchParams,
2149                 int query_length,
2150                 Boolean positionBased,
2151                 ECompoAdjustModes compo_adjust_mode)
2152 {
2153     int i;
2154 
2155     scoring->gap_open = searchParams->gap_open;
2156     scoring->gap_extend = searchParams->gapExtend;
2157     scoring->scale_factor = searchParams->scale_factor;
2158 
2159     for (i = 0;  i < searchParams->num_queries;  i++) {
2160         if (sbp->kbp_gap[i] != NULL) {
2161             Blast_KarlinBlkCopy(sbp->kbp_gap[i],
2162                                 searchParams->kbp_gap_orig[i]);
2163         }
2164     }
2165     if(compo_adjust_mode != eNoCompositionBasedStats) {
2166         int  j;             /* iteration index */
2167         Int4 ** matrix;     /* matrix to be restored */
2168         int rows;           /* number of rows in the matrix */
2169 
2170         if (positionBased) {
2171             matrix = sbp->psi_matrix->pssm->data;
2172             rows = query_length;
2173         } else {
2174             matrix = sbp->matrix->data;
2175             rows = BLASTAA_SIZE;
2176         }
2177         for (i = 0;  i < rows;  i++) {
2178             for (j = 0;  j < BLASTAA_SIZE;  j++) {
2179                 matrix[i][j] = searchParams->origMatrix[i][j];
2180             }
2181         }
2182     }
2183 }
2184 
2185 
2186 /**
2187  * Initialize an object of type Blast_MatrixInfo.
2188  *
2189  * @param self            object being initialized
2190  * @param queryBlk        the query sequence data
2191  * @param sbp             score block for this search
2192  * @param scale_factor    amount by which ungapped parameters should be
2193  *                        scaled
2194  * @param matrixName      name of the matrix
2195  */
2196 static int
s_MatrixInfoInit(Blast_MatrixInfo * self,BLAST_SequenceBlk * queryBlk,BlastScoreBlk * sbp,double scale_factor,const char * matrixName)2197 s_MatrixInfoInit(Blast_MatrixInfo * self,
2198                  BLAST_SequenceBlk* queryBlk,
2199                  BlastScoreBlk* sbp,
2200                  double scale_factor,
2201                  const char * matrixName)
2202 {
2203     int status = 0;    /* return status */
2204     int lenName;       /* length of matrixName as a string */
2205 
2206     /* copy the matrix name (strdup is not standard C) */
2207     lenName = strlen(matrixName);
2208     if (NULL == (self->matrixName = malloc(lenName + 1))) {
2209         return -1;
2210     }
2211     memcpy(self->matrixName, matrixName, lenName + 1);
2212 
2213     if (self->positionBased) {
2214         status = s_GetPosBasedStartFreqRatios(self->startFreqRatios,
2215                                               queryBlk->length,
2216                                               queryBlk->sequence,
2217                                               matrixName,
2218                                               sbp->psi_matrix->freq_ratios);
2219         if (status == 0) {
2220             status = s_ScalePosMatrix(self->startMatrix, matrixName,
2221                                       sbp->psi_matrix->freq_ratios,
2222                                       queryBlk->sequence,
2223                                       queryBlk->length, sbp, scale_factor);
2224             self->ungappedLambda = sbp->kbp_psi[0]->Lambda / scale_factor;
2225         }
2226     } else {
2227         self->ungappedLambda = sbp->kbp_ideal->Lambda / scale_factor;
2228         status = s_GetStartFreqRatios(self->startFreqRatios, matrixName);
2229         if (status == 0) {
2230             Blast_Int4MatrixFromFreq(self->startMatrix, self->cols,
2231                                      self->startFreqRatios,
2232                                      self->ungappedLambda);
2233         }
2234     }
2235     return status;
2236 }
2237 
2238 
2239 /* Create an array of 8-mers for a sequence, such that index of each 8-mer
2240    is the same as its position in the query */
2241 static int
s_CreateWordArray(const Uint1 * seq_data,Int4 seq_len,Uint8 ** words)2242 s_CreateWordArray(const Uint1* seq_data, Int4 seq_len, Uint8** words)
2243 {
2244     int word_size = 8;         /* word size for k-mer matching */
2245     Uint8* query_hashes;       /* list of hashes for query words */
2246     Uint8 mask = NCBI_CONST_UINT8(0xFFFFFFFFFF); /* mask for computing hash
2247                                                     values */
2248     int i;
2249 
2250     /* if query or subject length is smaller than word size, exit */
2251     if (!seq_data || !words || seq_len < word_size) {
2252         return -1;
2253     }
2254 
2255     query_hashes = (Uint8*)calloc((seq_len - word_size + 1),
2256                                   sizeof(Uint8));
2257     *words = query_hashes;
2258 
2259     if (!query_hashes) {
2260         return -1;
2261     }
2262 
2263 
2264     /* find query word hashes */
2265     query_hashes[0] = s_GetHash(&seq_data[0], word_size);
2266     for (i = 1; i < seq_len - word_size; i++) {
2267         query_hashes[i] = query_hashes[i - 1];
2268         query_hashes[i] <<= 5;
2269         query_hashes[i] &= mask;
2270         query_hashes[i] += (Uint8)seq_data[i + word_size - 1];
2271     }
2272 
2273     return 0;
2274 }
2275 
2276 
s_FreeBlastCompo_QueryInfoArray(BlastCompo_QueryInfo ** query_info,int num_queries)2277 static void s_FreeBlastCompo_QueryInfoArray(BlastCompo_QueryInfo** query_info,
2278                                             int num_queries)
2279 {
2280     int i;
2281 
2282     if (!query_info) {
2283         return;
2284     }
2285 
2286     for (i = 0;i < num_queries;i++) {
2287         if ((*query_info)[i].words) {
2288             free((*query_info)[i].words);
2289         }
2290     }
2291 
2292     free(*query_info);
2293     *query_info = NULL;
2294 }
2295 
2296 /**
2297  * Save information about all queries in an array of objects of type
2298  * BlastCompo_QueryInfo.
2299  *
2300  * @param query_data        query sequence data
2301  * @param blast_query_info  information about all queries, as an
2302  *                          internal blast data structure
2303  *
2304  * @return the new array on success, or NULL on error
2305  */
2306 static BlastCompo_QueryInfo *
s_GetQueryInfo(Uint1 * query_data,const BlastQueryInfo * blast_query_info,Boolean skip)2307 s_GetQueryInfo(Uint1 * query_data, const BlastQueryInfo * blast_query_info, Boolean skip)
2308 {
2309     int i;                   /* loop index */
2310     BlastCompo_QueryInfo *
2311         compo_query_info;    /* the new array */
2312     int num_queries;         /* the number of queries/elements in
2313                                 compo_query_info */
2314 
2315     num_queries = blast_query_info->last_context + 1;
2316     compo_query_info = calloc(num_queries, sizeof(BlastCompo_QueryInfo));
2317     if (compo_query_info != NULL) {
2318         for (i = 0;  i < num_queries;  i++) {
2319             BlastCompo_QueryInfo * query_info = &compo_query_info[i];
2320             const BlastContextInfo * query_context = &blast_query_info->contexts[i];
2321 
2322             query_info->eff_search_space =
2323                 (double) query_context->eff_searchsp;
2324             query_info->origin = query_context->query_offset;
2325             query_info->seq.data = &query_data[query_info->origin];
2326             query_info->seq.length = query_context->query_length;
2327             query_info->words = NULL;
2328 
2329             s_CreateWordArray(query_info->seq.data, query_info->seq.length,
2330                               &query_info->words);
2331             if (! skip) {
2332                 Blast_ReadAaComposition(&query_info->composition, BLASTAA_SIZE,
2333                                         query_info->seq.data,
2334                                         query_info->seq.length);
2335             }
2336         }
2337     }
2338     return compo_query_info;
2339 }
2340 
2341 
2342 /**
2343  * Create a new object of type BlastCompo_GappingParams.  The new
2344  * object contains the parameters needed by the composition adjustment
2345  * library to compute a gapped alignment.
2346  *
2347  * @param context     the data structures needed by callback functions
2348  *                    that perform the gapped alignments.
2349  * @param extendParams parameters used for a gapped extension
2350  * @param num_queries  the number of queries in the concatenated query
2351  */
2352 static BlastCompo_GappingParams *
s_GappingParamsNew(BlastKappa_GappingParamsContext * context,const BlastExtensionParameters * extendParams,int num_queries)2353 s_GappingParamsNew(BlastKappa_GappingParamsContext * context,
2354                    const BlastExtensionParameters* extendParams,
2355                    int num_queries)
2356 {
2357     int i;
2358     double min_lambda = DBL_MAX;   /* smallest gapped Lambda */
2359     const BlastScoringParameters * scoring = context->scoringParams;
2360     const BlastExtensionOptions * options = extendParams->options;
2361     /* The new object */
2362     BlastCompo_GappingParams * gapping_params = NULL;
2363 
2364     gapping_params = malloc(sizeof(BlastCompo_GappingParams));
2365     if (gapping_params == NULL)
2366 	return NULL;
2367 
2368     gapping_params->gap_open = scoring->gap_open;
2369     gapping_params->gap_extend = scoring->gap_extend;
2370     gapping_params->context = context;
2371 
2372     for (i = 0;  i < num_queries;  i++) {
2373         if (context->sbp->kbp_gap[i] != NULL &&
2374             context->sbp->kbp_gap[i]->Lambda < min_lambda) {
2375             min_lambda = context->sbp->kbp_gap[i]->Lambda;
2376         }
2377     }
2378     gapping_params->x_dropoff = (Int4)
2379         MAX(options->gap_x_dropoff_final*NCBIMATH_LN2 / min_lambda,
2380             extendParams->gap_x_dropoff_final);
2381     context->gap_align->gap_x_dropoff = gapping_params->x_dropoff;
2382 
2383     return gapping_params;
2384 }
2385 
2386 
2387 /** Callbacks used by the Blast_RedoOneMatch* routines */
2388 static const Blast_RedoAlignCallbacks
2389 redo_align_callbacks = {
2390     s_CalcLambda, s_SequenceGetRange, s_RedoOneAlignment,
2391     s_NewAlignmentUsingXdrop, s_FreeEditScript
2392 };
2393 
2394 
2395 /* Bit score per alignment position threshold for preliminaru near identical
2396    test */
2397 #define NEAR_IDENTICAL_BITS_PER_POSITION (1.74)
2398 
2399 /**
2400  * Read the parameters required for the Blast_RedoOneMatch* functions from
2401  * the corresponding parameters in standard BLAST datatypes.  Return a new
2402  * object representing these parameters.
2403  */
2404 static Blast_RedoAlignParams *
s_GetAlignParams(BlastKappa_GappingParamsContext * context,BLAST_SequenceBlk * queryBlk,const BlastQueryInfo * queryInfo,const BlastHitSavingParameters * hitParams,const BlastExtensionParameters * extendParams)2405 s_GetAlignParams(BlastKappa_GappingParamsContext * context,
2406                  BLAST_SequenceBlk * queryBlk,
2407                  const BlastQueryInfo* queryInfo,
2408                  const BlastHitSavingParameters* hitParams,
2409                  const BlastExtensionParameters* extendParams)
2410 {
2411     int status = 0;    /* status code */
2412     int rows;          /* number of rows in the scoring matrix */
2413     int cutoff_s;      /* cutoff score for saving an alignment */
2414     double cutoff_e;   /* cutoff evalue for saving an alignment */
2415     BlastCompo_GappingParams *
2416         gapping_params = NULL;    /* parameters needed to compute a gapped
2417                                      alignment */
2418     Blast_MatrixInfo *
2419         scaledMatrixInfo;         /* information about the scoring matrix */
2420     /* does this kind of search translate the database sequence */
2421     int subject_is_translated = (context->prog_number == eBlastTypeTblastn) || (context->prog_number == eBlastTypeRpsTblastn);
2422     int query_is_translated   = context->prog_number == eBlastTypeBlastx;
2423     /* is this a positiion-based search */
2424     Boolean positionBased = (Boolean) (context->sbp->psi_matrix != NULL);
2425     /* will BLAST_LinkHsps be called to assign e-values */
2426     Boolean do_link_hsps = (hitParams->do_sum_stats);
2427     ECompoAdjustModes compo_adjust_mode =
2428         (ECompoAdjustModes) extendParams->options->compositionBasedStats;
2429 
2430     /* per position bit score cutoff for testing whether sequences are
2431        near identical */
2432     double near_identical_cutoff_bits = NEAR_IDENTICAL_BITS_PER_POSITION;
2433 
2434     /* score block is already scaled by context->localScalingFactor */
2435     double near_identical_cutoff=0;
2436     Int4 index;
2437     for (index = queryInfo->first_context;
2438                      index <= queryInfo->last_context; ++index) {
2439 
2440         if ((queryInfo->contexts[index].is_valid)) {
2441     		near_identical_cutoff =
2442        		 (near_identical_cutoff_bits * NCBIMATH_LN2)
2443         		/ context->sbp->kbp_gap[index]->Lambda;
2444 		break;
2445 	}
2446     }
2447 
2448     if (do_link_hsps) {
2449         ASSERT(hitParams->link_hsp_params != NULL);
2450         cutoff_s =
2451             (int) (hitParams->cutoff_score_min * context->localScalingFactor);
2452     } else {
2453         /* There is no cutoff score; we consider e-values instead */
2454         cutoff_s = 1;
2455     }
2456     cutoff_e = hitParams->options->expect_value;
2457     rows = positionBased ? queryInfo->max_length : BLASTAA_SIZE;
2458     scaledMatrixInfo = Blast_MatrixInfoNew(rows, BLASTAA_SIZE, positionBased);
2459     status = s_MatrixInfoInit(scaledMatrixInfo, queryBlk, context->sbp,
2460                               context->localScalingFactor,
2461                               context->scoringParams->options->matrix);
2462     if (status != 0) {
2463         return NULL;
2464     }
2465     gapping_params = s_GappingParamsNew(context, extendParams,
2466                                         queryInfo->last_context + 1);
2467     if (gapping_params == NULL) {
2468         return NULL;
2469     } else {
2470         return
2471             Blast_RedoAlignParamsNew(&scaledMatrixInfo, &gapping_params,
2472                                      compo_adjust_mode, positionBased,
2473                                      query_is_translated,
2474                                      subject_is_translated,
2475                                      queryInfo->max_length, cutoff_s, cutoff_e,
2476                                      do_link_hsps, &redo_align_callbacks,
2477                                      near_identical_cutoff);
2478     }
2479 }
2480 
2481 
2482 /**
2483  * Convert an array of BlastCompo_Heap objects to a BlastHSPResults structure.
2484  *
2485  * @param results        BLAST core external results structure (pre-SeqAlign)
2486  *                       [out]
2487  * @param heaps          an array of BlastCompo_Heap objects
2488  * @param hitlist_size   size of each list in the results structure above [in]
2489  */
2490 static void
s_FillResultsFromCompoHeaps(BlastHSPResults * results,BlastCompo_Heap heaps[],Int4 hitlist_size)2491 s_FillResultsFromCompoHeaps(BlastHSPResults * results,
2492                             BlastCompo_Heap heaps[],
2493                             Int4 hitlist_size)
2494 {
2495     int query_index;   /* loop index */
2496     int num_queries;   /* Number of queries in this search */
2497 
2498     num_queries = results->num_queries;
2499     for (query_index = 0;  query_index < num_queries;  query_index++) {
2500         BlastHSPList* hsp_list;
2501         BlastHitList* hitlist;
2502         BlastCompo_Heap * heap = &heaps[query_index];
2503 
2504         results->hitlist_array[query_index] = Blast_HitListNew(hitlist_size);
2505         hitlist = results->hitlist_array[query_index];
2506 
2507         while (NULL != (hsp_list = BlastCompo_HeapPop(heap))) {
2508             Blast_HitListUpdate(hitlist, hsp_list);
2509         }
2510     }
2511     Blast_HSPResultsReverseOrder(results);
2512 }
2513 
2514 
2515 /** Remove all matches from a BlastCompo_Heap. */
s_ClearHeap(BlastCompo_Heap * self)2516 static void s_ClearHeap(BlastCompo_Heap * self)
2517 {
2518     BlastHSPList* hsp_list = NULL;   /* an element of the heap */
2519 
2520     while (NULL != (hsp_list = BlastCompo_HeapPop(self))) {
2521         hsp_list = Blast_HSPListFree(hsp_list);
2522     }
2523 }
2524 
2525 /**
2526  * Free a BlastGapAlignStruct copy created by s_BlastGapAlignStruct_Copy
2527  *
2528  * @param copy Pointer to BlastGapAlignStruct to be freed
2529  */
s_BlastGapAlignStruct_Free(BlastGapAlignStruct * copy)2530 static void s_BlastGapAlignStruct_Free(BlastGapAlignStruct* copy)
2531 {
2532     {
2533         while (copy->state_struct != NULL) {
2534             GapStateArrayStruct* cur = copy->state_struct;
2535             copy->state_struct = copy->state_struct->next;
2536             if (cur->state_array) {
2537                 sfree(cur->state_array);
2538             }
2539             if (cur) {
2540                 sfree(cur);
2541             }
2542         }
2543     }
2544     {
2545         if (copy->edit_script != NULL) {
2546             if (copy->edit_script->op_type) {
2547                 sfree(copy->edit_script->op_type);
2548             }
2549             if (copy->edit_script->num) {
2550                 sfree(copy->edit_script->num);
2551             }
2552             sfree(copy->edit_script);
2553         }
2554     }
2555     {
2556         if (copy->fwd_prelim_tback != NULL) {
2557             if (copy->fwd_prelim_tback->edit_ops) {
2558                 sfree(copy->fwd_prelim_tback->edit_ops);
2559             }
2560             sfree(copy->fwd_prelim_tback);
2561         }
2562     }
2563     {
2564         if (copy->rev_prelim_tback != NULL) {
2565             if (copy->rev_prelim_tback->edit_ops) {
2566                 sfree(copy->rev_prelim_tback->edit_ops);
2567             }
2568             sfree(copy->rev_prelim_tback);
2569         }
2570     }
2571     {
2572         if (copy->greedy_align_mem != NULL) {
2573             sfree(copy->greedy_align_mem);
2574         }
2575     }
2576     {
2577         if (copy->dp_mem != NULL) {
2578             sfree(copy->dp_mem);
2579         }
2580     }
2581     {
2582         if (copy->sbp != NULL) {
2583             sfree(copy->sbp);
2584         }
2585     }
2586     sfree(copy);
2587 }
2588 
2589 /**
2590  * Create a "deep" copy of a BlastGapAlignStruct structure.
2591  *
2592  * Non-pointer structure members are copied.  Pointers to data which will
2593  * only be read are copied.  For data which will be changing, memory for copies
2594  * will be allocated and new pointers will be assigned to them.  The process
2595  * repeats down the structure hierarchy until all pointers are dealt with.
2596  *
2597  * @param orig Pointer to BlastGapAlignStruct structure to be copied
2598  * @param sbp  Pointer to BlastScoreBlk structure, required to set copy->sbp
2599  *
2600  * @return Pointer to copy of original BlastGapAlignStruct structure
2601  */
s_BlastGapAlignStruct_Copy(BlastGapAlignStruct * orig,BlastScoreBlk * sbp)2602 static BlastGapAlignStruct* s_BlastGapAlignStruct_Copy(
2603         BlastGapAlignStruct* orig,
2604         BlastScoreBlk* sbp
2605 )
2606 {
2607     BlastGapAlignStruct* copy =
2608             (BlastGapAlignStruct*) calloc(1, sizeof(BlastGapAlignStruct));
2609 
2610     // Copy plain old data (ints, doubles, booleans, ...).
2611     // Any pointer members will be processed separately.
2612     memcpy(copy, orig, sizeof(BlastGapAlignStruct));
2613 
2614     {
2615         GapStateArrayStruct* o = orig->state_struct;
2616         if (o != NULL) {
2617             GapStateArrayStruct* c = (GapStateArrayStruct*) calloc(
2618                     1,
2619                     sizeof(GapStateArrayStruct)
2620             );
2621             copy->state_struct = c;
2622             memcpy(c, o, sizeof(GapStateArrayStruct));
2623             c->state_array = (Uint1*) calloc(c->length, sizeof(Uint1));
2624             int i;
2625             for (i = 0; i < c->length; ++i) {
2626                 c->state_array[i] = o->state_array[i];
2627             }
2628             while (o->next != NULL) {
2629                 c->next = (GapStateArrayStruct*)
2630                         calloc(1, sizeof(GapStateArrayStruct));
2631                 c = c->next;
2632                 o = o->next;
2633                 memcpy(c, o, sizeof(GapStateArrayStruct));
2634                 c->state_array = (Uint1*) calloc(c->length, sizeof(Uint1));
2635                 int i;
2636                 for (i = 0; i < c->length; ++i) {
2637                     c->state_array[i] = o->state_array[i];
2638                 }
2639             }
2640         }
2641     }
2642     {
2643         GapEditScript* o = orig->edit_script;
2644         if (o != NULL) {
2645             GapEditScript* c = (GapEditScript*) calloc(
2646                     1,
2647                     sizeof(GapEditScript)
2648             );
2649             copy->edit_script = c;
2650             memcpy(c, o, sizeof(GapEditScript));
2651             c->op_type = (EGapAlignOpType*) calloc(
2652                     o->size,
2653                     sizeof(EGapAlignOpType)
2654             );
2655             c->num = (Int4*) calloc(o->size, sizeof(Int4));
2656             int i;
2657             for (i = 0; i < o->size; ++i) {
2658                 c->op_type[i] = o->op_type[i];
2659                 c->num[i] = o->num[i];
2660             }
2661         }
2662     }
2663     {
2664         GapPrelimEditBlock* o = orig->fwd_prelim_tback;
2665         if (o != NULL) {
2666             GapPrelimEditBlock* c = (GapPrelimEditBlock*) calloc(
2667                     1,
2668                     sizeof(GapPrelimEditBlock)
2669             );
2670             copy->fwd_prelim_tback = c;
2671             memcpy(c, o, sizeof(GapPrelimEditBlock));
2672             c->edit_ops = calloc(
2673                     o->num_ops_allocated,
2674                     sizeof(GapPrelimEditScript)
2675             );
2676             int i;
2677             for (i = 0; i < o->num_ops_allocated; ++i) {
2678                 c->edit_ops[i].op_type = o->edit_ops[i].op_type;
2679                 c->edit_ops[i].num = o->edit_ops[i].num;
2680             }
2681         }
2682     }
2683     {
2684         GapPrelimEditBlock* o = orig->rev_prelim_tback;
2685         if (o != NULL) {
2686             GapPrelimEditBlock* c = (GapPrelimEditBlock*) calloc(
2687                     1,
2688                     sizeof(GapPrelimEditBlock)
2689             );
2690             copy->rev_prelim_tback = c;
2691             memcpy(c, o, sizeof(GapPrelimEditBlock));
2692             c->edit_ops = calloc(
2693                     o->num_ops_allocated,
2694                     sizeof(GapPrelimEditScript)
2695             );
2696             int i;
2697             for (i = 0; i < o->num_ops_allocated; ++i) {
2698                 c->edit_ops[i].op_type = o->edit_ops[i].op_type;
2699                 c->edit_ops[i].num = o->edit_ops[i].num;
2700             }
2701         }
2702     }
2703     {
2704         SGreedyAlignMem* o = orig->greedy_align_mem;
2705         if (o != NULL) {
2706             SGreedyAlignMem* c = (SGreedyAlignMem*) calloc(
2707                     1,
2708                     sizeof(SGreedyAlignMem)
2709             );
2710             copy->greedy_align_mem = c;
2711             memcpy(c, o, sizeof(SGreedyAlignMem));
2712         }
2713     }
2714     {
2715         BlastGapDP* o = orig->dp_mem;
2716         if (o != NULL) {
2717             BlastGapDP* c = (BlastGapDP*) calloc(
2718                     orig->dp_mem_alloc,
2719                     sizeof(BlastGapDP)
2720             );
2721             copy->dp_mem = c;
2722             memcpy(c, o, orig->dp_mem_alloc * sizeof(BlastGapDP));
2723         }
2724     }
2725     {
2726         copy->sbp = sbp;
2727     }
2728 
2729     return copy;
2730 }
2731 
2732 /**
2733  * Free a BlastScoreBlk copy created by s_BlastScoreBlk_Copy
2734  *
2735  * BlastScoreBlk* pointer "bsb_ptr" should be passed as (&bsb_ptr);
2736  * this function will set bsb_ptr to NULL before returning.
2737  *
2738  * @param copy Pointer to (pointer to BlastScoreBlk to be freed)
2739  */
2740 static
s_BlastScoreBlk_Free(BlastScoreBlk ** copy)2741 void s_BlastScoreBlk_Free(BlastScoreBlk** copy)
2742 {
2743     BlastScoreBlkFree(*copy);
2744     *copy = NULL;
2745 }
2746 
2747 /**
2748  * Create a "deep" copy of a BlastScoreBlk structure.
2749  *
2750  * Non-pointer structure members are copied.  Pointers to data which will
2751  * only be read are copied.  For data which will be changing, memory for copies
2752  * will be allocated and new pointers will be assigned to them.  The process
2753  * repeats down the structure hierarchy until all pointers are dealt with.
2754  *
2755  * @param program            The program type
2756  * @param orig               Pointer to BlastScoreBlk structure to be copied
2757  * @param alphabet_code      Alphabet code
2758  * @param number_of_contexts Number of contexts
2759  *
2760  * @return Pointer to copy of original BlastScoreBlk structure
2761  */
2762 static
s_BlastScoreBlk_Copy(EBlastProgramType program,BlastScoreBlk * orig,Uint1 alphabet_code,Int4 number_of_contexts)2763 BlastScoreBlk* s_BlastScoreBlk_Copy(
2764         EBlastProgramType program,
2765         BlastScoreBlk* orig,
2766         Uint1 alphabet_code,
2767         Int4 number_of_contexts
2768 )
2769 {
2770     BlastScoreBlk* copy = BlastScoreBlkNew(
2771             orig->alphabet_code,
2772             orig->number_of_contexts
2773     );
2774     if (copy == NULL) {
2775         return NULL;
2776     }
2777 
2778     copy->alphabet_start = orig->alphabet_start;
2779     copy->name = strdup(orig->name);
2780     copy->comments = orig->comments;
2781     /* Deep-copy orig->matrix */
2782     if (orig->matrix != NULL) {
2783         if (copy->matrix == NULL) {
2784             return BlastScoreBlkFree(copy);
2785         }
2786         SBlastScoreMatrix* m = copy->matrix;
2787         if (m->data != NULL  &&  orig->matrix->data != NULL) {
2788             int i;
2789             for (i = 0; i < orig->matrix->ncols; ++i) {
2790                 memcpy(
2791                         m->data[i],
2792                         orig->matrix->data[i],
2793                         m->nrows * sizeof(int)
2794                 );
2795             }
2796         }
2797         if (m->freqs != NULL  &&  orig->matrix->freqs != NULL) {
2798             memcpy(
2799                     m->freqs,
2800                     orig->matrix->freqs,
2801                     m->ncols * sizeof(double)
2802             );
2803         }
2804         m->lambda = orig->matrix->lambda;
2805     }
2806     /* Deep-copy orig->psi_matrix */
2807     if (orig->psi_matrix != NULL
2808             &&  orig->psi_matrix->pssm != NULL) {
2809         copy->psi_matrix = SPsiBlastScoreMatrixNew(orig->psi_matrix->pssm->ncols);
2810         if (copy->psi_matrix == NULL) {
2811             return BlastScoreBlkFree(copy);
2812         }
2813         SPsiBlastScoreMatrix* pm = copy->psi_matrix;
2814         SBlastScoreMatrix* m = pm->pssm;
2815         if (m->data != NULL  &&  orig->psi_matrix->pssm->data != NULL) {
2816             int i;
2817             for (i = 0; i < orig->psi_matrix->pssm->ncols; ++i) {
2818                 memcpy(
2819                         m->data[i],
2820                         orig->psi_matrix->pssm->data[i],
2821                         m->nrows * sizeof(int)
2822                 );
2823             }
2824         }
2825         if (m->freqs != NULL
2826                 &&  orig->psi_matrix->pssm->freqs != NULL) {
2827             memcpy(
2828                     m->freqs,
2829                     orig->psi_matrix->pssm->freqs,
2830                     m->ncols * sizeof(double)
2831             );
2832         }
2833         m->lambda = orig->psi_matrix->pssm->lambda;
2834         if (pm->freq_ratios != NULL
2835                 &&  orig->psi_matrix->freq_ratios != NULL) {
2836             int i;
2837             for (i = 0; i < orig->psi_matrix->pssm->ncols; ++i) {
2838                 memcpy(
2839                         pm->freq_ratios[i],
2840                         orig->psi_matrix->freq_ratios[i],
2841                         orig->psi_matrix->pssm->nrows * sizeof(double)
2842                 );
2843             }
2844         }
2845         if (orig->psi_matrix->kbp != NULL) {
2846             memcpy(pm->kbp, orig->psi_matrix->kbp, sizeof(Blast_KarlinBlk));
2847         }
2848     }
2849     copy->matrix_only_scoring = orig->matrix_only_scoring;
2850     copy->complexity_adjusted_scoring = orig->complexity_adjusted_scoring;
2851     copy->loscore = orig->loscore;
2852     copy->hiscore = orig->hiscore;
2853     copy->penalty = orig->penalty;
2854     copy->reward = orig->reward;
2855     copy->read_in_matrix = orig->read_in_matrix;
2856     if (Blast_QueryIsPssm(program)) {
2857         copy->kbp     = copy->kbp_psi;
2858         copy->kbp_gap = copy->kbp_gap_psi;
2859     } else {
2860         copy->kbp     = copy->kbp_std;
2861         copy->kbp_gap = copy->kbp_gap_std;
2862     }
2863     if (orig->gbp != NULL) {
2864         memcpy(copy->gbp, orig->gbp, sizeof(Blast_GumbelBlk));
2865     }
2866     int ctx;
2867     for (ctx = 0; ctx < orig->number_of_contexts; ++ctx) {
2868         if (orig->sfp != NULL  &&  orig->sfp[ctx] != NULL) {
2869             copy->sfp[ctx] = Blast_ScoreFreqNew(
2870                     orig->sfp[ctx]->score_min,
2871                     orig->sfp[ctx]->score_max
2872             );
2873             if (copy->sfp[ctx] == NULL) {
2874                 return BlastScoreBlkFree(copy);
2875             }
2876             copy->sfp[ctx]->obs_min = orig->sfp[ctx]->obs_min;
2877             copy->sfp[ctx]->obs_max = orig->sfp[ctx]->obs_max;
2878             copy->sfp[ctx]->score_avg = orig->sfp[ctx]->score_avg;
2879             int r = orig->sfp[ctx]->score_max - orig->sfp[ctx]->score_min + 1;
2880             memcpy(
2881                     copy->sfp[ctx]->sprob0,
2882                     orig->sfp[ctx]->sprob0,
2883                     r * sizeof(double)
2884             );
2885         }
2886         if (orig->kbp_std != NULL  &&  orig->kbp_std[ctx] != NULL) {
2887             copy->kbp_std[ctx] = Blast_KarlinBlkNew();
2888             if (Blast_KarlinBlkCopy(copy->kbp_std[ctx], orig->kbp_std[ctx]) != 0) {
2889                 return BlastScoreBlkFree(copy);
2890             }
2891         }
2892         if (orig->kbp_gap_std != NULL  &&  orig->kbp_gap_std[ctx] != NULL) {
2893             copy->kbp_gap_std[ctx] = Blast_KarlinBlkNew();
2894             if (Blast_KarlinBlkCopy(copy->kbp_gap_std[ctx], orig->kbp_gap_std[ctx]) != 0) {
2895                 return BlastScoreBlkFree(copy);
2896             }
2897         }
2898         if (orig->kbp_psi != NULL  &&  orig->kbp_psi[ctx] != NULL) {
2899             copy->kbp_psi[ctx] = Blast_KarlinBlkNew();
2900             if (Blast_KarlinBlkCopy(copy->kbp_psi[ctx], orig->kbp_psi[ctx]) != 0) {
2901                 return BlastScoreBlkFree(copy);
2902             }
2903         }
2904         if (orig->kbp_gap_psi != NULL  &&  orig->kbp_gap_psi[ctx] != NULL) {
2905             copy->kbp_gap_psi[ctx] = Blast_KarlinBlkNew();
2906             if (Blast_KarlinBlkCopy(copy->kbp_gap_psi[ctx], orig->kbp_gap_psi[ctx]) != 0) {
2907                 return BlastScoreBlkFree(copy);
2908             }
2909         }
2910         if (Blast_QueryIsPssm(program)) {
2911             copy->kbp[ctx]     = copy->kbp_psi[ctx];
2912             copy->kbp_gap[ctx] = copy->kbp_gap_psi[ctx];
2913         } else {
2914             copy->kbp[ctx]     = copy->kbp_std[ctx];
2915             copy->kbp_gap[ctx] = copy->kbp_gap_std[ctx];
2916         }
2917     }
2918     if (orig->kbp_ideal != NULL) {
2919         copy->kbp_ideal = Blast_KarlinBlkNew();
2920         if (Blast_KarlinBlkCopy(copy->kbp_ideal, orig->kbp_ideal) != 0) {
2921             return BlastScoreBlkFree(copy);
2922         }
2923     }
2924     copy->ambiguous_res = (Uint1*) calloc(orig->ambig_size, sizeof(Uint1));
2925     if (orig->ambiguous_res != NULL) {
2926         memcpy(copy->ambiguous_res, orig->ambiguous_res, orig->ambig_size);
2927     }
2928     copy->ambig_size = orig->ambig_size;
2929     copy->ambig_occupy = orig->ambig_occupy;
2930     copy->round_down = orig->round_down;
2931 
2932     return copy;
2933 }
2934 
2935 /**
2936  *  Recompute alignments for each match found by the gapped BLAST
2937  *  algorithm.  Single-thread adapter to Blast_RedoAlignmentCore_MT.
2938  */
2939 Int2
Blast_RedoAlignmentCore(EBlastProgramType program_number,BLAST_SequenceBlk * queryBlk,const BlastQueryInfo * queryInfo,BlastScoreBlk * sbp,BLAST_SequenceBlk * subjectBlk,const BlastSeqSrc * seqSrc,Int4 default_db_genetic_code,BlastHSPList * thisMatch,BlastHSPStream * hsp_stream,BlastScoringParameters * scoringParams,const BlastExtensionParameters * extendParams,const BlastHitSavingParameters * hitParams,const PSIBlastOptions * psiOptions,BlastHSPResults * results)2940 Blast_RedoAlignmentCore(EBlastProgramType program_number,
2941                         BLAST_SequenceBlk * queryBlk,
2942                         const BlastQueryInfo* queryInfo,
2943                         BlastScoreBlk* sbp,
2944                         BLAST_SequenceBlk * subjectBlk,
2945                         const BlastSeqSrc* seqSrc,
2946                         Int4 default_db_genetic_code,
2947                         BlastHSPList * thisMatch,
2948                         BlastHSPStream* hsp_stream,
2949                         BlastScoringParameters* scoringParams,
2950                         const BlastExtensionParameters* extendParams,
2951                         const BlastHitSavingParameters* hitParams,
2952                         const PSIBlastOptions* psiOptions,
2953                         BlastHSPResults* results)
2954 {
2955     return Blast_RedoAlignmentCore_MT(
2956             program_number,
2957             1,                  /* number of threads */
2958             queryBlk,
2959             queryInfo,
2960             sbp,
2961             subjectBlk,
2962             seqSrc,
2963             default_db_genetic_code,
2964             thisMatch,
2965             hsp_stream,
2966             scoringParams,
2967             extendParams,
2968             hitParams,
2969             psiOptions,
2970             results
2971     );
2972 }
2973 
2974 /**
2975  *  Recompute alignments for each match found by the gapped BLAST
2976  *  algorithm.
2977  */
2978 Int2
Blast_RedoAlignmentCore_MT(EBlastProgramType program_number,Uint4 num_threads,BLAST_SequenceBlk * queryBlk,const BlastQueryInfo * queryInfo,BlastScoreBlk * sbp,BLAST_SequenceBlk * subjectBlk,const BlastSeqSrc * seqSrc,Int4 default_db_genetic_code,BlastHSPList * thisMatch,BlastHSPStream * hsp_stream,BlastScoringParameters * scoringParams,const BlastExtensionParameters * extendParams,const BlastHitSavingParameters * hitParams,const PSIBlastOptions * psiOptions,BlastHSPResults * results)2979 Blast_RedoAlignmentCore_MT(EBlastProgramType program_number,
2980                            Uint4 num_threads,
2981                            BLAST_SequenceBlk * queryBlk,
2982                            const BlastQueryInfo* queryInfo,
2983                            BlastScoreBlk* sbp,
2984                            BLAST_SequenceBlk * subjectBlk,
2985                            const BlastSeqSrc* seqSrc,
2986                            Int4 default_db_genetic_code,
2987                            BlastHSPList * thisMatch,
2988                            BlastHSPStream* hsp_stream,
2989                            BlastScoringParameters* scoringParams,
2990                            const BlastExtensionParameters* extendParams,
2991                            const BlastHitSavingParameters* hitParams,
2992                            const PSIBlastOptions* psiOptions,
2993                            BlastHSPResults* results)
2994 {
2995     int status_code = 0;                    /* return value code */
2996     /* the factor by which to scale the scoring system in order to
2997      * obtain greater precision */
2998     double localScalingFactor;
2999     /* forbidden ranges for each database position (used in
3000      * Smith-Waterman alignments) */
3001     Blast_ForbiddenRanges forbidden = {0,};
3002     /* a collection of alignments for each query sequence with
3003      * sequences from the database */
3004     BlastCompo_Heap* redoneMatches = NULL;
3005     /* stores all fields needed for computing a compositionally
3006      * adjusted score matrix using Newton's method */
3007     Blast_CompositionWorkspace** NRrecord_tld = NULL;
3008     /* loop index */
3009     int query_index;
3010     /* number of queries in the concatenated query */
3011     int numQueries = queryInfo->num_queries;
3012     /* number of contexts in the concatenated query */
3013     int numContexts = queryInfo->last_context + 1;
3014     /* number of contexts within a query */
3015     int numFrames = (program_number == eBlastTypeBlastx) ? 6:1;
3016     /* keeps track of gapped alignment params */
3017     BlastGapAlignStruct* gapAlign = NULL;
3018     /* the values of the search parameters that will be recorded, altered
3019      * in the search structure in this routine, and then restored before
3020      * the routine exits. */
3021     BlastKappa_SavedParameters *savedParams = NULL;
3022     /* All alignments above this value will be reported, no matter how many. */
3023     double inclusion_ethresh;
3024 
3025     BlastHSPResults* local_results = NULL;
3026 
3027     BlastCompo_QueryInfo** query_info_tld = NULL;
3028     int* numContexts_tld = NULL;
3029     int* compositionTestIndex_tld = NULL;
3030     Blast_RedoAlignParams** redo_align_params_tld = NULL;
3031     BLAST_SequenceBlk** subjectBlk_tld = NULL;
3032     Boolean positionBased = (Boolean) (sbp->psi_matrix != NULL);
3033     ECompoAdjustModes compo_adjust_mode =
3034         (ECompoAdjustModes) extendParams->options->compositionBasedStats;
3035     Boolean smithWaterman =
3036         (Boolean) (extendParams->options->eTbackExt == eSmithWatermanTbck);
3037 
3038     /* which test function do we use to see if a composition-adjusted
3039        p-value is desired; value needs to be passed in eventually*/
3040     int compositionTestIndex = extendParams->options->unifiedP;
3041     Uint1* genetic_code_string = GenCodeSingletonFind(default_db_genetic_code);
3042 
3043     ASSERT(program_number == eBlastTypeBlastp   ||
3044            program_number == eBlastTypeTblastn  ||
3045            program_number == eBlastTypeBlastx   ||
3046            program_number == eBlastTypePsiBlast ||
3047            program_number == eBlastTypeRpsBlast ||
3048            program_number == eBlastTypeRpsTblastn);
3049 
3050     if (0 == strcmp(scoringParams->options->matrix, "BLOSUM62_20") &&
3051         compo_adjust_mode == eNoCompositionBasedStats) {
3052         return -1;                   /* BLOSUM62_20 only makes sense if
3053                                       * compo_adjust_mode is on */
3054     }
3055     if (positionBased) {
3056         /* Position based searches can only use traditional
3057          * composition based stats */
3058         if ((int) compo_adjust_mode > 1) {
3059             compo_adjust_mode = eCompositionBasedStats;
3060         }
3061         /* A position-based search can only have one query */
3062         ASSERT(queryInfo->num_queries == 1);
3063         ASSERT(queryBlk->length == (Int4)sbp->psi_matrix->pssm->ncols);
3064     }
3065 
3066     if ((int) compo_adjust_mode > 1 &&
3067         !Blast_FrequencyDataIsAvailable(scoringParams->options->matrix)) {
3068         return -1;   /* Unsupported matrix */
3069     }
3070     /*****************/
3071     inclusion_ethresh = (psiOptions /* this can be NULL for CBl2Seq */
3072                          ? psiOptions->inclusion_ethresh
3073                          : PSI_INCLUSION_ETHRESH);
3074     ASSERT(inclusion_ethresh != 0.0);
3075 
3076     int actual_num_threads = 1;
3077 #ifdef _OPENMP
3078     actual_num_threads = num_threads;
3079 #endif
3080 
3081     /* Initialize savedParams */
3082     savedParams =
3083         s_SavedParametersNew(queryInfo->max_length, numContexts,
3084                              compo_adjust_mode, positionBased);
3085     if (savedParams == NULL) {
3086         status_code = -1;
3087         goto function_cleanup;
3088     }
3089     status_code =
3090         s_RecordInitialSearch(savedParams, sbp, scoringParams,
3091                               queryInfo->max_length, compo_adjust_mode,
3092                               positionBased);
3093     if (status_code != 0) {
3094         goto function_cleanup;
3095     }
3096 
3097     if (compo_adjust_mode != eNoCompositionBasedStats) {
3098         if((0 == strcmp(scoringParams->options->matrix, "BLOSUM62_20"))) {
3099             localScalingFactor = SCALING_FACTOR / 10;
3100         } else {
3101             localScalingFactor = SCALING_FACTOR;
3102         }
3103     } else {
3104         localScalingFactor = 1.0;
3105     }
3106     s_RescaleSearch(sbp, scoringParams, numContexts, localScalingFactor);
3107     status_code =
3108         BLAST_GapAlignStructNew(scoringParams, extendParams,
3109                                 (seqSrc) ?  BlastSeqSrcGetMaxSeqLen(seqSrc)
3110                                          :  subjectBlk->length,
3111                                 sbp, &gapAlign);
3112     if (status_code != 0) {
3113         return (Int2) status_code;
3114     }
3115 
3116     if(smithWaterman) {
3117         status_code =
3118             Blast_ForbiddenRangesInitialize(&forbidden, queryInfo->max_length);
3119         if (status_code != 0) {
3120             goto function_cleanup;
3121         }
3122     }
3123     redoneMatches = calloc(numQueries, sizeof(BlastCompo_Heap));
3124     if (redoneMatches == NULL) {
3125         status_code = -1;
3126         goto function_cleanup;
3127     }
3128     for (query_index = 0;  query_index < numQueries;  query_index++) {
3129         status_code =
3130             BlastCompo_HeapInitialize(&redoneMatches[query_index],
3131                                       hitParams->options->hitlist_size,
3132                                       inclusion_ethresh);
3133         if (status_code != 0) {
3134             goto function_cleanup;
3135         }
3136     }
3137 
3138     BlastCompo_Heap** redoneMatches_tld =
3139             (BlastCompo_Heap**) calloc(
3140                     actual_num_threads,
3141                     sizeof(BlastCompo_Heap*)
3142             );
3143     BlastCompo_Alignment*** alignments_tld =
3144             (BlastCompo_Alignment***) calloc(
3145                     actual_num_threads,
3146                     sizeof(BlastCompo_Alignment**)
3147             );
3148     BlastCompo_Alignment*** incoming_align_set_tld =
3149             (BlastCompo_Alignment***) calloc(
3150                     actual_num_threads,
3151                     sizeof(BlastCompo_Alignment**)
3152             );
3153     BlastKappa_SavedParameters** savedParams_tld =
3154             (BlastKappa_SavedParameters**) calloc(
3155                     actual_num_threads,
3156                     sizeof(BlastKappa_SavedParameters*)
3157             );
3158     BlastScoreBlk** sbp_tld =
3159             (BlastScoreBlk**) calloc(
3160                     actual_num_threads,
3161                     sizeof(BlastScoreBlk*)
3162             );
3163     BlastKappa_GappingParamsContext* gapping_params_context_tld =
3164             (BlastKappa_GappingParamsContext*) calloc(
3165                     actual_num_threads,
3166                     sizeof(BlastKappa_GappingParamsContext)
3167             );
3168     Int4*** matrix_tld =
3169             (Int4***) calloc(
3170                     actual_num_threads,
3171                     sizeof(Int4**)
3172             );
3173 
3174     NRrecord_tld =
3175             (Blast_CompositionWorkspace**) calloc(
3176                     actual_num_threads,
3177                     sizeof(Blast_CompositionWorkspace*)
3178             );
3179 
3180     subjectBlk_tld =
3181             (BLAST_SequenceBlk**) calloc(
3182                     actual_num_threads,
3183                     sizeof(BLAST_SequenceBlk*)
3184             );
3185     redo_align_params_tld =
3186             (Blast_RedoAlignParams**) calloc(
3187                     actual_num_threads,
3188                     sizeof(Blast_RedoAlignParams*)
3189             );
3190     int* status_code_tld =
3191             (int*) calloc(
3192                     actual_num_threads,
3193                     sizeof(int)
3194             );
3195     BlastSeqSrc** seqsrc_tld =
3196             (BlastSeqSrc**) calloc(
3197                     actual_num_threads,
3198                     sizeof(BlastSeqSrc*)
3199             );
3200     BlastGapAlignStruct** gap_align_tld =
3201             (BlastGapAlignStruct**) calloc(
3202                     actual_num_threads,
3203                     sizeof(BlastGapAlignStruct*)
3204             );
3205     BlastScoringParameters** score_params_tld =
3206             (BlastScoringParameters**) calloc(
3207                     actual_num_threads,
3208                     sizeof(BlastScoringParameters*)
3209             );
3210     BlastHitSavingParameters** hit_params_tld =
3211             (BlastHitSavingParameters**) calloc(
3212                     actual_num_threads,
3213                     sizeof(BlastHitSavingParameters*)
3214             );
3215     BlastHSPResults** results_tld =
3216             (BlastHSPResults**) calloc(
3217                     actual_num_threads,
3218                     sizeof(BlastHSPResults*)
3219             );
3220     query_info_tld =
3221             (BlastCompo_QueryInfo**) calloc(
3222                     actual_num_threads,
3223                     sizeof(BlastCompo_QueryInfo*)
3224             );
3225     numContexts_tld =
3226             (int*) calloc(
3227                     actual_num_threads,
3228                     sizeof(int)
3229             );
3230     compositionTestIndex_tld =
3231             (int*) calloc(
3232                     actual_num_threads,
3233                     sizeof(int)
3234             );
3235 
3236     int i;
3237     for (i = 0; i < actual_num_threads; ++i) {
3238         query_info_tld[i] = s_GetQueryInfo(
3239                 queryBlk->sequence,
3240                 queryInfo,
3241                 (program_number == eBlastTypeBlastx)
3242         );
3243         if (query_info_tld[i] == NULL) {
3244             status_code = -1;
3245             goto function_cleanup;
3246         }
3247 
3248         sbp_tld[i] = s_BlastScoreBlk_Copy(
3249                 program_number,
3250                 sbp,
3251                 sbp->alphabet_code,
3252                 sbp->number_of_contexts
3253         );
3254 
3255         numContexts_tld[i]          = numContexts;
3256         compositionTestIndex_tld[i] = compositionTestIndex;
3257         seqsrc_tld[i]               = BlastSeqSrcCopy(seqSrc);
3258         gap_align_tld[i]            =
3259                 s_BlastGapAlignStruct_Copy(gapAlign, sbp_tld[i]);
3260         score_params_tld[i]         = scoringParams;
3261         hit_params_tld[i]           = (BlastHitSavingParameters*) hitParams;
3262         results_tld[i]              =
3263                 Blast_HSPResultsNew(queryInfo->num_queries);
3264         subjectBlk_tld[i]           = subjectBlk;
3265 
3266         redoneMatches_tld[i] =
3267                 (BlastCompo_Heap*) calloc(numQueries, sizeof(BlastCompo_Heap));
3268         if (redoneMatches_tld[i] == NULL) {
3269             status_code = -1;
3270             goto function_cleanup;
3271         }
3272         for (query_index = 0; query_index < numQueries; query_index++) {
3273             status_code =
3274                 BlastCompo_HeapInitialize(&redoneMatches_tld[i][query_index],
3275                                           hitParams->options->hitlist_size,
3276                                           inclusion_ethresh);
3277             if (status_code != 0) {
3278                 goto function_cleanup;
3279             }
3280         }
3281 
3282         alignments_tld[i] = (BlastCompo_Alignment**) calloc(
3283                 numContexts,
3284                 sizeof(BlastCompo_Alignment*)
3285         );
3286         incoming_align_set_tld[i] = (BlastCompo_Alignment**) calloc(
3287                 numFrames,
3288                 sizeof(BlastCompo_Alignment*)
3289         );
3290 
3291         savedParams_tld[i] = s_SavedParametersNew(
3292                 queryInfo->max_length,
3293                 numContexts,
3294                 compo_adjust_mode,
3295                 positionBased
3296         );
3297         if (savedParams_tld[i] == NULL) {
3298             status_code = -1;
3299             goto function_cleanup;
3300         }
3301         status_code = s_RecordInitialSearch(
3302                 savedParams_tld[i],
3303                 sbp,
3304                 scoringParams,
3305                 queryInfo->max_length,
3306                 compo_adjust_mode,
3307                 positionBased
3308         );
3309         if (status_code != 0) {
3310             goto function_cleanup;
3311         }
3312 
3313         if ((int) compo_adjust_mode > 1 && !positionBased) {
3314             NRrecord_tld[i] = Blast_CompositionWorkspaceNew();
3315             status_code = Blast_CompositionWorkspaceInit(
3316                     NRrecord_tld[i],
3317                     scoringParams->options->matrix
3318             );
3319             if (status_code != 0) {
3320                 goto function_cleanup;
3321             }
3322         }
3323 
3324         gapping_params_context_tld[i].gap_align = gap_align_tld[i];
3325         gapping_params_context_tld[i].scoringParams = score_params_tld[i];
3326         gapping_params_context_tld[i].sbp = sbp_tld[i];
3327         gapping_params_context_tld[i].localScalingFactor = localScalingFactor;
3328         gapping_params_context_tld[i].prog_number = program_number;
3329 
3330         redo_align_params_tld[i] =
3331             s_GetAlignParams(
3332                     &gapping_params_context_tld[i],
3333                     queryBlk,
3334                     queryInfo,
3335                     hitParams,
3336                     extendParams
3337             );
3338         if (redo_align_params_tld[i] == NULL) {
3339             status_code = -1;
3340             goto function_cleanup;
3341         }
3342 
3343         if (positionBased) {
3344             matrix_tld[i] = sbp_tld[i]->psi_matrix->pssm->data;
3345         } else {
3346             matrix_tld[i] = sbp_tld[i]->matrix->data;
3347         }
3348         /**** Validate parameters *************/
3349         if (matrix_tld[i] == NULL) {
3350             goto function_cleanup;
3351         }
3352     }
3353 
3354     /*
3355      * There are two use cases here.
3356      * (1) hsp_stream == NULL, so single match is passed in thisMatch.
3357      *     Also, seqSrc == NULL and subjectBlk are != NULL.
3358      * (2) hsp_stream != NULL, so one or more matches are taken from
3359      *     hsp_stream, and thisMatch is (probably) NULL.
3360      *     Also, seqSrc != NULL, subjectBlk and thisMatch are == NULL.
3361      */
3362     struct BlastHSPListLinkedList {
3363         BlastHSPList* match;
3364         struct BlastHSPListLinkedList* next;
3365     };
3366     typedef struct BlastHSPListLinkedList BlastHSPListLinkedList;
3367 
3368     BlastHSPList** theseMatches = NULL;
3369     int numMatches = 0;
3370     if (hsp_stream == NULL) {
3371         theseMatches = (BlastHSPList**) calloc(1, sizeof(BlastHSPList*));
3372         *theseMatches = thisMatch;
3373         numMatches = 1;
3374     } else {
3375         BlastHSPList* localMatch = NULL;
3376         BlastHSPListLinkedList* head = NULL;
3377         BlastHSPListLinkedList* tail = NULL;
3378         /*
3379          * Collect matches from stream into linked list, counting them
3380          * along the way.
3381          */
3382         while (BlastHSPStreamRead(hsp_stream, &localMatch)
3383                 != kBlastHSPStream_Eof) {
3384             BlastHSPListLinkedList* entry =
3385                     (BlastHSPListLinkedList*) calloc(
3386                             1,
3387                             sizeof(BlastHSPListLinkedList)
3388                     );
3389             entry->match = localMatch;
3390             if (head == NULL) {
3391                 head = entry;
3392             } else {
3393                 tail->next = entry;
3394             }
3395             tail = entry;
3396             ++numMatches;
3397         }
3398         /*
3399          * Convert linked list of matches into array.
3400          */
3401         theseMatches =
3402                 (BlastHSPList**) calloc(numMatches, sizeof(BlastHSPList*));
3403         int i;
3404         for (i = 0; i < numMatches; ++i) {
3405             theseMatches[i] = head->match;
3406             BlastHSPListLinkedList* here = head;
3407             head = head->next;
3408             sfree(here);
3409         }
3410     }
3411 
3412     Boolean interrupt = FALSE;
3413 #pragma omp parallel \
3414     default(none) num_threads(actual_num_threads) \
3415     if(actual_num_threads>1) \
3416     shared(interrupt, seqsrc_tld, score_params_tld, hit_params_tld, \
3417     gap_align_tld, results_tld, \
3418     redoneMatches_tld, \
3419     STDERR_COMMA \
3420     numQueries, numMatches, theseMatches, \
3421     numFrames, program_number, subjectBlk_tld, positionBased, \
3422     default_db_genetic_code, localScalingFactor, queryInfo, \
3423     sbp, smithWaterman, compositionTestIndex_tld, forbidden, \
3424     NRrecord_tld, actual_num_threads, sbp_tld, \
3425     matrix_tld, query_info_tld, numContexts_tld, \
3426     genetic_code_string, queryBlk, compo_adjust_mode, \
3427     alignments_tld, incoming_align_set_tld, savedParams_tld, \
3428     scoringParams, redo_align_params_tld, \
3429     status_code_tld)
3430     {
3431         int b;
3432 #pragma omp for schedule(static)
3433         for (b = 0; b < numMatches; ++b) {
3434 #pragma omp flush(interrupt)
3435             if (!interrupt) {
3436                 BlastCompo_Alignment** alignments = NULL;
3437                 BlastCompo_Alignment** incoming_align_set = NULL;
3438                 Blast_CompositionWorkspace* NRrecord = NULL;
3439                 BlastCompo_QueryInfo* query_info = NULL;
3440 
3441                 int numAligns[6];
3442                 Blast_KarlinBlk* kbp = NULL;
3443                 BlastCompo_MatchingSequence matchingSeq = {0,};
3444                 BlastHSPList* hsp_list = NULL;
3445                 BlastCompo_Alignment* incoming_aligns = NULL;
3446                 Blast_RedoAlignParams* redo_align_params;
3447                 double best_evalue;
3448                 Int4 best_score;
3449                 int query_index;
3450                 int context_index;
3451                 int frame_index;
3452                 void* discarded_aligns = NULL;
3453                 BlastSeqSrc* seqSrc;
3454                 BlastScoringParameters* scoringParams;
3455                 BlastHitSavingParameters* hitParams;
3456                 BlastCompo_Heap* redoneMatches;
3457                 BlastScoreBlk* sbp;
3458                 BLAST_SequenceBlk* subjectBlk;
3459                 int numContexts;
3460                 int compositionTestIndex;
3461                 /* existing alignments for a match */
3462                 Int4** matrix;                   /* score matrix */
3463                 int* pStatusCode;
3464 
3465                 double pvalueForThisPair = (-1); /* p-value for this match
3466                                                     for composition; -1 == no adjustment*/
3467                 double LambdaRatio; /*lambda ratio*/
3468 
3469                 int tid = 0;
3470 #ifdef _OPENMP
3471                 if(actual_num_threads > 1) {
3472                 	tid = omp_get_thread_num();
3473                 }
3474 #endif
3475                 seqSrc               = seqsrc_tld[tid];
3476                 scoringParams        = score_params_tld[tid];
3477                 hitParams            = hit_params_tld[tid];
3478                 redoneMatches        = redoneMatches_tld[tid];
3479                 alignments           = alignments_tld[tid];
3480                 incoming_align_set   = incoming_align_set_tld[tid];
3481                 NRrecord             = NRrecord_tld[tid];
3482                 sbp                  = sbp_tld[tid];
3483                 redo_align_params    = redo_align_params_tld[tid];
3484                 matrix               = matrix_tld[tid];
3485                 pStatusCode          = &status_code_tld[tid];
3486                 query_info           = query_info_tld[tid];
3487                 numContexts          = numContexts_tld[tid];
3488                 compositionTestIndex = compositionTestIndex_tld[tid];
3489                 subjectBlk           = subjectBlk_tld[tid];
3490 
3491                 BlastHSPList* localMatch = theseMatches[b];
3492 
3493                 if (localMatch->hsp_array == NULL) {
3494                     if (seqSrc) {
3495                         continue;
3496                     }
3497                     if(actual_num_threads > 1) {
3498 #pragma omp critical(intrpt)
3499                     	interrupt = TRUE;
3500 #pragma omp flush(interrupt)
3501                     	continue;
3502                     }
3503                 }
3504 
3505                 if (BlastCompo_EarlyTermination(
3506                         localMatch->best_evalue,
3507                         redoneMatches,
3508                         numQueries
3509                 )) {
3510                     Blast_HSPListFree(localMatch);
3511                     if (seqSrc) {
3512                         continue;
3513                     }
3514                     if(actual_num_threads > 1) {
3515 #pragma omp critical(intrpt)
3516                     	interrupt = TRUE;
3517 #pragma omp flush(interrupt)
3518                     	continue;
3519                     }
3520                 }
3521 
3522                 query_index = localMatch->query_index;
3523                 context_index = query_index * numFrames;
3524                 BlastSeqSrcSetRangesArg * ranges = NULL;
3525                 /* Get the sequence for this match */
3526                 if (seqSrc && BlastSeqSrcGetSupportsPartialFetching(seqSrc)) {
3527                     ranges = BLAST_SetupPartialFetching(
3528                             program_number,
3529                             (BlastSeqSrc*) seqSrc,
3530                             (const BlastHSPList**)&localMatch,
3531                             1
3532                     );
3533                 }
3534 
3535                 if (subjectBlk) {
3536                     matchingSeq.length = subjectBlk->length;
3537                     matchingSeq.index = -1;
3538                     matchingSeq.local_data = subjectBlk;
3539                 } else {
3540                     *pStatusCode = s_MatchingSequenceInitialize(
3541                             &matchingSeq,
3542                             program_number,
3543                             seqSrc,
3544                             default_db_genetic_code,
3545                             localMatch->oid,
3546                             ranges
3547                     );
3548                     if (*pStatusCode != 0) {
3549                         /*
3550                          * some sequences may have been excluded by membit filtering
3551                          * so this is not really an exception
3552                          */
3553                         *pStatusCode = 0;
3554                         goto match_loop_cleanup;
3555                     }
3556                 }
3557                 *pStatusCode = s_ResultHspToDistinctAlign(
3558                         incoming_align_set,     /* o */
3559                         numAligns,              /* o */
3560                         localMatch->hsp_array,  /* i */
3561                         localMatch->hspcnt,     /* i */
3562                         context_index,          /* i */
3563                         queryInfo,              /* i */
3564                         localScalingFactor      /* i */
3565                 );
3566                 if (*pStatusCode != 0) {
3567                     goto match_loop_cleanup;
3568                 }
3569 
3570                 hsp_list = Blast_HSPListNew(0);
3571                 for (frame_index = 0;
3572                         frame_index < numFrames;
3573                         frame_index++, context_index++) {
3574                     incoming_aligns = incoming_align_set[frame_index];
3575                     if (!incoming_aligns) {
3576                         continue;
3577                     }
3578                     /*
3579                      * All alignments in thisMatch should be to the same query
3580                      */
3581                     kbp = sbp->kbp_gap[context_index];
3582                     if (smithWaterman) {
3583                         *pStatusCode =
3584                                 Blast_RedoOneMatchSmithWaterman(
3585                                         alignments,
3586                                         redo_align_params,
3587                                         incoming_aligns,
3588                                         numAligns[frame_index],
3589                                         kbp->Lambda,
3590                                         kbp->logK,
3591                                         &matchingSeq,
3592                                         query_info,
3593                                         numQueries,
3594                                         matrix,
3595                                         BLASTAA_SIZE,
3596                                         NRrecord,
3597                                         &forbidden,
3598                                         redoneMatches,
3599                                         &pvalueForThisPair,
3600                                         compositionTestIndex,
3601                                         &LambdaRatio
3602                                 );
3603                     } else {
3604                         *pStatusCode =
3605                                 Blast_RedoOneMatch(
3606                                         alignments,             // thread-local
3607                                         redo_align_params,      // thread-local
3608                                         incoming_aligns,        // thread-local
3609                                         numAligns[frame_index], // local
3610                                         kbp->Lambda,            // thread-local
3611                                         &matchingSeq,           // thread-local
3612                                         -1,                     // const
3613                                         query_info,             // thread-local
3614                                         numContexts,            // thread-local
3615                                         matrix,                 // thread-local
3616                                         BLASTAA_SIZE,           // const
3617                                         NRrecord,               // thread-local
3618                                         &pvalueForThisPair,     // local
3619                                         compositionTestIndex,   // thread-local
3620                                         &LambdaRatio            // local
3621                                 );
3622                     }
3623 
3624                     if (*pStatusCode != 0) {
3625                         goto match_loop_cleanup;
3626                     }
3627 
3628                     if (alignments[context_index] != NULL) {
3629                         Int2 qframe = frame_index;
3630                         if (program_number == eBlastTypeBlastx) {
3631                             if (qframe < 3) {
3632                                 qframe++;
3633                             } else {
3634                                 qframe = 2 - qframe;
3635                             }
3636                         }
3637                         *pStatusCode =
3638                                 s_HSPListFromDistinctAlignments(hsp_list,
3639                                         &alignments[context_index],
3640                                         matchingSeq.index,
3641                                         queryInfo, qframe);
3642                         if (*pStatusCode) {
3643                             goto match_loop_cleanup;
3644                         }
3645                     }
3646                     BlastCompo_AlignmentsFree(&incoming_aligns, NULL);
3647                     incoming_align_set[frame_index] = NULL;
3648                 }
3649 
3650                 if (hsp_list->hspcnt > 1) {
3651                     s_HitlistReapContained(hsp_list->hsp_array,
3652                             &hsp_list->hspcnt);
3653                 }
3654                 *pStatusCode =
3655                         s_HitlistEvaluateAndPurge(&best_score, &best_evalue,
3656                                 hsp_list,
3657                                 seqSrc,
3658                                 matchingSeq.length,
3659                                 program_number,
3660                                 queryInfo, context_index,
3661                                 sbp, hitParams,
3662                                 pvalueForThisPair, LambdaRatio,
3663                                 matchingSeq.index);
3664                 if (*pStatusCode != 0) {
3665                     goto query_loop_cleanup;
3666                 }
3667                 if (best_evalue <= hitParams->options->expect_value) {
3668                     /* The best alignment is significant */
3669                     s_HSPListNormalizeScores(hsp_list, kbp->Lambda, kbp->logK,
3670                             localScalingFactor);
3671                     s_ComputeNumIdentities(
3672                             queryBlk,
3673                             queryInfo,
3674                             subjectBlk,
3675                             seqSrc,
3676                             hsp_list,
3677                             scoringParams->options,
3678                             genetic_code_string,
3679                             sbp,
3680                             ranges
3681                     );
3682                     if (!seqSrc) {
3683                         goto query_loop_cleanup;
3684                     }
3685                     if (BlastCompo_HeapWouldInsert(
3686                             &redoneMatches[query_index],
3687                             best_evalue,
3688                             best_score,
3689                             localMatch->oid
3690                     )) {
3691                         *pStatusCode =
3692                                 BlastCompo_HeapInsert(
3693                                         &redoneMatches[query_index],
3694                                         hsp_list,
3695                                         best_evalue,
3696                                         best_score,
3697                                         localMatch->oid,
3698                                         &discarded_aligns
3699                                 );
3700                         if (*pStatusCode == 0) {
3701                             hsp_list = NULL;
3702                         }
3703                     } else {
3704                         hsp_list = Blast_HSPListFree(hsp_list);
3705                     }
3706 
3707                     if (*pStatusCode) {
3708                         goto query_loop_cleanup;
3709                     }
3710                     if (discarded_aligns != NULL) {
3711                         Blast_HSPListFree(discarded_aligns);
3712                     }
3713                 }
3714 query_loop_cleanup:
3715 match_loop_cleanup:
3716                 if (seqSrc) {
3717                     localMatch = Blast_HSPListFree(localMatch);
3718                 } else {
3719                     Blast_HSPListSwap(localMatch, hsp_list);
3720                     localMatch->oid = hsp_list->oid;
3721                 }
3722                 hsp_list = Blast_HSPListFree(hsp_list);
3723                 ranges = BlastSeqSrcSetRangesArgFree(ranges);
3724 
3725                 if (*pStatusCode != 0) {
3726                     for (context_index = 0;
3727                             context_index < numContexts;
3728                             context_index++) {
3729                         BlastCompo_AlignmentsFree(
3730                                 &alignments[context_index],
3731                                 s_FreeEditScript
3732                         );
3733                     }
3734                 }
3735                 s_MatchingSequenceRelease(&matchingSeq);
3736                 BlastCompo_AlignmentsFree(&incoming_aligns, NULL);
3737                 if ((actual_num_threads > 1) &&
3738                 	(*pStatusCode != 0 || !seqSrc)) {
3739 #pragma omp critical(intrpt)
3740                     interrupt = TRUE;
3741 #pragma omp flush(interrupt)
3742                     continue;
3743                 }
3744 
3745             } /* end of if(!interrupt) */
3746         }
3747 #pragma omp barrier
3748 
3749         /*
3750          * end of omp parallel section
3751          */
3752     }
3753 
3754 function_cleanup:
3755 
3756     for (i = 0; i < actual_num_threads; ++i) {
3757         if (status_code_tld[i] != 0) {
3758             status_code = status_code_tld[i];
3759         }
3760     }
3761     for (i = 0; i < actual_num_threads; ++i) {
3762         if (seqSrc  &&  status_code == 0) {
3763             s_FillResultsFromCompoHeaps(
3764                     results_tld[i],
3765                     redoneMatches_tld[i],
3766                     hitParams->options->hitlist_size
3767             );
3768             if (redoneMatches_tld[i] != NULL) {
3769                 int qi;
3770                 for (qi = 0; qi < numQueries; ++qi) {
3771                     sfree(redoneMatches_tld[i][qi].array);
3772                     sfree(redoneMatches_tld[i][qi].heapArray);
3773                 }
3774                 s_ClearHeap(redoneMatches_tld[i]);
3775             }
3776         } else {
3777             if (redoneMatches_tld[i] != NULL) {
3778                 int qi;
3779                 for (qi = 0; qi < numQueries; ++qi) {
3780                     sfree(redoneMatches_tld[i][qi].array);
3781                     sfree(redoneMatches_tld[i][qi].heapArray);
3782                 }
3783                 s_ClearHeap(redoneMatches_tld[i]);
3784             }
3785         }
3786         sfree(redoneMatches_tld[i]);
3787     }
3788     if (redoneMatches != NULL) {
3789         int qi;
3790         for (qi = 0; qi < numQueries; ++qi) {
3791             sfree(redoneMatches[qi].array);
3792             sfree(redoneMatches[qi].heapArray);
3793         }
3794         s_ClearHeap(redoneMatches);
3795     }
3796 
3797     if (hsp_stream != NULL) {
3798         /* Reduce results from all threads and continue with business as usual */
3799         SThreadLocalDataArray* thread_data =
3800                 SThreadLocalDataArrayNew(actual_num_threads);
3801         int i;
3802         for (i = 0; i < actual_num_threads; ++i) {
3803             SThreadLocalData* tdi = thread_data->tld[i];
3804             BlastHSPResults* rdi = results_tld[i];
3805             tdi->hit_params = hit_params_tld[i];
3806             hit_params_tld[i] = NULL;
3807             tdi->results =
3808                     (BlastHSPResults*) calloc(1, sizeof(BlastHSPResults));
3809             tdi->results->num_queries = rdi->num_queries;
3810             tdi->results->hitlist_array =
3811                     (BlastHitList**) calloc(
3812                             tdi->results->num_queries,
3813                             sizeof(BlastHitList*)
3814                     );
3815             int j;
3816             for (j = 0; j < tdi->results->num_queries; ++j) {
3817                 tdi->results->hitlist_array[j] = rdi->hitlist_array[j];
3818                 rdi->hitlist_array[j] = NULL;
3819             }
3820         }
3821         local_results = SThreadLocalDataArrayConsolidateResults(thread_data);
3822         ASSERT(local_results);
3823 
3824         /* post-traceback pipes */
3825         BlastHSPStreamTBackClose(hsp_stream, local_results);
3826 
3827         for (i = 0; i < local_results->num_queries; ++i) {
3828             results->hitlist_array[i] = local_results->hitlist_array[i];
3829             local_results->hitlist_array[i] = NULL;
3830         }
3831         for (i = 0; i < actual_num_threads; ++i) {
3832             thread_data->tld[i]->hit_params = NULL;
3833             int j;
3834             for (j = 0; j < local_results->num_queries; ++j) {
3835                 thread_data->tld[i]->results->hitlist_array[j] =
3836                         Blast_HitListFree(
3837                                 thread_data->tld[i]->results->hitlist_array[j]
3838                         );
3839             }
3840             sfree(thread_data->tld[i]->results->hitlist_array);
3841             sfree(thread_data->tld[i]->results);
3842             thread_data->tld[i] = SThreadLocalDataFree(thread_data->tld[i]);
3843         }
3844         sfree(thread_data->tld);
3845         sfree(thread_data);
3846         Blast_HSPResultsFree(local_results);
3847     }
3848 
3849     if (redoneMatches != NULL) {
3850         for (query_index = 0;  query_index < numQueries;  query_index++) {
3851             BlastCompo_HeapRelease(&redoneMatches[query_index]);
3852         }
3853         sfree(redoneMatches);
3854         redoneMatches = NULL;
3855     }
3856     if (smithWaterman) {
3857         Blast_ForbiddenRangesRelease(&forbidden);
3858     }
3859     if (gapAlign != NULL) {
3860         gapAlign = BLAST_GapAlignStructFree(gapAlign);
3861     }
3862     s_RestoreSearch(sbp, scoringParams, savedParams, queryBlk->length,
3863                     positionBased, compo_adjust_mode);
3864     s_SavedParametersFree(&savedParams);
3865 
3866     for (i = 0; i < actual_num_threads; ++i) {
3867         s_BlastScoreBlk_Free(&sbp_tld[i]);
3868         gap_align_tld[i]->sbp = NULL;
3869         s_BlastGapAlignStruct_Free(gap_align_tld[i]);
3870         Blast_RedoAlignParamsFree(&redo_align_params_tld[i]);
3871         sfree(alignments_tld[i]);
3872         sfree(incoming_align_set_tld[i]);
3873         Blast_CompositionWorkspaceFree(&NRrecord_tld[i]);
3874         s_SavedParametersFree(&savedParams_tld[i]);
3875         BlastSeqSrcFree(seqsrc_tld[i]);
3876         results_tld[i] = Blast_HSPResultsFree(results_tld[i]);
3877         s_FreeBlastCompo_QueryInfoArray(&query_info_tld[i], numContexts);
3878     }
3879     sfree(alignments_tld);
3880     sfree(compositionTestIndex_tld);
3881     sfree(gap_align_tld);
3882     sfree(gapping_params_context_tld);
3883     sfree(hit_params_tld);
3884     sfree(incoming_align_set_tld);
3885     sfree(matrix_tld);
3886     sfree(NRrecord_tld);
3887     sfree(numContexts_tld);
3888     sfree(query_info_tld);
3889     sfree(redo_align_params_tld);
3890     sfree(redoneMatches_tld);
3891     sfree(results_tld);
3892     sfree(savedParams_tld);
3893     sfree(sbp_tld);
3894     sfree(score_params_tld);
3895     sfree(seqsrc_tld);
3896     sfree(status_code_tld);
3897     sfree(subjectBlk_tld);
3898     sfree(theseMatches);
3899 
3900     return (Int2) status_code;
3901 }
3902 
3903 
3904 
3905