1 /* $Id: blast_kappa.c 615057 2020-08-26 15:29:10Z fongah2 $
2 * ==========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Alejandro Schaffer, Mike Gertz (ported to algo/blast by Tom Madden)
27 *
28 */
29
30 /** @file blast_kappa.c
31 * Utilities for doing Smith-Waterman alignments and adjusting the scoring
32 * system for each match in blastpgp
33 */
34
35 #include <float.h>
36 #include <algo/blast/core/ncbi_math.h>
37 #include <algo/blast/core/blast_hits.h>
38 #include <algo/blast/core/blast_kappa.h>
39 #include <algo/blast/core/blast_util.h>
40 #include <algo/blast/core/blast_gapalign.h>
41 #include <algo/blast/core/blast_filter.h>
42 #include <algo/blast/core/blast_traceback.h>
43 #include <algo/blast/core/link_hsps.h>
44 #include <algo/blast/core/gencode_singleton.h>
45 #include "blast_psi_priv.h"
46 #include "blast_gapalign_priv.h"
47 #include "blast_hits_priv.h"
48 #include "blast_posit.h"
49 #include "blast_hspstream_mt_utils.h"
50 #include "blast_traceback_mt_priv.h"
51
52 #ifdef _OPENMP
53 #include <omp.h>
54
55 # ifdef _WIN32
56 /* stderr expands to (__acrt_iob_func(2)), which won't work in an OpenMP
57 * shared(...) list. */
58 # define STDERR_COMMA
59 # else
60 # define STDERR_COMMA stderr,
61 # endif
62 #endif
63
64 #include <algo/blast/composition_adjustment/nlm_linear_algebra.h>
65 #include <algo/blast/composition_adjustment/compo_heap.h>
66 #include <algo/blast/composition_adjustment/redo_alignment.h>
67 #include <algo/blast/composition_adjustment/matrix_frequency_data.h>
68 #include <algo/blast/composition_adjustment/unified_pvalues.h>
69
70 /* Define KAPPA_PRINT_DIAGNOSTICS to turn on printing of
71 * diagnostic information from some routines. */
72
73 /** Compile-time option; if set to a true value, then blastp runs
74 that use Blast_RedoAlignmentCore to compute the traceback will not
75 SEG the subject sequence */
76 #ifndef KAPPA_BLASTP_NO_SEG_SEQUENCE
77 #define KAPPA_BLASTP_NO_SEG_SEQUENCE 0
78 #endif
79
80
81 /** Compile-time option; if set to a true value, then blastp runs
82 that use Blast_RedoAlignmentCore to compute the traceback will not
83 SEG the subject sequence */
84 #ifndef KAPPA_TBLASTN_NO_SEG_SEQUENCE
85 #define KAPPA_TBLASTN_NO_SEG_SEQUENCE 0
86 #endif
87
88
89 /**
90 * Given a list of HSPs with (possibly) high-precision scores, rescale
91 * the scores to have standard precision and set the scale-independent
92 * bit scores. This routine does *not* resort the list; it is assumed
93 * that the list is already sorted according to e-values that have been
94 * computed using the initial, higher-precision scores.
95 *
96 * @param hsp_list the HSP list
97 * @param logK Karlin-Altschul statistical parameter [in]
98 * @param lambda Karlin-Altschul statistical parameter [in]
99 * @param scoreDivisor the value by which reported scores are to be
100 */
101 static void
s_HSPListNormalizeScores(BlastHSPList * hsp_list,double lambda,double logK,double scoreDivisor)102 s_HSPListNormalizeScores(BlastHSPList * hsp_list,
103 double lambda,
104 double logK,
105 double scoreDivisor)
106 {
107 int hsp_index;
108 for(hsp_index = 0; hsp_index < hsp_list->hspcnt; hsp_index++) {
109 BlastHSP * hsp = hsp_list->hsp_array[hsp_index];
110
111 hsp->score = BLAST_Nint(((double) hsp->score) / scoreDivisor);
112 /* Compute the bit score using the newly computed scaled score. */
113 hsp->bit_score = (hsp->score*lambda*scoreDivisor - logK)/NCBIMATH_LN2;
114 }
115 }
116
117
118 /**
119 * Adjusts the E-values in a BLAST_HitList to be composites of
120 * a composition-based P-value and a score/alignment-based P-value
121 *
122 * @param hsp_list the hitlist whose E-values need to be adjusted
123 * @param comp_p_value P-value from sequence composition
124 * @param seqSrc a source of sequence data
125 * @param subject_length length of database sequence
126 * @param query_context info about this query context; needed when
127 * multiple queries are being used
128 * @param LambdaRatio the ratio between the observed value of Lambda
129 * and the predicted value of lambda (used to print
130 * diagnostics)
131 * @param subject_id the subject id of this sequence (used to print
132 * diagnostics)
133 **/
134 static void
s_AdjustEvaluesForComposition(BlastHSPList * hsp_list,double comp_p_value,const BlastSeqSrc * seqSrc,Int4 subject_length,const BlastContextInfo * query_context,double LambdaRatio,int subject_id)135 s_AdjustEvaluesForComposition(
136 BlastHSPList *hsp_list,
137 double comp_p_value,
138 const BlastSeqSrc* seqSrc,
139 Int4 subject_length,
140 const BlastContextInfo * query_context,
141 double LambdaRatio,
142 int subject_id)
143 {
144 /* Smallest observed evalue after adjustment */
145 double best_evalue = DBL_MAX;
146
147 /* True length of the query */
148 int query_length = query_context->query_length;
149 /* Length adjustment to compensate for edge effects */
150 int length_adjustment = query_context->length_adjustment;
151
152 /* Effective lengths of the query, subject, and database */
153 double query_eff = MAX((query_length - length_adjustment), 1);
154 double subject_eff = MAX((subject_length - length_adjustment), 1.0);
155 double dblen_eff = (double) query_context->eff_searchsp / query_eff;
156
157 /* Scale factor to convert the database E-value to the sequence E-value */
158 double db_to_sequence_scale = subject_eff / dblen_eff;
159
160 int hsp_index;
161 for (hsp_index = 0; hsp_index < hsp_list->hspcnt; hsp_index++) {
162 /* for all HSPs */
163 double align_p_value; /* P-value for the alignment score */
164 double combined_p_value; /* combination of two P-values */
165
166 /* HSP for this iteration */
167 BlastHSP * hsp = hsp_list->hsp_array[hsp_index];
168 #ifdef KAPPA_PRINT_DIAGNOSTICS
169 /* Original E-value, saved if diagnostics are printed. */
170 double old_e_value = hsp->evalue;
171 #endif
172 hsp->evalue *= db_to_sequence_scale;
173
174 align_p_value = BLAST_KarlinEtoP(hsp->evalue);
175 combined_p_value = Blast_Overall_P_Value(comp_p_value,align_p_value);
176 hsp->evalue = BLAST_KarlinPtoE(combined_p_value);
177 hsp->evalue /= db_to_sequence_scale;
178
179 if (hsp->evalue < best_evalue) {
180 best_evalue = hsp->evalue;
181 }
182
183 #ifdef KAPPA_PRINT_DIAGNOSTICS
184 if (seqSrc){
185 int sequence_gi; /*GI of a sequence*/
186 Blast_GiList* gi_list; /*list of GI's for a sequence*/
187 gi_list = BlastSeqSrcGetGis(seqSrc, (void *) (&subject_id));
188 if ((gi_list) && (gi_list->num_used > 0)) {
189 sequence_gi = gi_list->data[0];
190 } else {
191 sequence_gi = (-1);
192 }
193 printf("GI %d Lambda ratio %e comp. p-value %e; "
194 "adjust E-value of query length %d match length "
195 "%d from %e to %e\n",
196 sequence_gi, LambdaRatio, comp_p_value,
197 query_length, subject_length, old_e_value, hsp->evalue);
198 Blast_GiListFree(gi_list);
199 }
200 #endif
201 } /* end for all HSPs */
202
203 hsp_list->best_evalue = best_evalue;
204
205 /* suppress unused parameter warnings if diagnostics are not printed */
206 (void) seqSrc;
207 (void) query_length;
208 (void) LambdaRatio;
209 (void) subject_id;
210 }
211
212
213 /**
214 * Remove from a hitlist all HSPs that are completely contained in an
215 * HSP that occurs earlier in the list and that:
216 * - is on the same strand; and
217 * - has equal or greater score. T
218 * The hitlist should be sorted by some measure of significance before
219 * this routine is called.
220 * @param hsp_array array to be reaped
221 * @param hspcnt length of hsp_array
222 */
223 static void
s_HitlistReapContained(BlastHSP * hsp_array[],Int4 * hspcnt)224 s_HitlistReapContained(BlastHSP * hsp_array[], Int4 * hspcnt)
225 {
226 Int4 iread; /* iteration index used to read the hitlist */
227 Int4 iwrite; /* iteration index used to write to the hitlist */
228 Int4 old_hspcnt; /* number of HSPs in the hitlist on entry */
229
230 old_hspcnt = *hspcnt;
231
232 for (iread = 1; iread < *hspcnt; iread++) {
233 /* for all HSPs in the hitlist */
234 Int4 ireadBack; /* iterator over indices less than iread */
235 BlastHSP *hsp1; /* an HSP that is a candidate for deletion */
236
237 hsp1 = hsp_array[iread];
238 for (ireadBack = 0; ireadBack < iread && hsp1 != NULL; ireadBack++) {
239 /* for all HSPs before hsp1 in the hitlist and while hsp1
240 * has not been deleted */
241 BlastHSP *hsp2; /* an HSP that occurs earlier in hsp_array
242 * than hsp1 */
243 hsp2 = hsp_array[ireadBack];
244
245 if( hsp2 == NULL ) { /* hsp2 was deleted in a prior iteration. */
246 continue;
247 }
248 if (hsp2->query.frame == hsp1->query.frame &&
249 hsp2->subject.frame == hsp1->subject.frame) {
250 /* hsp1 and hsp2 are in the same query/subject frame. */
251 if (CONTAINED_IN_HSP
252 (hsp2->query.offset, hsp2->query.end, hsp1->query.offset,
253 hsp2->subject.offset, hsp2->subject.end,
254 hsp1->subject.offset) &&
255 CONTAINED_IN_HSP
256 (hsp2->query.offset, hsp2->query.end, hsp1->query.end,
257 hsp2->subject.offset, hsp2->subject.end,
258 hsp1->subject.end) &&
259 hsp1->score <= hsp2->score) {
260 hsp1 = hsp_array[iread] = Blast_HSPFree(hsp_array[iread]);
261 }
262 } /* end if hsp1 and hsp2 are in the same query/subject frame */
263 } /* end for all HSPs before hsp1 in the hitlist */
264 } /* end for all HSPs in the hitlist */
265
266 /* Condense the hsp_array, removing any NULL items. */
267 iwrite = 0;
268 for (iread = 0; iread < *hspcnt; iread++) {
269 if (hsp_array[iread] != NULL) {
270 hsp_array[iwrite++] = hsp_array[iread];
271 }
272 }
273 *hspcnt = iwrite;
274 /* Fill the remaining memory in hsp_array with NULL pointers. */
275 for ( ; iwrite < old_hspcnt; iwrite++) {
276 hsp_array[iwrite] = NULL;
277 }
278 }
279
280
281 /** A callback used to free an EditScript that has been stored in a
282 * BlastCompo_Alignment. */
s_FreeEditScript(void * edit_script)283 static void s_FreeEditScript(void * edit_script)
284 {
285 if (edit_script != NULL)
286 GapEditScriptDelete(edit_script);
287 }
288
289
290 /**
291 * Converts a list of objects of type BlastCompo_Alignment to an
292 * new object of type BlastHSPList and returns the result. Conversion
293 * in this direction is lossless. The list passed to this routine is
294 * freed to ensure that there is no aliasing of fields between the
295 * list of BlastCompo_Alignments and the new hitlist.
296 *
297 * @param hsp_list The hsp_list to populate
298 * @param alignments A list of distinct alignments; freed before return [in]
299 * @param oid Ordinal id of a database sequence [in]
300 * @param queryInfo information about all queries in this search [in]
301 * @param frame query frame
302 * @return Allocated and filled BlastHSPList structure.
303 */
304 static int
s_HSPListFromDistinctAlignments(BlastHSPList * hsp_list,BlastCompo_Alignment ** alignments,int oid,const BlastQueryInfo * queryInfo,int frame)305 s_HSPListFromDistinctAlignments(BlastHSPList *hsp_list,
306 BlastCompo_Alignment ** alignments,
307 int oid,
308 const BlastQueryInfo* queryInfo,
309 int frame)
310 {
311 int status = 0; /* return code for any routine called */
312 static const int unknown_value = 0; /* dummy constant to use when a
313 parameter value is not known */
314 BlastCompo_Alignment * align; /* an alignment in the list */
315
316 if (hsp_list == NULL) {
317 return -1;
318 }
319 hsp_list->oid = oid;
320
321 for (align = *alignments; NULL != align; align = align->next) {
322 BlastHSP * new_hsp = NULL;
323 GapEditScript * editScript = align->context;
324 align->context = NULL;
325
326 status = Blast_HSPInit(align->queryStart, align->queryEnd,
327 align->matchStart, align->matchEnd,
328 unknown_value, unknown_value,
329 align->queryIndex,
330 frame, (Int2) align->frame, align->score,
331 &editScript, &new_hsp);
332 switch (align->matrix_adjust_rule) {
333 case eDontAdjustMatrix:
334 new_hsp->comp_adjustment_method = eNoCompositionBasedStats;
335 break;
336 case eCompoScaleOldMatrix:
337 new_hsp->comp_adjustment_method = eCompositionBasedStats;
338 break;
339 default:
340 new_hsp->comp_adjustment_method = eCompositionMatrixAdjust;
341 break;
342 }
343 if (status != 0)
344 break;
345 /* At this point, the subject and possibly the query sequence have
346 * been filtered; since it is not clear that num_ident of the
347 * filtered sequences, rather than the original, is desired,
348 * explicitly leave num_ident blank. */
349 new_hsp->num_ident = 0;
350
351 status = Blast_HSPListSaveHSP(hsp_list, new_hsp);
352 if (status != 0)
353 break;
354 }
355 if (status == 0) {
356 BlastCompo_AlignmentsFree(alignments, s_FreeEditScript);
357 Blast_HSPListSortByScore(hsp_list);
358 } else {
359 hsp_list = Blast_HSPListFree(hsp_list);
360 }
361 return 0;
362 }
363
s_GetSubjectLength(Int4 total_subj_length,EBlastProgramType program_number)364 Int4 s_GetSubjectLength(Int4 total_subj_length, EBlastProgramType program_number)
365 {
366 return ((program_number == eBlastTypeRpsTblastn) ?
367 (GET_NUCL_LENGTH(total_subj_length) - 1 ) /3 : total_subj_length);
368 }
369
370
371 /**
372 * Adding evalues to a list of HSPs and remove those that do not have
373 * sufficiently good (low) evalue.
374 *
375 * @param *pbestScore best (highest) score in the list
376 * @param *pbestEvalue best (lowest) evalue in the list
377 * @param hsp_list the list
378 * @param seqSrc a source of sequence data
379 * @param subject_length length of the subject sequence
380 * @param program_number the type of BLAST search being performed
381 * @param queryInfo information about the queries
382 * @param context_index the index of the query corresponding to
383 * the HSPs in hsp_list
384 * @param sbp the score block for this search
385 * @param hitParams parameters used to assign evalues and
386 * decide whether to save hits.
387 * @param pvalueForThisPair composition p-value
388 * @param LambdaRatio lambda ratio, if available
389 * @param subject_id index of subject
390 *
391 * @return 0 on success; -1 on failure (can fail because some methods
392 * of generating evalues use auxiliary structures)
393 */
394 static int
s_HitlistEvaluateAndPurge(int * pbestScore,double * pbestEvalue,BlastHSPList * hsp_list,const BlastSeqSrc * seqSrc,int subject_length,EBlastProgramType program_number,const BlastQueryInfo * queryInfo,int context_index,BlastScoreBlk * sbp,const BlastHitSavingParameters * hitParams,double pvalueForThisPair,double LambdaRatio,int subject_id)395 s_HitlistEvaluateAndPurge(int * pbestScore, double *pbestEvalue,
396 BlastHSPList * hsp_list,
397 const BlastSeqSrc* seqSrc,
398 int subject_length,
399 EBlastProgramType program_number,
400 const BlastQueryInfo* queryInfo,
401 int context_index,
402 BlastScoreBlk* sbp,
403 const BlastHitSavingParameters* hitParams,
404 double pvalueForThisPair,
405 double LambdaRatio,
406 int subject_id)
407 {
408 int status = 0;
409 *pbestEvalue = DBL_MAX;
410 *pbestScore = 0;
411 if (hitParams->do_sum_stats) {
412 status = BLAST_LinkHsps(program_number, hsp_list, queryInfo,
413 subject_length, sbp,
414 hitParams->link_hsp_params, TRUE);
415 } else {
416
417
418 status =
419 Blast_HSPListGetEvalues(program_number, queryInfo,
420 s_GetSubjectLength(subject_length, program_number),
421 hsp_list, TRUE, FALSE, sbp,
422 0.0, /* use a non-zero gap decay
423 only when linking HSPs */
424 1.0); /* Use scaling factor equal to
425 1, because both scores and
426 Lambda are scaled, so they
427 will cancel each other. */
428 }
429 if (eBlastTypeBlastp == program_number ||
430 eBlastTypeBlastx == program_number) {
431 if ((0 <= pvalueForThisPair) && (pvalueForThisPair <= 1)) {
432 s_AdjustEvaluesForComposition(hsp_list, pvalueForThisPair, seqSrc,
433 subject_length,
434 &queryInfo->contexts[context_index],
435 LambdaRatio, subject_id);
436 }
437 }
438 if (status == 0) {
439 Blast_HSPListReapByEvalue(hsp_list, hitParams->options);
440 if (hsp_list->hspcnt > 0) {
441 *pbestEvalue = hsp_list->best_evalue;
442 *pbestScore = hsp_list->hsp_array[0]->score;
443 }
444 }
445 return status == 0 ? 0 : -1;
446 }
447
448 /** Compute the number of identities for the HSPs in the hsp_list
449 * @note Should work for blastp and tblastn now.
450 *
451 * @param query_blk the query sequence data [in]
452 * @param query_info structure describing the query_blk structure [in]
453 * @param seq_src source of subject sequence data [in]
454 * @param hsp_list list of HSPs to be processed [in|out]
455 * @param scoring_options scoring options [in]
456 * @gen_code_string Genetic code for tblastn [in]
457 */
458 static void
s_ComputeNumIdentities(const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info,BLAST_SequenceBlk * subject_blk,const BlastSeqSrc * seq_src,BlastHSPList * hsp_list,const BlastScoringOptions * scoring_options,const Uint1 * gen_code_string,const BlastScoreBlk * sbp,BlastSeqSrcSetRangesArg * ranges)459 s_ComputeNumIdentities(const BLAST_SequenceBlk* query_blk,
460 const BlastQueryInfo* query_info,
461 BLAST_SequenceBlk* subject_blk,
462 const BlastSeqSrc* seq_src,
463 BlastHSPList* hsp_list,
464 const BlastScoringOptions* scoring_options,
465 const Uint1* gen_code_string,
466 const BlastScoreBlk* sbp,
467 BlastSeqSrcSetRangesArg * ranges)
468 {
469 Uint1* query = NULL;
470 Uint1* query_nomask = NULL;
471 Uint1* subject = NULL;
472 const EBlastProgramType program_number = scoring_options->program_number;
473 const Boolean kIsOutOfFrame = scoring_options->is_ooframe;
474 const EBlastEncoding encoding = Blast_TracebackGetEncoding(program_number);
475 BlastSeqSrcGetSeqArg seq_arg;
476 Int2 status = 0;
477 int i;
478 SBlastTargetTranslation* target_t = NULL;
479
480 if ( !hsp_list) return;
481
482 /* Initialize the subject */
483 if (seq_src){
484 memset((void*) &seq_arg, 0, sizeof(seq_arg));
485 seq_arg.oid = hsp_list->oid;
486 seq_arg.encoding = encoding;
487 seq_arg.check_oid_exclusion = TRUE;
488 seq_arg.ranges = ranges;
489 status = BlastSeqSrcGetSequence(seq_src, (void*) &seq_arg);
490 ASSERT(status == 0);
491 (void)status; /* to pacify compiler warning */
492
493 if (program_number == eBlastTypeTblastn) {
494 subject_blk = seq_arg.seq;
495 BlastTargetTranslationNew(
496 subject_blk,
497 gen_code_string,
498 eBlastTypeTblastn,
499 kIsOutOfFrame,
500 &target_t
501 );
502 } else {
503 subject = seq_arg.seq->sequence;
504 }
505 } else {
506 subject = subject_blk->sequence;
507 }
508
509 for (i = 0; i < hsp_list->hspcnt; i++) {
510 BlastHSP* hsp = hsp_list->hsp_array[i];
511
512 /* Initialize the query */
513 if (program_number == eBlastTypeBlastx && kIsOutOfFrame) {
514 Int4 context = hsp->context - hsp->context % CODON_LENGTH;
515 Int4 context_offset = query_info->contexts[context].query_offset;
516 query = query_blk->oof_sequence + CODON_LENGTH + context_offset;
517 query_nomask = query_blk->oof_sequence + CODON_LENGTH + context_offset;
518 } else {
519 query = query_blk->sequence +
520 query_info->contexts[hsp->context].query_offset;
521 query_nomask = query_blk->sequence_nomask +
522 query_info->contexts[hsp->context].query_offset;
523 }
524
525 /* Translate subject if needed. */
526 if (program_number == eBlastTypeTblastn) {
527 const Uint1* target_sequence = Blast_HSPGetTargetTranslation(target_t, hsp, NULL);
528 status = Blast_HSPGetNumIdentitiesAndPositives(query, target_sequence, hsp, scoring_options, 0, sbp);
529 }
530 else
531 status = Blast_HSPGetNumIdentitiesAndPositives(query_nomask, subject, hsp, scoring_options, 0, sbp);
532
533 ASSERT(status == 0);
534 }
535 target_t = BlastTargetTranslationFree(target_t);
536 if (seq_src) {
537 BlastSeqSrcReleaseSequence(seq_src, (void*) &seq_arg);
538 BlastSequenceBlkFree(seq_arg.seq);
539 }
540 }
541
542
543 /**
544 * A callback routine: compute lambda for the given score
545 * probabilities.
546 * (@sa calc_lambda_type).
547 */
548 static double
s_CalcLambda(double probs[],int min_score,int max_score,double lambda0)549 s_CalcLambda(double probs[], int min_score, int max_score, double lambda0)
550 {
551
552 int i; /* loop index */
553 int score_range; /* range of possible scores */
554 double avg; /* expected score of aligning two characters */
555 Blast_ScoreFreq freq; /* score frequency data */
556
557 score_range = max_score - min_score + 1;
558 avg = 0.0;
559 for (i = 0; i < score_range; i++) {
560 avg += (min_score + i) * probs[i];
561 }
562 freq.score_min = min_score;
563 freq.score_max = max_score;
564 freq.obs_min = min_score;
565 freq.obs_max = max_score;
566 freq.sprob0 = probs;
567 freq.sprob = &probs[-min_score];
568 freq.score_avg = avg;
569
570 return Blast_KarlinLambdaNR(&freq, lambda0);
571 }
572
573
574 /** Fill a two-dimensional array with the frequency ratios that
575 * underlie a position specific score matrix (PSSM).
576 *
577 * @param returnRatios a two-dimensional array with BLASTAA_SIZE
578 * columns
579 * @param numPositions the number of rows in returnRatios
580 * @param query query sequence data, of length numPositions
581 * @param matrixName the name of the position independent matrix
582 * corresponding to this PSSM
583 * @param startNumerator position-specific data used to generate the
584 * PSSM
585 * @return 0 on success; -1 if the named matrix isn't known, or if
586 * there was a memory error
587 * @todo find out what start numerator is.
588 */
589 static int
s_GetPosBasedStartFreqRatios(double ** returnRatios,Int4 numPositions,Uint1 * query,const char * matrixName,double ** startNumerator)590 s_GetPosBasedStartFreqRatios(double ** returnRatios,
591 Int4 numPositions,
592 Uint1 * query,
593 const char *matrixName,
594 double **startNumerator)
595 {
596 Int4 i,j; /* loop indices */
597 SFreqRatios * stdFreqRatios = NULL; /* frequency ratios for the
598 named matrix. */
599 double *standardProb; /* probabilities of each
600 letter*/
601 const double kPosEpsilon = 0.0001; /* values below this cutoff
602 are treated specially */
603
604 stdFreqRatios = _PSIMatrixFrequencyRatiosNew(matrixName);
605 if (stdFreqRatios == NULL) {
606 return -1;
607 }
608 for (i = 0; i < numPositions; i++) {
609 for (j = 0; j < BLASTAA_SIZE; j++) {
610 returnRatios[i][j] = stdFreqRatios->data[query[i]][j];
611 }
612 }
613 stdFreqRatios = _PSIMatrixFrequencyRatiosFree(stdFreqRatios);
614
615 standardProb = BLAST_GetStandardAaProbabilities();
616 if(standardProb == NULL) {
617 return -1;
618 }
619 /*reverse multiplication done in posit.c*/
620 for (i = 0; i < numPositions; i++) {
621 for (j = 0; j < BLASTAA_SIZE; j++) {
622 if ((standardProb[query[i]] > kPosEpsilon) &&
623 (standardProb[j] > kPosEpsilon) &&
624 (j != eStopChar) && (j != eXchar) &&
625 (startNumerator[i][j] > kPosEpsilon)) {
626 returnRatios[i][j] = startNumerator[i][j] / standardProb[j];
627 }
628 }
629 }
630 sfree(standardProb);
631
632 return 0;
633 }
634
635
636 /**
637 * Fill a two-dimensional array with the frequency ratios that underlie the
638 * named score matrix.
639 *
640 * @param returnRatios a two-dimensional array of size
641 * BLASTAA_SIZE x BLASTAA_SIZE
642 * @param matrixName the name of a matrix
643 * @return 0 on success; -1 if the named matrix isn't known, or if
644 * there was a memory error
645 */
646 static int
s_GetStartFreqRatios(double ** returnRatios,const char * matrixName)647 s_GetStartFreqRatios(double ** returnRatios,
648 const char *matrixName)
649 {
650 /* Loop indices */
651 int i,j;
652 /* Frequency ratios for the matrix */
653 SFreqRatios * stdFreqRatios = NULL;
654
655 stdFreqRatios = _PSIMatrixFrequencyRatiosNew(matrixName);
656 if (stdFreqRatios == NULL) {
657 return -1;
658 }
659 for (i = 0; i < BLASTAA_SIZE; i++) {
660 for (j = 0; j < BLASTAA_SIZE; j++) {
661 returnRatios[i][j] = stdFreqRatios->data[i][j];
662 }
663 }
664 stdFreqRatios = _PSIMatrixFrequencyRatiosFree(stdFreqRatios);
665
666 return 0;
667 }
668
669
670 /** SCALING_FACTOR is a multiplicative factor used to get more bits of
671 * precision in the integer matrix scores. It cannot be arbitrarily
672 * large because we do not want total alignment scores to exceed
673 * -(BLAST_SCORE_MIN) */
674 #define SCALING_FACTOR 32
675
676
677 /**
678 * Produce a scaled-up version of the position-specific matrix
679 * with a given set of position-specific residue frequencies.
680 *
681 * @param fillPosMatrix is the matrix to be filled
682 * @param matrixName name of the standard substitution matrix [in]
683 * @param posFreqs PSSM's frequency ratios [in]
684 * @param query Query sequence data [in]
685 * @param queryLength Length of the query sequence above [in]
686 * @param sbp stores various parameters of the search
687 * @param scale_factor amount by which ungapped parameters should be
688 * scaled.
689 * @return 0 on success; -1 on failure
690 */
691 static int
s_ScalePosMatrix(int ** fillPosMatrix,const char * matrixName,double ** posFreqs,Uint1 * query,int queryLength,BlastScoreBlk * sbp,double scale_factor)692 s_ScalePosMatrix(int ** fillPosMatrix,
693 const char * matrixName,
694 double ** posFreqs,
695 Uint1 * query,
696 int queryLength,
697 BlastScoreBlk* sbp,
698 double scale_factor)
699 {
700 /* Data used by scaling routines */
701 Kappa_posSearchItems *posSearch = NULL;
702 /* A reduced collection of search parameters used by PSI-blast */
703 Kappa_compactSearchItems *compactSearch = NULL;
704 /* Representation of a PSSM internal to PSI-blast */
705 _PSIInternalPssmData* internal_pssm = NULL;
706 /* return code */
707 int status = 0;
708
709 posSearch = Kappa_posSearchItemsNew(queryLength, matrixName,
710 fillPosMatrix, posFreqs);
711 compactSearch = Kappa_compactSearchItemsNew(query, queryLength, sbp);
712 /* Copy data into new structures */
713 internal_pssm = _PSIInternalPssmDataNew(queryLength, BLASTAA_SIZE);
714 if (posSearch == NULL || compactSearch == NULL || internal_pssm == NULL) {
715 status = -1;
716 goto cleanup;
717 }
718 _PSICopyMatrix_int(internal_pssm->pssm, posSearch->posMatrix,
719 internal_pssm->ncols, internal_pssm->nrows);
720 _PSICopyMatrix_int(internal_pssm->scaled_pssm,
721 posSearch->posPrivateMatrix,
722 internal_pssm->ncols, internal_pssm->nrows);
723 _PSICopyMatrix_double(internal_pssm->freq_ratios,
724 posSearch->posFreqs, internal_pssm->ncols,
725 internal_pssm->nrows);
726 status = _PSIConvertFreqRatiosToPSSM(internal_pssm, query, sbp,
727 compactSearch->standardProb);
728 if (status != 0) {
729 goto cleanup;
730 }
731 /* Copy data from new structures to posSearchItems */
732 _PSICopyMatrix_int(posSearch->posMatrix, internal_pssm->pssm,
733 internal_pssm->ncols, internal_pssm->nrows);
734 _PSICopyMatrix_int(posSearch->posPrivateMatrix,
735 internal_pssm->scaled_pssm,
736 internal_pssm->ncols, internal_pssm->nrows);
737 _PSICopyMatrix_double(posSearch->posFreqs,
738 internal_pssm->freq_ratios,
739 internal_pssm->ncols, internal_pssm->nrows);
740 status = Kappa_impalaScaling(posSearch, compactSearch, (double)
741 scale_factor, FALSE, sbp);
742 cleanup:
743 internal_pssm = _PSIInternalPssmDataFree(internal_pssm);
744 posSearch = Kappa_posSearchItemsFree(posSearch);
745 compactSearch = Kappa_compactSearchItemsFree(compactSearch);
746
747 return status;
748 }
749
750
751 /**
752 * Convert an array of HSPs to a list of BlastCompo_Alignment objects.
753 * The context field of each BlastCompo_Alignment is set to point to the
754 * corresponding HSP.
755 *
756 * @param self the array of alignment to be filled
757 * @param numAligns number of alignments
758 * @param hsp_array an array of HSPs
759 * @param hspcnt the length of hsp_array
760 * @param init_context the initial context to process
761 * @param queryInfo information about the concatenated query
762 * @param localScalingFactor the amount by which this search is scaled
763 *
764 * @return the new list of alignments; or NULL if there is an out-of-memory
765 * error (or if the original array is empty)
766 */
767 static int
s_ResultHspToDistinctAlign(BlastCompo_Alignment ** self,int * numAligns,BlastHSP * hsp_array[],Int4 hspcnt,int init_context,const BlastQueryInfo * queryInfo,double localScalingFactor)768 s_ResultHspToDistinctAlign(BlastCompo_Alignment **self,
769 int *numAligns,
770 BlastHSP * hsp_array[], Int4 hspcnt,
771 int init_context,
772 const BlastQueryInfo* queryInfo,
773 double localScalingFactor)
774 {
775 BlastCompo_Alignment * tail[6]; /* last element in aligns */
776 int hsp_index; /* loop index */
777 int frame_index;
778
779 for (frame_index = 0; frame_index < 6; frame_index++) {
780 tail[frame_index] = NULL;
781 numAligns[frame_index] = 0;
782 }
783
784 for (hsp_index = 0; hsp_index < hspcnt; hsp_index++) {
785 BlastHSP * hsp = hsp_array[hsp_index]; /* current HSP */
786 BlastCompo_Alignment * new_align; /* newly-created alignment */
787 frame_index = hsp->context - init_context;
788 ASSERT(frame_index < 6 && frame_index >= 0);
789 /* Incoming alignments will have coordinates of the query
790 portion relative to a particular query context; they must
791 be shifted for used in the composition_adjustment library.
792 */
793 new_align =
794 BlastCompo_AlignmentNew((int) (hsp->score * localScalingFactor),
795 eDontAdjustMatrix,
796 hsp->query.offset, hsp->query.end, hsp->context,
797 hsp->subject.offset, hsp->subject.end,
798 hsp->subject.frame, hsp);
799 if (new_align == NULL) /* out of memory */
800 return -1;
801 if (tail[frame_index] == NULL) { /* if the list aligns is empty; */
802 /* make new_align the first element in the list */
803 self[frame_index] = new_align;
804 } else {
805 /* otherwise add new_align to the end of the list */
806 tail[frame_index]->next = new_align;
807 }
808 tail[frame_index] = new_align;
809 numAligns[frame_index]++;
810 }
811 return 0;
812 }
813
814
815 /**
816 * Redo a S-W alignment using an x-drop alignment. The result will
817 * usually be the same as the S-W alignment. The call to ALIGN_EX
818 * attempts to force the endpoints of the alignment to match the
819 * optimal endpoints determined by the Smith-Waterman algorithm.
820 * ALIGN_EX is used, so that if the data structures for storing BLAST
821 * alignments are changed, the code will not break
822 *
823 * @param query the query data
824 * @param queryStart start of the alignment in the query sequence
825 * @param queryEnd end of the alignment in the query sequence,
826 * as computed by the Smith-Waterman algorithm
827 * @param subject the subject (database) sequence
828 * @param matchStart start of the alignment in the subject sequence
829 * @param matchEnd end of the alignment in the query sequence,
830 * as computed by the Smith-Waterman algorithm
831 * @param gap_align parameters for a gapped alignment
832 * @param scoringParams Settings for gapped alignment.[in]
833 * @param score score computed by the Smith-Waterman algorithm
834 * @param queryAlignmentExtent length of the alignment in the query sequence,
835 * as computed by the x-drop algorithm
836 * @param matchAlignmentExtent length of the alignment in the subject
837 * sequence, as computed by the x-drop algorithm
838 * @param newScore alignment score computed by the x-drop
839 * algorithm
840 */
841 static void
s_SWFindFinalEndsUsingXdrop(BlastCompo_SequenceData * query,Int4 queryStart,Int4 queryEnd,BlastCompo_SequenceData * subject,Int4 matchStart,Int4 matchEnd,BlastGapAlignStruct * gap_align,const BlastScoringParameters * scoringParams,Int4 score,Int4 * queryAlignmentExtent,Int4 * matchAlignmentExtent,Int4 * newScore)842 s_SWFindFinalEndsUsingXdrop(BlastCompo_SequenceData * query,
843 Int4 queryStart,
844 Int4 queryEnd,
845 BlastCompo_SequenceData * subject,
846 Int4 matchStart,
847 Int4 matchEnd,
848 BlastGapAlignStruct* gap_align,
849 const BlastScoringParameters* scoringParams,
850 Int4 score,
851 Int4 * queryAlignmentExtent,
852 Int4 * matchAlignmentExtent,
853 Int4 * newScore)
854 {
855 Int4 XdropAlignScore; /* alignment score obtained using X-dropoff
856 * method rather than Smith-Waterman */
857 Int4 doublingCount = 0; /* number of times X-dropoff had to be
858 * doubled */
859 Int4 gap_x_dropoff_orig = gap_align->gap_x_dropoff;
860
861 GapPrelimEditBlockReset(gap_align->rev_prelim_tback);
862 GapPrelimEditBlockReset(gap_align->fwd_prelim_tback);
863 do {
864 XdropAlignScore =
865 ALIGN_EX(&(query->data[queryStart]) - 1,
866 &(subject->data[matchStart]) - 1,
867 queryEnd - queryStart + 1, matchEnd - matchStart + 1,
868 queryAlignmentExtent,
869 matchAlignmentExtent, gap_align->fwd_prelim_tback,
870 gap_align, scoringParams, queryStart - 1, FALSE, FALSE,
871 NULL);
872
873 gap_align->gap_x_dropoff *= 2;
874 doublingCount++;
875 if((XdropAlignScore < score) && (doublingCount < 3)) {
876 GapPrelimEditBlockReset(gap_align->fwd_prelim_tback);
877 }
878 } while((XdropAlignScore < score) && (doublingCount < 3));
879
880 gap_align->gap_x_dropoff = gap_x_dropoff_orig;
881 *newScore = XdropAlignScore;
882 }
883
884
885 /**
886 * BLAST-specific information that is associated with a
887 * BlastCompo_MatchingSequence.
888 */
889 typedef struct
890 BlastKappa_SequenceInfo {
891 EBlastProgramType prog_number; /**< identifies the type of blast
892 search being performed. The type
893 of search determines how sequence
894 data should be obtained. */
895 const BlastSeqSrc* seq_src; /**< BLAST sequence data source */
896 BlastSeqSrcGetSeqArg seq_arg; /**< argument to GetSequence method
897 of the BlastSeqSrc (@todo this
898 structure was designed to be
899 allocated on the stack, i.e.: in
900 Kappa_MatchingSequenceInitialize) */
901 } BlastKappa_SequenceInfo;
902
903
904 /** Release the resources associated with a matching sequence. */
905 static void
s_MatchingSequenceRelease(BlastCompo_MatchingSequence * self)906 s_MatchingSequenceRelease(BlastCompo_MatchingSequence * self)
907 {
908 if (self != NULL) {
909 if (self->index >=0) {
910 BlastKappa_SequenceInfo * local_data = self->local_data;
911 if (self->length > 0) {
912 BlastSeqSrcReleaseSequence(local_data->seq_src,
913 &local_data->seq_arg);
914 BlastSequenceBlkFree(local_data->seq_arg.seq);
915 }
916 free(self->local_data);
917 }
918 self->local_data = NULL;
919 }
920 }
921
922 /**
923 * Do a simple gapped extension to the right from the beginning of query and
924 * subject ranges examining only matches and mismatches. The extension stops
925 * when there are more than max_shift mismatches or mismatches or gaps are not
926 * followed by two identical matches. This is a simplified version of the
927 * Danielle and Jean Thierry-Miegs' jumper
928 * alignment implemented in NCBI Magic
929 * http://www.ncbi.nlm.nih.gov/IEB/Research/Acembly/Download/Downloads.html
930 *
931 * @param query_seq Query sequence [in]
932 * @param query_len Query length [in]
933 * @param subject_seq Subject sequence [in]
934 * @param subject_len Subject length [in]
935 * @param max_shift Maximum number of mismatches or gaps, extension stops if
936 * this number is reached [in]
937 * @param query_ext_len Extension length on the query [out]
938 * @param subject_ext_len Extension length on the subject [out]
939 * @param align_len Alignment length [out]
940 * @return Number of identical residues
941 */
s_ExtendRight(Uint1 * query_seq,int query_len,Uint1 * subject_seq,int subject_len,int max_shift,int * query_ext_len,int * subject_ext_len,int * align_len)942 static int s_ExtendRight(Uint1* query_seq, int query_len,
943 Uint1* subject_seq, int subject_len,
944 int max_shift,
945 int* query_ext_len, int* subject_ext_len,
946 int* align_len)
947 {
948 int num_identical = 0;
949 int q_pos, s_pos;
950 int gaps_in_query = 0;
951 int gaps_in_subject = 0;
952 q_pos = 0;
953 s_pos = 0;
954 while (q_pos < query_len && s_pos < subject_len) {
955 int n;
956 int match = 0;
957
958 while (q_pos < query_len && s_pos < subject_len
959 && query_seq[q_pos] == subject_seq[s_pos]) {
960
961 num_identical++;
962 q_pos++;
963 s_pos++;
964 }
965
966 /* try to skip mismatches or gaps */
967 for (n=1; n < max_shift && q_pos + n + 1 < query_len
968 && s_pos + n + 1 < subject_len && !match; n++) {
969
970 /* mismatches */
971 if (query_seq[q_pos + n] == subject_seq[s_pos + n]
972 && query_seq[q_pos + n + 1] == subject_seq[s_pos + n + 1]) {
973
974 /* we have already checked that two positions behind mismatches
975 match so we can advance further */
976 q_pos += n + 2;
977 s_pos += n + 2;
978 num_identical += 2;
979 match = 1;
980 }
981
982 /* gap in subject */
983 if (!match && query_seq[q_pos + n] == subject_seq[s_pos]
984 && query_seq[q_pos + n + 1] == subject_seq[s_pos + 1]) {
985
986 q_pos += n + 2;
987 s_pos += 2;
988 num_identical += 2;
989 gaps_in_subject += n;
990 match = 1;
991 }
992
993 /* gap in query */
994 if (!match && query_seq[q_pos] == subject_seq[s_pos + n]
995 && query_seq[q_pos + 1] == subject_seq[s_pos + n + 1]) {
996
997 q_pos += 2;
998 s_pos += n + 2;
999 num_identical += 2;
1000 gaps_in_query += n;
1001 match = 1;
1002 }
1003 }
1004
1005 if (match) {
1006 continue;
1007 }
1008
1009 /* exit the loop */
1010 break;
1011 }
1012 *query_ext_len = q_pos;
1013 *subject_ext_len = s_pos;
1014 *align_len = q_pos > s_pos ? q_pos + gaps_in_query : s_pos + gaps_in_subject;
1015
1016 return num_identical;
1017 }
1018
1019
1020 /**
1021 * Extend left from the end of the sequence and subject ranges and count
1022 * identities. The extension stops when there are more than max_shift
1023 * mismatches or mismatches or gaps are not followed by two identical matches.
1024 * See description for s_ExtendRight for more details.
1025 *
1026 * @param query_seq Query sequence [in]
1027 * @param query_len Query length [in]
1028 * @param subject_seq Subject sequence [in]
1029 * @param subject_len Subject length [in]
1030 * @param max_shift Maximum number of mismatches or gaps, extension stops if
1031 * this number is reached [in]
1032 * @param query_ext_len Extension length on the query [out]
1033 * @param subject_ext_len Extension length on the subject [out]
1034 * @param align_len Alignment length [out]
1035 * @return Number of identical residues
1036 */
s_ExtendLeft(Uint1 * query_seq,int query_len,Uint1 * subject_seq,int subject_len,int max_shift,int * query_ext_len,int * subject_ext_len,int * align_len)1037 static int s_ExtendLeft(Uint1* query_seq, int query_len,
1038 Uint1* subject_seq, int subject_len,
1039 int max_shift,
1040 int* query_ext_len, int* subject_ext_len,
1041 int* align_len)
1042 {
1043 int q_pos = query_len - 1;
1044 int s_pos = subject_len - 1;
1045 int num_identical = 0;
1046 int gaps_in_query = 0;
1047 int gaps_in_subject = 0;
1048 while (q_pos >= 0 && s_pos >= 0) {
1049 int n;
1050 int match = 0;
1051
1052 /* process identies */
1053 while (q_pos > 0 && s_pos > 0 && query_seq[q_pos] == subject_seq[s_pos]) {
1054 num_identical++;
1055 q_pos--;
1056 s_pos--;
1057 }
1058
1059 /* try to skip mismatches or gaps */
1060 for (n=1;n < max_shift && q_pos - n - 1 > 0 && s_pos - n - 1 > 0
1061 && !match; n++) {
1062
1063 /* mismatch */
1064 if (query_seq[q_pos - n] == subject_seq[s_pos - n]
1065 && query_seq[q_pos - n - 1] == subject_seq[s_pos - n - 1]) {
1066 q_pos -= n + 2;
1067 s_pos -= n + 2;
1068 num_identical += 2;
1069 match = 1;
1070 }
1071
1072 /* gap in subject */
1073 if (!match && query_seq[q_pos - n] == subject_seq[s_pos]
1074 && query_seq[q_pos - n - 1] == subject_seq[s_pos - 1]) {
1075 q_pos -= n + 2;
1076 s_pos -= 2;
1077 num_identical += 2;
1078 gaps_in_subject += n;
1079 match = 1;
1080 }
1081
1082 /* gap in query */
1083 if (!match && query_seq[q_pos] == subject_seq[s_pos - n]
1084 && query_seq[q_pos - 1] == subject_seq[s_pos - n - 1]) {
1085 q_pos -= 2;
1086 s_pos -= n + 2;
1087 num_identical += 2;
1088 gaps_in_query += n;
1089 match = 1;
1090 }
1091 }
1092
1093 if (match) {
1094 continue;
1095 }
1096
1097 break;
1098 }
1099 *query_ext_len = query_len - q_pos - 1;
1100 *subject_ext_len = subject_len - s_pos - 1;
1101 *align_len += *query_ext_len > *subject_ext_len ?
1102 *query_ext_len + gaps_in_query : *subject_ext_len + gaps_in_subject;
1103
1104 return num_identical;
1105 }
1106
1107
1108 /**
1109 * Get hash for a word of word_size residues assuming 28-letter alphabet
1110 *
1111 * @param data Sequence [in]
1112 * @param word_size Word size [in]
1113 * @return Hash value
1114 */
s_GetHash(const Uint1 * data,int word_size)1115 static Uint8 s_GetHash(const Uint1* data, int word_size)
1116 {
1117 Uint8 hash = 0;
1118 int k;
1119 for (k=0;k < word_size;k++) {
1120 hash <<= 5;
1121 hash += (Int8)data[k];
1122 }
1123 return hash;
1124 }
1125
1126
1127 /**
1128 * Find a local number of identical residues in two aligned sequences by
1129 * finding word matches and doing a simple gapped extensions from the word hits
1130 *
1131 * @param query_seq Query sequence [in]
1132 * @param query_hashes Array of query words with index of each word
1133 * corresponding to word position in the query [in]
1134 * @param query_len Query length [in]
1135 * @param subject_seq Subject sequence [in]
1136 * @param subject_len Subject length [in]
1137 * @param max_shift Maximum number of local mismatches or gaps for extensions
1138 * [in]
1139 * @return Number of identical residues
1140 */
s_FindNumIdentical(Uint1 * query_seq,const Uint8 * query_hashes,int query_len,Uint1 * subject_seq,int subject_len,int max_shift)1141 static int s_FindNumIdentical(Uint1* query_seq,
1142 const Uint8* query_hashes,
1143 int query_len,
1144 Uint1* subject_seq,
1145 int subject_len,
1146 int max_shift)
1147 {
1148 int word_size = 8; /* word size for k-mer matching */
1149 Uint8 hash = 0;
1150 Uint8 mask = NCBI_CONST_UINT8(0xFFFFFFFFFF); /* mask for computing hash
1151 values */
1152 int query_from = 0;
1153 int subject_from = 0;
1154
1155 int s_pos; /* position in the subject sequence */
1156 int num_identical = 0; /* number of identical residues found */
1157 Boolean match = FALSE;
1158
1159 /* if query or subject length is smaller than word size, exit */
1160 if (!query_seq || !query_hashes || !subject_seq
1161 || query_len < word_size || subject_len < word_size) {
1162
1163 return 0;
1164 }
1165
1166 /* for each subject position */
1167 for (s_pos = 0; s_pos < subject_len - word_size; s_pos++) {
1168 int q_pos;
1169
1170 /* find word hash */
1171 if (s_pos == 0 || match) {
1172 hash = s_GetHash(&subject_seq[s_pos], word_size);
1173 }
1174 else {
1175 hash <<= 5;
1176 hash &= mask;
1177 hash += subject_seq[s_pos + word_size - 1];
1178 }
1179
1180 /* find matching query word; index of hash is position of the word
1181 the query */
1182 for (q_pos = query_from;q_pos < query_len - word_size; q_pos++) {
1183 if (query_hashes[q_pos] == hash) {
1184 break;
1185 }
1186 }
1187
1188 /* if match */
1189 if (q_pos < query_len - word_size) {
1190 int query_start = q_pos;
1191 int subject_start = s_pos;
1192
1193 int query_left_len, query_right_len;
1194 int subject_left_len, subject_right_len;
1195 int align_len_left=0, align_len_right=0;
1196
1197 match = TRUE;
1198 num_identical += word_size;
1199
1200 /* extend left from word match */
1201 num_identical += s_ExtendLeft(query_seq + query_from,
1202 query_start - query_from,
1203 subject_seq + subject_from,
1204 subject_start - subject_from,
1205 max_shift,
1206 &query_left_len, &subject_left_len,
1207 &align_len_left);
1208
1209 /* extend right from word match */
1210 num_identical += s_ExtendRight(query_seq + query_start + word_size,
1211 query_len - query_start - word_size,
1212 subject_seq + subject_start + word_size,
1213 subject_len - subject_start - word_size,
1214 max_shift,
1215 &query_right_len, &subject_right_len,
1216 &align_len_right);
1217
1218
1219 /* disregard already matched and extended words when matching
1220 further positions */
1221
1222 query_from = query_start + word_size + query_right_len;
1223 subject_from = subject_start + word_size + subject_right_len;
1224 /* s_pos will be incremented in the loop */
1225 s_pos = subject_from - 1;
1226 }
1227 else {
1228 match = FALSE;
1229 }
1230 }
1231
1232 return num_identical;
1233 }
1234
1235 /**
1236 * Test whether the aligned parts of two sequences that
1237 * have a high-scoring gapless alignment are nearly identical.
1238 *
1239 * First extend from the left end of the query and subject ranges and stop if
1240 * there are too manu mismatches. Then extend from the right end. Then for the
1241 * remaining protion of ths sequences find matching words and extend left and
1242 * right from the word hit. Repeat the last steo until the whole alignment
1243 * ranges are processed.
1244 *
1245 * @params seqData Subject sequence [in]
1246 * @params seqOffse Starting offset of the subject sequence in alignment data
1247 * [in]
1248 * @params queryData Query sequence [in]
1249 * @params queryOffset Starting offset of the query sequence in alignment data
1250 * [in]
1251 * @param query_words Array of query words with word index corresponding to
1252 * word's position in the query [in]
1253 * @param align Alignment data [in]
1254 * @return True if sequence parts are nearly identical, false otherwise
1255 */
1256 static Boolean
s_TestNearIdentical(const BlastCompo_SequenceData * seqData,const int seqOffset,const BlastCompo_SequenceData * queryData,const int queryOffset,const Uint8 * query_words,const BlastCompo_Alignment * align)1257 s_TestNearIdentical(const BlastCompo_SequenceData* seqData,
1258 const int seqOffset,
1259 const BlastCompo_SequenceData* queryData,
1260 const int queryOffset,
1261 const Uint8* query_words,
1262 const BlastCompo_Alignment* align)
1263 {
1264 int qStart = align->queryStart - queryOffset;
1265 /* align->queryEnd points to one position past alignment end */
1266 int qEnd = align->queryEnd - queryOffset - 1;
1267 int sStart = align->matchStart - seqOffset;
1268 int sEnd = align->matchEnd - seqOffset - 1;
1269 const double kMinFractionNearIdentical = 0.96;
1270 int max_shift = 8;
1271
1272 int query_len = qEnd - qStart + 1;
1273 int subject_len = sEnd - sStart + 1;
1274 int align_len = MIN(query_len, subject_len);
1275
1276 int query_left_len = 0;
1277 int subject_left_len = 0;
1278 int query_right_len = 0;
1279 int subject_right_len = 0;
1280 int align_left_len = 0;
1281 int align_right_len = 0;
1282
1283 double fraction_identical;
1284
1285 /* first find number of identies going from the beginning of the query
1286 and subject ranges */
1287 int num_identical = s_ExtendRight(queryData->data + qStart, query_len,
1288 seqData->data + sStart, subject_len,
1289 max_shift,
1290 &query_right_len, &subject_right_len,
1291 &align_right_len);
1292
1293 /* if the whole query range was processed return near identical status */
1294 if (query_right_len >= query_len || subject_right_len >= subject_len) {
1295 fraction_identical = (double)num_identical / (double)align_len;
1296 ASSERT(fraction_identical - 1.0 < 1e-10);
1297 return fraction_identical > kMinFractionNearIdentical;
1298 }
1299
1300 /* find the number of identies going from the end of the query and subject
1301 ranges */
1302 num_identical += s_ExtendLeft(queryData->data + qStart + query_right_len,
1303 query_len - query_right_len,
1304 seqData->data + sStart + subject_right_len,
1305 subject_len - subject_right_len,
1306 max_shift,
1307 &query_left_len, &subject_left_len,
1308 &align_left_len);
1309
1310 /* if the whole alignment ranges where covered, return the near identical
1311 status */
1312 if (query_left_len + query_right_len >= query_len
1313 || subject_left_len + subject_right_len >= subject_len) {
1314
1315 fraction_identical = (double)num_identical / (double)(align_len);
1316 ASSERT(fraction_identical - 1.0 < 1e-10);
1317 return fraction_identical > kMinFractionNearIdentical;
1318 }
1319
1320 /* find the number of identical matches in the middle portion of the
1321 alignment ranges */
1322 num_identical += s_FindNumIdentical(queryData->data + qStart + query_right_len,
1323 query_words + qStart + query_right_len,
1324 query_len - query_left_len - query_right_len,
1325 seqData->data + sStart + subject_right_len,
1326 subject_len - subject_left_len - subject_right_len,
1327 max_shift);
1328
1329 fraction_identical = (double)num_identical / (double)align_len;
1330 ASSERT(fraction_identical - 1.0 < 1e-10);
1331 if (fraction_identical > kMinFractionNearIdentical) {
1332 return TRUE;
1333 }
1334 else {
1335 return FALSE;
1336 }
1337 }
1338
1339
1340 /**
1341 * Initialize a new matching sequence, obtaining information about the
1342 * sequence from the search.
1343 *
1344 * @param self object to be initialized
1345 * @param seqSrc A pointer to a source from which sequence data
1346 * may be obtained
1347 * @param program_number identifies the type of blast search being
1348 * performed.
1349 * @param default_db_genetic_code default genetic code to use when
1350 * subject sequences are translated and there is
1351 * no other guidance on what code to use
1352 * @param subject_index index of the matching sequence in the database
1353 */
1354 static int
s_MatchingSequenceInitialize(BlastCompo_MatchingSequence * self,EBlastProgramType program_number,const BlastSeqSrc * seqSrc,Int4 default_db_genetic_code,Int4 subject_index,BlastSeqSrcSetRangesArg * ranges)1355 s_MatchingSequenceInitialize(BlastCompo_MatchingSequence * self,
1356 EBlastProgramType program_number,
1357 const BlastSeqSrc* seqSrc,
1358 Int4 default_db_genetic_code,
1359 Int4 subject_index,
1360 BlastSeqSrcSetRangesArg * ranges)
1361 {
1362 BlastKappa_SequenceInfo * seq_info; /* BLAST-specific sequence
1363 information */
1364 self->length = 0;
1365 self->local_data = NULL;
1366
1367 seq_info = malloc(sizeof(BlastKappa_SequenceInfo));
1368 if (seq_info != NULL) {
1369 self->local_data = seq_info;
1370
1371 seq_info->seq_src = seqSrc;
1372 seq_info->prog_number = program_number;
1373
1374 memset((void*) &seq_info->seq_arg, 0, sizeof(seq_info->seq_arg));
1375 seq_info->seq_arg.oid = self->index = subject_index;
1376 seq_info->seq_arg.check_oid_exclusion = TRUE;
1377 seq_info->seq_arg.ranges = ranges;
1378
1379 if( program_number == eBlastTypeTblastn ) {
1380 seq_info->seq_arg.encoding = eBlastEncodingNcbi4na;
1381 } else {
1382 seq_info->seq_arg.encoding = eBlastEncodingProtein;
1383 }
1384 if (BlastSeqSrcGetSequence(seqSrc, &seq_info->seq_arg) >= 0) {
1385 self->length =
1386 BlastSeqSrcGetSeqLen(seqSrc, (void*) &seq_info->seq_arg);
1387
1388 /* If the subject is translated and the BlastSeqSrc implementation
1389 * doesn't provide a genetic code string, use the default genetic
1390 * code for all subjects (as in the C toolkit) */
1391 if (Blast_SubjectIsTranslated(program_number) &&
1392 seq_info->seq_arg.seq->gen_code_string == NULL) {
1393 seq_info->seq_arg.seq->gen_code_string =
1394 GenCodeSingletonFind(default_db_genetic_code);
1395 ASSERT(seq_info->seq_arg.seq->gen_code_string);
1396 }
1397 } else {
1398 self->length = 0;
1399 }
1400 }
1401 if (self->length == 0) {
1402 /* Could not obtain the required data */
1403 s_MatchingSequenceRelease(self);
1404 return -1;
1405 } else {
1406 return 0;
1407 }
1408 }
1409
1410
1411 /** NCBIstdaa encoding for 'X' character */
1412 #define BLASTP_MASK_RESIDUE 21
1413 /** Default instructions and mask residue for SEG filtering */
1414 #define BLASTP_MASK_INSTRUCTIONS "S 10 1.8 2.1"
1415
1416
1417 /**
1418 * Filter low complexity regions from the sequence data; uses the SEG
1419 * algorithm.
1420 *
1421 * @param seqData data to be filtered
1422 * @param program_name type of search being performed
1423 * @return 0 for success; -1 for out-of-memory
1424 */
1425 static int
s_DoSegSequenceData(BlastCompo_SequenceData * seqData,EBlastProgramType program_name,Boolean * is_seq_biased)1426 s_DoSegSequenceData(BlastCompo_SequenceData * seqData,
1427 EBlastProgramType program_name,
1428 Boolean* is_seq_biased)
1429 {
1430 int status = 0;
1431 BlastSeqLoc* mask_seqloc = NULL;
1432 SBlastFilterOptions* filter_options = NULL;
1433
1434 status = BlastFilteringOptionsFromString(program_name,
1435 BLASTP_MASK_INSTRUCTIONS,
1436 &filter_options, NULL);
1437 if (status == 0) {
1438 status = BlastSetUp_Filter(program_name, seqData->data,
1439 seqData->length, 0, filter_options,
1440 &mask_seqloc, NULL);
1441 filter_options = SBlastFilterOptionsFree(filter_options);
1442 }
1443 if (is_seq_biased) {
1444 *is_seq_biased = (mask_seqloc != NULL);
1445 }
1446 if (status == 0) {
1447 Blast_MaskTheResidues(seqData->data, seqData->length,
1448 FALSE, mask_seqloc, FALSE, 0);
1449 }
1450 if (mask_seqloc != NULL) {
1451 mask_seqloc = BlastSeqLocFree(mask_seqloc);
1452 }
1453 return status;
1454 }
1455
1456
1457 /**
1458 * Obtain a string of translated data
1459 *
1460 * @param self the sequence from which to obtain the data [in]
1461 * @param range the range and translation frame to get [in]
1462 * @param seqData the resulting data [out]
1463 * @param queryData the query sequence [in]
1464 * @param queryOffset offset for align if there are multiple queries
1465 * @param align information about the alignment between query and subject
1466 * @param shouldTestIdentical did alignment pass a preliminary test in
1467 * redo_alignment.c that indicates the sequence
1468 * pieces may be near identical
1469 *
1470 * @return 0 on success; -1 on failure
1471 */
1472 static int
s_SequenceGetTranslatedRange(const BlastCompo_MatchingSequence * self,const BlastCompo_SequenceRange * range,BlastCompo_SequenceData * seqData,const BlastCompo_SequenceRange * q_range,BlastCompo_SequenceData * queryData,const Uint8 * query_words,const BlastCompo_Alignment * align,const Boolean shouldTestIdentical,const ECompoAdjustModes compo_adjust_mode,const Boolean isSmithWaterman,Boolean * subject_maybe_biased)1473 s_SequenceGetTranslatedRange(const BlastCompo_MatchingSequence * self,
1474 const BlastCompo_SequenceRange * range,
1475 BlastCompo_SequenceData * seqData,
1476 const BlastCompo_SequenceRange * q_range,
1477 BlastCompo_SequenceData * queryData,
1478 const Uint8* query_words,
1479 const BlastCompo_Alignment *align,
1480 const Boolean shouldTestIdentical,
1481 const ECompoAdjustModes compo_adjust_mode,
1482 const Boolean isSmithWaterman,
1483 Boolean* subject_maybe_biased)
1484 {
1485 int status = 0;
1486 BlastKappa_SequenceInfo * local_data; /* BLAST-specific
1487 information associated
1488 with the sequence */
1489 Uint1 * translation_buffer; /* a buffer for the translated,
1490 amino-acid sequence */
1491 Int4 translated_length; /* length of the translated sequence */
1492 int translation_frame; /* frame in which to translate */
1493 Uint1 * na_sequence; /* the nucleotide sequence */
1494 int translation_start; /* location in na_sequence to start
1495 translating */
1496 int num_nucleotides; /* the number of nucleotides to be translated */
1497
1498 local_data = self->local_data;
1499 na_sequence = local_data->seq_arg.seq->sequence_start;
1500
1501 /* Initialize seqData to nil, in case this routine fails */
1502 seqData->buffer = NULL;
1503 seqData->data = NULL;
1504 seqData->length = 0;
1505
1506 translation_frame = range->context;
1507 if (translation_frame > 0) {
1508 translation_start = 3 * range->begin;
1509 } else {
1510 translation_start =
1511 self->length - 3 * range->end + translation_frame + 1;
1512 }
1513 num_nucleotides =
1514 3 * (range->end - range->begin) + ABS(translation_frame) - 1;
1515
1516 status = Blast_GetPartialTranslation(na_sequence + translation_start,
1517 num_nucleotides,
1518 (Int2) translation_frame,
1519 local_data->seq_arg.seq->gen_code_string,
1520 &translation_buffer,
1521 &translated_length,
1522 NULL);
1523 if (status == 0) {
1524 seqData->buffer = translation_buffer;
1525 seqData->data = translation_buffer + 1;
1526 seqData->length = translated_length;
1527
1528 if ( !(KAPPA_TBLASTN_NO_SEG_SEQUENCE) ) {
1529 if (compo_adjust_mode
1530 && (!subject_maybe_biased || *subject_maybe_biased)) {
1531
1532 if ( (!shouldTestIdentical)
1533 || (shouldTestIdentical
1534 && (!s_TestNearIdentical(seqData, range->begin,
1535 queryData, q_range->begin,
1536 query_words, align)))) {
1537
1538 status = s_DoSegSequenceData(seqData, eBlastTypeTblastn,
1539 subject_maybe_biased);
1540 if (status != 0) {
1541 free(seqData->buffer);
1542 seqData->buffer = NULL;
1543 seqData->data = NULL;
1544 seqData->length = 0;
1545 }
1546 }
1547 }
1548 }
1549 }
1550 return status;
1551 }
1552
1553
1554 /**
1555 * Get a string of protein data from a protein sequence.
1556 *
1557 * @param self a protein sequence [in]
1558 * @param range the range to get [in]
1559 * @param seqData the resulting data [out]
1560 * @param queryData the query sequence [in]
1561 * @param queryOffset offset for align if there are multiple queries
1562 * @param align information about the alignment
1563 * between query and subject [in]
1564 * @param shouldTestIdentical did alignment pass a preliminary test in
1565 * redo_alignment.c that indicates the sequence
1566 * pieces may be near identical [in]
1567 *
1568 * @return 0 on success; -1 on failure
1569 */
1570 static int
s_SequenceGetProteinRange(const BlastCompo_MatchingSequence * self,const BlastCompo_SequenceRange * range,BlastCompo_SequenceData * seqData,const BlastCompo_SequenceRange * q_range,BlastCompo_SequenceData * queryData,const Uint8 * query_words,const BlastCompo_Alignment * align,const Boolean shouldTestIdentical,const ECompoAdjustModes compo_adjust_mode,const Boolean isSmithWaterman,Boolean * subject_maybe_biased)1571 s_SequenceGetProteinRange(const BlastCompo_MatchingSequence * self,
1572 const BlastCompo_SequenceRange * range,
1573 BlastCompo_SequenceData * seqData,
1574 const BlastCompo_SequenceRange * q_range,
1575 BlastCompo_SequenceData * queryData,
1576 const Uint8* query_words,
1577 const BlastCompo_Alignment *align,
1578 const Boolean shouldTestIdentical,
1579 const ECompoAdjustModes compo_adjust_mode,
1580 const Boolean isSmithWaterman,
1581 Boolean* subject_maybe_biased)
1582
1583 {
1584 int status = 0; /* return status */
1585 Int4 idx; /* loop index */
1586 Uint1 *origData; /* the unfiltered data for the sequence */
1587 /* BLAST-specific sequence information */
1588 BlastKappa_SequenceInfo * local_data = self->local_data;
1589 BLAST_SequenceBlk * seq = self->local_data;
1590
1591 if (self->local_data == NULL)
1592 return -1;
1593
1594 seqData->data = NULL;
1595 seqData->length = 0;
1596 /* Copy the entire sequence (necessary for SEG filtering.) */
1597 seqData->buffer = calloc((self->length + 2), sizeof(Uint1));
1598 if (seqData->buffer == NULL) {
1599 return -1;
1600 }
1601 /* First and last characters of the buffer MUST be '\0', which is
1602 * true here because the buffer was allocated using calloc. */
1603 seqData->data = seqData->buffer + 1;
1604 seqData->length = self->length;
1605
1606 origData = (self->index >= 0) ? local_data->seq_arg.seq->sequence
1607 : seq->sequence;
1608 if((self->index < 0) && (align->frame != 0)) {
1609 int i=0, offsets =0;
1610 int f = GET_SEQ_FRAME(align->frame);
1611 int nucl_length = GET_NUCL_LENGTH(self->length);
1612 seqData->length = GET_TRANSLATED_LENGTH(nucl_length, f);
1613 for(; i < f; i++) {
1614 offsets = GET_TRANSLATED_LENGTH(nucl_length, i) +1;
1615 origData += offsets;
1616 }
1617 }
1618 /* Copy the sequence data */
1619 for (idx = 0; idx < seqData->length; idx++) {
1620 seqData->data[idx] = origData[idx];
1621 }
1622
1623 if ( !(KAPPA_BLASTP_NO_SEG_SEQUENCE) ) {
1624 if (compo_adjust_mode
1625 && (!subject_maybe_biased || *subject_maybe_biased)) {
1626
1627 if ( (!shouldTestIdentical)
1628 || (shouldTestIdentical
1629 && (!s_TestNearIdentical(seqData, 0, queryData,
1630 q_range->begin, query_words,
1631 align)))) {
1632
1633 status = s_DoSegSequenceData(seqData, eBlastTypeBlastp,
1634 subject_maybe_biased);
1635 }
1636 }
1637 }
1638 /* Fit the data to the range. */
1639 seqData ->data = &seqData->data[range->begin - 1];
1640 *seqData->data++ = '\0';
1641 seqData ->length = range->end - range->begin;
1642
1643 if (status != 0) {
1644 free(seqData->buffer);
1645 seqData->buffer = NULL;
1646 seqData->data = NULL;
1647 }
1648 return status;
1649 }
1650
1651
1652 /**
1653 * Obtain the sequence data that lies within the given range.
1654 *
1655 * @param self sequence information [in]
1656 * @param range range specifying the range of data [in]
1657 * @param seqData the sequence data obtained [out]
1658 * @param seqData the resulting data [out]
1659 * @param queryData the query sequence [in]
1660 * @param queryOffset offset for align if there are multiple queries
1661 * @param align information about the alignment between query and subject
1662 * @param shouldTestIdentical did alignment pass a preliminary test in
1663 * redo_alignment.c that indicates the sequence
1664 * pieces may be near identical
1665 *
1666 * @return 0 on success; -1 on failure
1667 */
1668 static int
s_SequenceGetRange(const BlastCompo_MatchingSequence * self,const BlastCompo_SequenceRange * s_range,BlastCompo_SequenceData * seqData,const BlastCompo_SequenceData * query,const BlastCompo_SequenceRange * q_range,BlastCompo_SequenceData * queryData,const Uint8 * query_words,const BlastCompo_Alignment * align,const Boolean shouldTestIdentical,const ECompoAdjustModes compo_adjust_mode,const Boolean isSmithWaterman,Boolean * subject_maybe_biased)1669 s_SequenceGetRange(const BlastCompo_MatchingSequence * self,
1670 const BlastCompo_SequenceRange * s_range,
1671 BlastCompo_SequenceData * seqData,
1672 const BlastCompo_SequenceData * query,
1673 const BlastCompo_SequenceRange * q_range,
1674 BlastCompo_SequenceData * queryData,
1675 const Uint8* query_words,
1676 const BlastCompo_Alignment *align,
1677 const Boolean shouldTestIdentical,
1678 const ECompoAdjustModes compo_adjust_mode,
1679 const Boolean isSmithWaterman,
1680 Boolean* subject_maybe_biased)
1681 {
1682 Int4 idx;
1683 BlastKappa_SequenceInfo * seq_info = self->local_data;
1684 Uint1 *origData = query->data + q_range->begin;
1685 /* Copy the query sequence (necessary for SEG filtering.) */
1686 queryData->length = q_range->end - q_range->begin;
1687 queryData->buffer = calloc((queryData->length + 2), sizeof(Uint1));
1688 queryData->data = queryData->buffer + 1;
1689
1690 for (idx = 0; idx < queryData->length; idx++) {
1691 /* Copy the sequence data, replacing occurrences of amino acid
1692 * number 24 (Selenocysteine) with number 3 (Cysteine). */
1693 queryData->data[idx] = (origData[idx] != 24) ? origData[idx] : 3;
1694 }
1695 if (seq_info && seq_info->prog_number == eBlastTypeTblastn) {
1696 /* The sequence must be translated. */
1697 return s_SequenceGetTranslatedRange(self, s_range, seqData,
1698 q_range, queryData, query_words,
1699 align, shouldTestIdentical,
1700 compo_adjust_mode, isSmithWaterman,
1701 subject_maybe_biased);
1702 } else {
1703 return s_SequenceGetProteinRange(self, s_range, seqData,
1704 q_range, queryData, query_words,
1705 align, shouldTestIdentical,
1706 compo_adjust_mode, isSmithWaterman,
1707 subject_maybe_biased);
1708 }
1709 }
1710
1711
1712 /** Data and data-structures needed to perform a gapped alignment */
1713 typedef struct BlastKappa_GappingParamsContext {
1714 const BlastScoringParameters*
1715 scoringParams; /**< scoring parameters for a
1716 gapped alignment */
1717 BlastGapAlignStruct * gap_align; /**< additional parameters for a
1718 gapped alignment */
1719 BlastScoreBlk* sbp; /**< the score block for this search */
1720 double localScalingFactor; /**< the amount by which this
1721 search has been scaled */
1722 EBlastProgramType prog_number; /**< the type of search being
1723 performed */
1724 } BlastKappa_GappingParamsContext;
1725
1726
1727 /**
1728 * Reads a BlastGapAlignStruct that has been used to compute a
1729 * traceback, and return a BlastCompo_Alignment representing the
1730 * alignment. The BlastGapAlignStruct is in coordinates local to the
1731 * ranges being aligned; the resulting alignment is in coordinates w.r.t.
1732 * the whole query and subject.
1733 *
1734 * @param gap_align the BlastGapAlignStruct
1735 * @param *edit_script the edit script from the alignment; on exit
1736 * NULL. The edit_script is usually
1737 * gap_align->edit_script, but we don't want
1738 * an implicit side effect on the gap_align.
1739 * @param query_range the range of the query used in this alignment
1740 * @param subject_range the range of the subject used in this alignment
1741 * @param matrix_adjust_rule the rule used to compute the scoring matrix
1742 *
1743 * @return the new alignment on success or NULL on error
1744 */
1745 static BlastCompo_Alignment *
s_NewAlignmentFromGapAlign(BlastGapAlignStruct * gap_align,GapEditScript ** edit_script,BlastCompo_SequenceRange * query_range,BlastCompo_SequenceRange * subject_range,EMatrixAdjustRule matrix_adjust_rule)1746 s_NewAlignmentFromGapAlign(BlastGapAlignStruct * gap_align,
1747 GapEditScript ** edit_script,
1748 BlastCompo_SequenceRange * query_range,
1749 BlastCompo_SequenceRange * subject_range,
1750 EMatrixAdjustRule matrix_adjust_rule)
1751 {
1752 /* parameters to BlastCompo_AlignmentNew */
1753 int queryStart, queryEnd, queryIndex, matchStart, matchEnd, frame;
1754 BlastCompo_Alignment * obj; /* the new alignment */
1755
1756 /* In the composition_adjustment library, the query start/end are
1757 indices into the concatenated query, and so must be shifted. */
1758 queryStart = gap_align->query_start + query_range->begin;
1759 queryEnd = gap_align->query_stop + query_range->begin;
1760 queryIndex = query_range->context;
1761 matchStart = gap_align->subject_start + subject_range->begin;
1762 matchEnd = gap_align->subject_stop + subject_range->begin;
1763 frame = subject_range->context;
1764
1765 obj = BlastCompo_AlignmentNew(gap_align->score, matrix_adjust_rule,
1766 queryStart, queryEnd, queryIndex,
1767 matchStart, matchEnd, frame,
1768 *edit_script);
1769 if (obj != NULL) {
1770 *edit_script = NULL;
1771 }
1772 return obj;
1773 }
1774
1775
1776 /** A callback used when performing SmithWaterman alignments:
1777 * Calculate the traceback for one alignment by performing an x-drop
1778 * alignment in the forward direction, possibly increasing the x-drop
1779 * parameter until the desired score is attained.
1780 *
1781 * The start, end and score of the alignment should be obtained
1782 * using the Smith-Waterman algorithm before this routine is called.
1783 *
1784 * @param *pnewAlign the new alignment
1785 * @param *pqueryEnd on entry, the end of the alignment in the
1786 * query, as computed by the Smith-Waterman
1787 * algorithm. On exit, the end as computed by
1788 * the x-drop algorithm
1789 * @param *pmatchEnd like as *pqueryEnd, but for the subject
1790 * sequence
1791 * @param queryStart the starting point in the query
1792 * @param matchStart the starting point in the subject
1793 * @param score the score of the alignment, as computed by
1794 * the Smith-Waterman algorithm
1795 * @param query query sequence data
1796 * @param query_range range of this query in the concatenated
1797 * query
1798 * @param ccat_query_length total length of the concatenated query
1799 * @param subject subject sequence data
1800 * @param subject_range range of subject_data in the translated
1801 * query, in amino acid coordinates
1802 * @param full_subject_length length of the full subject sequence
1803 * @param gapping_params parameters used to compute gapped
1804 * alignments
1805 * @param matrix_adjust_rule the rule used to compute the scoring matrix
1806 *
1807 * @returns 0 (posts a fatal error if it fails)
1808 * @sa new_xdrop_align_type
1809 */
1810 static int
s_NewAlignmentUsingXdrop(BlastCompo_Alignment ** pnewAlign,Int4 * pqueryEnd,Int4 * pmatchEnd,Int4 queryStart,Int4 matchStart,Int4 score,BlastCompo_SequenceData * query,BlastCompo_SequenceRange * query_range,Int4 ccat_query_length,BlastCompo_SequenceData * subject,BlastCompo_SequenceRange * subject_range,Int4 full_subject_length,BlastCompo_GappingParams * gapping_params,EMatrixAdjustRule matrix_adjust_rule)1811 s_NewAlignmentUsingXdrop(BlastCompo_Alignment ** pnewAlign,
1812 Int4 * pqueryEnd, Int4 *pmatchEnd,
1813 Int4 queryStart, Int4 matchStart, Int4 score,
1814 BlastCompo_SequenceData * query,
1815 BlastCompo_SequenceRange * query_range,
1816 Int4 ccat_query_length,
1817 BlastCompo_SequenceData * subject,
1818 BlastCompo_SequenceRange * subject_range,
1819 Int4 full_subject_length,
1820 BlastCompo_GappingParams * gapping_params,
1821 EMatrixAdjustRule matrix_adjust_rule)
1822 {
1823 Int4 newScore;
1824 /* Extent of the alignment as computed by an x-drop alignment
1825 * (usually the same as (queryEnd - queryStart) and (matchEnd -
1826 * matchStart)) */
1827 Int4 queryExtent, matchExtent;
1828 BlastCompo_Alignment * obj = NULL; /* the new object */
1829 /* BLAST-specific parameters needed compute an X-drop alignment */
1830 BlastKappa_GappingParamsContext * context = gapping_params->context;
1831 /* Auxiliarly structure for computing gapped alignments */
1832 BlastGapAlignStruct * gap_align = context->gap_align;
1833 /* Scoring parameters for gapped alignments */
1834 const BlastScoringParameters* scoringParams = context->scoringParams;
1835 /* A structure containing the traceback of a gapped alignment */
1836 GapEditScript* editScript = NULL;
1837
1838 /* suppress unused parameter warnings; this is a callback
1839 function, so these parameter cannot be deleted */
1840 (void) ccat_query_length;
1841 (void) full_subject_length;
1842
1843 gap_align->gap_x_dropoff = gapping_params->x_dropoff;
1844
1845 s_SWFindFinalEndsUsingXdrop(query, queryStart, *pqueryEnd,
1846 subject, matchStart, *pmatchEnd,
1847 gap_align, scoringParams,
1848 score, &queryExtent, &matchExtent,
1849 &newScore);
1850 *pqueryEnd = queryStart + queryExtent;
1851 *pmatchEnd = matchStart + matchExtent;
1852
1853 editScript =
1854 Blast_PrelimEditBlockToGapEditScript(gap_align->rev_prelim_tback,
1855 gap_align->fwd_prelim_tback);
1856 if (editScript != NULL) {
1857 /* Shifted values of the endpoints */
1858 Int4 aqueryStart = queryStart + query_range->begin;
1859 Int4 aqueryEnd = *pqueryEnd + query_range->begin;
1860 Int4 amatchStart = matchStart + subject_range->begin;
1861 Int4 amatchEnd = *pmatchEnd + subject_range->begin;
1862
1863 obj = BlastCompo_AlignmentNew(newScore, matrix_adjust_rule,
1864 aqueryStart, aqueryEnd,
1865 query_range->context,
1866 amatchStart, amatchEnd,
1867 subject_range->context, editScript);
1868 if (obj == NULL) {
1869 GapEditScriptDelete(editScript);
1870 }
1871 }
1872 *pnewAlign = obj;
1873
1874 return obj != NULL ? 0 : -1;
1875 }
1876
1877
1878 /**
1879 * A callback: calculate the traceback for one alignment by
1880 * performing an x-drop alignment in both directions
1881 *
1882 * @param in_align the existing alignment, without traceback
1883 * @param matrix_adjust_rule the rule used to compute the scoring matrix
1884 * @param query_data query sequence data
1885 * @param query_range range of this query in the concatenated
1886 * query
1887 * @param ccat_query_length total length of the concatenated query
1888 * @param subject_data subject sequence data
1889 * @param subject_range range of subject_data in the translated
1890 * query, in amino acid coordinates
1891 * @param full_subject_length length of the full subject sequence
1892 * @param gapping_params parameters used to compute gapped
1893 * alignments
1894 * @sa redo_one_alignment_type
1895 */
1896 static BlastCompo_Alignment *
s_RedoOneAlignment(BlastCompo_Alignment * in_align,EMatrixAdjustRule matrix_adjust_rule,BlastCompo_SequenceData * query_data,BlastCompo_SequenceRange * query_range,int ccat_query_length,BlastCompo_SequenceData * subject_data,BlastCompo_SequenceRange * subject_range,int full_subject_length,BlastCompo_GappingParams * gapping_params)1897 s_RedoOneAlignment(BlastCompo_Alignment * in_align,
1898 EMatrixAdjustRule matrix_adjust_rule,
1899 BlastCompo_SequenceData * query_data,
1900 BlastCompo_SequenceRange * query_range,
1901 int ccat_query_length,
1902 BlastCompo_SequenceData * subject_data,
1903 BlastCompo_SequenceRange * subject_range,
1904 int full_subject_length,
1905 BlastCompo_GappingParams * gapping_params)
1906 {
1907 int status; /* return code */
1908 Int4 q_start, s_start; /* starting point in query and subject */
1909 /* BLAST-specific parameters needed to compute a gapped alignment */
1910 BlastKappa_GappingParamsContext * context = gapping_params->context;
1911 /* Auxiliary structure for computing gapped alignments */
1912 BlastGapAlignStruct* gapAlign = context->gap_align;
1913 /* The preliminary gapped HSP that were are recomputing */
1914 BlastHSP * hsp = in_align->context;
1915 Boolean fence_hit = FALSE;
1916
1917 /* suppress unused parameter warnings; this is a callback
1918 function, so these parameter cannot be deleted */
1919 (void) ccat_query_length;
1920 (void) full_subject_length;
1921
1922 /* Use the starting point supplied by the HSP. */
1923 q_start = hsp->query.gapped_start - query_range->begin;
1924 s_start = hsp->subject.gapped_start - subject_range->begin;
1925
1926 gapAlign->gap_x_dropoff = gapping_params->x_dropoff;
1927
1928 /*
1929 * Previously, last argument was NULL which could cause problems for
1930 * tblastn.
1931 */
1932 status =
1933 BLAST_GappedAlignmentWithTraceback(context->prog_number,
1934 query_data->data,
1935 subject_data->data, gapAlign,
1936 context->scoringParams,
1937 q_start, s_start,
1938 query_data->length,
1939 subject_data->length,
1940 &fence_hit);
1941 if (status == 0) {
1942 return s_NewAlignmentFromGapAlign(gapAlign, &gapAlign->edit_script,
1943 query_range, subject_range,
1944 matrix_adjust_rule);
1945 } else {
1946 return NULL;
1947 }
1948 }
1949
1950
1951 /**
1952 * A BlastKappa_SavedParameters holds the value of certain search
1953 * parameters on entry to RedoAlignmentCore. These values are
1954 * restored on exit.
1955 */
1956 typedef struct BlastKappa_SavedParameters {
1957 Int4 gap_open; /**< a penalty for the existence of a gap */
1958 Int4 gapExtend; /**< a penalty for each residue in the
1959 gap */
1960 double scale_factor; /**< the original scale factor */
1961 Int4 **origMatrix; /**< The original matrix values */
1962 double original_expect_value; /**< expect value on entry */
1963 /** copy of the original gapped Karlin-Altschul block
1964 * corresponding to the first context */
1965 Blast_KarlinBlk** kbp_gap_orig;
1966 Int4 num_queries; /**< Number of queries in this search */
1967 } BlastKappa_SavedParameters;
1968
1969
1970 /**
1971 * Release the data associated with a BlastKappa_SavedParameters and
1972 * delete the object
1973 * @param searchParams the object to be deleted [in][out]
1974 */
1975 static void
s_SavedParametersFree(BlastKappa_SavedParameters ** searchParams)1976 s_SavedParametersFree(BlastKappa_SavedParameters ** searchParams)
1977 {
1978 /* for convenience, remove one level of indirection from searchParams */
1979 BlastKappa_SavedParameters *sp = *searchParams;
1980
1981 if (sp != NULL) {
1982 if (sp->kbp_gap_orig != NULL) {
1983 int i;
1984 for (i = 0; i < sp->num_queries; i++) {
1985 if (sp->kbp_gap_orig[i] != NULL)
1986 Blast_KarlinBlkFree(sp->kbp_gap_orig[i]);
1987 }
1988 free(sp->kbp_gap_orig);
1989 }
1990 if (sp->origMatrix != NULL)
1991 Nlm_Int4MatrixFree(&sp->origMatrix);
1992 }
1993 sfree(*searchParams);
1994 *searchParams = NULL;
1995 }
1996
1997
1998 /**
1999 * Create a new instance of BlastKappa_SavedParameters
2000 *
2001 * @param rows number of rows in the scoring matrix
2002 * @param numQueries number of queries in this search
2003 * @param compo_adjust_mode if >0, use composition-based statistics
2004 * @param positionBased if true, the search is position-based
2005 */
2006 static BlastKappa_SavedParameters *
s_SavedParametersNew(Int4 rows,Int4 numQueries,ECompoAdjustModes compo_adjust_mode,Boolean positionBased)2007 s_SavedParametersNew(Int4 rows,
2008 Int4 numQueries,
2009 ECompoAdjustModes compo_adjust_mode,
2010 Boolean positionBased)
2011 {
2012 int i;
2013 BlastKappa_SavedParameters *sp; /* the new object */
2014 sp = malloc(sizeof(BlastKappa_SavedParameters));
2015
2016 if (sp == NULL) {
2017 goto error_return;
2018 }
2019 sp->kbp_gap_orig = NULL;
2020 sp->origMatrix = NULL;
2021
2022 sp->kbp_gap_orig = calloc(numQueries, sizeof(Blast_KarlinBlk*));
2023 if (sp->kbp_gap_orig == NULL) {
2024 goto error_return;
2025 }
2026 sp->num_queries = numQueries;
2027 for (i = 0; i < numQueries; i++) {
2028 sp->kbp_gap_orig[i] = NULL;
2029 }
2030 if (compo_adjust_mode != eNoCompositionBasedStats) {
2031 if (positionBased) {
2032 sp->origMatrix = Nlm_Int4MatrixNew(rows, BLASTAA_SIZE);
2033 } else {
2034 sp->origMatrix = Nlm_Int4MatrixNew(BLASTAA_SIZE, BLASTAA_SIZE);
2035 }
2036 if (sp->origMatrix == NULL)
2037 goto error_return;
2038 }
2039 return sp;
2040 error_return:
2041 s_SavedParametersFree(&sp);
2042 return NULL;
2043 }
2044
2045
2046 /**
2047 * Record the initial value of the search parameters that are to be
2048 * adjusted.
2049 *
2050 * @param searchParams holds the recorded values [out]
2051 * @param sbp a score block [in]
2052 * @param scoring gapped alignment parameters [in]
2053 * @param query_length length of the concatenated query [in]
2054 * @param compo_adjust_mode composition adjustment mode [in]
2055 * @param positionBased is this search position-based [in]
2056 */
2057 static int
s_RecordInitialSearch(BlastKappa_SavedParameters * searchParams,BlastScoreBlk * sbp,const BlastScoringParameters * scoring,int query_length,ECompoAdjustModes compo_adjust_mode,Boolean positionBased)2058 s_RecordInitialSearch(BlastKappa_SavedParameters * searchParams,
2059 BlastScoreBlk* sbp,
2060 const BlastScoringParameters* scoring,
2061 int query_length,
2062 ECompoAdjustModes compo_adjust_mode,
2063 Boolean positionBased)
2064 {
2065 int i;
2066
2067 searchParams->gap_open = scoring->gap_open;
2068 searchParams->gapExtend = scoring->gap_extend;
2069 searchParams->scale_factor = scoring->scale_factor;
2070
2071 for (i = 0; i < searchParams->num_queries; i++) {
2072 if (sbp->kbp_gap[i] != NULL) {
2073 /* There is a kbp_gap for query i and it must be copied */
2074 searchParams->kbp_gap_orig[i] = Blast_KarlinBlkNew();
2075 if (searchParams->kbp_gap_orig[i] == NULL) {
2076 return -1;
2077 }
2078 Blast_KarlinBlkCopy(searchParams->kbp_gap_orig[i],
2079 sbp->kbp_gap[i]);
2080 }
2081 }
2082
2083 if (compo_adjust_mode != eNoCompositionBasedStats) {
2084 Int4 **matrix; /* scoring matrix */
2085 int j; /* iteration index */
2086 int rows; /* number of rows in matrix */
2087 if (positionBased) {
2088 matrix = sbp->psi_matrix->pssm->data;
2089 rows = query_length;
2090 } else {
2091 matrix = sbp->matrix->data;
2092 rows = BLASTAA_SIZE;
2093 }
2094
2095 for (i = 0; i < rows; i++) {
2096 for (j = 0; j < BLASTAA_SIZE; j++) {
2097 searchParams->origMatrix[i][j] = matrix[i][j];
2098 }
2099 }
2100 }
2101 return 0;
2102 }
2103
2104
2105 /**
2106 * Rescale the search parameters in the search object and options
2107 * object to obtain more precision.
2108 *
2109 * @param sbp score block to be rescaled
2110 * @param sp scoring parameters to be rescaled
2111 * @param num_queries number of queries in this search
2112 * @param scale_factor amount by which to scale this search
2113 */
2114 static void
s_RescaleSearch(BlastScoreBlk * sbp,BlastScoringParameters * sp,int num_queries,double scale_factor)2115 s_RescaleSearch(BlastScoreBlk* sbp,
2116 BlastScoringParameters* sp,
2117 int num_queries,
2118 double scale_factor)
2119 {
2120 int i;
2121 for (i = 0; i < num_queries; i++) {
2122 if (sbp->kbp_gap[i] != NULL) {
2123 Blast_KarlinBlk * kbp = sbp->kbp_gap[i];
2124 kbp->Lambda /= scale_factor;
2125 kbp->logK = log(kbp->K);
2126 }
2127 }
2128
2129 sp->gap_open = BLAST_Nint(sp->gap_open * scale_factor);
2130 sp->gap_extend = BLAST_Nint(sp->gap_extend * scale_factor);
2131 sp->scale_factor = scale_factor;
2132 }
2133
2134
2135 /**
2136 * Restore the parameters that were adjusted to their original values.
2137 *
2138 * @param sbp the score block to be restored
2139 * @param scoring the scoring parameters to be restored
2140 * @param searchParams the initial recorded values of the parameters
2141 * @param query_length the concatenated query length
2142 * @param positionBased is this search position-based
2143 * @param compo_adjust_mode mode of composition adjustment
2144 */
2145 static void
s_RestoreSearch(BlastScoreBlk * sbp,BlastScoringParameters * scoring,const BlastKappa_SavedParameters * searchParams,int query_length,Boolean positionBased,ECompoAdjustModes compo_adjust_mode)2146 s_RestoreSearch(BlastScoreBlk* sbp,
2147 BlastScoringParameters* scoring,
2148 const BlastKappa_SavedParameters * searchParams,
2149 int query_length,
2150 Boolean positionBased,
2151 ECompoAdjustModes compo_adjust_mode)
2152 {
2153 int i;
2154
2155 scoring->gap_open = searchParams->gap_open;
2156 scoring->gap_extend = searchParams->gapExtend;
2157 scoring->scale_factor = searchParams->scale_factor;
2158
2159 for (i = 0; i < searchParams->num_queries; i++) {
2160 if (sbp->kbp_gap[i] != NULL) {
2161 Blast_KarlinBlkCopy(sbp->kbp_gap[i],
2162 searchParams->kbp_gap_orig[i]);
2163 }
2164 }
2165 if(compo_adjust_mode != eNoCompositionBasedStats) {
2166 int j; /* iteration index */
2167 Int4 ** matrix; /* matrix to be restored */
2168 int rows; /* number of rows in the matrix */
2169
2170 if (positionBased) {
2171 matrix = sbp->psi_matrix->pssm->data;
2172 rows = query_length;
2173 } else {
2174 matrix = sbp->matrix->data;
2175 rows = BLASTAA_SIZE;
2176 }
2177 for (i = 0; i < rows; i++) {
2178 for (j = 0; j < BLASTAA_SIZE; j++) {
2179 matrix[i][j] = searchParams->origMatrix[i][j];
2180 }
2181 }
2182 }
2183 }
2184
2185
2186 /**
2187 * Initialize an object of type Blast_MatrixInfo.
2188 *
2189 * @param self object being initialized
2190 * @param queryBlk the query sequence data
2191 * @param sbp score block for this search
2192 * @param scale_factor amount by which ungapped parameters should be
2193 * scaled
2194 * @param matrixName name of the matrix
2195 */
2196 static int
s_MatrixInfoInit(Blast_MatrixInfo * self,BLAST_SequenceBlk * queryBlk,BlastScoreBlk * sbp,double scale_factor,const char * matrixName)2197 s_MatrixInfoInit(Blast_MatrixInfo * self,
2198 BLAST_SequenceBlk* queryBlk,
2199 BlastScoreBlk* sbp,
2200 double scale_factor,
2201 const char * matrixName)
2202 {
2203 int status = 0; /* return status */
2204 int lenName; /* length of matrixName as a string */
2205
2206 /* copy the matrix name (strdup is not standard C) */
2207 lenName = strlen(matrixName);
2208 if (NULL == (self->matrixName = malloc(lenName + 1))) {
2209 return -1;
2210 }
2211 memcpy(self->matrixName, matrixName, lenName + 1);
2212
2213 if (self->positionBased) {
2214 status = s_GetPosBasedStartFreqRatios(self->startFreqRatios,
2215 queryBlk->length,
2216 queryBlk->sequence,
2217 matrixName,
2218 sbp->psi_matrix->freq_ratios);
2219 if (status == 0) {
2220 status = s_ScalePosMatrix(self->startMatrix, matrixName,
2221 sbp->psi_matrix->freq_ratios,
2222 queryBlk->sequence,
2223 queryBlk->length, sbp, scale_factor);
2224 self->ungappedLambda = sbp->kbp_psi[0]->Lambda / scale_factor;
2225 }
2226 } else {
2227 self->ungappedLambda = sbp->kbp_ideal->Lambda / scale_factor;
2228 status = s_GetStartFreqRatios(self->startFreqRatios, matrixName);
2229 if (status == 0) {
2230 Blast_Int4MatrixFromFreq(self->startMatrix, self->cols,
2231 self->startFreqRatios,
2232 self->ungappedLambda);
2233 }
2234 }
2235 return status;
2236 }
2237
2238
2239 /* Create an array of 8-mers for a sequence, such that index of each 8-mer
2240 is the same as its position in the query */
2241 static int
s_CreateWordArray(const Uint1 * seq_data,Int4 seq_len,Uint8 ** words)2242 s_CreateWordArray(const Uint1* seq_data, Int4 seq_len, Uint8** words)
2243 {
2244 int word_size = 8; /* word size for k-mer matching */
2245 Uint8* query_hashes; /* list of hashes for query words */
2246 Uint8 mask = NCBI_CONST_UINT8(0xFFFFFFFFFF); /* mask for computing hash
2247 values */
2248 int i;
2249
2250 /* if query or subject length is smaller than word size, exit */
2251 if (!seq_data || !words || seq_len < word_size) {
2252 return -1;
2253 }
2254
2255 query_hashes = (Uint8*)calloc((seq_len - word_size + 1),
2256 sizeof(Uint8));
2257 *words = query_hashes;
2258
2259 if (!query_hashes) {
2260 return -1;
2261 }
2262
2263
2264 /* find query word hashes */
2265 query_hashes[0] = s_GetHash(&seq_data[0], word_size);
2266 for (i = 1; i < seq_len - word_size; i++) {
2267 query_hashes[i] = query_hashes[i - 1];
2268 query_hashes[i] <<= 5;
2269 query_hashes[i] &= mask;
2270 query_hashes[i] += (Uint8)seq_data[i + word_size - 1];
2271 }
2272
2273 return 0;
2274 }
2275
2276
s_FreeBlastCompo_QueryInfoArray(BlastCompo_QueryInfo ** query_info,int num_queries)2277 static void s_FreeBlastCompo_QueryInfoArray(BlastCompo_QueryInfo** query_info,
2278 int num_queries)
2279 {
2280 int i;
2281
2282 if (!query_info) {
2283 return;
2284 }
2285
2286 for (i = 0;i < num_queries;i++) {
2287 if ((*query_info)[i].words) {
2288 free((*query_info)[i].words);
2289 }
2290 }
2291
2292 free(*query_info);
2293 *query_info = NULL;
2294 }
2295
2296 /**
2297 * Save information about all queries in an array of objects of type
2298 * BlastCompo_QueryInfo.
2299 *
2300 * @param query_data query sequence data
2301 * @param blast_query_info information about all queries, as an
2302 * internal blast data structure
2303 *
2304 * @return the new array on success, or NULL on error
2305 */
2306 static BlastCompo_QueryInfo *
s_GetQueryInfo(Uint1 * query_data,const BlastQueryInfo * blast_query_info,Boolean skip)2307 s_GetQueryInfo(Uint1 * query_data, const BlastQueryInfo * blast_query_info, Boolean skip)
2308 {
2309 int i; /* loop index */
2310 BlastCompo_QueryInfo *
2311 compo_query_info; /* the new array */
2312 int num_queries; /* the number of queries/elements in
2313 compo_query_info */
2314
2315 num_queries = blast_query_info->last_context + 1;
2316 compo_query_info = calloc(num_queries, sizeof(BlastCompo_QueryInfo));
2317 if (compo_query_info != NULL) {
2318 for (i = 0; i < num_queries; i++) {
2319 BlastCompo_QueryInfo * query_info = &compo_query_info[i];
2320 const BlastContextInfo * query_context = &blast_query_info->contexts[i];
2321
2322 query_info->eff_search_space =
2323 (double) query_context->eff_searchsp;
2324 query_info->origin = query_context->query_offset;
2325 query_info->seq.data = &query_data[query_info->origin];
2326 query_info->seq.length = query_context->query_length;
2327 query_info->words = NULL;
2328
2329 s_CreateWordArray(query_info->seq.data, query_info->seq.length,
2330 &query_info->words);
2331 if (! skip) {
2332 Blast_ReadAaComposition(&query_info->composition, BLASTAA_SIZE,
2333 query_info->seq.data,
2334 query_info->seq.length);
2335 }
2336 }
2337 }
2338 return compo_query_info;
2339 }
2340
2341
2342 /**
2343 * Create a new object of type BlastCompo_GappingParams. The new
2344 * object contains the parameters needed by the composition adjustment
2345 * library to compute a gapped alignment.
2346 *
2347 * @param context the data structures needed by callback functions
2348 * that perform the gapped alignments.
2349 * @param extendParams parameters used for a gapped extension
2350 * @param num_queries the number of queries in the concatenated query
2351 */
2352 static BlastCompo_GappingParams *
s_GappingParamsNew(BlastKappa_GappingParamsContext * context,const BlastExtensionParameters * extendParams,int num_queries)2353 s_GappingParamsNew(BlastKappa_GappingParamsContext * context,
2354 const BlastExtensionParameters* extendParams,
2355 int num_queries)
2356 {
2357 int i;
2358 double min_lambda = DBL_MAX; /* smallest gapped Lambda */
2359 const BlastScoringParameters * scoring = context->scoringParams;
2360 const BlastExtensionOptions * options = extendParams->options;
2361 /* The new object */
2362 BlastCompo_GappingParams * gapping_params = NULL;
2363
2364 gapping_params = malloc(sizeof(BlastCompo_GappingParams));
2365 if (gapping_params == NULL)
2366 return NULL;
2367
2368 gapping_params->gap_open = scoring->gap_open;
2369 gapping_params->gap_extend = scoring->gap_extend;
2370 gapping_params->context = context;
2371
2372 for (i = 0; i < num_queries; i++) {
2373 if (context->sbp->kbp_gap[i] != NULL &&
2374 context->sbp->kbp_gap[i]->Lambda < min_lambda) {
2375 min_lambda = context->sbp->kbp_gap[i]->Lambda;
2376 }
2377 }
2378 gapping_params->x_dropoff = (Int4)
2379 MAX(options->gap_x_dropoff_final*NCBIMATH_LN2 / min_lambda,
2380 extendParams->gap_x_dropoff_final);
2381 context->gap_align->gap_x_dropoff = gapping_params->x_dropoff;
2382
2383 return gapping_params;
2384 }
2385
2386
2387 /** Callbacks used by the Blast_RedoOneMatch* routines */
2388 static const Blast_RedoAlignCallbacks
2389 redo_align_callbacks = {
2390 s_CalcLambda, s_SequenceGetRange, s_RedoOneAlignment,
2391 s_NewAlignmentUsingXdrop, s_FreeEditScript
2392 };
2393
2394
2395 /* Bit score per alignment position threshold for preliminaru near identical
2396 test */
2397 #define NEAR_IDENTICAL_BITS_PER_POSITION (1.74)
2398
2399 /**
2400 * Read the parameters required for the Blast_RedoOneMatch* functions from
2401 * the corresponding parameters in standard BLAST datatypes. Return a new
2402 * object representing these parameters.
2403 */
2404 static Blast_RedoAlignParams *
s_GetAlignParams(BlastKappa_GappingParamsContext * context,BLAST_SequenceBlk * queryBlk,const BlastQueryInfo * queryInfo,const BlastHitSavingParameters * hitParams,const BlastExtensionParameters * extendParams)2405 s_GetAlignParams(BlastKappa_GappingParamsContext * context,
2406 BLAST_SequenceBlk * queryBlk,
2407 const BlastQueryInfo* queryInfo,
2408 const BlastHitSavingParameters* hitParams,
2409 const BlastExtensionParameters* extendParams)
2410 {
2411 int status = 0; /* status code */
2412 int rows; /* number of rows in the scoring matrix */
2413 int cutoff_s; /* cutoff score for saving an alignment */
2414 double cutoff_e; /* cutoff evalue for saving an alignment */
2415 BlastCompo_GappingParams *
2416 gapping_params = NULL; /* parameters needed to compute a gapped
2417 alignment */
2418 Blast_MatrixInfo *
2419 scaledMatrixInfo; /* information about the scoring matrix */
2420 /* does this kind of search translate the database sequence */
2421 int subject_is_translated = (context->prog_number == eBlastTypeTblastn) || (context->prog_number == eBlastTypeRpsTblastn);
2422 int query_is_translated = context->prog_number == eBlastTypeBlastx;
2423 /* is this a positiion-based search */
2424 Boolean positionBased = (Boolean) (context->sbp->psi_matrix != NULL);
2425 /* will BLAST_LinkHsps be called to assign e-values */
2426 Boolean do_link_hsps = (hitParams->do_sum_stats);
2427 ECompoAdjustModes compo_adjust_mode =
2428 (ECompoAdjustModes) extendParams->options->compositionBasedStats;
2429
2430 /* per position bit score cutoff for testing whether sequences are
2431 near identical */
2432 double near_identical_cutoff_bits = NEAR_IDENTICAL_BITS_PER_POSITION;
2433
2434 /* score block is already scaled by context->localScalingFactor */
2435 double near_identical_cutoff=0;
2436 Int4 index;
2437 for (index = queryInfo->first_context;
2438 index <= queryInfo->last_context; ++index) {
2439
2440 if ((queryInfo->contexts[index].is_valid)) {
2441 near_identical_cutoff =
2442 (near_identical_cutoff_bits * NCBIMATH_LN2)
2443 / context->sbp->kbp_gap[index]->Lambda;
2444 break;
2445 }
2446 }
2447
2448 if (do_link_hsps) {
2449 ASSERT(hitParams->link_hsp_params != NULL);
2450 cutoff_s =
2451 (int) (hitParams->cutoff_score_min * context->localScalingFactor);
2452 } else {
2453 /* There is no cutoff score; we consider e-values instead */
2454 cutoff_s = 1;
2455 }
2456 cutoff_e = hitParams->options->expect_value;
2457 rows = positionBased ? queryInfo->max_length : BLASTAA_SIZE;
2458 scaledMatrixInfo = Blast_MatrixInfoNew(rows, BLASTAA_SIZE, positionBased);
2459 status = s_MatrixInfoInit(scaledMatrixInfo, queryBlk, context->sbp,
2460 context->localScalingFactor,
2461 context->scoringParams->options->matrix);
2462 if (status != 0) {
2463 return NULL;
2464 }
2465 gapping_params = s_GappingParamsNew(context, extendParams,
2466 queryInfo->last_context + 1);
2467 if (gapping_params == NULL) {
2468 return NULL;
2469 } else {
2470 return
2471 Blast_RedoAlignParamsNew(&scaledMatrixInfo, &gapping_params,
2472 compo_adjust_mode, positionBased,
2473 query_is_translated,
2474 subject_is_translated,
2475 queryInfo->max_length, cutoff_s, cutoff_e,
2476 do_link_hsps, &redo_align_callbacks,
2477 near_identical_cutoff);
2478 }
2479 }
2480
2481
2482 /**
2483 * Convert an array of BlastCompo_Heap objects to a BlastHSPResults structure.
2484 *
2485 * @param results BLAST core external results structure (pre-SeqAlign)
2486 * [out]
2487 * @param heaps an array of BlastCompo_Heap objects
2488 * @param hitlist_size size of each list in the results structure above [in]
2489 */
2490 static void
s_FillResultsFromCompoHeaps(BlastHSPResults * results,BlastCompo_Heap heaps[],Int4 hitlist_size)2491 s_FillResultsFromCompoHeaps(BlastHSPResults * results,
2492 BlastCompo_Heap heaps[],
2493 Int4 hitlist_size)
2494 {
2495 int query_index; /* loop index */
2496 int num_queries; /* Number of queries in this search */
2497
2498 num_queries = results->num_queries;
2499 for (query_index = 0; query_index < num_queries; query_index++) {
2500 BlastHSPList* hsp_list;
2501 BlastHitList* hitlist;
2502 BlastCompo_Heap * heap = &heaps[query_index];
2503
2504 results->hitlist_array[query_index] = Blast_HitListNew(hitlist_size);
2505 hitlist = results->hitlist_array[query_index];
2506
2507 while (NULL != (hsp_list = BlastCompo_HeapPop(heap))) {
2508 Blast_HitListUpdate(hitlist, hsp_list);
2509 }
2510 }
2511 Blast_HSPResultsReverseOrder(results);
2512 }
2513
2514
2515 /** Remove all matches from a BlastCompo_Heap. */
s_ClearHeap(BlastCompo_Heap * self)2516 static void s_ClearHeap(BlastCompo_Heap * self)
2517 {
2518 BlastHSPList* hsp_list = NULL; /* an element of the heap */
2519
2520 while (NULL != (hsp_list = BlastCompo_HeapPop(self))) {
2521 hsp_list = Blast_HSPListFree(hsp_list);
2522 }
2523 }
2524
2525 /**
2526 * Free a BlastGapAlignStruct copy created by s_BlastGapAlignStruct_Copy
2527 *
2528 * @param copy Pointer to BlastGapAlignStruct to be freed
2529 */
s_BlastGapAlignStruct_Free(BlastGapAlignStruct * copy)2530 static void s_BlastGapAlignStruct_Free(BlastGapAlignStruct* copy)
2531 {
2532 {
2533 while (copy->state_struct != NULL) {
2534 GapStateArrayStruct* cur = copy->state_struct;
2535 copy->state_struct = copy->state_struct->next;
2536 if (cur->state_array) {
2537 sfree(cur->state_array);
2538 }
2539 if (cur) {
2540 sfree(cur);
2541 }
2542 }
2543 }
2544 {
2545 if (copy->edit_script != NULL) {
2546 if (copy->edit_script->op_type) {
2547 sfree(copy->edit_script->op_type);
2548 }
2549 if (copy->edit_script->num) {
2550 sfree(copy->edit_script->num);
2551 }
2552 sfree(copy->edit_script);
2553 }
2554 }
2555 {
2556 if (copy->fwd_prelim_tback != NULL) {
2557 if (copy->fwd_prelim_tback->edit_ops) {
2558 sfree(copy->fwd_prelim_tback->edit_ops);
2559 }
2560 sfree(copy->fwd_prelim_tback);
2561 }
2562 }
2563 {
2564 if (copy->rev_prelim_tback != NULL) {
2565 if (copy->rev_prelim_tback->edit_ops) {
2566 sfree(copy->rev_prelim_tback->edit_ops);
2567 }
2568 sfree(copy->rev_prelim_tback);
2569 }
2570 }
2571 {
2572 if (copy->greedy_align_mem != NULL) {
2573 sfree(copy->greedy_align_mem);
2574 }
2575 }
2576 {
2577 if (copy->dp_mem != NULL) {
2578 sfree(copy->dp_mem);
2579 }
2580 }
2581 {
2582 if (copy->sbp != NULL) {
2583 sfree(copy->sbp);
2584 }
2585 }
2586 sfree(copy);
2587 }
2588
2589 /**
2590 * Create a "deep" copy of a BlastGapAlignStruct structure.
2591 *
2592 * Non-pointer structure members are copied. Pointers to data which will
2593 * only be read are copied. For data which will be changing, memory for copies
2594 * will be allocated and new pointers will be assigned to them. The process
2595 * repeats down the structure hierarchy until all pointers are dealt with.
2596 *
2597 * @param orig Pointer to BlastGapAlignStruct structure to be copied
2598 * @param sbp Pointer to BlastScoreBlk structure, required to set copy->sbp
2599 *
2600 * @return Pointer to copy of original BlastGapAlignStruct structure
2601 */
s_BlastGapAlignStruct_Copy(BlastGapAlignStruct * orig,BlastScoreBlk * sbp)2602 static BlastGapAlignStruct* s_BlastGapAlignStruct_Copy(
2603 BlastGapAlignStruct* orig,
2604 BlastScoreBlk* sbp
2605 )
2606 {
2607 BlastGapAlignStruct* copy =
2608 (BlastGapAlignStruct*) calloc(1, sizeof(BlastGapAlignStruct));
2609
2610 // Copy plain old data (ints, doubles, booleans, ...).
2611 // Any pointer members will be processed separately.
2612 memcpy(copy, orig, sizeof(BlastGapAlignStruct));
2613
2614 {
2615 GapStateArrayStruct* o = orig->state_struct;
2616 if (o != NULL) {
2617 GapStateArrayStruct* c = (GapStateArrayStruct*) calloc(
2618 1,
2619 sizeof(GapStateArrayStruct)
2620 );
2621 copy->state_struct = c;
2622 memcpy(c, o, sizeof(GapStateArrayStruct));
2623 c->state_array = (Uint1*) calloc(c->length, sizeof(Uint1));
2624 int i;
2625 for (i = 0; i < c->length; ++i) {
2626 c->state_array[i] = o->state_array[i];
2627 }
2628 while (o->next != NULL) {
2629 c->next = (GapStateArrayStruct*)
2630 calloc(1, sizeof(GapStateArrayStruct));
2631 c = c->next;
2632 o = o->next;
2633 memcpy(c, o, sizeof(GapStateArrayStruct));
2634 c->state_array = (Uint1*) calloc(c->length, sizeof(Uint1));
2635 int i;
2636 for (i = 0; i < c->length; ++i) {
2637 c->state_array[i] = o->state_array[i];
2638 }
2639 }
2640 }
2641 }
2642 {
2643 GapEditScript* o = orig->edit_script;
2644 if (o != NULL) {
2645 GapEditScript* c = (GapEditScript*) calloc(
2646 1,
2647 sizeof(GapEditScript)
2648 );
2649 copy->edit_script = c;
2650 memcpy(c, o, sizeof(GapEditScript));
2651 c->op_type = (EGapAlignOpType*) calloc(
2652 o->size,
2653 sizeof(EGapAlignOpType)
2654 );
2655 c->num = (Int4*) calloc(o->size, sizeof(Int4));
2656 int i;
2657 for (i = 0; i < o->size; ++i) {
2658 c->op_type[i] = o->op_type[i];
2659 c->num[i] = o->num[i];
2660 }
2661 }
2662 }
2663 {
2664 GapPrelimEditBlock* o = orig->fwd_prelim_tback;
2665 if (o != NULL) {
2666 GapPrelimEditBlock* c = (GapPrelimEditBlock*) calloc(
2667 1,
2668 sizeof(GapPrelimEditBlock)
2669 );
2670 copy->fwd_prelim_tback = c;
2671 memcpy(c, o, sizeof(GapPrelimEditBlock));
2672 c->edit_ops = calloc(
2673 o->num_ops_allocated,
2674 sizeof(GapPrelimEditScript)
2675 );
2676 int i;
2677 for (i = 0; i < o->num_ops_allocated; ++i) {
2678 c->edit_ops[i].op_type = o->edit_ops[i].op_type;
2679 c->edit_ops[i].num = o->edit_ops[i].num;
2680 }
2681 }
2682 }
2683 {
2684 GapPrelimEditBlock* o = orig->rev_prelim_tback;
2685 if (o != NULL) {
2686 GapPrelimEditBlock* c = (GapPrelimEditBlock*) calloc(
2687 1,
2688 sizeof(GapPrelimEditBlock)
2689 );
2690 copy->rev_prelim_tback = c;
2691 memcpy(c, o, sizeof(GapPrelimEditBlock));
2692 c->edit_ops = calloc(
2693 o->num_ops_allocated,
2694 sizeof(GapPrelimEditScript)
2695 );
2696 int i;
2697 for (i = 0; i < o->num_ops_allocated; ++i) {
2698 c->edit_ops[i].op_type = o->edit_ops[i].op_type;
2699 c->edit_ops[i].num = o->edit_ops[i].num;
2700 }
2701 }
2702 }
2703 {
2704 SGreedyAlignMem* o = orig->greedy_align_mem;
2705 if (o != NULL) {
2706 SGreedyAlignMem* c = (SGreedyAlignMem*) calloc(
2707 1,
2708 sizeof(SGreedyAlignMem)
2709 );
2710 copy->greedy_align_mem = c;
2711 memcpy(c, o, sizeof(SGreedyAlignMem));
2712 }
2713 }
2714 {
2715 BlastGapDP* o = orig->dp_mem;
2716 if (o != NULL) {
2717 BlastGapDP* c = (BlastGapDP*) calloc(
2718 orig->dp_mem_alloc,
2719 sizeof(BlastGapDP)
2720 );
2721 copy->dp_mem = c;
2722 memcpy(c, o, orig->dp_mem_alloc * sizeof(BlastGapDP));
2723 }
2724 }
2725 {
2726 copy->sbp = sbp;
2727 }
2728
2729 return copy;
2730 }
2731
2732 /**
2733 * Free a BlastScoreBlk copy created by s_BlastScoreBlk_Copy
2734 *
2735 * BlastScoreBlk* pointer "bsb_ptr" should be passed as (&bsb_ptr);
2736 * this function will set bsb_ptr to NULL before returning.
2737 *
2738 * @param copy Pointer to (pointer to BlastScoreBlk to be freed)
2739 */
2740 static
s_BlastScoreBlk_Free(BlastScoreBlk ** copy)2741 void s_BlastScoreBlk_Free(BlastScoreBlk** copy)
2742 {
2743 BlastScoreBlkFree(*copy);
2744 *copy = NULL;
2745 }
2746
2747 /**
2748 * Create a "deep" copy of a BlastScoreBlk structure.
2749 *
2750 * Non-pointer structure members are copied. Pointers to data which will
2751 * only be read are copied. For data which will be changing, memory for copies
2752 * will be allocated and new pointers will be assigned to them. The process
2753 * repeats down the structure hierarchy until all pointers are dealt with.
2754 *
2755 * @param program The program type
2756 * @param orig Pointer to BlastScoreBlk structure to be copied
2757 * @param alphabet_code Alphabet code
2758 * @param number_of_contexts Number of contexts
2759 *
2760 * @return Pointer to copy of original BlastScoreBlk structure
2761 */
2762 static
s_BlastScoreBlk_Copy(EBlastProgramType program,BlastScoreBlk * orig,Uint1 alphabet_code,Int4 number_of_contexts)2763 BlastScoreBlk* s_BlastScoreBlk_Copy(
2764 EBlastProgramType program,
2765 BlastScoreBlk* orig,
2766 Uint1 alphabet_code,
2767 Int4 number_of_contexts
2768 )
2769 {
2770 BlastScoreBlk* copy = BlastScoreBlkNew(
2771 orig->alphabet_code,
2772 orig->number_of_contexts
2773 );
2774 if (copy == NULL) {
2775 return NULL;
2776 }
2777
2778 copy->alphabet_start = orig->alphabet_start;
2779 copy->name = strdup(orig->name);
2780 copy->comments = orig->comments;
2781 /* Deep-copy orig->matrix */
2782 if (orig->matrix != NULL) {
2783 if (copy->matrix == NULL) {
2784 return BlastScoreBlkFree(copy);
2785 }
2786 SBlastScoreMatrix* m = copy->matrix;
2787 if (m->data != NULL && orig->matrix->data != NULL) {
2788 int i;
2789 for (i = 0; i < orig->matrix->ncols; ++i) {
2790 memcpy(
2791 m->data[i],
2792 orig->matrix->data[i],
2793 m->nrows * sizeof(int)
2794 );
2795 }
2796 }
2797 if (m->freqs != NULL && orig->matrix->freqs != NULL) {
2798 memcpy(
2799 m->freqs,
2800 orig->matrix->freqs,
2801 m->ncols * sizeof(double)
2802 );
2803 }
2804 m->lambda = orig->matrix->lambda;
2805 }
2806 /* Deep-copy orig->psi_matrix */
2807 if (orig->psi_matrix != NULL
2808 && orig->psi_matrix->pssm != NULL) {
2809 copy->psi_matrix = SPsiBlastScoreMatrixNew(orig->psi_matrix->pssm->ncols);
2810 if (copy->psi_matrix == NULL) {
2811 return BlastScoreBlkFree(copy);
2812 }
2813 SPsiBlastScoreMatrix* pm = copy->psi_matrix;
2814 SBlastScoreMatrix* m = pm->pssm;
2815 if (m->data != NULL && orig->psi_matrix->pssm->data != NULL) {
2816 int i;
2817 for (i = 0; i < orig->psi_matrix->pssm->ncols; ++i) {
2818 memcpy(
2819 m->data[i],
2820 orig->psi_matrix->pssm->data[i],
2821 m->nrows * sizeof(int)
2822 );
2823 }
2824 }
2825 if (m->freqs != NULL
2826 && orig->psi_matrix->pssm->freqs != NULL) {
2827 memcpy(
2828 m->freqs,
2829 orig->psi_matrix->pssm->freqs,
2830 m->ncols * sizeof(double)
2831 );
2832 }
2833 m->lambda = orig->psi_matrix->pssm->lambda;
2834 if (pm->freq_ratios != NULL
2835 && orig->psi_matrix->freq_ratios != NULL) {
2836 int i;
2837 for (i = 0; i < orig->psi_matrix->pssm->ncols; ++i) {
2838 memcpy(
2839 pm->freq_ratios[i],
2840 orig->psi_matrix->freq_ratios[i],
2841 orig->psi_matrix->pssm->nrows * sizeof(double)
2842 );
2843 }
2844 }
2845 if (orig->psi_matrix->kbp != NULL) {
2846 memcpy(pm->kbp, orig->psi_matrix->kbp, sizeof(Blast_KarlinBlk));
2847 }
2848 }
2849 copy->matrix_only_scoring = orig->matrix_only_scoring;
2850 copy->complexity_adjusted_scoring = orig->complexity_adjusted_scoring;
2851 copy->loscore = orig->loscore;
2852 copy->hiscore = orig->hiscore;
2853 copy->penalty = orig->penalty;
2854 copy->reward = orig->reward;
2855 copy->read_in_matrix = orig->read_in_matrix;
2856 if (Blast_QueryIsPssm(program)) {
2857 copy->kbp = copy->kbp_psi;
2858 copy->kbp_gap = copy->kbp_gap_psi;
2859 } else {
2860 copy->kbp = copy->kbp_std;
2861 copy->kbp_gap = copy->kbp_gap_std;
2862 }
2863 if (orig->gbp != NULL) {
2864 memcpy(copy->gbp, orig->gbp, sizeof(Blast_GumbelBlk));
2865 }
2866 int ctx;
2867 for (ctx = 0; ctx < orig->number_of_contexts; ++ctx) {
2868 if (orig->sfp != NULL && orig->sfp[ctx] != NULL) {
2869 copy->sfp[ctx] = Blast_ScoreFreqNew(
2870 orig->sfp[ctx]->score_min,
2871 orig->sfp[ctx]->score_max
2872 );
2873 if (copy->sfp[ctx] == NULL) {
2874 return BlastScoreBlkFree(copy);
2875 }
2876 copy->sfp[ctx]->obs_min = orig->sfp[ctx]->obs_min;
2877 copy->sfp[ctx]->obs_max = orig->sfp[ctx]->obs_max;
2878 copy->sfp[ctx]->score_avg = orig->sfp[ctx]->score_avg;
2879 int r = orig->sfp[ctx]->score_max - orig->sfp[ctx]->score_min + 1;
2880 memcpy(
2881 copy->sfp[ctx]->sprob0,
2882 orig->sfp[ctx]->sprob0,
2883 r * sizeof(double)
2884 );
2885 }
2886 if (orig->kbp_std != NULL && orig->kbp_std[ctx] != NULL) {
2887 copy->kbp_std[ctx] = Blast_KarlinBlkNew();
2888 if (Blast_KarlinBlkCopy(copy->kbp_std[ctx], orig->kbp_std[ctx]) != 0) {
2889 return BlastScoreBlkFree(copy);
2890 }
2891 }
2892 if (orig->kbp_gap_std != NULL && orig->kbp_gap_std[ctx] != NULL) {
2893 copy->kbp_gap_std[ctx] = Blast_KarlinBlkNew();
2894 if (Blast_KarlinBlkCopy(copy->kbp_gap_std[ctx], orig->kbp_gap_std[ctx]) != 0) {
2895 return BlastScoreBlkFree(copy);
2896 }
2897 }
2898 if (orig->kbp_psi != NULL && orig->kbp_psi[ctx] != NULL) {
2899 copy->kbp_psi[ctx] = Blast_KarlinBlkNew();
2900 if (Blast_KarlinBlkCopy(copy->kbp_psi[ctx], orig->kbp_psi[ctx]) != 0) {
2901 return BlastScoreBlkFree(copy);
2902 }
2903 }
2904 if (orig->kbp_gap_psi != NULL && orig->kbp_gap_psi[ctx] != NULL) {
2905 copy->kbp_gap_psi[ctx] = Blast_KarlinBlkNew();
2906 if (Blast_KarlinBlkCopy(copy->kbp_gap_psi[ctx], orig->kbp_gap_psi[ctx]) != 0) {
2907 return BlastScoreBlkFree(copy);
2908 }
2909 }
2910 if (Blast_QueryIsPssm(program)) {
2911 copy->kbp[ctx] = copy->kbp_psi[ctx];
2912 copy->kbp_gap[ctx] = copy->kbp_gap_psi[ctx];
2913 } else {
2914 copy->kbp[ctx] = copy->kbp_std[ctx];
2915 copy->kbp_gap[ctx] = copy->kbp_gap_std[ctx];
2916 }
2917 }
2918 if (orig->kbp_ideal != NULL) {
2919 copy->kbp_ideal = Blast_KarlinBlkNew();
2920 if (Blast_KarlinBlkCopy(copy->kbp_ideal, orig->kbp_ideal) != 0) {
2921 return BlastScoreBlkFree(copy);
2922 }
2923 }
2924 copy->ambiguous_res = (Uint1*) calloc(orig->ambig_size, sizeof(Uint1));
2925 if (orig->ambiguous_res != NULL) {
2926 memcpy(copy->ambiguous_res, orig->ambiguous_res, orig->ambig_size);
2927 }
2928 copy->ambig_size = orig->ambig_size;
2929 copy->ambig_occupy = orig->ambig_occupy;
2930 copy->round_down = orig->round_down;
2931
2932 return copy;
2933 }
2934
2935 /**
2936 * Recompute alignments for each match found by the gapped BLAST
2937 * algorithm. Single-thread adapter to Blast_RedoAlignmentCore_MT.
2938 */
2939 Int2
Blast_RedoAlignmentCore(EBlastProgramType program_number,BLAST_SequenceBlk * queryBlk,const BlastQueryInfo * queryInfo,BlastScoreBlk * sbp,BLAST_SequenceBlk * subjectBlk,const BlastSeqSrc * seqSrc,Int4 default_db_genetic_code,BlastHSPList * thisMatch,BlastHSPStream * hsp_stream,BlastScoringParameters * scoringParams,const BlastExtensionParameters * extendParams,const BlastHitSavingParameters * hitParams,const PSIBlastOptions * psiOptions,BlastHSPResults * results)2940 Blast_RedoAlignmentCore(EBlastProgramType program_number,
2941 BLAST_SequenceBlk * queryBlk,
2942 const BlastQueryInfo* queryInfo,
2943 BlastScoreBlk* sbp,
2944 BLAST_SequenceBlk * subjectBlk,
2945 const BlastSeqSrc* seqSrc,
2946 Int4 default_db_genetic_code,
2947 BlastHSPList * thisMatch,
2948 BlastHSPStream* hsp_stream,
2949 BlastScoringParameters* scoringParams,
2950 const BlastExtensionParameters* extendParams,
2951 const BlastHitSavingParameters* hitParams,
2952 const PSIBlastOptions* psiOptions,
2953 BlastHSPResults* results)
2954 {
2955 return Blast_RedoAlignmentCore_MT(
2956 program_number,
2957 1, /* number of threads */
2958 queryBlk,
2959 queryInfo,
2960 sbp,
2961 subjectBlk,
2962 seqSrc,
2963 default_db_genetic_code,
2964 thisMatch,
2965 hsp_stream,
2966 scoringParams,
2967 extendParams,
2968 hitParams,
2969 psiOptions,
2970 results
2971 );
2972 }
2973
2974 /**
2975 * Recompute alignments for each match found by the gapped BLAST
2976 * algorithm.
2977 */
2978 Int2
Blast_RedoAlignmentCore_MT(EBlastProgramType program_number,Uint4 num_threads,BLAST_SequenceBlk * queryBlk,const BlastQueryInfo * queryInfo,BlastScoreBlk * sbp,BLAST_SequenceBlk * subjectBlk,const BlastSeqSrc * seqSrc,Int4 default_db_genetic_code,BlastHSPList * thisMatch,BlastHSPStream * hsp_stream,BlastScoringParameters * scoringParams,const BlastExtensionParameters * extendParams,const BlastHitSavingParameters * hitParams,const PSIBlastOptions * psiOptions,BlastHSPResults * results)2979 Blast_RedoAlignmentCore_MT(EBlastProgramType program_number,
2980 Uint4 num_threads,
2981 BLAST_SequenceBlk * queryBlk,
2982 const BlastQueryInfo* queryInfo,
2983 BlastScoreBlk* sbp,
2984 BLAST_SequenceBlk * subjectBlk,
2985 const BlastSeqSrc* seqSrc,
2986 Int4 default_db_genetic_code,
2987 BlastHSPList * thisMatch,
2988 BlastHSPStream* hsp_stream,
2989 BlastScoringParameters* scoringParams,
2990 const BlastExtensionParameters* extendParams,
2991 const BlastHitSavingParameters* hitParams,
2992 const PSIBlastOptions* psiOptions,
2993 BlastHSPResults* results)
2994 {
2995 int status_code = 0; /* return value code */
2996 /* the factor by which to scale the scoring system in order to
2997 * obtain greater precision */
2998 double localScalingFactor;
2999 /* forbidden ranges for each database position (used in
3000 * Smith-Waterman alignments) */
3001 Blast_ForbiddenRanges forbidden = {0,};
3002 /* a collection of alignments for each query sequence with
3003 * sequences from the database */
3004 BlastCompo_Heap* redoneMatches = NULL;
3005 /* stores all fields needed for computing a compositionally
3006 * adjusted score matrix using Newton's method */
3007 Blast_CompositionWorkspace** NRrecord_tld = NULL;
3008 /* loop index */
3009 int query_index;
3010 /* number of queries in the concatenated query */
3011 int numQueries = queryInfo->num_queries;
3012 /* number of contexts in the concatenated query */
3013 int numContexts = queryInfo->last_context + 1;
3014 /* number of contexts within a query */
3015 int numFrames = (program_number == eBlastTypeBlastx) ? 6:1;
3016 /* keeps track of gapped alignment params */
3017 BlastGapAlignStruct* gapAlign = NULL;
3018 /* the values of the search parameters that will be recorded, altered
3019 * in the search structure in this routine, and then restored before
3020 * the routine exits. */
3021 BlastKappa_SavedParameters *savedParams = NULL;
3022 /* All alignments above this value will be reported, no matter how many. */
3023 double inclusion_ethresh;
3024
3025 BlastHSPResults* local_results = NULL;
3026
3027 BlastCompo_QueryInfo** query_info_tld = NULL;
3028 int* numContexts_tld = NULL;
3029 int* compositionTestIndex_tld = NULL;
3030 Blast_RedoAlignParams** redo_align_params_tld = NULL;
3031 BLAST_SequenceBlk** subjectBlk_tld = NULL;
3032 Boolean positionBased = (Boolean) (sbp->psi_matrix != NULL);
3033 ECompoAdjustModes compo_adjust_mode =
3034 (ECompoAdjustModes) extendParams->options->compositionBasedStats;
3035 Boolean smithWaterman =
3036 (Boolean) (extendParams->options->eTbackExt == eSmithWatermanTbck);
3037
3038 /* which test function do we use to see if a composition-adjusted
3039 p-value is desired; value needs to be passed in eventually*/
3040 int compositionTestIndex = extendParams->options->unifiedP;
3041 Uint1* genetic_code_string = GenCodeSingletonFind(default_db_genetic_code);
3042
3043 ASSERT(program_number == eBlastTypeBlastp ||
3044 program_number == eBlastTypeTblastn ||
3045 program_number == eBlastTypeBlastx ||
3046 program_number == eBlastTypePsiBlast ||
3047 program_number == eBlastTypeRpsBlast ||
3048 program_number == eBlastTypeRpsTblastn);
3049
3050 if (0 == strcmp(scoringParams->options->matrix, "BLOSUM62_20") &&
3051 compo_adjust_mode == eNoCompositionBasedStats) {
3052 return -1; /* BLOSUM62_20 only makes sense if
3053 * compo_adjust_mode is on */
3054 }
3055 if (positionBased) {
3056 /* Position based searches can only use traditional
3057 * composition based stats */
3058 if ((int) compo_adjust_mode > 1) {
3059 compo_adjust_mode = eCompositionBasedStats;
3060 }
3061 /* A position-based search can only have one query */
3062 ASSERT(queryInfo->num_queries == 1);
3063 ASSERT(queryBlk->length == (Int4)sbp->psi_matrix->pssm->ncols);
3064 }
3065
3066 if ((int) compo_adjust_mode > 1 &&
3067 !Blast_FrequencyDataIsAvailable(scoringParams->options->matrix)) {
3068 return -1; /* Unsupported matrix */
3069 }
3070 /*****************/
3071 inclusion_ethresh = (psiOptions /* this can be NULL for CBl2Seq */
3072 ? psiOptions->inclusion_ethresh
3073 : PSI_INCLUSION_ETHRESH);
3074 ASSERT(inclusion_ethresh != 0.0);
3075
3076 int actual_num_threads = 1;
3077 #ifdef _OPENMP
3078 actual_num_threads = num_threads;
3079 #endif
3080
3081 /* Initialize savedParams */
3082 savedParams =
3083 s_SavedParametersNew(queryInfo->max_length, numContexts,
3084 compo_adjust_mode, positionBased);
3085 if (savedParams == NULL) {
3086 status_code = -1;
3087 goto function_cleanup;
3088 }
3089 status_code =
3090 s_RecordInitialSearch(savedParams, sbp, scoringParams,
3091 queryInfo->max_length, compo_adjust_mode,
3092 positionBased);
3093 if (status_code != 0) {
3094 goto function_cleanup;
3095 }
3096
3097 if (compo_adjust_mode != eNoCompositionBasedStats) {
3098 if((0 == strcmp(scoringParams->options->matrix, "BLOSUM62_20"))) {
3099 localScalingFactor = SCALING_FACTOR / 10;
3100 } else {
3101 localScalingFactor = SCALING_FACTOR;
3102 }
3103 } else {
3104 localScalingFactor = 1.0;
3105 }
3106 s_RescaleSearch(sbp, scoringParams, numContexts, localScalingFactor);
3107 status_code =
3108 BLAST_GapAlignStructNew(scoringParams, extendParams,
3109 (seqSrc) ? BlastSeqSrcGetMaxSeqLen(seqSrc)
3110 : subjectBlk->length,
3111 sbp, &gapAlign);
3112 if (status_code != 0) {
3113 return (Int2) status_code;
3114 }
3115
3116 if(smithWaterman) {
3117 status_code =
3118 Blast_ForbiddenRangesInitialize(&forbidden, queryInfo->max_length);
3119 if (status_code != 0) {
3120 goto function_cleanup;
3121 }
3122 }
3123 redoneMatches = calloc(numQueries, sizeof(BlastCompo_Heap));
3124 if (redoneMatches == NULL) {
3125 status_code = -1;
3126 goto function_cleanup;
3127 }
3128 for (query_index = 0; query_index < numQueries; query_index++) {
3129 status_code =
3130 BlastCompo_HeapInitialize(&redoneMatches[query_index],
3131 hitParams->options->hitlist_size,
3132 inclusion_ethresh);
3133 if (status_code != 0) {
3134 goto function_cleanup;
3135 }
3136 }
3137
3138 BlastCompo_Heap** redoneMatches_tld =
3139 (BlastCompo_Heap**) calloc(
3140 actual_num_threads,
3141 sizeof(BlastCompo_Heap*)
3142 );
3143 BlastCompo_Alignment*** alignments_tld =
3144 (BlastCompo_Alignment***) calloc(
3145 actual_num_threads,
3146 sizeof(BlastCompo_Alignment**)
3147 );
3148 BlastCompo_Alignment*** incoming_align_set_tld =
3149 (BlastCompo_Alignment***) calloc(
3150 actual_num_threads,
3151 sizeof(BlastCompo_Alignment**)
3152 );
3153 BlastKappa_SavedParameters** savedParams_tld =
3154 (BlastKappa_SavedParameters**) calloc(
3155 actual_num_threads,
3156 sizeof(BlastKappa_SavedParameters*)
3157 );
3158 BlastScoreBlk** sbp_tld =
3159 (BlastScoreBlk**) calloc(
3160 actual_num_threads,
3161 sizeof(BlastScoreBlk*)
3162 );
3163 BlastKappa_GappingParamsContext* gapping_params_context_tld =
3164 (BlastKappa_GappingParamsContext*) calloc(
3165 actual_num_threads,
3166 sizeof(BlastKappa_GappingParamsContext)
3167 );
3168 Int4*** matrix_tld =
3169 (Int4***) calloc(
3170 actual_num_threads,
3171 sizeof(Int4**)
3172 );
3173
3174 NRrecord_tld =
3175 (Blast_CompositionWorkspace**) calloc(
3176 actual_num_threads,
3177 sizeof(Blast_CompositionWorkspace*)
3178 );
3179
3180 subjectBlk_tld =
3181 (BLAST_SequenceBlk**) calloc(
3182 actual_num_threads,
3183 sizeof(BLAST_SequenceBlk*)
3184 );
3185 redo_align_params_tld =
3186 (Blast_RedoAlignParams**) calloc(
3187 actual_num_threads,
3188 sizeof(Blast_RedoAlignParams*)
3189 );
3190 int* status_code_tld =
3191 (int*) calloc(
3192 actual_num_threads,
3193 sizeof(int)
3194 );
3195 BlastSeqSrc** seqsrc_tld =
3196 (BlastSeqSrc**) calloc(
3197 actual_num_threads,
3198 sizeof(BlastSeqSrc*)
3199 );
3200 BlastGapAlignStruct** gap_align_tld =
3201 (BlastGapAlignStruct**) calloc(
3202 actual_num_threads,
3203 sizeof(BlastGapAlignStruct*)
3204 );
3205 BlastScoringParameters** score_params_tld =
3206 (BlastScoringParameters**) calloc(
3207 actual_num_threads,
3208 sizeof(BlastScoringParameters*)
3209 );
3210 BlastHitSavingParameters** hit_params_tld =
3211 (BlastHitSavingParameters**) calloc(
3212 actual_num_threads,
3213 sizeof(BlastHitSavingParameters*)
3214 );
3215 BlastHSPResults** results_tld =
3216 (BlastHSPResults**) calloc(
3217 actual_num_threads,
3218 sizeof(BlastHSPResults*)
3219 );
3220 query_info_tld =
3221 (BlastCompo_QueryInfo**) calloc(
3222 actual_num_threads,
3223 sizeof(BlastCompo_QueryInfo*)
3224 );
3225 numContexts_tld =
3226 (int*) calloc(
3227 actual_num_threads,
3228 sizeof(int)
3229 );
3230 compositionTestIndex_tld =
3231 (int*) calloc(
3232 actual_num_threads,
3233 sizeof(int)
3234 );
3235
3236 int i;
3237 for (i = 0; i < actual_num_threads; ++i) {
3238 query_info_tld[i] = s_GetQueryInfo(
3239 queryBlk->sequence,
3240 queryInfo,
3241 (program_number == eBlastTypeBlastx)
3242 );
3243 if (query_info_tld[i] == NULL) {
3244 status_code = -1;
3245 goto function_cleanup;
3246 }
3247
3248 sbp_tld[i] = s_BlastScoreBlk_Copy(
3249 program_number,
3250 sbp,
3251 sbp->alphabet_code,
3252 sbp->number_of_contexts
3253 );
3254
3255 numContexts_tld[i] = numContexts;
3256 compositionTestIndex_tld[i] = compositionTestIndex;
3257 seqsrc_tld[i] = BlastSeqSrcCopy(seqSrc);
3258 gap_align_tld[i] =
3259 s_BlastGapAlignStruct_Copy(gapAlign, sbp_tld[i]);
3260 score_params_tld[i] = scoringParams;
3261 hit_params_tld[i] = (BlastHitSavingParameters*) hitParams;
3262 results_tld[i] =
3263 Blast_HSPResultsNew(queryInfo->num_queries);
3264 subjectBlk_tld[i] = subjectBlk;
3265
3266 redoneMatches_tld[i] =
3267 (BlastCompo_Heap*) calloc(numQueries, sizeof(BlastCompo_Heap));
3268 if (redoneMatches_tld[i] == NULL) {
3269 status_code = -1;
3270 goto function_cleanup;
3271 }
3272 for (query_index = 0; query_index < numQueries; query_index++) {
3273 status_code =
3274 BlastCompo_HeapInitialize(&redoneMatches_tld[i][query_index],
3275 hitParams->options->hitlist_size,
3276 inclusion_ethresh);
3277 if (status_code != 0) {
3278 goto function_cleanup;
3279 }
3280 }
3281
3282 alignments_tld[i] = (BlastCompo_Alignment**) calloc(
3283 numContexts,
3284 sizeof(BlastCompo_Alignment*)
3285 );
3286 incoming_align_set_tld[i] = (BlastCompo_Alignment**) calloc(
3287 numFrames,
3288 sizeof(BlastCompo_Alignment*)
3289 );
3290
3291 savedParams_tld[i] = s_SavedParametersNew(
3292 queryInfo->max_length,
3293 numContexts,
3294 compo_adjust_mode,
3295 positionBased
3296 );
3297 if (savedParams_tld[i] == NULL) {
3298 status_code = -1;
3299 goto function_cleanup;
3300 }
3301 status_code = s_RecordInitialSearch(
3302 savedParams_tld[i],
3303 sbp,
3304 scoringParams,
3305 queryInfo->max_length,
3306 compo_adjust_mode,
3307 positionBased
3308 );
3309 if (status_code != 0) {
3310 goto function_cleanup;
3311 }
3312
3313 if ((int) compo_adjust_mode > 1 && !positionBased) {
3314 NRrecord_tld[i] = Blast_CompositionWorkspaceNew();
3315 status_code = Blast_CompositionWorkspaceInit(
3316 NRrecord_tld[i],
3317 scoringParams->options->matrix
3318 );
3319 if (status_code != 0) {
3320 goto function_cleanup;
3321 }
3322 }
3323
3324 gapping_params_context_tld[i].gap_align = gap_align_tld[i];
3325 gapping_params_context_tld[i].scoringParams = score_params_tld[i];
3326 gapping_params_context_tld[i].sbp = sbp_tld[i];
3327 gapping_params_context_tld[i].localScalingFactor = localScalingFactor;
3328 gapping_params_context_tld[i].prog_number = program_number;
3329
3330 redo_align_params_tld[i] =
3331 s_GetAlignParams(
3332 &gapping_params_context_tld[i],
3333 queryBlk,
3334 queryInfo,
3335 hitParams,
3336 extendParams
3337 );
3338 if (redo_align_params_tld[i] == NULL) {
3339 status_code = -1;
3340 goto function_cleanup;
3341 }
3342
3343 if (positionBased) {
3344 matrix_tld[i] = sbp_tld[i]->psi_matrix->pssm->data;
3345 } else {
3346 matrix_tld[i] = sbp_tld[i]->matrix->data;
3347 }
3348 /**** Validate parameters *************/
3349 if (matrix_tld[i] == NULL) {
3350 goto function_cleanup;
3351 }
3352 }
3353
3354 /*
3355 * There are two use cases here.
3356 * (1) hsp_stream == NULL, so single match is passed in thisMatch.
3357 * Also, seqSrc == NULL and subjectBlk are != NULL.
3358 * (2) hsp_stream != NULL, so one or more matches are taken from
3359 * hsp_stream, and thisMatch is (probably) NULL.
3360 * Also, seqSrc != NULL, subjectBlk and thisMatch are == NULL.
3361 */
3362 struct BlastHSPListLinkedList {
3363 BlastHSPList* match;
3364 struct BlastHSPListLinkedList* next;
3365 };
3366 typedef struct BlastHSPListLinkedList BlastHSPListLinkedList;
3367
3368 BlastHSPList** theseMatches = NULL;
3369 int numMatches = 0;
3370 if (hsp_stream == NULL) {
3371 theseMatches = (BlastHSPList**) calloc(1, sizeof(BlastHSPList*));
3372 *theseMatches = thisMatch;
3373 numMatches = 1;
3374 } else {
3375 BlastHSPList* localMatch = NULL;
3376 BlastHSPListLinkedList* head = NULL;
3377 BlastHSPListLinkedList* tail = NULL;
3378 /*
3379 * Collect matches from stream into linked list, counting them
3380 * along the way.
3381 */
3382 while (BlastHSPStreamRead(hsp_stream, &localMatch)
3383 != kBlastHSPStream_Eof) {
3384 BlastHSPListLinkedList* entry =
3385 (BlastHSPListLinkedList*) calloc(
3386 1,
3387 sizeof(BlastHSPListLinkedList)
3388 );
3389 entry->match = localMatch;
3390 if (head == NULL) {
3391 head = entry;
3392 } else {
3393 tail->next = entry;
3394 }
3395 tail = entry;
3396 ++numMatches;
3397 }
3398 /*
3399 * Convert linked list of matches into array.
3400 */
3401 theseMatches =
3402 (BlastHSPList**) calloc(numMatches, sizeof(BlastHSPList*));
3403 int i;
3404 for (i = 0; i < numMatches; ++i) {
3405 theseMatches[i] = head->match;
3406 BlastHSPListLinkedList* here = head;
3407 head = head->next;
3408 sfree(here);
3409 }
3410 }
3411
3412 Boolean interrupt = FALSE;
3413 #pragma omp parallel \
3414 default(none) num_threads(actual_num_threads) \
3415 if(actual_num_threads>1) \
3416 shared(interrupt, seqsrc_tld, score_params_tld, hit_params_tld, \
3417 gap_align_tld, results_tld, \
3418 redoneMatches_tld, \
3419 STDERR_COMMA \
3420 numQueries, numMatches, theseMatches, \
3421 numFrames, program_number, subjectBlk_tld, positionBased, \
3422 default_db_genetic_code, localScalingFactor, queryInfo, \
3423 sbp, smithWaterman, compositionTestIndex_tld, forbidden, \
3424 NRrecord_tld, actual_num_threads, sbp_tld, \
3425 matrix_tld, query_info_tld, numContexts_tld, \
3426 genetic_code_string, queryBlk, compo_adjust_mode, \
3427 alignments_tld, incoming_align_set_tld, savedParams_tld, \
3428 scoringParams, redo_align_params_tld, \
3429 status_code_tld)
3430 {
3431 int b;
3432 #pragma omp for schedule(static)
3433 for (b = 0; b < numMatches; ++b) {
3434 #pragma omp flush(interrupt)
3435 if (!interrupt) {
3436 BlastCompo_Alignment** alignments = NULL;
3437 BlastCompo_Alignment** incoming_align_set = NULL;
3438 Blast_CompositionWorkspace* NRrecord = NULL;
3439 BlastCompo_QueryInfo* query_info = NULL;
3440
3441 int numAligns[6];
3442 Blast_KarlinBlk* kbp = NULL;
3443 BlastCompo_MatchingSequence matchingSeq = {0,};
3444 BlastHSPList* hsp_list = NULL;
3445 BlastCompo_Alignment* incoming_aligns = NULL;
3446 Blast_RedoAlignParams* redo_align_params;
3447 double best_evalue;
3448 Int4 best_score;
3449 int query_index;
3450 int context_index;
3451 int frame_index;
3452 void* discarded_aligns = NULL;
3453 BlastSeqSrc* seqSrc;
3454 BlastScoringParameters* scoringParams;
3455 BlastHitSavingParameters* hitParams;
3456 BlastCompo_Heap* redoneMatches;
3457 BlastScoreBlk* sbp;
3458 BLAST_SequenceBlk* subjectBlk;
3459 int numContexts;
3460 int compositionTestIndex;
3461 /* existing alignments for a match */
3462 Int4** matrix; /* score matrix */
3463 int* pStatusCode;
3464
3465 double pvalueForThisPair = (-1); /* p-value for this match
3466 for composition; -1 == no adjustment*/
3467 double LambdaRatio; /*lambda ratio*/
3468
3469 int tid = 0;
3470 #ifdef _OPENMP
3471 if(actual_num_threads > 1) {
3472 tid = omp_get_thread_num();
3473 }
3474 #endif
3475 seqSrc = seqsrc_tld[tid];
3476 scoringParams = score_params_tld[tid];
3477 hitParams = hit_params_tld[tid];
3478 redoneMatches = redoneMatches_tld[tid];
3479 alignments = alignments_tld[tid];
3480 incoming_align_set = incoming_align_set_tld[tid];
3481 NRrecord = NRrecord_tld[tid];
3482 sbp = sbp_tld[tid];
3483 redo_align_params = redo_align_params_tld[tid];
3484 matrix = matrix_tld[tid];
3485 pStatusCode = &status_code_tld[tid];
3486 query_info = query_info_tld[tid];
3487 numContexts = numContexts_tld[tid];
3488 compositionTestIndex = compositionTestIndex_tld[tid];
3489 subjectBlk = subjectBlk_tld[tid];
3490
3491 BlastHSPList* localMatch = theseMatches[b];
3492
3493 if (localMatch->hsp_array == NULL) {
3494 if (seqSrc) {
3495 continue;
3496 }
3497 if(actual_num_threads > 1) {
3498 #pragma omp critical(intrpt)
3499 interrupt = TRUE;
3500 #pragma omp flush(interrupt)
3501 continue;
3502 }
3503 }
3504
3505 if (BlastCompo_EarlyTermination(
3506 localMatch->best_evalue,
3507 redoneMatches,
3508 numQueries
3509 )) {
3510 Blast_HSPListFree(localMatch);
3511 if (seqSrc) {
3512 continue;
3513 }
3514 if(actual_num_threads > 1) {
3515 #pragma omp critical(intrpt)
3516 interrupt = TRUE;
3517 #pragma omp flush(interrupt)
3518 continue;
3519 }
3520 }
3521
3522 query_index = localMatch->query_index;
3523 context_index = query_index * numFrames;
3524 BlastSeqSrcSetRangesArg * ranges = NULL;
3525 /* Get the sequence for this match */
3526 if (seqSrc && BlastSeqSrcGetSupportsPartialFetching(seqSrc)) {
3527 ranges = BLAST_SetupPartialFetching(
3528 program_number,
3529 (BlastSeqSrc*) seqSrc,
3530 (const BlastHSPList**)&localMatch,
3531 1
3532 );
3533 }
3534
3535 if (subjectBlk) {
3536 matchingSeq.length = subjectBlk->length;
3537 matchingSeq.index = -1;
3538 matchingSeq.local_data = subjectBlk;
3539 } else {
3540 *pStatusCode = s_MatchingSequenceInitialize(
3541 &matchingSeq,
3542 program_number,
3543 seqSrc,
3544 default_db_genetic_code,
3545 localMatch->oid,
3546 ranges
3547 );
3548 if (*pStatusCode != 0) {
3549 /*
3550 * some sequences may have been excluded by membit filtering
3551 * so this is not really an exception
3552 */
3553 *pStatusCode = 0;
3554 goto match_loop_cleanup;
3555 }
3556 }
3557 *pStatusCode = s_ResultHspToDistinctAlign(
3558 incoming_align_set, /* o */
3559 numAligns, /* o */
3560 localMatch->hsp_array, /* i */
3561 localMatch->hspcnt, /* i */
3562 context_index, /* i */
3563 queryInfo, /* i */
3564 localScalingFactor /* i */
3565 );
3566 if (*pStatusCode != 0) {
3567 goto match_loop_cleanup;
3568 }
3569
3570 hsp_list = Blast_HSPListNew(0);
3571 for (frame_index = 0;
3572 frame_index < numFrames;
3573 frame_index++, context_index++) {
3574 incoming_aligns = incoming_align_set[frame_index];
3575 if (!incoming_aligns) {
3576 continue;
3577 }
3578 /*
3579 * All alignments in thisMatch should be to the same query
3580 */
3581 kbp = sbp->kbp_gap[context_index];
3582 if (smithWaterman) {
3583 *pStatusCode =
3584 Blast_RedoOneMatchSmithWaterman(
3585 alignments,
3586 redo_align_params,
3587 incoming_aligns,
3588 numAligns[frame_index],
3589 kbp->Lambda,
3590 kbp->logK,
3591 &matchingSeq,
3592 query_info,
3593 numQueries,
3594 matrix,
3595 BLASTAA_SIZE,
3596 NRrecord,
3597 &forbidden,
3598 redoneMatches,
3599 &pvalueForThisPair,
3600 compositionTestIndex,
3601 &LambdaRatio
3602 );
3603 } else {
3604 *pStatusCode =
3605 Blast_RedoOneMatch(
3606 alignments, // thread-local
3607 redo_align_params, // thread-local
3608 incoming_aligns, // thread-local
3609 numAligns[frame_index], // local
3610 kbp->Lambda, // thread-local
3611 &matchingSeq, // thread-local
3612 -1, // const
3613 query_info, // thread-local
3614 numContexts, // thread-local
3615 matrix, // thread-local
3616 BLASTAA_SIZE, // const
3617 NRrecord, // thread-local
3618 &pvalueForThisPair, // local
3619 compositionTestIndex, // thread-local
3620 &LambdaRatio // local
3621 );
3622 }
3623
3624 if (*pStatusCode != 0) {
3625 goto match_loop_cleanup;
3626 }
3627
3628 if (alignments[context_index] != NULL) {
3629 Int2 qframe = frame_index;
3630 if (program_number == eBlastTypeBlastx) {
3631 if (qframe < 3) {
3632 qframe++;
3633 } else {
3634 qframe = 2 - qframe;
3635 }
3636 }
3637 *pStatusCode =
3638 s_HSPListFromDistinctAlignments(hsp_list,
3639 &alignments[context_index],
3640 matchingSeq.index,
3641 queryInfo, qframe);
3642 if (*pStatusCode) {
3643 goto match_loop_cleanup;
3644 }
3645 }
3646 BlastCompo_AlignmentsFree(&incoming_aligns, NULL);
3647 incoming_align_set[frame_index] = NULL;
3648 }
3649
3650 if (hsp_list->hspcnt > 1) {
3651 s_HitlistReapContained(hsp_list->hsp_array,
3652 &hsp_list->hspcnt);
3653 }
3654 *pStatusCode =
3655 s_HitlistEvaluateAndPurge(&best_score, &best_evalue,
3656 hsp_list,
3657 seqSrc,
3658 matchingSeq.length,
3659 program_number,
3660 queryInfo, context_index,
3661 sbp, hitParams,
3662 pvalueForThisPair, LambdaRatio,
3663 matchingSeq.index);
3664 if (*pStatusCode != 0) {
3665 goto query_loop_cleanup;
3666 }
3667 if (best_evalue <= hitParams->options->expect_value) {
3668 /* The best alignment is significant */
3669 s_HSPListNormalizeScores(hsp_list, kbp->Lambda, kbp->logK,
3670 localScalingFactor);
3671 s_ComputeNumIdentities(
3672 queryBlk,
3673 queryInfo,
3674 subjectBlk,
3675 seqSrc,
3676 hsp_list,
3677 scoringParams->options,
3678 genetic_code_string,
3679 sbp,
3680 ranges
3681 );
3682 if (!seqSrc) {
3683 goto query_loop_cleanup;
3684 }
3685 if (BlastCompo_HeapWouldInsert(
3686 &redoneMatches[query_index],
3687 best_evalue,
3688 best_score,
3689 localMatch->oid
3690 )) {
3691 *pStatusCode =
3692 BlastCompo_HeapInsert(
3693 &redoneMatches[query_index],
3694 hsp_list,
3695 best_evalue,
3696 best_score,
3697 localMatch->oid,
3698 &discarded_aligns
3699 );
3700 if (*pStatusCode == 0) {
3701 hsp_list = NULL;
3702 }
3703 } else {
3704 hsp_list = Blast_HSPListFree(hsp_list);
3705 }
3706
3707 if (*pStatusCode) {
3708 goto query_loop_cleanup;
3709 }
3710 if (discarded_aligns != NULL) {
3711 Blast_HSPListFree(discarded_aligns);
3712 }
3713 }
3714 query_loop_cleanup:
3715 match_loop_cleanup:
3716 if (seqSrc) {
3717 localMatch = Blast_HSPListFree(localMatch);
3718 } else {
3719 Blast_HSPListSwap(localMatch, hsp_list);
3720 localMatch->oid = hsp_list->oid;
3721 }
3722 hsp_list = Blast_HSPListFree(hsp_list);
3723 ranges = BlastSeqSrcSetRangesArgFree(ranges);
3724
3725 if (*pStatusCode != 0) {
3726 for (context_index = 0;
3727 context_index < numContexts;
3728 context_index++) {
3729 BlastCompo_AlignmentsFree(
3730 &alignments[context_index],
3731 s_FreeEditScript
3732 );
3733 }
3734 }
3735 s_MatchingSequenceRelease(&matchingSeq);
3736 BlastCompo_AlignmentsFree(&incoming_aligns, NULL);
3737 if ((actual_num_threads > 1) &&
3738 (*pStatusCode != 0 || !seqSrc)) {
3739 #pragma omp critical(intrpt)
3740 interrupt = TRUE;
3741 #pragma omp flush(interrupt)
3742 continue;
3743 }
3744
3745 } /* end of if(!interrupt) */
3746 }
3747 #pragma omp barrier
3748
3749 /*
3750 * end of omp parallel section
3751 */
3752 }
3753
3754 function_cleanup:
3755
3756 for (i = 0; i < actual_num_threads; ++i) {
3757 if (status_code_tld[i] != 0) {
3758 status_code = status_code_tld[i];
3759 }
3760 }
3761 for (i = 0; i < actual_num_threads; ++i) {
3762 if (seqSrc && status_code == 0) {
3763 s_FillResultsFromCompoHeaps(
3764 results_tld[i],
3765 redoneMatches_tld[i],
3766 hitParams->options->hitlist_size
3767 );
3768 if (redoneMatches_tld[i] != NULL) {
3769 int qi;
3770 for (qi = 0; qi < numQueries; ++qi) {
3771 sfree(redoneMatches_tld[i][qi].array);
3772 sfree(redoneMatches_tld[i][qi].heapArray);
3773 }
3774 s_ClearHeap(redoneMatches_tld[i]);
3775 }
3776 } else {
3777 if (redoneMatches_tld[i] != NULL) {
3778 int qi;
3779 for (qi = 0; qi < numQueries; ++qi) {
3780 sfree(redoneMatches_tld[i][qi].array);
3781 sfree(redoneMatches_tld[i][qi].heapArray);
3782 }
3783 s_ClearHeap(redoneMatches_tld[i]);
3784 }
3785 }
3786 sfree(redoneMatches_tld[i]);
3787 }
3788 if (redoneMatches != NULL) {
3789 int qi;
3790 for (qi = 0; qi < numQueries; ++qi) {
3791 sfree(redoneMatches[qi].array);
3792 sfree(redoneMatches[qi].heapArray);
3793 }
3794 s_ClearHeap(redoneMatches);
3795 }
3796
3797 if (hsp_stream != NULL) {
3798 /* Reduce results from all threads and continue with business as usual */
3799 SThreadLocalDataArray* thread_data =
3800 SThreadLocalDataArrayNew(actual_num_threads);
3801 int i;
3802 for (i = 0; i < actual_num_threads; ++i) {
3803 SThreadLocalData* tdi = thread_data->tld[i];
3804 BlastHSPResults* rdi = results_tld[i];
3805 tdi->hit_params = hit_params_tld[i];
3806 hit_params_tld[i] = NULL;
3807 tdi->results =
3808 (BlastHSPResults*) calloc(1, sizeof(BlastHSPResults));
3809 tdi->results->num_queries = rdi->num_queries;
3810 tdi->results->hitlist_array =
3811 (BlastHitList**) calloc(
3812 tdi->results->num_queries,
3813 sizeof(BlastHitList*)
3814 );
3815 int j;
3816 for (j = 0; j < tdi->results->num_queries; ++j) {
3817 tdi->results->hitlist_array[j] = rdi->hitlist_array[j];
3818 rdi->hitlist_array[j] = NULL;
3819 }
3820 }
3821 local_results = SThreadLocalDataArrayConsolidateResults(thread_data);
3822 ASSERT(local_results);
3823
3824 /* post-traceback pipes */
3825 BlastHSPStreamTBackClose(hsp_stream, local_results);
3826
3827 for (i = 0; i < local_results->num_queries; ++i) {
3828 results->hitlist_array[i] = local_results->hitlist_array[i];
3829 local_results->hitlist_array[i] = NULL;
3830 }
3831 for (i = 0; i < actual_num_threads; ++i) {
3832 thread_data->tld[i]->hit_params = NULL;
3833 int j;
3834 for (j = 0; j < local_results->num_queries; ++j) {
3835 thread_data->tld[i]->results->hitlist_array[j] =
3836 Blast_HitListFree(
3837 thread_data->tld[i]->results->hitlist_array[j]
3838 );
3839 }
3840 sfree(thread_data->tld[i]->results->hitlist_array);
3841 sfree(thread_data->tld[i]->results);
3842 thread_data->tld[i] = SThreadLocalDataFree(thread_data->tld[i]);
3843 }
3844 sfree(thread_data->tld);
3845 sfree(thread_data);
3846 Blast_HSPResultsFree(local_results);
3847 }
3848
3849 if (redoneMatches != NULL) {
3850 for (query_index = 0; query_index < numQueries; query_index++) {
3851 BlastCompo_HeapRelease(&redoneMatches[query_index]);
3852 }
3853 sfree(redoneMatches);
3854 redoneMatches = NULL;
3855 }
3856 if (smithWaterman) {
3857 Blast_ForbiddenRangesRelease(&forbidden);
3858 }
3859 if (gapAlign != NULL) {
3860 gapAlign = BLAST_GapAlignStructFree(gapAlign);
3861 }
3862 s_RestoreSearch(sbp, scoringParams, savedParams, queryBlk->length,
3863 positionBased, compo_adjust_mode);
3864 s_SavedParametersFree(&savedParams);
3865
3866 for (i = 0; i < actual_num_threads; ++i) {
3867 s_BlastScoreBlk_Free(&sbp_tld[i]);
3868 gap_align_tld[i]->sbp = NULL;
3869 s_BlastGapAlignStruct_Free(gap_align_tld[i]);
3870 Blast_RedoAlignParamsFree(&redo_align_params_tld[i]);
3871 sfree(alignments_tld[i]);
3872 sfree(incoming_align_set_tld[i]);
3873 Blast_CompositionWorkspaceFree(&NRrecord_tld[i]);
3874 s_SavedParametersFree(&savedParams_tld[i]);
3875 BlastSeqSrcFree(seqsrc_tld[i]);
3876 results_tld[i] = Blast_HSPResultsFree(results_tld[i]);
3877 s_FreeBlastCompo_QueryInfoArray(&query_info_tld[i], numContexts);
3878 }
3879 sfree(alignments_tld);
3880 sfree(compositionTestIndex_tld);
3881 sfree(gap_align_tld);
3882 sfree(gapping_params_context_tld);
3883 sfree(hit_params_tld);
3884 sfree(incoming_align_set_tld);
3885 sfree(matrix_tld);
3886 sfree(NRrecord_tld);
3887 sfree(numContexts_tld);
3888 sfree(query_info_tld);
3889 sfree(redo_align_params_tld);
3890 sfree(redoneMatches_tld);
3891 sfree(results_tld);
3892 sfree(savedParams_tld);
3893 sfree(sbp_tld);
3894 sfree(score_params_tld);
3895 sfree(seqsrc_tld);
3896 sfree(status_code_tld);
3897 sfree(subjectBlk_tld);
3898 sfree(theseMatches);
3899
3900 return (Int2) status_code;
3901 }
3902
3903
3904
3905