1 /* $Id: blast_kappa.c,v 1.156 2016/09/27 14:59:15 fukanchi Exp $
2 * ==========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Alejandro Schaffer, Mike Gertz (ported to algo/blast by Tom Madden)
27 *
28 */
29
30 /** @file blast_kappa.c
31 * Utilities for doing Smith-Waterman alignments and adjusting the scoring
32 * system for each match in blastpgp
33 */
34
35 #include <float.h>
36 #include <algo/blast/core/ncbi_math.h>
37 #include <algo/blast/core/blast_hits.h>
38 #include <algo/blast/core/blast_kappa.h>
39 #include <algo/blast/core/blast_util.h>
40 #include <algo/blast/core/blast_gapalign.h>
41 #include <algo/blast/core/blast_filter.h>
42 #include <algo/blast/core/blast_traceback.h>
43 #include <algo/blast/core/link_hsps.h>
44 #include <algo/blast/core/gencode_singleton.h>
45 #include "blast_psi_priv.h"
46 #include "blast_gapalign_priv.h"
47 #include "blast_hits_priv.h"
48 #include "blast_posit.h"
49 #include "blast_hspstream_mt_utils.h"
50 #include "blast_traceback_mt_priv.h"
51
52 #ifdef _OPENMP
53 #include <omp.h>
54 #endif
55
56 #include <algo/blast/composition_adjustment/nlm_linear_algebra.h>
57 #include <algo/blast/composition_adjustment/compo_heap.h>
58 #include <algo/blast/composition_adjustment/redo_alignment.h>
59 #include <algo/blast/composition_adjustment/matrix_frequency_data.h>
60 #include <algo/blast/composition_adjustment/unified_pvalues.h>
61
62 /* Define KAPPA_PRINT_DIAGNOSTICS to turn on printing of
63 * diagnostic information from some routines. */
64
65 /** Compile-time option; if set to a true value, then blastp runs
66 that use Blast_RedoAlignmentCore to compute the traceback will not
67 SEG the subject sequence */
68 #ifndef KAPPA_BLASTP_NO_SEG_SEQUENCE
69 #define KAPPA_BLASTP_NO_SEG_SEQUENCE 0
70 #endif
71
72
73 /** Compile-time option; if set to a true value, then blastp runs
74 that use Blast_RedoAlignmentCore to compute the traceback will not
75 SEG the subject sequence */
76 #ifndef KAPPA_TBLASTN_NO_SEG_SEQUENCE
77 #define KAPPA_TBLASTN_NO_SEG_SEQUENCE 0
78 #endif
79
80
81 /**
82 * Given a list of HSPs with (possibly) high-precision scores, rescale
83 * the scores to have standard precision and set the scale-independent
84 * bit scores. This routine does *not* resort the list; it is assumed
85 * that the list is already sorted according to e-values that have been
86 * computed using the initial, higher-precision scores.
87 *
88 * @param hsp_list the HSP list
89 * @param logK Karlin-Altschul statistical parameter [in]
90 * @param lambda Karlin-Altschul statistical parameter [in]
91 * @param scoreDivisor the value by which reported scores are to be
92 */
93 static void
s_HSPListNormalizeScores(BlastHSPList * hsp_list,double lambda,double logK,double scoreDivisor)94 s_HSPListNormalizeScores(BlastHSPList * hsp_list,
95 double lambda,
96 double logK,
97 double scoreDivisor)
98 {
99 int hsp_index;
100 for(hsp_index = 0; hsp_index < hsp_list->hspcnt; hsp_index++) {
101 BlastHSP * hsp = hsp_list->hsp_array[hsp_index];
102
103 hsp->score = BLAST_Nint(((double) hsp->score) / scoreDivisor);
104 /* Compute the bit score using the newly computed scaled score. */
105 hsp->bit_score = (hsp->score*lambda*scoreDivisor - logK)/NCBIMATH_LN2;
106 }
107 }
108
109
110 /**
111 * Adjusts the E-values in a BLAST_HitList to be composites of
112 * a composition-based P-value and a score/alignment-based P-value
113 *
114 * @param hsp_list the hitlist whose E-values need to be adjusted
115 * @param comp_p_value P-value from sequence composition
116 * @param seqSrc a source of sequence data
117 * @param subject_length length of database sequence
118 * @param query_context info about this query context; needed when
119 * multiple queries are being used
120 * @param LambdaRatio the ratio between the observed value of Lambda
121 * and the predicted value of lambda (used to print
122 * diagnostics)
123 * @param subject_id the subject id of this sequence (used to print
124 * diagnostics)
125 **/
126 static void
s_AdjustEvaluesForComposition(BlastHSPList * hsp_list,double comp_p_value,const BlastSeqSrc * seqSrc,Int4 subject_length,const BlastContextInfo * query_context,double LambdaRatio,int subject_id)127 s_AdjustEvaluesForComposition(
128 BlastHSPList *hsp_list,
129 double comp_p_value,
130 const BlastSeqSrc* seqSrc,
131 Int4 subject_length,
132 const BlastContextInfo * query_context,
133 double LambdaRatio,
134 int subject_id)
135 {
136 /* Smallest observed evalue after adjustment */
137 double best_evalue = DBL_MAX;
138
139 /* True length of the query */
140 int query_length = query_context->query_length;
141 /* Length adjustment to compensate for edge effects */
142 int length_adjustment = query_context->length_adjustment;
143
144 /* Effective lengths of the query, subject, and database */
145 double query_eff = MAX((query_length - length_adjustment), 1);
146 double subject_eff = MAX((subject_length - length_adjustment), 1.0);
147 double dblen_eff = (double) query_context->eff_searchsp / query_eff;
148
149 /* Scale factor to convert the database E-value to the sequence E-value */
150 double db_to_sequence_scale = subject_eff / dblen_eff;
151
152 int hsp_index;
153 for (hsp_index = 0; hsp_index < hsp_list->hspcnt; hsp_index++) {
154 /* for all HSPs */
155 double align_p_value; /* P-value for the alignment score */
156 double combined_p_value; /* combination of two P-values */
157
158 /* HSP for this iteration */
159 BlastHSP * hsp = hsp_list->hsp_array[hsp_index];
160 #ifdef KAPPA_PRINT_DIAGNOSTICS
161 /* Original E-value, saved if diagnostics are printed. */
162 double old_e_value = hsp->evalue;
163 #endif
164 hsp->evalue *= db_to_sequence_scale;
165
166 align_p_value = BLAST_KarlinEtoP(hsp->evalue);
167 combined_p_value = Blast_Overall_P_Value(comp_p_value,align_p_value);
168 hsp->evalue = BLAST_KarlinPtoE(combined_p_value);
169 hsp->evalue /= db_to_sequence_scale;
170
171 if (hsp->evalue < best_evalue) {
172 best_evalue = hsp->evalue;
173 }
174
175 #ifdef KAPPA_PRINT_DIAGNOSTICS
176 if (seqSrc){
177 int sequence_gi; /*GI of a sequence*/
178 Blast_GiList* gi_list; /*list of GI's for a sequence*/
179 gi_list = BlastSeqSrcGetGis(seqSrc, (void *) (&subject_id));
180 if ((gi_list) && (gi_list->num_used > 0)) {
181 sequence_gi = gi_list->data[0];
182 } else {
183 sequence_gi = (-1);
184 }
185 printf("GI %d Lambda ratio %e comp. p-value %e; "
186 "adjust E-value of query length %d match length "
187 "%d from %e to %e\n",
188 sequence_gi, LambdaRatio, comp_p_value,
189 query_length, subject_length, old_e_value, hsp->evalue);
190 Blast_GiListFree(gi_list);
191 }
192 #endif
193 } /* end for all HSPs */
194
195 hsp_list->best_evalue = best_evalue;
196
197 /* suppress unused parameter warnings if diagnostics are not printed */
198 (void) seqSrc;
199 (void) query_length;
200 (void) LambdaRatio;
201 (void) subject_id;
202 }
203
204
205 /**
206 * Remove from a hitlist all HSPs that are completely contained in an
207 * HSP that occurs earlier in the list and that:
208 * - is on the same strand; and
209 * - has equal or greater score. T
210 * The hitlist should be sorted by some measure of significance before
211 * this routine is called.
212 * @param hsp_array array to be reaped
213 * @param hspcnt length of hsp_array
214 */
215 static void
s_HitlistReapContained(BlastHSP * hsp_array[],Int4 * hspcnt)216 s_HitlistReapContained(BlastHSP * hsp_array[], Int4 * hspcnt)
217 {
218 Int4 iread; /* iteration index used to read the hitlist */
219 Int4 iwrite; /* iteration index used to write to the hitlist */
220 Int4 old_hspcnt; /* number of HSPs in the hitlist on entry */
221
222 old_hspcnt = *hspcnt;
223
224 for (iread = 1; iread < *hspcnt; iread++) {
225 /* for all HSPs in the hitlist */
226 Int4 ireadBack; /* iterator over indices less than iread */
227 BlastHSP *hsp1; /* an HSP that is a candidate for deletion */
228
229 hsp1 = hsp_array[iread];
230 for (ireadBack = 0; ireadBack < iread && hsp1 != NULL; ireadBack++) {
231 /* for all HSPs before hsp1 in the hitlist and while hsp1
232 * has not been deleted */
233 BlastHSP *hsp2; /* an HSP that occurs earlier in hsp_array
234 * than hsp1 */
235 hsp2 = hsp_array[ireadBack];
236
237 if( hsp2 == NULL ) { /* hsp2 was deleted in a prior iteration. */
238 continue;
239 }
240 if (hsp2->query.frame == hsp1->query.frame &&
241 hsp2->subject.frame == hsp1->subject.frame) {
242 /* hsp1 and hsp2 are in the same query/subject frame. */
243 if (CONTAINED_IN_HSP
244 (hsp2->query.offset, hsp2->query.end, hsp1->query.offset,
245 hsp2->subject.offset, hsp2->subject.end,
246 hsp1->subject.offset) &&
247 CONTAINED_IN_HSP
248 (hsp2->query.offset, hsp2->query.end, hsp1->query.end,
249 hsp2->subject.offset, hsp2->subject.end,
250 hsp1->subject.end) &&
251 hsp1->score <= hsp2->score) {
252 hsp1 = hsp_array[iread] = Blast_HSPFree(hsp_array[iread]);
253 }
254 } /* end if hsp1 and hsp2 are in the same query/subject frame */
255 } /* end for all HSPs before hsp1 in the hitlist */
256 } /* end for all HSPs in the hitlist */
257
258 /* Condense the hsp_array, removing any NULL items. */
259 iwrite = 0;
260 for (iread = 0; iread < *hspcnt; iread++) {
261 if (hsp_array[iread] != NULL) {
262 hsp_array[iwrite++] = hsp_array[iread];
263 }
264 }
265 *hspcnt = iwrite;
266 /* Fill the remaining memory in hsp_array with NULL pointers. */
267 for ( ; iwrite < old_hspcnt; iwrite++) {
268 hsp_array[iwrite] = NULL;
269 }
270 }
271
272
273 /** A callback used to free an EditScript that has been stored in a
274 * BlastCompo_Alignment. */
s_FreeEditScript(void * edit_script)275 static void s_FreeEditScript(void * edit_script)
276 {
277 if (edit_script != NULL)
278 GapEditScriptDelete(edit_script);
279 }
280
281
282 /**
283 * Converts a list of objects of type BlastCompo_Alignment to an
284 * new object of type BlastHSPList and returns the result. Conversion
285 * in this direction is lossless. The list passed to this routine is
286 * freed to ensure that there is no aliasing of fields between the
287 * list of BlastCompo_Alignments and the new hitlist.
288 *
289 * @param hsp_list The hsp_list to populate
290 * @param alignments A list of distinct alignments; freed before return [in]
291 * @param oid Ordinal id of a database sequence [in]
292 * @param queryInfo information about all queries in this search [in]
293 * @param frame query frame
294 * @return Allocated and filled BlastHSPList structure.
295 */
296 static int
s_HSPListFromDistinctAlignments(BlastHSPList * hsp_list,BlastCompo_Alignment ** alignments,int oid,const BlastQueryInfo * queryInfo,int frame)297 s_HSPListFromDistinctAlignments(BlastHSPList *hsp_list,
298 BlastCompo_Alignment ** alignments,
299 int oid,
300 const BlastQueryInfo* queryInfo,
301 int frame)
302 {
303 int status = 0; /* return code for any routine called */
304 static const int unknown_value = 0; /* dummy constant to use when a
305 parameter value is not known */
306 BlastCompo_Alignment * align; /* an alignment in the list */
307
308 if (hsp_list == NULL) {
309 return -1;
310 }
311 hsp_list->oid = oid;
312
313 for (align = *alignments; NULL != align; align = align->next) {
314 BlastHSP * new_hsp = NULL;
315 GapEditScript * editScript = align->context;
316 align->context = NULL;
317
318 status = Blast_HSPInit(align->queryStart, align->queryEnd,
319 align->matchStart, align->matchEnd,
320 unknown_value, unknown_value,
321 align->queryIndex,
322 frame, (Int2) align->frame, align->score,
323 &editScript, &new_hsp);
324 switch (align->matrix_adjust_rule) {
325 case eDontAdjustMatrix:
326 new_hsp->comp_adjustment_method = eNoCompositionBasedStats;
327 break;
328 case eCompoScaleOldMatrix:
329 new_hsp->comp_adjustment_method = eCompositionBasedStats;
330 break;
331 default:
332 new_hsp->comp_adjustment_method = eCompositionMatrixAdjust;
333 break;
334 }
335 if (status != 0)
336 break;
337 /* At this point, the subject and possibly the query sequence have
338 * been filtered; since it is not clear that num_ident of the
339 * filtered sequences, rather than the original, is desired,
340 * explicitly leave num_ident blank. */
341 new_hsp->num_ident = 0;
342
343 status = Blast_HSPListSaveHSP(hsp_list, new_hsp);
344 if (status != 0)
345 break;
346 }
347 if (status == 0) {
348 BlastCompo_AlignmentsFree(alignments, s_FreeEditScript);
349 Blast_HSPListSortByScore(hsp_list);
350 } else {
351 hsp_list = Blast_HSPListFree(hsp_list);
352 }
353 return 0;
354 }
355
s_GetSubjectLength(Int4 total_subj_length,EBlastProgramType program_number)356 Int4 s_GetSubjectLength(Int4 total_subj_length, EBlastProgramType program_number)
357 {
358 return ((program_number == eBlastTypeRpsTblastn) ?
359 (GET_NUCL_LENGTH(total_subj_length) - 1 ) /3 : total_subj_length);
360 }
361
362
363 /**
364 * Adding evalues to a list of HSPs and remove those that do not have
365 * sufficiently good (low) evalue.
366 *
367 * @param *pbestScore best (highest) score in the list
368 * @param *pbestEvalue best (lowest) evalue in the list
369 * @param hsp_list the list
370 * @param seqSrc a source of sequence data
371 * @param subject_length length of the subject sequence
372 * @param program_number the type of BLAST search being performed
373 * @param queryInfo information about the queries
374 * @param context_index the index of the query corresponding to
375 * the HSPs in hsp_list
376 * @param sbp the score block for this search
377 * @param hitParams parameters used to assign evalues and
378 * decide whether to save hits.
379 * @param pvalueForThisPair composition p-value
380 * @param LambdaRatio lambda ratio, if available
381 * @param subject_id index of subject
382 *
383 * @return 0 on success; -1 on failure (can fail because some methods
384 * of generating evalues use auxiliary structures)
385 */
386 static int
s_HitlistEvaluateAndPurge(int * pbestScore,double * pbestEvalue,BlastHSPList * hsp_list,const BlastSeqSrc * seqSrc,int subject_length,EBlastProgramType program_number,const BlastQueryInfo * queryInfo,int context_index,BlastScoreBlk * sbp,const BlastHitSavingParameters * hitParams,double pvalueForThisPair,double LambdaRatio,int subject_id)387 s_HitlistEvaluateAndPurge(int * pbestScore, double *pbestEvalue,
388 BlastHSPList * hsp_list,
389 const BlastSeqSrc* seqSrc,
390 int subject_length,
391 EBlastProgramType program_number,
392 const BlastQueryInfo* queryInfo,
393 int context_index,
394 BlastScoreBlk* sbp,
395 const BlastHitSavingParameters* hitParams,
396 double pvalueForThisPair,
397 double LambdaRatio,
398 int subject_id)
399 {
400 int status = 0;
401 *pbestEvalue = DBL_MAX;
402 *pbestScore = 0;
403 if (hitParams->do_sum_stats) {
404 status = BLAST_LinkHsps(program_number, hsp_list, queryInfo,
405 subject_length, sbp,
406 hitParams->link_hsp_params, TRUE);
407 } else {
408
409
410 status =
411 Blast_HSPListGetEvalues(program_number, queryInfo,
412 s_GetSubjectLength(subject_length, program_number),
413 hsp_list, TRUE, FALSE, sbp,
414 0.0, /* use a non-zero gap decay
415 only when linking HSPs */
416 1.0); /* Use scaling factor equal to
417 1, because both scores and
418 Lambda are scaled, so they
419 will cancel each other. */
420 }
421 if (eBlastTypeBlastp == program_number ||
422 eBlastTypeBlastx == program_number) {
423 if ((0 <= pvalueForThisPair) && (pvalueForThisPair <= 1)) {
424 s_AdjustEvaluesForComposition(hsp_list, pvalueForThisPair, seqSrc,
425 subject_length,
426 &queryInfo->contexts[context_index],
427 LambdaRatio, subject_id);
428 }
429 }
430 if (status == 0) {
431 Blast_HSPListReapByEvalue(hsp_list, hitParams->options);
432 if (hsp_list->hspcnt > 0) {
433 *pbestEvalue = hsp_list->best_evalue;
434 *pbestScore = hsp_list->hsp_array[0]->score;
435 }
436 }
437 return status == 0 ? 0 : -1;
438 }
439
440 /** Compute the number of identities for the HSPs in the hsp_list
441 * @note Should work for blastp and tblastn now.
442 *
443 * @param query_blk the query sequence data [in]
444 * @param query_info structure describing the query_blk structure [in]
445 * @param seq_src source of subject sequence data [in]
446 * @param hsp_list list of HSPs to be processed [in|out]
447 * @param scoring_options scoring options [in]
448 * @gen_code_string Genetic code for tblastn [in]
449 */
450 static void
s_ComputeNumIdentities(const BLAST_SequenceBlk * query_blk,const BlastQueryInfo * query_info,BLAST_SequenceBlk * subject_blk,const BlastSeqSrc * seq_src,BlastHSPList * hsp_list,const BlastScoringOptions * scoring_options,const Uint1 * gen_code_string,const BlastScoreBlk * sbp)451 s_ComputeNumIdentities(const BLAST_SequenceBlk* query_blk,
452 const BlastQueryInfo* query_info,
453 BLAST_SequenceBlk* subject_blk,
454 const BlastSeqSrc* seq_src,
455 BlastHSPList* hsp_list,
456 const BlastScoringOptions* scoring_options,
457 const Uint1* gen_code_string,
458 const BlastScoreBlk* sbp)
459 {
460 Uint1* query = NULL;
461 Uint1* query_nomask = NULL;
462 Uint1* subject = NULL;
463 const EBlastProgramType program_number = scoring_options->program_number;
464 const Boolean kIsOutOfFrame = scoring_options->is_ooframe;
465 const EBlastEncoding encoding = Blast_TracebackGetEncoding(program_number);
466 BlastSeqSrcGetSeqArg seq_arg;
467 Int2 status = 0;
468 int i;
469 SBlastTargetTranslation* target_t = NULL;
470
471 if ( !hsp_list) return;
472
473 /* Initialize the subject */
474 if (seq_src){
475 memset((void*) &seq_arg, 0, sizeof(seq_arg));
476 seq_arg.oid = hsp_list->oid;
477 seq_arg.encoding = encoding;
478 seq_arg.check_oid_exclusion = TRUE;
479 status = BlastSeqSrcGetSequence(seq_src, (void*) &seq_arg);
480 ASSERT(status == 0);
481 (void)status; /* to pacify compiler warning */
482
483 if (program_number == eBlastTypeTblastn) {
484 subject_blk = seq_arg.seq;
485 BlastTargetTranslationNew(
486 subject_blk,
487 gen_code_string,
488 eBlastTypeTblastn,
489 kIsOutOfFrame,
490 &target_t
491 );
492 } else {
493 subject = seq_arg.seq->sequence;
494 }
495 } else {
496 subject = subject_blk->sequence;
497 }
498
499 for (i = 0; i < hsp_list->hspcnt; i++) {
500 BlastHSP* hsp = hsp_list->hsp_array[i];
501
502 /* Initialize the query */
503 if (program_number == eBlastTypeBlastx && kIsOutOfFrame) {
504 Int4 context = hsp->context - hsp->context % CODON_LENGTH;
505 Int4 context_offset = query_info->contexts[context].query_offset;
506 query = query_blk->oof_sequence + CODON_LENGTH + context_offset;
507 query_nomask = query_blk->oof_sequence + CODON_LENGTH + context_offset;
508 } else {
509 query = query_blk->sequence +
510 query_info->contexts[hsp->context].query_offset;
511 query_nomask = query_blk->sequence_nomask +
512 query_info->contexts[hsp->context].query_offset;
513 }
514
515 /* Translate subject if needed. */
516 if (program_number == eBlastTypeTblastn) {
517 const Uint1* target_sequence = Blast_HSPGetTargetTranslation(target_t, hsp, NULL);
518 status = Blast_HSPGetNumIdentitiesAndPositives(query, target_sequence, hsp, scoring_options, 0, sbp);
519 }
520 else
521 status = Blast_HSPGetNumIdentitiesAndPositives(query_nomask, subject, hsp, scoring_options, 0, sbp);
522
523 ASSERT(status == 0);
524 }
525 target_t = BlastTargetTranslationFree(target_t);
526 if (seq_src) {
527 BlastSeqSrcReleaseSequence(seq_src, (void*) &seq_arg);
528 BlastSequenceBlkFree(seq_arg.seq);
529 }
530 }
531
532
533 /**
534 * A callback routine: compute lambda for the given score
535 * probabilities.
536 * (@sa calc_lambda_type).
537 */
538 static double
s_CalcLambda(double probs[],int min_score,int max_score,double lambda0)539 s_CalcLambda(double probs[], int min_score, int max_score, double lambda0)
540 {
541
542 int i; /* loop index */
543 int score_range; /* range of possible scores */
544 double avg; /* expected score of aligning two characters */
545 Blast_ScoreFreq freq; /* score frequency data */
546
547 score_range = max_score - min_score + 1;
548 avg = 0.0;
549 for (i = 0; i < score_range; i++) {
550 avg += (min_score + i) * probs[i];
551 }
552 freq.score_min = min_score;
553 freq.score_max = max_score;
554 freq.obs_min = min_score;
555 freq.obs_max = max_score;
556 freq.sprob0 = probs;
557 freq.sprob = &probs[-min_score];
558 freq.score_avg = avg;
559
560 return Blast_KarlinLambdaNR(&freq, lambda0);
561 }
562
563
564 /** Fill a two-dimensional array with the frequency ratios that
565 * underlie a position specific score matrix (PSSM).
566 *
567 * @param returnRatios a two-dimensional array with BLASTAA_SIZE
568 * columns
569 * @param numPositions the number of rows in returnRatios
570 * @param query query sequence data, of length numPositions
571 * @param matrixName the name of the position independent matrix
572 * corresponding to this PSSM
573 * @param startNumerator position-specific data used to generate the
574 * PSSM
575 * @return 0 on success; -1 if the named matrix isn't known, or if
576 * there was a memory error
577 * @todo find out what start numerator is.
578 */
579 static int
s_GetPosBasedStartFreqRatios(double ** returnRatios,Int4 numPositions,Uint1 * query,const char * matrixName,double ** startNumerator)580 s_GetPosBasedStartFreqRatios(double ** returnRatios,
581 Int4 numPositions,
582 Uint1 * query,
583 const char *matrixName,
584 double **startNumerator)
585 {
586 Int4 i,j; /* loop indices */
587 SFreqRatios * stdFreqRatios = NULL; /* frequency ratios for the
588 named matrix. */
589 double *standardProb; /* probabilities of each
590 letter*/
591 const double kPosEpsilon = 0.0001; /* values below this cutoff
592 are treated specially */
593
594 stdFreqRatios = _PSIMatrixFrequencyRatiosNew(matrixName);
595 if (stdFreqRatios == NULL) {
596 return -1;
597 }
598 for (i = 0; i < numPositions; i++) {
599 for (j = 0; j < BLASTAA_SIZE; j++) {
600 returnRatios[i][j] = stdFreqRatios->data[query[i]][j];
601 }
602 }
603 stdFreqRatios = _PSIMatrixFrequencyRatiosFree(stdFreqRatios);
604
605 standardProb = BLAST_GetStandardAaProbabilities();
606 if(standardProb == NULL) {
607 return -1;
608 }
609 /*reverse multiplication done in posit.c*/
610 for (i = 0; i < numPositions; i++) {
611 for (j = 0; j < BLASTAA_SIZE; j++) {
612 if ((standardProb[query[i]] > kPosEpsilon) &&
613 (standardProb[j] > kPosEpsilon) &&
614 (j != eStopChar) && (j != eXchar) &&
615 (startNumerator[i][j] > kPosEpsilon)) {
616 returnRatios[i][j] = startNumerator[i][j] / standardProb[j];
617 }
618 }
619 }
620 sfree(standardProb);
621
622 return 0;
623 }
624
625
626 /**
627 * Fill a two-dimensional array with the frequency ratios that underlie the
628 * named score matrix.
629 *
630 * @param returnRatios a two-dimensional array of size
631 * BLASTAA_SIZE x BLASTAA_SIZE
632 * @param matrixName the name of a matrix
633 * @return 0 on success; -1 if the named matrix isn't known, or if
634 * there was a memory error
635 */
636 static int
s_GetStartFreqRatios(double ** returnRatios,const char * matrixName)637 s_GetStartFreqRatios(double ** returnRatios,
638 const char *matrixName)
639 {
640 /* Loop indices */
641 int i,j;
642 /* Frequency ratios for the matrix */
643 SFreqRatios * stdFreqRatios = NULL;
644
645 stdFreqRatios = _PSIMatrixFrequencyRatiosNew(matrixName);
646 if (stdFreqRatios == NULL) {
647 return -1;
648 }
649 for (i = 0; i < BLASTAA_SIZE; i++) {
650 for (j = 0; j < BLASTAA_SIZE; j++) {
651 returnRatios[i][j] = stdFreqRatios->data[i][j];
652 }
653 }
654 stdFreqRatios = _PSIMatrixFrequencyRatiosFree(stdFreqRatios);
655
656 return 0;
657 }
658
659
660 /** SCALING_FACTOR is a multiplicative factor used to get more bits of
661 * precision in the integer matrix scores. It cannot be arbitrarily
662 * large because we do not want total alignment scores to exceed
663 * -(BLAST_SCORE_MIN) */
664 #define SCALING_FACTOR 32
665
666
667 /**
668 * Produce a scaled-up version of the position-specific matrix
669 * with a given set of position-specific residue frequencies.
670 *
671 * @param fillPosMatrix is the matrix to be filled
672 * @param matrixName name of the standard substitution matrix [in]
673 * @param posFreqs PSSM's frequency ratios [in]
674 * @param query Query sequence data [in]
675 * @param queryLength Length of the query sequence above [in]
676 * @param sbp stores various parameters of the search
677 * @param scale_factor amount by which ungapped parameters should be
678 * scaled.
679 * @return 0 on success; -1 on failure
680 */
681 static int
s_ScalePosMatrix(int ** fillPosMatrix,const char * matrixName,double ** posFreqs,Uint1 * query,int queryLength,BlastScoreBlk * sbp,double scale_factor)682 s_ScalePosMatrix(int ** fillPosMatrix,
683 const char * matrixName,
684 double ** posFreqs,
685 Uint1 * query,
686 int queryLength,
687 BlastScoreBlk* sbp,
688 double scale_factor)
689 {
690 /* Data used by scaling routines */
691 Kappa_posSearchItems *posSearch = NULL;
692 /* A reduced collection of search parameters used by PSI-blast */
693 Kappa_compactSearchItems *compactSearch = NULL;
694 /* Representation of a PSSM internal to PSI-blast */
695 _PSIInternalPssmData* internal_pssm = NULL;
696 /* return code */
697 int status = 0;
698
699 posSearch = Kappa_posSearchItemsNew(queryLength, matrixName,
700 fillPosMatrix, posFreqs);
701 compactSearch = Kappa_compactSearchItemsNew(query, queryLength, sbp);
702 /* Copy data into new structures */
703 internal_pssm = _PSIInternalPssmDataNew(queryLength, BLASTAA_SIZE);
704 if (posSearch == NULL || compactSearch == NULL || internal_pssm == NULL) {
705 status = -1;
706 goto cleanup;
707 }
708 _PSICopyMatrix_int(internal_pssm->pssm, posSearch->posMatrix,
709 internal_pssm->ncols, internal_pssm->nrows);
710 _PSICopyMatrix_int(internal_pssm->scaled_pssm,
711 posSearch->posPrivateMatrix,
712 internal_pssm->ncols, internal_pssm->nrows);
713 _PSICopyMatrix_double(internal_pssm->freq_ratios,
714 posSearch->posFreqs, internal_pssm->ncols,
715 internal_pssm->nrows);
716 status = _PSIConvertFreqRatiosToPSSM(internal_pssm, query, sbp,
717 compactSearch->standardProb);
718 if (status != 0) {
719 goto cleanup;
720 }
721 /* Copy data from new structures to posSearchItems */
722 _PSICopyMatrix_int(posSearch->posMatrix, internal_pssm->pssm,
723 internal_pssm->ncols, internal_pssm->nrows);
724 _PSICopyMatrix_int(posSearch->posPrivateMatrix,
725 internal_pssm->scaled_pssm,
726 internal_pssm->ncols, internal_pssm->nrows);
727 _PSICopyMatrix_double(posSearch->posFreqs,
728 internal_pssm->freq_ratios,
729 internal_pssm->ncols, internal_pssm->nrows);
730 status = Kappa_impalaScaling(posSearch, compactSearch, (double)
731 scale_factor, FALSE, sbp);
732 cleanup:
733 internal_pssm = _PSIInternalPssmDataFree(internal_pssm);
734 posSearch = Kappa_posSearchItemsFree(posSearch);
735 compactSearch = Kappa_compactSearchItemsFree(compactSearch);
736
737 return status;
738 }
739
740
741 /**
742 * Convert an array of HSPs to a list of BlastCompo_Alignment objects.
743 * The context field of each BlastCompo_Alignment is set to point to the
744 * corresponding HSP.
745 *
746 * @param self the array of alignment to be filled
747 * @param numAligns number of alignments
748 * @param hsp_array an array of HSPs
749 * @param hspcnt the length of hsp_array
750 * @param init_context the initial context to process
751 * @param queryInfo information about the concatenated query
752 * @param localScalingFactor the amount by which this search is scaled
753 *
754 * @return the new list of alignments; or NULL if there is an out-of-memory
755 * error (or if the original array is empty)
756 */
757 static int
s_ResultHspToDistinctAlign(BlastCompo_Alignment ** self,int * numAligns,BlastHSP * hsp_array[],Int4 hspcnt,int init_context,const BlastQueryInfo * queryInfo,double localScalingFactor)758 s_ResultHspToDistinctAlign(BlastCompo_Alignment **self,
759 int *numAligns,
760 BlastHSP * hsp_array[], Int4 hspcnt,
761 int init_context,
762 const BlastQueryInfo* queryInfo,
763 double localScalingFactor)
764 {
765 BlastCompo_Alignment * tail[6]; /* last element in aligns */
766 int hsp_index; /* loop index */
767 int frame_index;
768
769 for (frame_index = 0; frame_index < 6; frame_index++) {
770 tail[frame_index] = NULL;
771 numAligns[frame_index] = 0;
772 }
773
774 for (hsp_index = 0; hsp_index < hspcnt; hsp_index++) {
775 BlastHSP * hsp = hsp_array[hsp_index]; /* current HSP */
776 BlastCompo_Alignment * new_align; /* newly-created alignment */
777 frame_index = hsp->context - init_context;
778 ASSERT(frame_index < 6 && frame_index >= 0);
779 /* Incoming alignments will have coordinates of the query
780 portion relative to a particular query context; they must
781 be shifted for used in the composition_adjustment library.
782 */
783 new_align =
784 BlastCompo_AlignmentNew((int) (hsp->score * localScalingFactor),
785 eDontAdjustMatrix,
786 hsp->query.offset, hsp->query.end, hsp->context,
787 hsp->subject.offset, hsp->subject.end,
788 hsp->subject.frame, hsp);
789 if (new_align == NULL) /* out of memory */
790 return -1;
791 if (tail[frame_index] == NULL) { /* if the list aligns is empty; */
792 /* make new_align the first element in the list */
793 self[frame_index] = new_align;
794 } else {
795 /* otherwise add new_align to the end of the list */
796 tail[frame_index]->next = new_align;
797 }
798 tail[frame_index] = new_align;
799 numAligns[frame_index]++;
800 }
801 return 0;
802 }
803
804
805 /**
806 * Redo a S-W alignment using an x-drop alignment. The result will
807 * usually be the same as the S-W alignment. The call to ALIGN_EX
808 * attempts to force the endpoints of the alignment to match the
809 * optimal endpoints determined by the Smith-Waterman algorithm.
810 * ALIGN_EX is used, so that if the data structures for storing BLAST
811 * alignments are changed, the code will not break
812 *
813 * @param query the query data
814 * @param queryStart start of the alignment in the query sequence
815 * @param queryEnd end of the alignment in the query sequence,
816 * as computed by the Smith-Waterman algorithm
817 * @param subject the subject (database) sequence
818 * @param matchStart start of the alignment in the subject sequence
819 * @param matchEnd end of the alignment in the query sequence,
820 * as computed by the Smith-Waterman algorithm
821 * @param gap_align parameters for a gapped alignment
822 * @param scoringParams Settings for gapped alignment.[in]
823 * @param score score computed by the Smith-Waterman algorithm
824 * @param queryAlignmentExtent length of the alignment in the query sequence,
825 * as computed by the x-drop algorithm
826 * @param matchAlignmentExtent length of the alignment in the subject
827 * sequence, as computed by the x-drop algorithm
828 * @param newScore alignment score computed by the x-drop
829 * algorithm
830 */
831 static void
s_SWFindFinalEndsUsingXdrop(BlastCompo_SequenceData * query,Int4 queryStart,Int4 queryEnd,BlastCompo_SequenceData * subject,Int4 matchStart,Int4 matchEnd,BlastGapAlignStruct * gap_align,const BlastScoringParameters * scoringParams,Int4 score,Int4 * queryAlignmentExtent,Int4 * matchAlignmentExtent,Int4 * newScore)832 s_SWFindFinalEndsUsingXdrop(BlastCompo_SequenceData * query,
833 Int4 queryStart,
834 Int4 queryEnd,
835 BlastCompo_SequenceData * subject,
836 Int4 matchStart,
837 Int4 matchEnd,
838 BlastGapAlignStruct* gap_align,
839 const BlastScoringParameters* scoringParams,
840 Int4 score,
841 Int4 * queryAlignmentExtent,
842 Int4 * matchAlignmentExtent,
843 Int4 * newScore)
844 {
845 Int4 XdropAlignScore; /* alignment score obtained using X-dropoff
846 * method rather than Smith-Waterman */
847 Int4 doublingCount = 0; /* number of times X-dropoff had to be
848 * doubled */
849 Int4 gap_x_dropoff_orig = gap_align->gap_x_dropoff;
850
851 GapPrelimEditBlockReset(gap_align->rev_prelim_tback);
852 GapPrelimEditBlockReset(gap_align->fwd_prelim_tback);
853 do {
854 XdropAlignScore =
855 ALIGN_EX(&(query->data[queryStart]) - 1,
856 &(subject->data[matchStart]) - 1,
857 queryEnd - queryStart + 1, matchEnd - matchStart + 1,
858 queryAlignmentExtent,
859 matchAlignmentExtent, gap_align->fwd_prelim_tback,
860 gap_align, scoringParams, queryStart - 1, FALSE, FALSE,
861 NULL);
862
863 gap_align->gap_x_dropoff *= 2;
864 doublingCount++;
865 if((XdropAlignScore < score) && (doublingCount < 3)) {
866 GapPrelimEditBlockReset(gap_align->fwd_prelim_tback);
867 }
868 } while((XdropAlignScore < score) && (doublingCount < 3));
869
870 gap_align->gap_x_dropoff = gap_x_dropoff_orig;
871 *newScore = XdropAlignScore;
872 }
873
874
875 /**
876 * BLAST-specific information that is associated with a
877 * BlastCompo_MatchingSequence.
878 */
879 typedef struct
880 BlastKappa_SequenceInfo {
881 EBlastProgramType prog_number; /**< identifies the type of blast
882 search being performed. The type
883 of search determines how sequence
884 data should be obtained. */
885 const BlastSeqSrc* seq_src; /**< BLAST sequence data source */
886 BlastSeqSrcGetSeqArg seq_arg; /**< argument to GetSequence method
887 of the BlastSeqSrc (@todo this
888 structure was designed to be
889 allocated on the stack, i.e.: in
890 Kappa_MatchingSequenceInitialize) */
891 } BlastKappa_SequenceInfo;
892
893
894 /** Release the resources associated with a matching sequence. */
895 static void
s_MatchingSequenceRelease(BlastCompo_MatchingSequence * self)896 s_MatchingSequenceRelease(BlastCompo_MatchingSequence * self)
897 {
898 if (self != NULL) {
899 if (self->index >=0) {
900 BlastKappa_SequenceInfo * local_data = self->local_data;
901 if (self->length > 0) {
902 BlastSeqSrcReleaseSequence(local_data->seq_src,
903 &local_data->seq_arg);
904 BlastSequenceBlkFree(local_data->seq_arg.seq);
905 }
906 free(self->local_data);
907 }
908 self->local_data = NULL;
909 }
910 }
911
912 /**
913 * Do a simple gapped extension to the right from the beginning of query and
914 * subject ranges examining only matches and mismatches. The extension stops
915 * when there are more than max_shift mismatches or mismatches or gaps are not
916 * followed by two identical matches. This is a simplified version of the
917 * Danielle and Jean Thierry-Miegs' jumper
918 * alignment implemented in NCBI Magic
919 * http://www.ncbi.nlm.nih.gov/IEB/Research/Acembly/Download/Downloads.html
920 *
921 * @param query_seq Query sequence [in]
922 * @param query_len Query length [in]
923 * @param subject_seq Subject sequence [in]
924 * @param subject_len Subject length [in]
925 * @param max_shift Maximum number of mismatches or gaps, extension stops if
926 * this number is reached [in]
927 * @param query_ext_len Extension length on the query [out]
928 * @param subject_ext_len Extension length on the subject [out]
929 * @param align_len Alignment length [out]
930 * @return Number of identical residues
931 */
s_ExtendRight(Uint1 * query_seq,int query_len,Uint1 * subject_seq,int subject_len,int max_shift,int * query_ext_len,int * subject_ext_len,int * align_len)932 static int s_ExtendRight(Uint1* query_seq, int query_len,
933 Uint1* subject_seq, int subject_len,
934 int max_shift,
935 int* query_ext_len, int* subject_ext_len,
936 int* align_len)
937 {
938 int num_identical = 0;
939 int q_pos, s_pos;
940 int gaps_in_query = 0;
941 int gaps_in_subject = 0;
942 q_pos = 0;
943 s_pos = 0;
944 while (q_pos < query_len && s_pos < subject_len) {
945 int n;
946 int match = 0;
947
948 while (q_pos < query_len && s_pos < subject_len
949 && query_seq[q_pos] == subject_seq[s_pos]) {
950
951 num_identical++;
952 q_pos++;
953 s_pos++;
954 }
955
956 /* try to skip mismatches or gaps */
957 for (n=1; n < max_shift && q_pos + n + 1 < query_len
958 && s_pos + n + 1 < subject_len && !match; n++) {
959
960 /* mismatches */
961 if (query_seq[q_pos + n] == subject_seq[s_pos + n]
962 && query_seq[q_pos + n + 1] == subject_seq[s_pos + n + 1]) {
963
964 /* we have already checked that two positions behind mismatches
965 match so we can advance further */
966 q_pos += n + 2;
967 s_pos += n + 2;
968 num_identical += 2;
969 match = 1;
970 }
971
972 /* gap in subject */
973 if (!match && query_seq[q_pos + n] == subject_seq[s_pos]
974 && query_seq[q_pos + n + 1] == subject_seq[s_pos + 1]) {
975
976 q_pos += n + 2;
977 s_pos += 2;
978 num_identical += 2;
979 gaps_in_subject += n;
980 match = 1;
981 }
982
983 /* gap in query */
984 if (!match && query_seq[q_pos] == subject_seq[s_pos + n]
985 && query_seq[q_pos + 1] == subject_seq[s_pos + n + 1]) {
986
987 q_pos += 2;
988 s_pos += n + 2;
989 num_identical += 2;
990 gaps_in_query += n;
991 match = 1;
992 }
993 }
994
995 if (match) {
996 continue;
997 }
998
999 /* exit the loop */
1000 break;
1001 }
1002 *query_ext_len = q_pos;
1003 *subject_ext_len = s_pos;
1004 *align_len = q_pos > s_pos ? q_pos + gaps_in_query : s_pos + gaps_in_subject;
1005
1006 return num_identical;
1007 }
1008
1009
1010 /**
1011 * Extend left from the end of the sequence and subject ranges and count
1012 * identities. The extension stops when there are more than max_shift
1013 * mismatches or mismatches or gaps are not followed by two identical matches.
1014 * See description for s_ExtendRight for more details.
1015 *
1016 * @param query_seq Query sequence [in]
1017 * @param query_len Query length [in]
1018 * @param subject_seq Subject sequence [in]
1019 * @param subject_len Subject length [in]
1020 * @param max_shift Maximum number of mismatches or gaps, extension stops if
1021 * this number is reached [in]
1022 * @param query_ext_len Extension length on the query [out]
1023 * @param subject_ext_len Extension length on the subject [out]
1024 * @param align_len Alignment length [out]
1025 * @return Number of identical residues
1026 */
s_ExtendLeft(Uint1 * query_seq,int query_len,Uint1 * subject_seq,int subject_len,int max_shift,int * query_ext_len,int * subject_ext_len,int * align_len)1027 static int s_ExtendLeft(Uint1* query_seq, int query_len,
1028 Uint1* subject_seq, int subject_len,
1029 int max_shift,
1030 int* query_ext_len, int* subject_ext_len,
1031 int* align_len)
1032 {
1033 int q_pos = query_len - 1;
1034 int s_pos = subject_len - 1;
1035 int num_identical = 0;
1036 int gaps_in_query = 0;
1037 int gaps_in_subject = 0;
1038 while (q_pos >= 0 && s_pos >= 0) {
1039 int n;
1040 int match = 0;
1041
1042 /* process identies */
1043 while (q_pos > 0 && s_pos > 0 && query_seq[q_pos] == subject_seq[s_pos]) {
1044 num_identical++;
1045 q_pos--;
1046 s_pos--;
1047 }
1048
1049 /* try to skip mismatches or gaps */
1050 for (n=1;n < max_shift && q_pos - n - 1 > 0 && s_pos - n - 1 > 0
1051 && !match; n++) {
1052
1053 /* mismatch */
1054 if (query_seq[q_pos - n] == subject_seq[s_pos - n]
1055 && query_seq[q_pos - n - 1] == subject_seq[s_pos - n - 1]) {
1056 q_pos -= n + 2;
1057 s_pos -= n + 2;
1058 num_identical += 2;
1059 match = 1;
1060 }
1061
1062 /* gap in subject */
1063 if (!match && query_seq[q_pos - n] == subject_seq[s_pos]
1064 && query_seq[q_pos - n - 1] == subject_seq[s_pos - 1]) {
1065 q_pos -= n + 2;
1066 s_pos -= 2;
1067 num_identical += 2;
1068 gaps_in_subject += n;
1069 match = 1;
1070 }
1071
1072 /* gap in query */
1073 if (!match && query_seq[q_pos] == subject_seq[s_pos - n]
1074 && query_seq[q_pos - 1] == subject_seq[s_pos - n - 1]) {
1075 q_pos -= 2;
1076 s_pos -= n + 2;
1077 num_identical += 2;
1078 gaps_in_query += n;
1079 match = 1;
1080 }
1081 }
1082
1083 if (match) {
1084 continue;
1085 }
1086
1087 break;
1088 }
1089 *query_ext_len = query_len - q_pos - 1;
1090 *subject_ext_len = subject_len - s_pos - 1;
1091 *align_len += *query_ext_len > *subject_ext_len ?
1092 *query_ext_len + gaps_in_query : *subject_ext_len + gaps_in_subject;
1093
1094 return num_identical;
1095 }
1096
1097
1098 /**
1099 * Get hash for a word of word_size residues assuming 28-letter alphabet
1100 *
1101 * @param data Sequence [in]
1102 * @param word_size Word size [in]
1103 * @return Hash value
1104 */
s_GetHash(const Uint1 * data,int word_size)1105 static Uint8 s_GetHash(const Uint1* data, int word_size)
1106 {
1107 Uint8 hash = 0;
1108 int k;
1109 for (k=0;k < word_size;k++) {
1110 hash <<= 5;
1111 hash += (Int8)data[k];
1112 }
1113 return hash;
1114 }
1115
1116
1117 /**
1118 * Find a local number of identical residues in two aligned sequences by
1119 * finding word matches and doing a simple gapped extensions from the word hits
1120 *
1121 * @param query_seq Query sequence [in]
1122 * @param query_hashes Array of query words with index of each word
1123 * corresponding to word position in the query [in]
1124 * @param query_len Query length [in]
1125 * @param subject_seq Subject sequence [in]
1126 * @param subject_len Subject length [in]
1127 * @param max_shift Maximum number of local mismatches or gaps for extensions
1128 * [in]
1129 * @return Number of identical residues
1130 */
s_FindNumIdentical(Uint1 * query_seq,const Uint8 * query_hashes,int query_len,Uint1 * subject_seq,int subject_len,int max_shift)1131 static int s_FindNumIdentical(Uint1* query_seq,
1132 const Uint8* query_hashes,
1133 int query_len,
1134 Uint1* subject_seq,
1135 int subject_len,
1136 int max_shift)
1137 {
1138 int word_size = 8; /* word size for k-mer matching */
1139 Uint8 hash = 0;
1140 Uint8 mask = NCBI_CONST_UINT8(0xFFFFFFFFFF); /* mask for computing hash
1141 values */
1142 int query_from = 0;
1143 int subject_from = 0;
1144
1145 int s_pos; /* position in the subject sequence */
1146 int num_identical = 0; /* number of identical residues found */
1147 Boolean match = FALSE;
1148
1149 /* if query or subject length is smaller than word size, exit */
1150 if (!query_seq || !query_hashes || !subject_seq
1151 || query_len < word_size || subject_len < word_size) {
1152
1153 return 0;
1154 }
1155
1156 /* for each subject position */
1157 for (s_pos = 0; s_pos < subject_len - word_size; s_pos++) {
1158 int q_pos;
1159
1160 /* find word hash */
1161 if (s_pos == 0 || match) {
1162 hash = s_GetHash(&subject_seq[s_pos], word_size);
1163 }
1164 else {
1165 hash <<= 5;
1166 hash &= mask;
1167 hash += subject_seq[s_pos + word_size - 1];
1168 }
1169
1170 /* find matching query word; index of hash is position of the word
1171 the query */
1172 for (q_pos = query_from;q_pos < query_len - word_size; q_pos++) {
1173 if (query_hashes[q_pos] == hash) {
1174 break;
1175 }
1176 }
1177
1178 /* if match */
1179 if (q_pos < query_len - word_size) {
1180 int query_start = q_pos;
1181 int subject_start = s_pos;
1182
1183 int query_left_len, query_right_len;
1184 int subject_left_len, subject_right_len;
1185 int align_len_left=0, align_len_right=0;
1186
1187 match = TRUE;
1188 num_identical += word_size;
1189
1190 /* extend left from word match */
1191 num_identical += s_ExtendLeft(query_seq + query_start - 1,
1192 query_start - query_from,
1193 subject_seq + subject_start - 1,
1194 subject_start - subject_from,
1195 max_shift,
1196 &query_left_len, &subject_left_len,
1197 &align_len_left);
1198
1199 /* extend right from word match */
1200 num_identical += s_ExtendRight(query_seq + query_start + word_size,
1201 query_len - query_start - word_size,
1202 subject_seq + subject_start + word_size,
1203 subject_len - subject_start - word_size,
1204 max_shift,
1205 &query_right_len, &subject_right_len,
1206 &align_len_right);
1207
1208
1209 /* disregard already matched and extended words when matching
1210 further positions */
1211
1212 query_from = query_start + word_size + query_right_len;
1213 subject_from = subject_start + word_size + subject_right_len;
1214 /* s_pos will be incremented in the loop */
1215 s_pos = subject_from - 1;
1216 }
1217 else {
1218 match = FALSE;
1219 }
1220 }
1221
1222 return num_identical;
1223 }
1224
1225 /**
1226 * Test whether the aligned parts of two sequences that
1227 * have a high-scoring gapless alignment are nearly identical.
1228 *
1229 * First extend from the left end of the query and subject ranges and stop if
1230 * there are too manu mismatches. Then extend from the right end. Then for the
1231 * remaining protion of ths sequences find matching words and extend left and
1232 * right from the word hit. Repeat the last steo until the whole alignment
1233 * ranges are processed.
1234 *
1235 * @params seqData Subject sequence [in]
1236 * @params seqOffse Starting offset of the subject sequence in alignment data
1237 * [in]
1238 * @params queryData Query sequence [in]
1239 * @params queryOffset Starting offset of the query sequence in alignment data
1240 * [in]
1241 * @param query_words Array of query words with word index corresponding to
1242 * word's position in the query [in]
1243 * @param align Alignment data [in]
1244 * @return True if sequence parts are nearly identical, false otherwise
1245 */
1246 static Boolean
s_TestNearIdentical(const BlastCompo_SequenceData * seqData,const int seqOffset,const BlastCompo_SequenceData * queryData,const int queryOffset,const Uint8 * query_words,const BlastCompo_Alignment * align)1247 s_TestNearIdentical(const BlastCompo_SequenceData* seqData,
1248 const int seqOffset,
1249 const BlastCompo_SequenceData* queryData,
1250 const int queryOffset,
1251 const Uint8* query_words,
1252 const BlastCompo_Alignment* align)
1253 {
1254 int qStart = align->queryStart - queryOffset;
1255 /* align->queryEnd points to one position past alignment end */
1256 int qEnd = align->queryEnd - queryOffset - 1;
1257 int sStart = align->matchStart - seqOffset;
1258 int sEnd = align->matchEnd - seqOffset - 1;
1259 const double kMinFractionNearIdentical = 0.96;
1260 int max_shift = 8;
1261
1262 int query_len = qEnd - qStart + 1;
1263 int subject_len = sEnd - sStart + 1;
1264 int align_len = MIN(query_len, subject_len);
1265
1266 int query_left_len = 0;
1267 int subject_left_len = 0;
1268 int query_right_len = 0;
1269 int subject_right_len = 0;
1270 int align_left_len = 0;
1271 int align_right_len = 0;
1272
1273 double fraction_identical;
1274
1275 /* first find number of identies going from the beginning of the query
1276 and subject ranges */
1277 int num_identical = s_ExtendRight(queryData->data + qStart, query_len,
1278 seqData->data + sStart, subject_len,
1279 max_shift,
1280 &query_right_len, &subject_right_len,
1281 &align_right_len);
1282
1283 /* if the whole query range was processed return near identical status */
1284 if (query_right_len >= query_len || subject_right_len >= subject_len) {
1285 fraction_identical = (double)num_identical / (double)align_len;
1286 ASSERT(fraction_identical - 1.0 < 1e-10);
1287 return fraction_identical > kMinFractionNearIdentical;
1288 }
1289
1290 /* find the number of identies going from the end of the query and subject
1291 ranges */
1292 num_identical += s_ExtendLeft(queryData->data + qStart + query_right_len,
1293 query_len - query_right_len,
1294 seqData->data + sStart + subject_right_len,
1295 subject_len - subject_right_len,
1296 max_shift,
1297 &query_left_len, &subject_left_len,
1298 &align_left_len);
1299
1300 /* if the whole alignment ranges where covered, return the near identical
1301 status */
1302 if (query_left_len + query_right_len >= query_len
1303 || subject_left_len + subject_right_len >= subject_len) {
1304
1305 fraction_identical = (double)num_identical / (double)(align_len);
1306 ASSERT(fraction_identical - 1.0 < 1e-10);
1307 return fraction_identical > kMinFractionNearIdentical;
1308 }
1309
1310 /* find the number of identical matches in the middle portion of the
1311 alignment ranges */
1312 num_identical += s_FindNumIdentical(queryData->data + qStart + query_right_len,
1313 query_words + qStart + query_right_len,
1314 query_len - query_left_len - query_right_len,
1315 seqData->data + sStart + subject_right_len,
1316 subject_len - subject_left_len - subject_right_len,
1317 max_shift);
1318
1319 fraction_identical = (double)num_identical / (double)align_len;
1320 ASSERT(fraction_identical - 1.0 < 1e-10);
1321 if (fraction_identical > kMinFractionNearIdentical) {
1322 return TRUE;
1323 }
1324 else {
1325 return FALSE;
1326 }
1327 }
1328
1329
1330 /**
1331 * Initialize a new matching sequence, obtaining information about the
1332 * sequence from the search.
1333 *
1334 * @param self object to be initialized
1335 * @param seqSrc A pointer to a source from which sequence data
1336 * may be obtained
1337 * @param program_number identifies the type of blast search being
1338 * performed.
1339 * @param default_db_genetic_code default genetic code to use when
1340 * subject sequences are translated and there is
1341 * no other guidance on what code to use
1342 * @param subject_index index of the matching sequence in the database
1343 */
1344 static int
s_MatchingSequenceInitialize(BlastCompo_MatchingSequence * self,EBlastProgramType program_number,const BlastSeqSrc * seqSrc,Int4 default_db_genetic_code,Int4 subject_index)1345 s_MatchingSequenceInitialize(BlastCompo_MatchingSequence * self,
1346 EBlastProgramType program_number,
1347 const BlastSeqSrc* seqSrc,
1348 Int4 default_db_genetic_code,
1349 Int4 subject_index)
1350 {
1351 BlastKappa_SequenceInfo * seq_info; /* BLAST-specific sequence
1352 information */
1353 self->length = 0;
1354 self->local_data = NULL;
1355
1356 seq_info = malloc(sizeof(BlastKappa_SequenceInfo));
1357 if (seq_info != NULL) {
1358 self->local_data = seq_info;
1359
1360 seq_info->seq_src = seqSrc;
1361 seq_info->prog_number = program_number;
1362
1363 memset((void*) &seq_info->seq_arg, 0, sizeof(seq_info->seq_arg));
1364 seq_info->seq_arg.oid = self->index = subject_index;
1365 seq_info->seq_arg.check_oid_exclusion = TRUE;
1366
1367 if( program_number == eBlastTypeTblastn ) {
1368 seq_info->seq_arg.encoding = eBlastEncodingNcbi4na;
1369 } else {
1370 seq_info->seq_arg.encoding = eBlastEncodingProtein;
1371 }
1372 if (BlastSeqSrcGetSequence(seqSrc, &seq_info->seq_arg) >= 0) {
1373 self->length =
1374 BlastSeqSrcGetSeqLen(seqSrc, (void*) &seq_info->seq_arg);
1375
1376 /* If the subject is translated and the BlastSeqSrc implementation
1377 * doesn't provide a genetic code string, use the default genetic
1378 * code for all subjects (as in the C toolkit) */
1379 if (Blast_SubjectIsTranslated(program_number) &&
1380 seq_info->seq_arg.seq->gen_code_string == NULL) {
1381 seq_info->seq_arg.seq->gen_code_string =
1382 GenCodeSingletonFind(default_db_genetic_code);
1383 ASSERT(seq_info->seq_arg.seq->gen_code_string);
1384 }
1385 } else {
1386 self->length = 0;
1387 }
1388 }
1389 if (self->length == 0) {
1390 /* Could not obtain the required data */
1391 s_MatchingSequenceRelease(self);
1392 return -1;
1393 } else {
1394 return 0;
1395 }
1396 }
1397
1398
1399 /** NCBIstdaa encoding for 'X' character */
1400 #define BLASTP_MASK_RESIDUE 21
1401 /** Default instructions and mask residue for SEG filtering */
1402 #define BLASTP_MASK_INSTRUCTIONS "S 10 1.8 2.1"
1403
1404
1405 /**
1406 * Filter low complexity regions from the sequence data; uses the SEG
1407 * algorithm.
1408 *
1409 * @param seqData data to be filtered
1410 * @param program_name type of search being performed
1411 * @return 0 for success; -1 for out-of-memory
1412 */
1413 static int
s_DoSegSequenceData(BlastCompo_SequenceData * seqData,EBlastProgramType program_name,Boolean * is_seq_biased)1414 s_DoSegSequenceData(BlastCompo_SequenceData * seqData,
1415 EBlastProgramType program_name,
1416 Boolean* is_seq_biased)
1417 {
1418 int status = 0;
1419 BlastSeqLoc* mask_seqloc = NULL;
1420 SBlastFilterOptions* filter_options = NULL;
1421
1422 status = BlastFilteringOptionsFromString(program_name,
1423 BLASTP_MASK_INSTRUCTIONS,
1424 &filter_options, NULL);
1425 if (status == 0) {
1426 status = BlastSetUp_Filter(program_name, seqData->data,
1427 seqData->length, 0, filter_options,
1428 &mask_seqloc, NULL);
1429 filter_options = SBlastFilterOptionsFree(filter_options);
1430 }
1431 if (is_seq_biased) {
1432 *is_seq_biased = (mask_seqloc != NULL);
1433 }
1434 if (status == 0) {
1435 Blast_MaskTheResidues(seqData->data, seqData->length,
1436 FALSE, mask_seqloc, FALSE, 0);
1437 }
1438 if (mask_seqloc != NULL) {
1439 mask_seqloc = BlastSeqLocFree(mask_seqloc);
1440 }
1441 return status;
1442 }
1443
1444
1445 /**
1446 * Obtain a string of translated data
1447 *
1448 * @param self the sequence from which to obtain the data [in]
1449 * @param range the range and translation frame to get [in]
1450 * @param seqData the resulting data [out]
1451 * @param queryData the query sequence [in]
1452 * @param queryOffset offset for align if there are multiple queries
1453 * @param align information about the alignment between query and subject
1454 * @param shouldTestIdentical did alignment pass a preliminary test in
1455 * redo_alignment.c that indicates the sequence
1456 * pieces may be near identical
1457 *
1458 * @return 0 on success; -1 on failure
1459 */
1460 static int
s_SequenceGetTranslatedRange(const BlastCompo_MatchingSequence * self,const BlastCompo_SequenceRange * range,BlastCompo_SequenceData * seqData,const BlastCompo_SequenceRange * q_range,BlastCompo_SequenceData * queryData,const Uint8 * query_words,const BlastCompo_Alignment * align,const Boolean shouldTestIdentical,const ECompoAdjustModes compo_adjust_mode,const Boolean isSmithWaterman,Boolean * subject_maybe_biased)1461 s_SequenceGetTranslatedRange(const BlastCompo_MatchingSequence * self,
1462 const BlastCompo_SequenceRange * range,
1463 BlastCompo_SequenceData * seqData,
1464 const BlastCompo_SequenceRange * q_range,
1465 BlastCompo_SequenceData * queryData,
1466 const Uint8* query_words,
1467 const BlastCompo_Alignment *align,
1468 const Boolean shouldTestIdentical,
1469 const ECompoAdjustModes compo_adjust_mode,
1470 const Boolean isSmithWaterman,
1471 Boolean* subject_maybe_biased)
1472 {
1473 int status = 0;
1474 BlastKappa_SequenceInfo * local_data; /* BLAST-specific
1475 information associated
1476 with the sequence */
1477 Uint1 * translation_buffer; /* a buffer for the translated,
1478 amino-acid sequence */
1479 Int4 translated_length; /* length of the translated sequence */
1480 int translation_frame; /* frame in which to translate */
1481 Uint1 * na_sequence; /* the nucleotide sequence */
1482 int translation_start; /* location in na_sequence to start
1483 translating */
1484 int num_nucleotides; /* the number of nucleotides to be translated */
1485
1486 local_data = self->local_data;
1487 na_sequence = local_data->seq_arg.seq->sequence_start;
1488
1489 /* Initialize seqData to nil, in case this routine fails */
1490 seqData->buffer = NULL;
1491 seqData->data = NULL;
1492 seqData->length = 0;
1493
1494 translation_frame = range->context;
1495 if (translation_frame > 0) {
1496 translation_start = 3 * range->begin;
1497 } else {
1498 translation_start =
1499 self->length - 3 * range->end + translation_frame + 1;
1500 }
1501 num_nucleotides =
1502 3 * (range->end - range->begin) + ABS(translation_frame) - 1;
1503
1504 status = Blast_GetPartialTranslation(na_sequence + translation_start,
1505 num_nucleotides,
1506 (Int2) translation_frame,
1507 local_data->seq_arg.seq->gen_code_string,
1508 &translation_buffer,
1509 &translated_length,
1510 NULL);
1511 if (status == 0) {
1512 seqData->buffer = translation_buffer;
1513 seqData->data = translation_buffer + 1;
1514 seqData->length = translated_length;
1515
1516 if ( !(KAPPA_TBLASTN_NO_SEG_SEQUENCE) ) {
1517 if (compo_adjust_mode
1518 && (!subject_maybe_biased || *subject_maybe_biased)) {
1519
1520 if ( (!shouldTestIdentical)
1521 || (shouldTestIdentical
1522 && (!s_TestNearIdentical(seqData, range->begin,
1523 queryData, q_range->begin,
1524 query_words, align)))) {
1525
1526 status = s_DoSegSequenceData(seqData, eBlastTypeTblastn,
1527 subject_maybe_biased);
1528 if (status != 0) {
1529 free(seqData->buffer);
1530 seqData->buffer = NULL;
1531 seqData->data = NULL;
1532 seqData->length = 0;
1533 }
1534 }
1535 }
1536 }
1537 }
1538 return status;
1539 }
1540
1541
1542 /**
1543 * Get a string of protein data from a protein sequence.
1544 *
1545 * @param self a protein sequence [in]
1546 * @param range the range to get [in]
1547 * @param seqData the resulting data [out]
1548 * @param queryData the query sequence [in]
1549 * @param queryOffset offset for align if there are multiple queries
1550 * @param align information about the alignment
1551 * between query and subject [in]
1552 * @param shouldTestIdentical did alignment pass a preliminary test in
1553 * redo_alignment.c that indicates the sequence
1554 * pieces may be near identical [in]
1555 *
1556 * @return 0 on success; -1 on failure
1557 */
1558 static int
s_SequenceGetProteinRange(const BlastCompo_MatchingSequence * self,const BlastCompo_SequenceRange * range,BlastCompo_SequenceData * seqData,const BlastCompo_SequenceRange * q_range,BlastCompo_SequenceData * queryData,const Uint8 * query_words,const BlastCompo_Alignment * align,const Boolean shouldTestIdentical,const ECompoAdjustModes compo_adjust_mode,const Boolean isSmithWaterman,Boolean * subject_maybe_biased)1559 s_SequenceGetProteinRange(const BlastCompo_MatchingSequence * self,
1560 const BlastCompo_SequenceRange * range,
1561 BlastCompo_SequenceData * seqData,
1562 const BlastCompo_SequenceRange * q_range,
1563 BlastCompo_SequenceData * queryData,
1564 const Uint8* query_words,
1565 const BlastCompo_Alignment *align,
1566 const Boolean shouldTestIdentical,
1567 const ECompoAdjustModes compo_adjust_mode,
1568 const Boolean isSmithWaterman,
1569 Boolean* subject_maybe_biased)
1570
1571 {
1572 int status = 0; /* return status */
1573 Int4 idx; /* loop index */
1574 Uint1 *origData; /* the unfiltered data for the sequence */
1575 /* BLAST-specific sequence information */
1576 BlastKappa_SequenceInfo * local_data = self->local_data;
1577 BLAST_SequenceBlk * seq = self->local_data;
1578
1579 if (self->local_data == NULL)
1580 return -1;
1581
1582 seqData->data = NULL;
1583 seqData->length = 0;
1584 /* Copy the entire sequence (necessary for SEG filtering.) */
1585 seqData->buffer = calloc((self->length + 2), sizeof(Uint1));
1586 if (seqData->buffer == NULL) {
1587 return -1;
1588 }
1589 /* First and last characters of the buffer MUST be '\0', which is
1590 * true here because the buffer was allocated using calloc. */
1591 seqData->data = seqData->buffer + 1;
1592 seqData->length = self->length;
1593
1594 origData = (self->index >= 0) ? local_data->seq_arg.seq->sequence
1595 : seq->sequence;
1596 if((self->index < 0) && (align->frame != 0)) {
1597 int i=0, offsets =0;
1598 int f = GET_SEQ_FRAME(align->frame);
1599 int nucl_length = GET_NUCL_LENGTH(self->length);
1600 seqData->length = GET_TRANSLATED_LENGTH(nucl_length, f);
1601 for(; i < f; i++) {
1602 offsets = GET_TRANSLATED_LENGTH(nucl_length, i) +1;
1603 origData += offsets;
1604 }
1605 }
1606 /* Copy the sequence data */
1607 for (idx = 0; idx < seqData->length; idx++) {
1608 seqData->data[idx] = origData[idx];
1609 }
1610
1611 if ( !(KAPPA_BLASTP_NO_SEG_SEQUENCE) ) {
1612 if (compo_adjust_mode
1613 && (!subject_maybe_biased || *subject_maybe_biased)) {
1614
1615 if ( (!shouldTestIdentical)
1616 || (shouldTestIdentical
1617 && (!s_TestNearIdentical(seqData, 0, queryData,
1618 q_range->begin, query_words,
1619 align)))) {
1620
1621 status = s_DoSegSequenceData(seqData, eBlastTypeBlastp,
1622 subject_maybe_biased);
1623 }
1624 }
1625 }
1626 /* Fit the data to the range. */
1627 seqData ->data = &seqData->data[range->begin - 1];
1628 *seqData->data++ = '\0';
1629 seqData ->length = range->end - range->begin;
1630
1631 if (status != 0) {
1632 free(seqData->buffer);
1633 seqData->buffer = NULL;
1634 seqData->data = NULL;
1635 }
1636 return status;
1637 }
1638
1639
1640 /**
1641 * Obtain the sequence data that lies within the given range.
1642 *
1643 * @param self sequence information [in]
1644 * @param range range specifying the range of data [in]
1645 * @param seqData the sequence data obtained [out]
1646 * @param seqData the resulting data [out]
1647 * @param queryData the query sequence [in]
1648 * @param queryOffset offset for align if there are multiple queries
1649 * @param align information about the alignment between query and subject
1650 * @param shouldTestIdentical did alignment pass a preliminary test in
1651 * redo_alignment.c that indicates the sequence
1652 * pieces may be near identical
1653 *
1654 * @return 0 on success; -1 on failure
1655 */
1656 static int
s_SequenceGetRange(const BlastCompo_MatchingSequence * self,const BlastCompo_SequenceRange * s_range,BlastCompo_SequenceData * seqData,const BlastCompo_SequenceData * query,const BlastCompo_SequenceRange * q_range,BlastCompo_SequenceData * queryData,const Uint8 * query_words,const BlastCompo_Alignment * align,const Boolean shouldTestIdentical,const ECompoAdjustModes compo_adjust_mode,const Boolean isSmithWaterman,Boolean * subject_maybe_biased)1657 s_SequenceGetRange(const BlastCompo_MatchingSequence * self,
1658 const BlastCompo_SequenceRange * s_range,
1659 BlastCompo_SequenceData * seqData,
1660 const BlastCompo_SequenceData * query,
1661 const BlastCompo_SequenceRange * q_range,
1662 BlastCompo_SequenceData * queryData,
1663 const Uint8* query_words,
1664 const BlastCompo_Alignment *align,
1665 const Boolean shouldTestIdentical,
1666 const ECompoAdjustModes compo_adjust_mode,
1667 const Boolean isSmithWaterman,
1668 Boolean* subject_maybe_biased)
1669 {
1670 Int4 idx;
1671 BlastKappa_SequenceInfo * seq_info = self->local_data;
1672 Uint1 *origData = query->data + q_range->begin;
1673 /* Copy the query sequence (necessary for SEG filtering.) */
1674 queryData->length = q_range->end - q_range->begin;
1675 queryData->buffer = calloc((queryData->length + 2), sizeof(Uint1));
1676 queryData->data = queryData->buffer + 1;
1677
1678 for (idx = 0; idx < queryData->length; idx++) {
1679 /* Copy the sequence data, replacing occurrences of amino acid
1680 * number 24 (Selenocysteine) with number 3 (Cysteine). */
1681 queryData->data[idx] = (origData[idx] != 24) ? origData[idx] : 3;
1682 }
1683 if (seq_info && seq_info->prog_number == eBlastTypeTblastn) {
1684 /* The sequence must be translated. */
1685 return s_SequenceGetTranslatedRange(self, s_range, seqData,
1686 q_range, queryData, query_words,
1687 align, shouldTestIdentical,
1688 compo_adjust_mode, isSmithWaterman,
1689 subject_maybe_biased);
1690 } else {
1691 return s_SequenceGetProteinRange(self, s_range, seqData,
1692 q_range, queryData, query_words,
1693 align, shouldTestIdentical,
1694 compo_adjust_mode, isSmithWaterman,
1695 subject_maybe_biased);
1696 }
1697 }
1698
1699
1700 /** Data and data-structures needed to perform a gapped alignment */
1701 typedef struct BlastKappa_GappingParamsContext {
1702 const BlastScoringParameters*
1703 scoringParams; /**< scoring parameters for a
1704 gapped alignment */
1705 BlastGapAlignStruct * gap_align; /**< additional parameters for a
1706 gapped alignment */
1707 BlastScoreBlk* sbp; /**< the score block for this search */
1708 double localScalingFactor; /**< the amount by which this
1709 search has been scaled */
1710 EBlastProgramType prog_number; /**< the type of search being
1711 performed */
1712 } BlastKappa_GappingParamsContext;
1713
1714
1715 /**
1716 * Reads a BlastGapAlignStruct that has been used to compute a
1717 * traceback, and return a BlastCompo_Alignment representing the
1718 * alignment. The BlastGapAlignStruct is in coordinates local to the
1719 * ranges being aligned; the resulting alignment is in coordinates w.r.t.
1720 * the whole query and subject.
1721 *
1722 * @param gap_align the BlastGapAlignStruct
1723 * @param *edit_script the edit script from the alignment; on exit
1724 * NULL. The edit_script is usually
1725 * gap_align->edit_script, but we don't want
1726 * an implicit side effect on the gap_align.
1727 * @param query_range the range of the query used in this alignment
1728 * @param subject_range the range of the subject used in this alignment
1729 * @param matrix_adjust_rule the rule used to compute the scoring matrix
1730 *
1731 * @return the new alignment on success or NULL on error
1732 */
1733 static BlastCompo_Alignment *
s_NewAlignmentFromGapAlign(BlastGapAlignStruct * gap_align,GapEditScript ** edit_script,BlastCompo_SequenceRange * query_range,BlastCompo_SequenceRange * subject_range,EMatrixAdjustRule matrix_adjust_rule)1734 s_NewAlignmentFromGapAlign(BlastGapAlignStruct * gap_align,
1735 GapEditScript ** edit_script,
1736 BlastCompo_SequenceRange * query_range,
1737 BlastCompo_SequenceRange * subject_range,
1738 EMatrixAdjustRule matrix_adjust_rule)
1739 {
1740 /* parameters to BlastCompo_AlignmentNew */
1741 int queryStart, queryEnd, queryIndex, matchStart, matchEnd, frame;
1742 BlastCompo_Alignment * obj; /* the new alignment */
1743
1744 /* In the composition_adjustment library, the query start/end are
1745 indices into the concatenated query, and so must be shifted. */
1746 queryStart = gap_align->query_start + query_range->begin;
1747 queryEnd = gap_align->query_stop + query_range->begin;
1748 queryIndex = query_range->context;
1749 matchStart = gap_align->subject_start + subject_range->begin;
1750 matchEnd = gap_align->subject_stop + subject_range->begin;
1751 frame = subject_range->context;
1752
1753 obj = BlastCompo_AlignmentNew(gap_align->score, matrix_adjust_rule,
1754 queryStart, queryEnd, queryIndex,
1755 matchStart, matchEnd, frame,
1756 *edit_script);
1757 if (obj != NULL) {
1758 *edit_script = NULL;
1759 }
1760 return obj;
1761 }
1762
1763
1764 /** A callback used when performing SmithWaterman alignments:
1765 * Calculate the traceback for one alignment by performing an x-drop
1766 * alignment in the forward direction, possibly increasing the x-drop
1767 * parameter until the desired score is attained.
1768 *
1769 * The start, end and score of the alignment should be obtained
1770 * using the Smith-Waterman algorithm before this routine is called.
1771 *
1772 * @param *pnewAlign the new alignment
1773 * @param *pqueryEnd on entry, the end of the alignment in the
1774 * query, as computed by the Smith-Waterman
1775 * algorithm. On exit, the end as computed by
1776 * the x-drop algorithm
1777 * @param *pmatchEnd like as *pqueryEnd, but for the subject
1778 * sequence
1779 * @param queryStart the starting point in the query
1780 * @param matchStart the starting point in the subject
1781 * @param score the score of the alignment, as computed by
1782 * the Smith-Waterman algorithm
1783 * @param query query sequence data
1784 * @param query_range range of this query in the concatenated
1785 * query
1786 * @param ccat_query_length total length of the concatenated query
1787 * @param subject subject sequence data
1788 * @param subject_range range of subject_data in the translated
1789 * query, in amino acid coordinates
1790 * @param full_subject_length length of the full subject sequence
1791 * @param gapping_params parameters used to compute gapped
1792 * alignments
1793 * @param matrix_adjust_rule the rule used to compute the scoring matrix
1794 *
1795 * @returns 0 (posts a fatal error if it fails)
1796 * @sa new_xdrop_align_type
1797 */
1798 static int
s_NewAlignmentUsingXdrop(BlastCompo_Alignment ** pnewAlign,Int4 * pqueryEnd,Int4 * pmatchEnd,Int4 queryStart,Int4 matchStart,Int4 score,BlastCompo_SequenceData * query,BlastCompo_SequenceRange * query_range,Int4 ccat_query_length,BlastCompo_SequenceData * subject,BlastCompo_SequenceRange * subject_range,Int4 full_subject_length,BlastCompo_GappingParams * gapping_params,EMatrixAdjustRule matrix_adjust_rule)1799 s_NewAlignmentUsingXdrop(BlastCompo_Alignment ** pnewAlign,
1800 Int4 * pqueryEnd, Int4 *pmatchEnd,
1801 Int4 queryStart, Int4 matchStart, Int4 score,
1802 BlastCompo_SequenceData * query,
1803 BlastCompo_SequenceRange * query_range,
1804 Int4 ccat_query_length,
1805 BlastCompo_SequenceData * subject,
1806 BlastCompo_SequenceRange * subject_range,
1807 Int4 full_subject_length,
1808 BlastCompo_GappingParams * gapping_params,
1809 EMatrixAdjustRule matrix_adjust_rule)
1810 {
1811 Int4 newScore;
1812 /* Extent of the alignment as computed by an x-drop alignment
1813 * (usually the same as (queryEnd - queryStart) and (matchEnd -
1814 * matchStart)) */
1815 Int4 queryExtent, matchExtent;
1816 BlastCompo_Alignment * obj = NULL; /* the new object */
1817 /* BLAST-specific parameters needed compute an X-drop alignment */
1818 BlastKappa_GappingParamsContext * context = gapping_params->context;
1819 /* Auxiliarly structure for computing gapped alignments */
1820 BlastGapAlignStruct * gap_align = context->gap_align;
1821 /* Scoring parameters for gapped alignments */
1822 const BlastScoringParameters* scoringParams = context->scoringParams;
1823 /* A structure containing the traceback of a gapped alignment */
1824 GapEditScript* editScript = NULL;
1825
1826 /* suppress unused parameter warnings; this is a callback
1827 function, so these parameter cannot be deleted */
1828 (void) ccat_query_length;
1829 (void) full_subject_length;
1830
1831 gap_align->gap_x_dropoff = gapping_params->x_dropoff;
1832
1833 s_SWFindFinalEndsUsingXdrop(query, queryStart, *pqueryEnd,
1834 subject, matchStart, *pmatchEnd,
1835 gap_align, scoringParams,
1836 score, &queryExtent, &matchExtent,
1837 &newScore);
1838 *pqueryEnd = queryStart + queryExtent;
1839 *pmatchEnd = matchStart + matchExtent;
1840
1841 editScript =
1842 Blast_PrelimEditBlockToGapEditScript(gap_align->rev_prelim_tback,
1843 gap_align->fwd_prelim_tback);
1844 if (editScript != NULL) {
1845 /* Shifted values of the endpoints */
1846 Int4 aqueryStart = queryStart + query_range->begin;
1847 Int4 aqueryEnd = *pqueryEnd + query_range->begin;
1848 Int4 amatchStart = matchStart + subject_range->begin;
1849 Int4 amatchEnd = *pmatchEnd + subject_range->begin;
1850
1851 obj = BlastCompo_AlignmentNew(newScore, matrix_adjust_rule,
1852 aqueryStart, aqueryEnd,
1853 query_range->context,
1854 amatchStart, amatchEnd,
1855 subject_range->context, editScript);
1856 if (obj == NULL) {
1857 GapEditScriptDelete(editScript);
1858 }
1859 }
1860 *pnewAlign = obj;
1861
1862 return obj != NULL ? 0 : -1;
1863 }
1864
1865
1866 /**
1867 * A callback: calculate the traceback for one alignment by
1868 * performing an x-drop alignment in both directions
1869 *
1870 * @param in_align the existing alignment, without traceback
1871 * @param matrix_adjust_rule the rule used to compute the scoring matrix
1872 * @param query_data query sequence data
1873 * @param query_range range of this query in the concatenated
1874 * query
1875 * @param ccat_query_length total length of the concatenated query
1876 * @param subject_data subject sequence data
1877 * @param subject_range range of subject_data in the translated
1878 * query, in amino acid coordinates
1879 * @param full_subject_length length of the full subject sequence
1880 * @param gapping_params parameters used to compute gapped
1881 * alignments
1882 * @sa redo_one_alignment_type
1883 */
1884 static BlastCompo_Alignment *
s_RedoOneAlignment(BlastCompo_Alignment * in_align,EMatrixAdjustRule matrix_adjust_rule,BlastCompo_SequenceData * query_data,BlastCompo_SequenceRange * query_range,int ccat_query_length,BlastCompo_SequenceData * subject_data,BlastCompo_SequenceRange * subject_range,int full_subject_length,BlastCompo_GappingParams * gapping_params)1885 s_RedoOneAlignment(BlastCompo_Alignment * in_align,
1886 EMatrixAdjustRule matrix_adjust_rule,
1887 BlastCompo_SequenceData * query_data,
1888 BlastCompo_SequenceRange * query_range,
1889 int ccat_query_length,
1890 BlastCompo_SequenceData * subject_data,
1891 BlastCompo_SequenceRange * subject_range,
1892 int full_subject_length,
1893 BlastCompo_GappingParams * gapping_params)
1894 {
1895 int status; /* return code */
1896 Int4 q_start, s_start; /* starting point in query and subject */
1897 /* BLAST-specific parameters needed to compute a gapped alignment */
1898 BlastKappa_GappingParamsContext * context = gapping_params->context;
1899 /* Auxiliary structure for computing gapped alignments */
1900 BlastGapAlignStruct* gapAlign = context->gap_align;
1901 /* The preliminary gapped HSP that were are recomputing */
1902 BlastHSP * hsp = in_align->context;
1903 Boolean fence_hit = FALSE;
1904
1905 /* suppress unused parameter warnings; this is a callback
1906 function, so these parameter cannot be deleted */
1907 (void) ccat_query_length;
1908 (void) full_subject_length;
1909
1910 /* Use the starting point supplied by the HSP. */
1911 q_start = hsp->query.gapped_start - query_range->begin;
1912 s_start = hsp->subject.gapped_start - subject_range->begin;
1913
1914 gapAlign->gap_x_dropoff = gapping_params->x_dropoff;
1915
1916 /*
1917 * Previously, last argument was NULL which could cause problems for
1918 * tblastn.
1919 */
1920 status =
1921 BLAST_GappedAlignmentWithTraceback(context->prog_number,
1922 query_data->data,
1923 subject_data->data, gapAlign,
1924 context->scoringParams,
1925 q_start, s_start,
1926 query_data->length,
1927 subject_data->length,
1928 &fence_hit);
1929 if (status == 0) {
1930 return s_NewAlignmentFromGapAlign(gapAlign, &gapAlign->edit_script,
1931 query_range, subject_range,
1932 matrix_adjust_rule);
1933 } else {
1934 return NULL;
1935 }
1936 }
1937
1938
1939 /**
1940 * A BlastKappa_SavedParameters holds the value of certain search
1941 * parameters on entry to RedoAlignmentCore. These values are
1942 * restored on exit.
1943 */
1944 typedef struct BlastKappa_SavedParameters {
1945 Int4 gap_open; /**< a penalty for the existence of a gap */
1946 Int4 gapExtend; /**< a penalty for each residue in the
1947 gap */
1948 double scale_factor; /**< the original scale factor */
1949 Int4 **origMatrix; /**< The original matrix values */
1950 double original_expect_value; /**< expect value on entry */
1951 /** copy of the original gapped Karlin-Altschul block
1952 * corresponding to the first context */
1953 Blast_KarlinBlk** kbp_gap_orig;
1954 Int4 num_queries; /**< Number of queries in this search */
1955 } BlastKappa_SavedParameters;
1956
1957
1958 /**
1959 * Release the data associated with a BlastKappa_SavedParameters and
1960 * delete the object
1961 * @param searchParams the object to be deleted [in][out]
1962 */
1963 static void
s_SavedParametersFree(BlastKappa_SavedParameters ** searchParams)1964 s_SavedParametersFree(BlastKappa_SavedParameters ** searchParams)
1965 {
1966 /* for convenience, remove one level of indirection from searchParams */
1967 BlastKappa_SavedParameters *sp = *searchParams;
1968
1969 if (sp != NULL) {
1970 if (sp->kbp_gap_orig != NULL) {
1971 int i;
1972 for (i = 0; i < sp->num_queries; i++) {
1973 if (sp->kbp_gap_orig[i] != NULL)
1974 Blast_KarlinBlkFree(sp->kbp_gap_orig[i]);
1975 }
1976 free(sp->kbp_gap_orig);
1977 }
1978 if (sp->origMatrix != NULL)
1979 Nlm_Int4MatrixFree(&sp->origMatrix);
1980 }
1981 sfree(*searchParams);
1982 *searchParams = NULL;
1983 }
1984
1985
1986 /**
1987 * Create a new instance of BlastKappa_SavedParameters
1988 *
1989 * @param rows number of rows in the scoring matrix
1990 * @param numQueries number of queries in this search
1991 * @param compo_adjust_mode if >0, use composition-based statistics
1992 * @param positionBased if true, the search is position-based
1993 */
1994 static BlastKappa_SavedParameters *
s_SavedParametersNew(Int4 rows,Int4 numQueries,ECompoAdjustModes compo_adjust_mode,Boolean positionBased)1995 s_SavedParametersNew(Int4 rows,
1996 Int4 numQueries,
1997 ECompoAdjustModes compo_adjust_mode,
1998 Boolean positionBased)
1999 {
2000 int i;
2001 BlastKappa_SavedParameters *sp; /* the new object */
2002 sp = malloc(sizeof(BlastKappa_SavedParameters));
2003
2004 if (sp == NULL) {
2005 goto error_return;
2006 }
2007 sp->kbp_gap_orig = NULL;
2008 sp->origMatrix = NULL;
2009
2010 sp->kbp_gap_orig = calloc(numQueries, sizeof(Blast_KarlinBlk*));
2011 if (sp->kbp_gap_orig == NULL) {
2012 goto error_return;
2013 }
2014 sp->num_queries = numQueries;
2015 for (i = 0; i < numQueries; i++) {
2016 sp->kbp_gap_orig[i] = NULL;
2017 }
2018 if (compo_adjust_mode != eNoCompositionBasedStats) {
2019 if (positionBased) {
2020 sp->origMatrix = Nlm_Int4MatrixNew(rows, BLASTAA_SIZE);
2021 } else {
2022 sp->origMatrix = Nlm_Int4MatrixNew(BLASTAA_SIZE, BLASTAA_SIZE);
2023 }
2024 if (sp->origMatrix == NULL)
2025 goto error_return;
2026 }
2027 return sp;
2028 error_return:
2029 s_SavedParametersFree(&sp);
2030 return NULL;
2031 }
2032
2033
2034 /**
2035 * Record the initial value of the search parameters that are to be
2036 * adjusted.
2037 *
2038 * @param searchParams holds the recorded values [out]
2039 * @param sbp a score block [in]
2040 * @param scoring gapped alignment parameters [in]
2041 * @param query_length length of the concatenated query [in]
2042 * @param compo_adjust_mode composition adjustment mode [in]
2043 * @param positionBased is this search position-based [in]
2044 */
2045 static int
s_RecordInitialSearch(BlastKappa_SavedParameters * searchParams,BlastScoreBlk * sbp,const BlastScoringParameters * scoring,int query_length,ECompoAdjustModes compo_adjust_mode,Boolean positionBased)2046 s_RecordInitialSearch(BlastKappa_SavedParameters * searchParams,
2047 BlastScoreBlk* sbp,
2048 const BlastScoringParameters* scoring,
2049 int query_length,
2050 ECompoAdjustModes compo_adjust_mode,
2051 Boolean positionBased)
2052 {
2053 int i;
2054
2055 searchParams->gap_open = scoring->gap_open;
2056 searchParams->gapExtend = scoring->gap_extend;
2057 searchParams->scale_factor = scoring->scale_factor;
2058
2059 for (i = 0; i < searchParams->num_queries; i++) {
2060 if (sbp->kbp_gap[i] != NULL) {
2061 /* There is a kbp_gap for query i and it must be copied */
2062 searchParams->kbp_gap_orig[i] = Blast_KarlinBlkNew();
2063 if (searchParams->kbp_gap_orig[i] == NULL) {
2064 return -1;
2065 }
2066 Blast_KarlinBlkCopy(searchParams->kbp_gap_orig[i],
2067 sbp->kbp_gap[i]);
2068 }
2069 }
2070
2071 if (compo_adjust_mode != eNoCompositionBasedStats) {
2072 Int4 **matrix; /* scoring matrix */
2073 int j; /* iteration index */
2074 int rows; /* number of rows in matrix */
2075 if (positionBased) {
2076 matrix = sbp->psi_matrix->pssm->data;
2077 rows = query_length;
2078 } else {
2079 matrix = sbp->matrix->data;
2080 rows = BLASTAA_SIZE;
2081 }
2082
2083 for (i = 0; i < rows; i++) {
2084 for (j = 0; j < BLASTAA_SIZE; j++) {
2085 searchParams->origMatrix[i][j] = matrix[i][j];
2086 }
2087 }
2088 }
2089 return 0;
2090 }
2091
2092
2093 /**
2094 * Rescale the search parameters in the search object and options
2095 * object to obtain more precision.
2096 *
2097 * @param sbp score block to be rescaled
2098 * @param sp scoring parameters to be rescaled
2099 * @param num_queries number of queries in this search
2100 * @param scale_factor amount by which to scale this search
2101 */
2102 static void
s_RescaleSearch(BlastScoreBlk * sbp,BlastScoringParameters * sp,int num_queries,double scale_factor)2103 s_RescaleSearch(BlastScoreBlk* sbp,
2104 BlastScoringParameters* sp,
2105 int num_queries,
2106 double scale_factor)
2107 {
2108 int i;
2109 for (i = 0; i < num_queries; i++) {
2110 if (sbp->kbp_gap[i] != NULL) {
2111 Blast_KarlinBlk * kbp = sbp->kbp_gap[i];
2112 kbp->Lambda /= scale_factor;
2113 kbp->logK = log(kbp->K);
2114 }
2115 }
2116
2117 sp->gap_open = BLAST_Nint(sp->gap_open * scale_factor);
2118 sp->gap_extend = BLAST_Nint(sp->gap_extend * scale_factor);
2119 sp->scale_factor = scale_factor;
2120 }
2121
2122
2123 /**
2124 * Restore the parameters that were adjusted to their original values.
2125 *
2126 * @param sbp the score block to be restored
2127 * @param scoring the scoring parameters to be restored
2128 * @param searchParams the initial recorded values of the parameters
2129 * @param query_length the concatenated query length
2130 * @param positionBased is this search position-based
2131 * @param compo_adjust_mode mode of composition adjustment
2132 */
2133 static void
s_RestoreSearch(BlastScoreBlk * sbp,BlastScoringParameters * scoring,const BlastKappa_SavedParameters * searchParams,int query_length,Boolean positionBased,ECompoAdjustModes compo_adjust_mode)2134 s_RestoreSearch(BlastScoreBlk* sbp,
2135 BlastScoringParameters* scoring,
2136 const BlastKappa_SavedParameters * searchParams,
2137 int query_length,
2138 Boolean positionBased,
2139 ECompoAdjustModes compo_adjust_mode)
2140 {
2141 int i;
2142
2143 scoring->gap_open = searchParams->gap_open;
2144 scoring->gap_extend = searchParams->gapExtend;
2145 scoring->scale_factor = searchParams->scale_factor;
2146
2147 for (i = 0; i < searchParams->num_queries; i++) {
2148 if (sbp->kbp_gap[i] != NULL) {
2149 Blast_KarlinBlkCopy(sbp->kbp_gap[i],
2150 searchParams->kbp_gap_orig[i]);
2151 }
2152 }
2153 if(compo_adjust_mode != eNoCompositionBasedStats) {
2154 int j; /* iteration index */
2155 Int4 ** matrix; /* matrix to be restored */
2156 int rows; /* number of rows in the matrix */
2157
2158 if (positionBased) {
2159 matrix = sbp->psi_matrix->pssm->data;
2160 rows = query_length;
2161 } else {
2162 matrix = sbp->matrix->data;
2163 rows = BLASTAA_SIZE;
2164 }
2165 for (i = 0; i < rows; i++) {
2166 for (j = 0; j < BLASTAA_SIZE; j++) {
2167 matrix[i][j] = searchParams->origMatrix[i][j];
2168 }
2169 }
2170 }
2171 }
2172
2173
2174 /**
2175 * Initialize an object of type Blast_MatrixInfo.
2176 *
2177 * @param self object being initialized
2178 * @param queryBlk the query sequence data
2179 * @param sbp score block for this search
2180 * @param scale_factor amount by which ungapped parameters should be
2181 * scaled
2182 * @param matrixName name of the matrix
2183 */
2184 static int
s_MatrixInfoInit(Blast_MatrixInfo * self,BLAST_SequenceBlk * queryBlk,BlastScoreBlk * sbp,double scale_factor,const char * matrixName)2185 s_MatrixInfoInit(Blast_MatrixInfo * self,
2186 BLAST_SequenceBlk* queryBlk,
2187 BlastScoreBlk* sbp,
2188 double scale_factor,
2189 const char * matrixName)
2190 {
2191 int status = 0; /* return status */
2192 int lenName; /* length of matrixName as a string */
2193
2194 /* copy the matrix name (strdup is not standard C) */
2195 lenName = strlen(matrixName);
2196 if (NULL == (self->matrixName = malloc(lenName + 1))) {
2197 return -1;
2198 }
2199 memcpy(self->matrixName, matrixName, lenName + 1);
2200
2201 if (self->positionBased) {
2202 status = s_GetPosBasedStartFreqRatios(self->startFreqRatios,
2203 queryBlk->length,
2204 queryBlk->sequence,
2205 matrixName,
2206 sbp->psi_matrix->freq_ratios);
2207 if (status == 0) {
2208 status = s_ScalePosMatrix(self->startMatrix, matrixName,
2209 sbp->psi_matrix->freq_ratios,
2210 queryBlk->sequence,
2211 queryBlk->length, sbp, scale_factor);
2212 self->ungappedLambda = sbp->kbp_psi[0]->Lambda / scale_factor;
2213 }
2214 } else {
2215 self->ungappedLambda = sbp->kbp_ideal->Lambda / scale_factor;
2216 status = s_GetStartFreqRatios(self->startFreqRatios, matrixName);
2217 if (status == 0) {
2218 Blast_Int4MatrixFromFreq(self->startMatrix, self->cols,
2219 self->startFreqRatios,
2220 self->ungappedLambda);
2221 }
2222 }
2223 return status;
2224 }
2225
2226
2227 /* Create an array of 8-mers for a sequence, such that index of each 8-mer
2228 is the same as its position in the query */
2229 static int
s_CreateWordArray(const Uint1 * seq_data,Int4 seq_len,Uint8 ** words)2230 s_CreateWordArray(const Uint1* seq_data, Int4 seq_len, Uint8** words)
2231 {
2232 int word_size = 8; /* word size for k-mer matching */
2233 Uint8* query_hashes; /* list of hashes for query words */
2234 Uint8 mask = NCBI_CONST_UINT8(0xFFFFFFFFFF); /* mask for computing hash
2235 values */
2236 int i;
2237
2238 /* if query or subject length is smaller than word size, exit */
2239 if (!seq_data || !words || seq_len < word_size) {
2240 return -1;
2241 }
2242
2243 query_hashes = (Uint8*)calloc((seq_len - word_size + 1),
2244 sizeof(Uint8));
2245 *words = query_hashes;
2246
2247 if (!query_hashes) {
2248 return -1;
2249 }
2250
2251
2252 /* find query word hashes */
2253 query_hashes[0] = s_GetHash(&seq_data[0], word_size);
2254 for (i = 1; i < seq_len - word_size; i++) {
2255 query_hashes[i] = query_hashes[i - 1];
2256 query_hashes[i] <<= 5;
2257 query_hashes[i] &= mask;
2258 query_hashes[i] += (Uint8)seq_data[i + word_size - 1];
2259 }
2260
2261 return 0;
2262 }
2263
2264
s_FreeBlastCompo_QueryInfoArray(BlastCompo_QueryInfo ** query_info,int num_queries)2265 static void s_FreeBlastCompo_QueryInfoArray(BlastCompo_QueryInfo** query_info,
2266 int num_queries)
2267 {
2268 int i;
2269
2270 if (!query_info) {
2271 return;
2272 }
2273
2274 for (i = 0;i < num_queries;i++) {
2275 if ((*query_info)[i].words) {
2276 free((*query_info)[i].words);
2277 }
2278 }
2279
2280 free(*query_info);
2281 *query_info = NULL;
2282 }
2283
2284 /**
2285 * Save information about all queries in an array of objects of type
2286 * BlastCompo_QueryInfo.
2287 *
2288 * @param query_data query sequence data
2289 * @param blast_query_info information about all queries, as an
2290 * internal blast data structure
2291 *
2292 * @return the new array on success, or NULL on error
2293 */
2294 static BlastCompo_QueryInfo *
s_GetQueryInfo(Uint1 * query_data,const BlastQueryInfo * blast_query_info,Boolean skip)2295 s_GetQueryInfo(Uint1 * query_data, const BlastQueryInfo * blast_query_info, Boolean skip)
2296 {
2297 int i; /* loop index */
2298 BlastCompo_QueryInfo *
2299 compo_query_info; /* the new array */
2300 int num_queries; /* the number of queries/elements in
2301 compo_query_info */
2302
2303 num_queries = blast_query_info->last_context + 1;
2304 compo_query_info = calloc(num_queries, sizeof(BlastCompo_QueryInfo));
2305 if (compo_query_info != NULL) {
2306 for (i = 0; i < num_queries; i++) {
2307 BlastCompo_QueryInfo * query_info = &compo_query_info[i];
2308 const BlastContextInfo * query_context = &blast_query_info->contexts[i];
2309
2310 query_info->eff_search_space =
2311 (double) query_context->eff_searchsp;
2312 query_info->origin = query_context->query_offset;
2313 query_info->seq.data = &query_data[query_info->origin];
2314 query_info->seq.length = query_context->query_length;
2315 query_info->words = NULL;
2316
2317 s_CreateWordArray(query_info->seq.data, query_info->seq.length,
2318 &query_info->words);
2319 if (! skip) {
2320 Blast_ReadAaComposition(&query_info->composition, BLASTAA_SIZE,
2321 query_info->seq.data,
2322 query_info->seq.length);
2323 }
2324 }
2325 }
2326 return compo_query_info;
2327 }
2328
2329
2330 /**
2331 * Create a new object of type BlastCompo_GappingParams. The new
2332 * object contains the parameters needed by the composition adjustment
2333 * library to compute a gapped alignment.
2334 *
2335 * @param context the data structures needed by callback functions
2336 * that perform the gapped alignments.
2337 * @param extendParams parameters used for a gapped extension
2338 * @param num_queries the number of queries in the concatenated query
2339 */
2340 static BlastCompo_GappingParams *
s_GappingParamsNew(BlastKappa_GappingParamsContext * context,const BlastExtensionParameters * extendParams,int num_queries)2341 s_GappingParamsNew(BlastKappa_GappingParamsContext * context,
2342 const BlastExtensionParameters* extendParams,
2343 int num_queries)
2344 {
2345 int i;
2346 double min_lambda = DBL_MAX; /* smallest gapped Lambda */
2347 const BlastScoringParameters * scoring = context->scoringParams;
2348 const BlastExtensionOptions * options = extendParams->options;
2349 /* The new object */
2350 BlastCompo_GappingParams * gapping_params = NULL;
2351
2352 gapping_params = malloc(sizeof(BlastCompo_GappingParams));
2353 if (gapping_params == NULL)
2354 return NULL;
2355
2356 gapping_params->gap_open = scoring->gap_open;
2357 gapping_params->gap_extend = scoring->gap_extend;
2358 gapping_params->context = context;
2359
2360 for (i = 0; i < num_queries; i++) {
2361 if (context->sbp->kbp_gap[i] != NULL &&
2362 context->sbp->kbp_gap[i]->Lambda < min_lambda) {
2363 min_lambda = context->sbp->kbp_gap[i]->Lambda;
2364 }
2365 }
2366 gapping_params->x_dropoff = (Int4)
2367 MAX(options->gap_x_dropoff_final*NCBIMATH_LN2 / min_lambda,
2368 extendParams->gap_x_dropoff_final);
2369 context->gap_align->gap_x_dropoff = gapping_params->x_dropoff;
2370
2371 return gapping_params;
2372 }
2373
2374
2375 /** Callbacks used by the Blast_RedoOneMatch* routines */
2376 static const Blast_RedoAlignCallbacks
2377 redo_align_callbacks = {
2378 s_CalcLambda, s_SequenceGetRange, s_RedoOneAlignment,
2379 s_NewAlignmentUsingXdrop, s_FreeEditScript
2380 };
2381
2382
2383 /* Bit score per alignment position threshold for preliminaru near identical
2384 test */
2385 #define NEAR_IDENTICAL_BITS_PER_POSITION (1.74)
2386
2387 /**
2388 * Read the parameters required for the Blast_RedoOneMatch* functions from
2389 * the corresponding parameters in standard BLAST datatypes. Return a new
2390 * object representing these parameters.
2391 */
2392 static Blast_RedoAlignParams *
s_GetAlignParams(BlastKappa_GappingParamsContext * context,BLAST_SequenceBlk * queryBlk,const BlastQueryInfo * queryInfo,const BlastHitSavingParameters * hitParams,const BlastExtensionParameters * extendParams)2393 s_GetAlignParams(BlastKappa_GappingParamsContext * context,
2394 BLAST_SequenceBlk * queryBlk,
2395 const BlastQueryInfo* queryInfo,
2396 const BlastHitSavingParameters* hitParams,
2397 const BlastExtensionParameters* extendParams)
2398 {
2399 int status = 0; /* status code */
2400 int rows; /* number of rows in the scoring matrix */
2401 int cutoff_s; /* cutoff score for saving an alignment */
2402 double cutoff_e; /* cutoff evalue for saving an alignment */
2403 BlastCompo_GappingParams *
2404 gapping_params = NULL; /* parameters needed to compute a gapped
2405 alignment */
2406 Blast_MatrixInfo *
2407 scaledMatrixInfo; /* information about the scoring matrix */
2408 /* does this kind of search translate the database sequence */
2409 int subject_is_translated = (context->prog_number == eBlastTypeTblastn) || (context->prog_number == eBlastTypeRpsTblastn);
2410 int query_is_translated = context->prog_number == eBlastTypeBlastx;
2411 /* is this a positiion-based search */
2412 Boolean positionBased = (Boolean) (context->sbp->psi_matrix != NULL);
2413 /* will BLAST_LinkHsps be called to assign e-values */
2414 Boolean do_link_hsps = (hitParams->do_sum_stats);
2415 ECompoAdjustModes compo_adjust_mode =
2416 (ECompoAdjustModes) extendParams->options->compositionBasedStats;
2417
2418 /* per position bit score cutoff for testing whether sequences are
2419 near identical */
2420 double near_identical_cutoff_bits = NEAR_IDENTICAL_BITS_PER_POSITION;
2421
2422 /* score block is already scaled by context->localScalingFactor */
2423 double near_identical_cutoff=0;
2424 Int4 index;
2425 for (index = queryInfo->first_context;
2426 index <= queryInfo->last_context; ++index) {
2427
2428 if ((queryInfo->contexts[index].is_valid)) {
2429 near_identical_cutoff =
2430 (near_identical_cutoff_bits * NCBIMATH_LN2)
2431 / context->sbp->kbp_gap[index]->Lambda;
2432 break;
2433 }
2434 }
2435
2436 if (do_link_hsps) {
2437 ASSERT(hitParams->link_hsp_params != NULL);
2438 cutoff_s =
2439 (int) (hitParams->cutoff_score_min * context->localScalingFactor);
2440 } else {
2441 /* There is no cutoff score; we consider e-values instead */
2442 cutoff_s = 1;
2443 }
2444 cutoff_e = hitParams->options->expect_value;
2445 rows = positionBased ? queryInfo->max_length : BLASTAA_SIZE;
2446 scaledMatrixInfo = Blast_MatrixInfoNew(rows, BLASTAA_SIZE, positionBased);
2447 status = s_MatrixInfoInit(scaledMatrixInfo, queryBlk, context->sbp,
2448 context->localScalingFactor,
2449 context->scoringParams->options->matrix);
2450 if (status != 0) {
2451 return NULL;
2452 }
2453 gapping_params = s_GappingParamsNew(context, extendParams,
2454 queryInfo->last_context + 1);
2455 if (gapping_params == NULL) {
2456 return NULL;
2457 } else {
2458 return
2459 Blast_RedoAlignParamsNew(&scaledMatrixInfo, &gapping_params,
2460 compo_adjust_mode, positionBased,
2461 query_is_translated,
2462 subject_is_translated,
2463 queryInfo->max_length, cutoff_s, cutoff_e,
2464 do_link_hsps, &redo_align_callbacks,
2465 near_identical_cutoff);
2466 }
2467 }
2468
2469
2470 /**
2471 * Convert an array of BlastCompo_Heap objects to a BlastHSPResults structure.
2472 *
2473 * @param results BLAST core external results structure (pre-SeqAlign)
2474 * [out]
2475 * @param heaps an array of BlastCompo_Heap objects
2476 * @param hitlist_size size of each list in the results structure above [in]
2477 */
2478 static void
s_FillResultsFromCompoHeaps(BlastHSPResults * results,BlastCompo_Heap heaps[],Int4 hitlist_size)2479 s_FillResultsFromCompoHeaps(BlastHSPResults * results,
2480 BlastCompo_Heap heaps[],
2481 Int4 hitlist_size)
2482 {
2483 int query_index; /* loop index */
2484 int num_queries; /* Number of queries in this search */
2485
2486 num_queries = results->num_queries;
2487 for (query_index = 0; query_index < num_queries; query_index++) {
2488 BlastHSPList* hsp_list;
2489 BlastHitList* hitlist;
2490 BlastCompo_Heap * heap = &heaps[query_index];
2491
2492 results->hitlist_array[query_index] = Blast_HitListNew(hitlist_size);
2493 hitlist = results->hitlist_array[query_index];
2494
2495 while (NULL != (hsp_list = BlastCompo_HeapPop(heap))) {
2496 Blast_HitListUpdate(hitlist, hsp_list);
2497 }
2498 }
2499 Blast_HSPResultsReverseOrder(results);
2500 }
2501
2502
2503 /** Remove all matches from a BlastCompo_Heap. */
s_ClearHeap(BlastCompo_Heap * self)2504 static void s_ClearHeap(BlastCompo_Heap * self)
2505 {
2506 BlastHSPList* hsp_list = NULL; /* an element of the heap */
2507
2508 while (NULL != (hsp_list = BlastCompo_HeapPop(self))) {
2509 hsp_list = Blast_HSPListFree(hsp_list);
2510 }
2511 }
2512
2513 extern void
2514 BLAST_SetupPartialFetching(EBlastProgramType program_number,
2515 BlastSeqSrc* seq_src,
2516 const BlastHSPList** hsp_list,
2517 Int4 num_hsplists);
2518
2519 /**
2520 * Free a BlastGapAlignStruct copy created by s_BlastGapAlignStruct_Copy
2521 *
2522 * @param copy Pointer to BlastGapAlignStruct to be freed
2523 */
s_BlastGapAlignStruct_Free(BlastGapAlignStruct * copy)2524 static void s_BlastGapAlignStruct_Free(BlastGapAlignStruct* copy)
2525 {
2526 {
2527 while (copy->state_struct != NULL) {
2528 GapStateArrayStruct* cur = copy->state_struct;
2529 copy->state_struct = copy->state_struct->next;
2530 if (cur->state_array) {
2531 sfree(cur->state_array);
2532 }
2533 if (cur) {
2534 sfree(cur);
2535 }
2536 }
2537 }
2538 {
2539 if (copy->edit_script != NULL) {
2540 if (copy->edit_script->op_type) {
2541 sfree(copy->edit_script->op_type);
2542 }
2543 if (copy->edit_script->num) {
2544 sfree(copy->edit_script->num);
2545 }
2546 sfree(copy->edit_script);
2547 }
2548 }
2549 {
2550 if (copy->fwd_prelim_tback != NULL) {
2551 if (copy->fwd_prelim_tback->edit_ops) {
2552 sfree(copy->fwd_prelim_tback->edit_ops);
2553 }
2554 sfree(copy->fwd_prelim_tback);
2555 }
2556 }
2557 {
2558 if (copy->rev_prelim_tback != NULL) {
2559 if (copy->rev_prelim_tback->edit_ops) {
2560 sfree(copy->rev_prelim_tback->edit_ops);
2561 }
2562 sfree(copy->rev_prelim_tback);
2563 }
2564 }
2565 {
2566 if (copy->greedy_align_mem != NULL) {
2567 sfree(copy->greedy_align_mem);
2568 }
2569 }
2570 {
2571 if (copy->dp_mem != NULL) {
2572 sfree(copy->dp_mem);
2573 }
2574 }
2575 {
2576 if (copy->sbp != NULL) {
2577 sfree(copy->sbp);
2578 }
2579 }
2580 sfree(copy);
2581 }
2582
2583 /**
2584 * Create a "deep" copy of a BlastGapAlignStruct structure.
2585 *
2586 * Non-pointer structure members are copied. Pointers to data which will
2587 * only be read are copied. For data which will be changing, memory for copies
2588 * will be allocated and new pointers will be assigned to them. The process
2589 * repeats down the structure hierarchy until all pointers are dealt with.
2590 *
2591 * @param orig Pointer to BlastGapAlignStruct structure to be copied
2592 * @param sbp Pointer to BlastScoreBlk structure, required to set copy->sbp
2593 *
2594 * @return Pointer to copy of original BlastGapAlignStruct structure
2595 */
s_BlastGapAlignStruct_Copy(BlastGapAlignStruct * orig,BlastScoreBlk * sbp)2596 static BlastGapAlignStruct* s_BlastGapAlignStruct_Copy(
2597 BlastGapAlignStruct* orig,
2598 BlastScoreBlk* sbp
2599 )
2600 {
2601 BlastGapAlignStruct* copy =
2602 (BlastGapAlignStruct*) calloc(1, sizeof(BlastGapAlignStruct));
2603
2604 // Copy plain old data (ints, doubles, booleans, ...).
2605 // Any pointer members will be processed separately.
2606 memcpy(copy, orig, sizeof(BlastGapAlignStruct));
2607
2608 {
2609 GapStateArrayStruct* o = orig->state_struct;
2610 if (o != NULL) {
2611 GapStateArrayStruct* c = (GapStateArrayStruct*) calloc(
2612 1,
2613 sizeof(GapStateArrayStruct)
2614 );
2615 copy->state_struct = c;
2616 memcpy(c, o, sizeof(GapStateArrayStruct));
2617 c->state_array = (Uint1*) calloc(c->length, sizeof(Uint1));
2618 int i;
2619 for (i = 0; i < c->length; ++i) {
2620 c->state_array[i] = o->state_array[i];
2621 }
2622 while (o->next != NULL) {
2623 c->next = (GapStateArrayStruct*)
2624 calloc(1, sizeof(GapStateArrayStruct));
2625 c = c->next;
2626 o = o->next;
2627 memcpy(c, o, sizeof(GapStateArrayStruct));
2628 c->state_array = (Uint1*) calloc(c->length, sizeof(Uint1));
2629 int i;
2630 for (i = 0; i < c->length; ++i) {
2631 c->state_array[i] = o->state_array[i];
2632 }
2633 }
2634 }
2635 }
2636 {
2637 GapEditScript* o = orig->edit_script;
2638 if (o != NULL) {
2639 GapEditScript* c = (GapEditScript*) calloc(
2640 1,
2641 sizeof(GapEditScript)
2642 );
2643 copy->edit_script = c;
2644 memcpy(c, o, sizeof(GapEditScript));
2645 c->op_type = (EGapAlignOpType*) calloc(
2646 o->size,
2647 sizeof(EGapAlignOpType)
2648 );
2649 c->num = (Int4*) calloc(o->size, sizeof(Int4));
2650 int i;
2651 for (i = 0; i < o->size; ++i) {
2652 c->op_type[i] = o->op_type[i];
2653 c->num[i] = o->num[i];
2654 }
2655 }
2656 }
2657 {
2658 GapPrelimEditBlock* o = orig->fwd_prelim_tback;
2659 if (o != NULL) {
2660 GapPrelimEditBlock* c = (GapPrelimEditBlock*) calloc(
2661 1,
2662 sizeof(GapPrelimEditBlock)
2663 );
2664 copy->fwd_prelim_tback = c;
2665 memcpy(c, o, sizeof(GapPrelimEditBlock));
2666 c->edit_ops = calloc(
2667 o->num_ops_allocated,
2668 sizeof(GapPrelimEditScript)
2669 );
2670 int i;
2671 for (i = 0; i < o->num_ops_allocated; ++i) {
2672 c->edit_ops[i].op_type = o->edit_ops[i].op_type;
2673 c->edit_ops[i].num = o->edit_ops[i].num;
2674 }
2675 }
2676 }
2677 {
2678 GapPrelimEditBlock* o = orig->rev_prelim_tback;
2679 if (o != NULL) {
2680 GapPrelimEditBlock* c = (GapPrelimEditBlock*) calloc(
2681 1,
2682 sizeof(GapPrelimEditBlock)
2683 );
2684 copy->rev_prelim_tback = c;
2685 memcpy(c, o, sizeof(GapPrelimEditBlock));
2686 c->edit_ops = calloc(
2687 o->num_ops_allocated,
2688 sizeof(GapPrelimEditScript)
2689 );
2690 int i;
2691 for (i = 0; i < o->num_ops_allocated; ++i) {
2692 c->edit_ops[i].op_type = o->edit_ops[i].op_type;
2693 c->edit_ops[i].num = o->edit_ops[i].num;
2694 }
2695 }
2696 }
2697 {
2698 SGreedyAlignMem* o = orig->greedy_align_mem;
2699 if (o != NULL) {
2700 SGreedyAlignMem* c = (SGreedyAlignMem*) calloc(
2701 1,
2702 sizeof(SGreedyAlignMem)
2703 );
2704 copy->greedy_align_mem = c;
2705 memcpy(c, o, sizeof(SGreedyAlignMem));
2706 }
2707 }
2708 {
2709 BlastGapDP* o = orig->dp_mem;
2710 if (o != NULL) {
2711 BlastGapDP* c = (BlastGapDP*) calloc(
2712 orig->dp_mem_alloc,
2713 sizeof(BlastGapDP)
2714 );
2715 copy->dp_mem = c;
2716 memcpy(c, o, orig->dp_mem_alloc * sizeof(BlastGapDP));
2717 }
2718 }
2719 {
2720 copy->sbp = sbp;
2721 }
2722
2723 return copy;
2724 }
2725
2726 /**
2727 * Free a BlastScoreBlk copy created by s_BlastScoreBlk_Copy
2728 *
2729 * BlastScoreBlk* pointer "bsb_ptr" should be passed as (&bsb_ptr);
2730 * this function will set bsb_ptr to NULL before returning.
2731 *
2732 * @param copy Pointer to (pointer to BlastScoreBlk to be freed)
2733 */
2734 static
s_BlastScoreBlk_Free(BlastScoreBlk ** copy)2735 void s_BlastScoreBlk_Free(BlastScoreBlk** copy)
2736 {
2737 BlastScoreBlkFree(*copy);
2738 *copy = NULL;
2739 }
2740
2741 /**
2742 * Create a "deep" copy of a BlastScoreBlk structure.
2743 *
2744 * Non-pointer structure members are copied. Pointers to data which will
2745 * only be read are copied. For data which will be changing, memory for copies
2746 * will be allocated and new pointers will be assigned to them. The process
2747 * repeats down the structure hierarchy until all pointers are dealt with.
2748 *
2749 * @param program The program type
2750 * @param orig Pointer to BlastScoreBlk structure to be copied
2751 * @param alphabet_code Alphabet code
2752 * @param number_of_contexts Number of contexts
2753 *
2754 * @return Pointer to copy of original BlastScoreBlk structure
2755 */
2756 static
s_BlastScoreBlk_Copy(EBlastProgramType program,BlastScoreBlk * orig,Uint1 alphabet_code,Int4 number_of_contexts)2757 BlastScoreBlk* s_BlastScoreBlk_Copy(
2758 EBlastProgramType program,
2759 BlastScoreBlk* orig,
2760 Uint1 alphabet_code,
2761 Int4 number_of_contexts
2762 )
2763 {
2764 BlastScoreBlk* copy = BlastScoreBlkNew(
2765 orig->alphabet_code,
2766 orig->number_of_contexts
2767 );
2768 if (copy == NULL) {
2769 return NULL;
2770 }
2771
2772 copy->alphabet_start = orig->alphabet_start;
2773 copy->name = strdup(orig->name);
2774 copy->comments = orig->comments;
2775 /* Deep-copy orig->matrix */
2776 if (orig->matrix != NULL) {
2777 if (copy->matrix == NULL) {
2778 return BlastScoreBlkFree(copy);
2779 }
2780 SBlastScoreMatrix* m = copy->matrix;
2781 if (m->data != NULL && orig->matrix->data != NULL) {
2782 int i;
2783 for (i = 0; i < orig->matrix->ncols; ++i) {
2784 memcpy(
2785 m->data[i],
2786 orig->matrix->data[i],
2787 m->nrows * sizeof(int)
2788 );
2789 }
2790 }
2791 if (m->freqs != NULL && orig->matrix->freqs != NULL) {
2792 memcpy(
2793 m->freqs,
2794 orig->matrix->freqs,
2795 m->ncols * sizeof(double)
2796 );
2797 }
2798 m->lambda = orig->matrix->lambda;
2799 }
2800 /* Deep-copy orig->psi_matrix */
2801 if (orig->psi_matrix != NULL
2802 && orig->psi_matrix->pssm != NULL) {
2803 copy->psi_matrix = SPsiBlastScoreMatrixNew(orig->psi_matrix->pssm->ncols);
2804 if (copy->psi_matrix == NULL) {
2805 return BlastScoreBlkFree(copy);
2806 }
2807 SPsiBlastScoreMatrix* pm = copy->psi_matrix;
2808 SBlastScoreMatrix* m = pm->pssm;
2809 if (m->data != NULL && orig->psi_matrix->pssm->data != NULL) {
2810 int i;
2811 for (i = 0; i < orig->psi_matrix->pssm->ncols; ++i) {
2812 memcpy(
2813 m->data[i],
2814 orig->psi_matrix->pssm->data[i],
2815 m->nrows * sizeof(int)
2816 );
2817 }
2818 }
2819 if (m->freqs != NULL
2820 && orig->psi_matrix->pssm->freqs != NULL) {
2821 memcpy(
2822 m->freqs,
2823 orig->psi_matrix->pssm->freqs,
2824 m->ncols * sizeof(double)
2825 );
2826 }
2827 m->lambda = orig->psi_matrix->pssm->lambda;
2828 if (pm->freq_ratios != NULL
2829 && orig->psi_matrix->freq_ratios != NULL) {
2830 int i;
2831 for (i = 0; i < orig->psi_matrix->pssm->ncols; ++i) {
2832 memcpy(
2833 pm->freq_ratios[i],
2834 orig->psi_matrix->freq_ratios[i],
2835 orig->psi_matrix->pssm->nrows * sizeof(double)
2836 );
2837 }
2838 }
2839 if (orig->psi_matrix->kbp != NULL) {
2840 memcpy(pm->kbp, orig->psi_matrix->kbp, sizeof(Blast_KarlinBlk));
2841 }
2842 }
2843 copy->matrix_only_scoring = orig->matrix_only_scoring;
2844 copy->complexity_adjusted_scoring = orig->complexity_adjusted_scoring;
2845 copy->loscore = orig->loscore;
2846 copy->hiscore = orig->hiscore;
2847 copy->penalty = orig->penalty;
2848 copy->reward = orig->reward;
2849 copy->read_in_matrix = orig->read_in_matrix;
2850 if (Blast_QueryIsPssm(program)) {
2851 copy->kbp = copy->kbp_psi;
2852 copy->kbp_gap = copy->kbp_gap_psi;
2853 } else {
2854 copy->kbp = copy->kbp_std;
2855 copy->kbp_gap = copy->kbp_gap_std;
2856 }
2857 if (orig->gbp != NULL) {
2858 memcpy(copy->gbp, orig->gbp, sizeof(Blast_GumbelBlk));
2859 }
2860 int ctx;
2861 for (ctx = 0; ctx < orig->number_of_contexts; ++ctx) {
2862 if (orig->sfp != NULL && orig->sfp[ctx] != NULL) {
2863 copy->sfp[ctx] = Blast_ScoreFreqNew(
2864 orig->sfp[ctx]->score_min,
2865 orig->sfp[ctx]->score_max
2866 );
2867 if (copy->sfp[ctx] == NULL) {
2868 return BlastScoreBlkFree(copy);
2869 }
2870 copy->sfp[ctx]->obs_min = orig->sfp[ctx]->obs_min;
2871 copy->sfp[ctx]->obs_max = orig->sfp[ctx]->obs_max;
2872 copy->sfp[ctx]->score_avg = orig->sfp[ctx]->score_avg;
2873 int r = orig->sfp[ctx]->score_max - orig->sfp[ctx]->score_min + 1;
2874 memcpy(
2875 copy->sfp[ctx]->sprob0,
2876 orig->sfp[ctx]->sprob0,
2877 r * sizeof(double)
2878 );
2879 }
2880 if (orig->kbp_std != NULL && orig->kbp_std[ctx] != NULL) {
2881 copy->kbp_std[ctx] = Blast_KarlinBlkNew();
2882 if (Blast_KarlinBlkCopy(copy->kbp_std[ctx], orig->kbp_std[ctx]) != 0) {
2883 return BlastScoreBlkFree(copy);
2884 }
2885 }
2886 if (orig->kbp_gap_std != NULL && orig->kbp_gap_std[ctx] != NULL) {
2887 copy->kbp_gap_std[ctx] = Blast_KarlinBlkNew();
2888 if (Blast_KarlinBlkCopy(copy->kbp_gap_std[ctx], orig->kbp_gap_std[ctx]) != 0) {
2889 return BlastScoreBlkFree(copy);
2890 }
2891 }
2892 if (orig->kbp_psi != NULL && orig->kbp_psi[ctx] != NULL) {
2893 copy->kbp_psi[ctx] = Blast_KarlinBlkNew();
2894 if (Blast_KarlinBlkCopy(copy->kbp_psi[ctx], orig->kbp_psi[ctx]) != 0) {
2895 return BlastScoreBlkFree(copy);
2896 }
2897 }
2898 if (orig->kbp_gap_psi != NULL && orig->kbp_gap_psi[ctx] != NULL) {
2899 copy->kbp_gap_psi[ctx] = Blast_KarlinBlkNew();
2900 if (Blast_KarlinBlkCopy(copy->kbp_gap_psi[ctx], orig->kbp_gap_psi[ctx]) != 0) {
2901 return BlastScoreBlkFree(copy);
2902 }
2903 }
2904 if (Blast_QueryIsPssm(program)) {
2905 copy->kbp[ctx] = copy->kbp_psi[ctx];
2906 copy->kbp_gap[ctx] = copy->kbp_gap_psi[ctx];
2907 } else {
2908 copy->kbp[ctx] = copy->kbp_std[ctx];
2909 copy->kbp_gap[ctx] = copy->kbp_gap_std[ctx];
2910 }
2911 }
2912 if (orig->kbp_ideal != NULL) {
2913 copy->kbp_ideal = Blast_KarlinBlkNew();
2914 if (Blast_KarlinBlkCopy(copy->kbp_ideal, orig->kbp_ideal) != 0) {
2915 return BlastScoreBlkFree(copy);
2916 }
2917 }
2918 copy->ambiguous_res = (Uint1*) calloc(orig->ambig_size, sizeof(Uint1));
2919 if (orig->ambiguous_res != NULL) {
2920 memcpy(copy->ambiguous_res, orig->ambiguous_res, orig->ambig_size);
2921 }
2922 copy->ambig_size = orig->ambig_size;
2923 copy->ambig_occupy = orig->ambig_occupy;
2924 copy->round_down = orig->round_down;
2925
2926 return copy;
2927 }
2928
2929 /**
2930 * Recompute alignments for each match found by the gapped BLAST
2931 * algorithm. Single-thread adapter to Blast_RedoAlignmentCore_MT.
2932 */
2933 Int2
Blast_RedoAlignmentCore(EBlastProgramType program_number,BLAST_SequenceBlk * queryBlk,const BlastQueryInfo * queryInfo,BlastScoreBlk * sbp,BLAST_SequenceBlk * subjectBlk,const BlastSeqSrc * seqSrc,Int4 default_db_genetic_code,BlastHSPList * thisMatch,BlastHSPStream * hsp_stream,BlastScoringParameters * scoringParams,const BlastExtensionParameters * extendParams,const BlastHitSavingParameters * hitParams,const PSIBlastOptions * psiOptions,BlastHSPResults * results)2934 Blast_RedoAlignmentCore(EBlastProgramType program_number,
2935 BLAST_SequenceBlk * queryBlk,
2936 const BlastQueryInfo* queryInfo,
2937 BlastScoreBlk* sbp,
2938 BLAST_SequenceBlk * subjectBlk,
2939 const BlastSeqSrc* seqSrc,
2940 Int4 default_db_genetic_code,
2941 BlastHSPList * thisMatch,
2942 BlastHSPStream* hsp_stream,
2943 BlastScoringParameters* scoringParams,
2944 const BlastExtensionParameters* extendParams,
2945 const BlastHitSavingParameters* hitParams,
2946 const PSIBlastOptions* psiOptions,
2947 BlastHSPResults* results)
2948 {
2949 return Blast_RedoAlignmentCore_MT(
2950 program_number,
2951 1, /* number of threads */
2952 queryBlk,
2953 queryInfo,
2954 sbp,
2955 subjectBlk,
2956 seqSrc,
2957 default_db_genetic_code,
2958 thisMatch,
2959 hsp_stream,
2960 scoringParams,
2961 extendParams,
2962 hitParams,
2963 psiOptions,
2964 results
2965 );
2966 }
2967
2968 /**
2969 * Recompute alignments for each match found by the gapped BLAST
2970 * algorithm.
2971 */
2972 Int2
Blast_RedoAlignmentCore_MT(EBlastProgramType program_number,Uint4 num_threads,BLAST_SequenceBlk * queryBlk,const BlastQueryInfo * queryInfo,BlastScoreBlk * sbp,BLAST_SequenceBlk * subjectBlk,const BlastSeqSrc * seqSrc,Int4 default_db_genetic_code,BlastHSPList * thisMatch,BlastHSPStream * hsp_stream,BlastScoringParameters * scoringParams,const BlastExtensionParameters * extendParams,const BlastHitSavingParameters * hitParams,const PSIBlastOptions * psiOptions,BlastHSPResults * results)2973 Blast_RedoAlignmentCore_MT(EBlastProgramType program_number,
2974 Uint4 num_threads,
2975 BLAST_SequenceBlk * queryBlk,
2976 const BlastQueryInfo* queryInfo,
2977 BlastScoreBlk* sbp,
2978 BLAST_SequenceBlk * subjectBlk,
2979 const BlastSeqSrc* seqSrc,
2980 Int4 default_db_genetic_code,
2981 BlastHSPList * thisMatch,
2982 BlastHSPStream* hsp_stream,
2983 BlastScoringParameters* scoringParams,
2984 const BlastExtensionParameters* extendParams,
2985 const BlastHitSavingParameters* hitParams,
2986 const PSIBlastOptions* psiOptions,
2987 BlastHSPResults* results)
2988 {
2989 int status_code = 0; /* return value code */
2990 /* the factor by which to scale the scoring system in order to
2991 * obtain greater precision */
2992 double localScalingFactor;
2993 /* forbidden ranges for each database position (used in
2994 * Smith-Waterman alignments) */
2995 Blast_ForbiddenRanges forbidden = {0,};
2996 /* a collection of alignments for each query sequence with
2997 * sequences from the database */
2998 BlastCompo_Heap* redoneMatches = NULL;
2999 /* stores all fields needed for computing a compositionally
3000 * adjusted score matrix using Newton's method */
3001 Blast_CompositionWorkspace** NRrecord_tld = NULL;
3002 /* loop index */
3003 int query_index;
3004 /* number of queries in the concatenated query */
3005 int numQueries = queryInfo->num_queries;
3006 /* number of contexts in the concatenated query */
3007 int numContexts = queryInfo->last_context + 1;
3008 /* number of contexts within a query */
3009 int numFrames = (program_number == eBlastTypeBlastx) ? 6:1;
3010 /* keeps track of gapped alignment params */
3011 BlastGapAlignStruct* gapAlign = NULL;
3012 /* the values of the search parameters that will be recorded, altered
3013 * in the search structure in this routine, and then restored before
3014 * the routine exits. */
3015 BlastKappa_SavedParameters *savedParams = NULL;
3016 /* All alignments above this value will be reported, no matter how many. */
3017 double inclusion_ethresh;
3018
3019 BlastHSPResults* local_results = NULL;
3020
3021 BlastCompo_QueryInfo** query_info_tld = NULL;
3022 int* numContexts_tld = NULL;
3023 int* compositionTestIndex_tld = NULL;
3024 Blast_RedoAlignParams** redo_align_params_tld = NULL;
3025 BLAST_SequenceBlk** subjectBlk_tld = NULL;
3026 Boolean positionBased = (Boolean) (sbp->psi_matrix != NULL);
3027 ECompoAdjustModes compo_adjust_mode =
3028 (ECompoAdjustModes) extendParams->options->compositionBasedStats;
3029 Boolean smithWaterman =
3030 (Boolean) (extendParams->options->eTbackExt == eSmithWatermanTbck);
3031
3032 /* which test function do we use to see if a composition-adjusted
3033 p-value is desired; value needs to be passed in eventually*/
3034 int compositionTestIndex = extendParams->options->unifiedP;
3035 Uint1* genetic_code_string = GenCodeSingletonFind(default_db_genetic_code);
3036
3037 ASSERT(program_number == eBlastTypeBlastp ||
3038 program_number == eBlastTypeTblastn ||
3039 program_number == eBlastTypeBlastx ||
3040 program_number == eBlastTypePsiBlast ||
3041 program_number == eBlastTypeRpsBlast ||
3042 program_number == eBlastTypeRpsTblastn);
3043
3044 if (0 == strcmp(scoringParams->options->matrix, "BLOSUM62_20") &&
3045 compo_adjust_mode == eNoCompositionBasedStats) {
3046 return -1; /* BLOSUM62_20 only makes sense if
3047 * compo_adjust_mode is on */
3048 }
3049 if (positionBased) {
3050 /* Position based searches can only use traditional
3051 * composition based stats */
3052 if ((int) compo_adjust_mode > 1) {
3053 compo_adjust_mode = eCompositionBasedStats;
3054 }
3055 /* A position-based search can only have one query */
3056 ASSERT(queryInfo->num_queries == 1);
3057 ASSERT(queryBlk->length == (Int4)sbp->psi_matrix->pssm->ncols);
3058 }
3059
3060 if ((int) compo_adjust_mode > 1 &&
3061 !Blast_FrequencyDataIsAvailable(scoringParams->options->matrix)) {
3062 return -1; /* Unsupported matrix */
3063 }
3064 /*****************/
3065 inclusion_ethresh = (psiOptions /* this can be NULL for CBl2Seq */
3066 ? psiOptions->inclusion_ethresh
3067 : PSI_INCLUSION_ETHRESH);
3068 ASSERT(inclusion_ethresh != 0.0);
3069
3070 int actual_num_threads = 1;
3071 #ifdef _OPENMP
3072 actual_num_threads = num_threads;
3073 #endif
3074
3075 /* Initialize savedParams */
3076 savedParams =
3077 s_SavedParametersNew(queryInfo->max_length, numContexts,
3078 compo_adjust_mode, positionBased);
3079 if (savedParams == NULL) {
3080 status_code = -1;
3081 goto function_cleanup;
3082 }
3083 status_code =
3084 s_RecordInitialSearch(savedParams, sbp, scoringParams,
3085 queryInfo->max_length, compo_adjust_mode,
3086 positionBased);
3087 if (status_code != 0) {
3088 goto function_cleanup;
3089 }
3090
3091 if (compo_adjust_mode != eNoCompositionBasedStats) {
3092 if((0 == strcmp(scoringParams->options->matrix, "BLOSUM62_20"))) {
3093 localScalingFactor = SCALING_FACTOR / 10;
3094 } else {
3095 localScalingFactor = SCALING_FACTOR;
3096 }
3097 } else {
3098 localScalingFactor = 1.0;
3099 }
3100 s_RescaleSearch(sbp, scoringParams, numContexts, localScalingFactor);
3101 status_code =
3102 BLAST_GapAlignStructNew(scoringParams, extendParams,
3103 (seqSrc) ? BlastSeqSrcGetMaxSeqLen(seqSrc)
3104 : subjectBlk->length,
3105 sbp, &gapAlign);
3106 if (status_code != 0) {
3107 return (Int2) status_code;
3108 }
3109
3110 if(smithWaterman) {
3111 status_code =
3112 Blast_ForbiddenRangesInitialize(&forbidden, queryInfo->max_length);
3113 if (status_code != 0) {
3114 goto function_cleanup;
3115 }
3116 }
3117 redoneMatches = calloc(numQueries, sizeof(BlastCompo_Heap));
3118 if (redoneMatches == NULL) {
3119 status_code = -1;
3120 goto function_cleanup;
3121 }
3122 for (query_index = 0; query_index < numQueries; query_index++) {
3123 status_code =
3124 BlastCompo_HeapInitialize(&redoneMatches[query_index],
3125 hitParams->options->hitlist_size,
3126 inclusion_ethresh);
3127 if (status_code != 0) {
3128 goto function_cleanup;
3129 }
3130 }
3131
3132 BlastCompo_Heap** redoneMatches_tld =
3133 (BlastCompo_Heap**) calloc(
3134 actual_num_threads,
3135 sizeof(BlastCompo_Heap*)
3136 );
3137 BlastCompo_Alignment*** alignments_tld =
3138 (BlastCompo_Alignment***) calloc(
3139 actual_num_threads,
3140 sizeof(BlastCompo_Alignment**)
3141 );
3142 BlastCompo_Alignment*** incoming_align_set_tld =
3143 (BlastCompo_Alignment***) calloc(
3144 actual_num_threads,
3145 sizeof(BlastCompo_Alignment**)
3146 );
3147 BlastKappa_SavedParameters** savedParams_tld =
3148 (BlastKappa_SavedParameters**) calloc(
3149 actual_num_threads,
3150 sizeof(BlastKappa_SavedParameters*)
3151 );
3152 BlastScoreBlk** sbp_tld =
3153 (BlastScoreBlk**) calloc(
3154 actual_num_threads,
3155 sizeof(BlastScoreBlk*)
3156 );
3157 BlastKappa_GappingParamsContext* gapping_params_context_tld =
3158 (BlastKappa_GappingParamsContext*) calloc(
3159 actual_num_threads,
3160 sizeof(BlastKappa_GappingParamsContext)
3161 );
3162 Int4*** matrix_tld =
3163 (Int4***) calloc(
3164 actual_num_threads,
3165 sizeof(Int4**)
3166 );
3167
3168 NRrecord_tld =
3169 (Blast_CompositionWorkspace**) calloc(
3170 actual_num_threads,
3171 sizeof(Blast_CompositionWorkspace*)
3172 );
3173
3174 subjectBlk_tld =
3175 (BLAST_SequenceBlk**) calloc(
3176 actual_num_threads,
3177 sizeof(BLAST_SequenceBlk*)
3178 );
3179 redo_align_params_tld =
3180 (Blast_RedoAlignParams**) calloc(
3181 actual_num_threads,
3182 sizeof(Blast_RedoAlignParams*)
3183 );
3184 int* status_code_tld =
3185 (int*) calloc(
3186 actual_num_threads,
3187 sizeof(int)
3188 );
3189 BlastSeqSrc** seqsrc_tld =
3190 (BlastSeqSrc**) calloc(
3191 actual_num_threads,
3192 sizeof(BlastSeqSrc*)
3193 );
3194 BlastGapAlignStruct** gap_align_tld =
3195 (BlastGapAlignStruct**) calloc(
3196 actual_num_threads,
3197 sizeof(BlastGapAlignStruct*)
3198 );
3199 BlastScoringParameters** score_params_tld =
3200 (BlastScoringParameters**) calloc(
3201 actual_num_threads,
3202 sizeof(BlastScoringParameters*)
3203 );
3204 BlastHitSavingParameters** hit_params_tld =
3205 (BlastHitSavingParameters**) calloc(
3206 actual_num_threads,
3207 sizeof(BlastHitSavingParameters*)
3208 );
3209 BlastHSPResults** results_tld =
3210 (BlastHSPResults**) calloc(
3211 actual_num_threads,
3212 sizeof(BlastHSPResults*)
3213 );
3214 query_info_tld =
3215 (BlastCompo_QueryInfo**) calloc(
3216 actual_num_threads,
3217 sizeof(BlastCompo_QueryInfo*)
3218 );
3219 numContexts_tld =
3220 (int*) calloc(
3221 actual_num_threads,
3222 sizeof(int)
3223 );
3224 compositionTestIndex_tld =
3225 (int*) calloc(
3226 actual_num_threads,
3227 sizeof(int)
3228 );
3229
3230 int i;
3231 for (i = 0; i < actual_num_threads; ++i) {
3232 query_info_tld[i] = s_GetQueryInfo(
3233 queryBlk->sequence,
3234 queryInfo,
3235 (program_number == eBlastTypeBlastx)
3236 );
3237 if (query_info_tld[i] == NULL) {
3238 status_code = -1;
3239 goto function_cleanup;
3240 }
3241
3242 sbp_tld[i] = s_BlastScoreBlk_Copy(
3243 program_number,
3244 sbp,
3245 sbp->alphabet_code,
3246 sbp->number_of_contexts
3247 );
3248
3249 numContexts_tld[i] = numContexts;
3250 compositionTestIndex_tld[i] = compositionTestIndex;
3251 seqsrc_tld[i] = BlastSeqSrcCopy(seqSrc);
3252 gap_align_tld[i] =
3253 s_BlastGapAlignStruct_Copy(gapAlign, sbp_tld[i]);
3254 score_params_tld[i] = scoringParams;
3255 hit_params_tld[i] = (BlastHitSavingParameters*) hitParams;
3256 results_tld[i] =
3257 Blast_HSPResultsNew(queryInfo->num_queries);
3258 subjectBlk_tld[i] = subjectBlk;
3259
3260 redoneMatches_tld[i] =
3261 (BlastCompo_Heap*) calloc(numQueries, sizeof(BlastCompo_Heap));
3262 if (redoneMatches_tld[i] == NULL) {
3263 status_code = -1;
3264 goto function_cleanup;
3265 }
3266 for (query_index = 0; query_index < numQueries; query_index++) {
3267 status_code =
3268 BlastCompo_HeapInitialize(&redoneMatches_tld[i][query_index],
3269 hitParams->options->hitlist_size,
3270 inclusion_ethresh);
3271 if (status_code != 0) {
3272 goto function_cleanup;
3273 }
3274 }
3275
3276 alignments_tld[i] = (BlastCompo_Alignment**) calloc(
3277 numContexts,
3278 sizeof(BlastCompo_Alignment*)
3279 );
3280 incoming_align_set_tld[i] = (BlastCompo_Alignment**) calloc(
3281 numFrames,
3282 sizeof(BlastCompo_Alignment*)
3283 );
3284
3285 savedParams_tld[i] = s_SavedParametersNew(
3286 queryInfo->max_length,
3287 numContexts,
3288 compo_adjust_mode,
3289 positionBased
3290 );
3291 if (savedParams_tld[i] == NULL) {
3292 status_code = -1;
3293 goto function_cleanup;
3294 }
3295 status_code = s_RecordInitialSearch(
3296 savedParams_tld[i],
3297 sbp,
3298 scoringParams,
3299 queryInfo->max_length,
3300 compo_adjust_mode,
3301 positionBased
3302 );
3303 if (status_code != 0) {
3304 goto function_cleanup;
3305 }
3306
3307 if ((int) compo_adjust_mode > 1 && !positionBased) {
3308 NRrecord_tld[i] = Blast_CompositionWorkspaceNew();
3309 status_code = Blast_CompositionWorkspaceInit(
3310 NRrecord_tld[i],
3311 scoringParams->options->matrix
3312 );
3313 if (status_code != 0) {
3314 goto function_cleanup;
3315 }
3316 }
3317
3318 gapping_params_context_tld[i].gap_align = gap_align_tld[i];
3319 gapping_params_context_tld[i].scoringParams = score_params_tld[i];
3320 gapping_params_context_tld[i].sbp = sbp_tld[i];
3321 gapping_params_context_tld[i].localScalingFactor = localScalingFactor;
3322 gapping_params_context_tld[i].prog_number = program_number;
3323
3324 redo_align_params_tld[i] =
3325 s_GetAlignParams(
3326 &gapping_params_context_tld[i],
3327 queryBlk,
3328 queryInfo,
3329 hitParams,
3330 extendParams
3331 );
3332 if (redo_align_params_tld[i] == NULL) {
3333 status_code = -1;
3334 goto function_cleanup;
3335 }
3336
3337 if (positionBased) {
3338 matrix_tld[i] = sbp_tld[i]->psi_matrix->pssm->data;
3339 } else {
3340 matrix_tld[i] = sbp_tld[i]->matrix->data;
3341 }
3342 /**** Validate parameters *************/
3343 if (matrix_tld[i] == NULL) {
3344 goto function_cleanup;
3345 }
3346 }
3347
3348 /*
3349 * There are two use cases here.
3350 * (1) hsp_stream == NULL, so single match is passed in thisMatch.
3351 * Also, seqSrc == NULL and subjectBlk are != NULL.
3352 * (2) hsp_stream != NULL, so one or more matches are taken from
3353 * hsp_stream, and thisMatch is (probably) NULL.
3354 * Also, seqSrc != NULL, subjectBlk and thisMatch are == NULL.
3355 */
3356 struct BlastHSPListLinkedList {
3357 BlastHSPList* match;
3358 struct BlastHSPListLinkedList* next;
3359 };
3360 typedef struct BlastHSPListLinkedList BlastHSPListLinkedList;
3361
3362 BlastHSPList** theseMatches = NULL;
3363 int numMatches = 0;
3364 if (hsp_stream == NULL) {
3365 theseMatches = (BlastHSPList**) calloc(1, sizeof(BlastHSPList*));
3366 *theseMatches = thisMatch;
3367 numMatches = 1;
3368 } else {
3369 BlastHSPList* localMatch = NULL;
3370 BlastHSPListLinkedList* head = NULL;
3371 BlastHSPListLinkedList* tail = NULL;
3372 /*
3373 * Collect matches from stream into linked list, counting them
3374 * along the way.
3375 */
3376 while (BlastHSPStreamRead(hsp_stream, &localMatch)
3377 != kBlastHSPStream_Eof) {
3378 BlastHSPListLinkedList* entry =
3379 (BlastHSPListLinkedList*) calloc(
3380 1,
3381 sizeof(BlastHSPListLinkedList)
3382 );
3383 entry->match = localMatch;
3384 if (head == NULL) {
3385 head = entry;
3386 } else {
3387 tail->next = entry;
3388 }
3389 tail = entry;
3390 ++numMatches;
3391 }
3392 /*
3393 * Convert linked list of matches into array.
3394 */
3395 theseMatches =
3396 (BlastHSPList**) calloc(numMatches, sizeof(BlastHSPList*));
3397 int i;
3398 for (i = 0; i < numMatches; ++i) {
3399 theseMatches[i] = head->match;
3400 BlastHSPListLinkedList* here = head;
3401 head = head->next;
3402 sfree(here);
3403 }
3404 }
3405
3406 Boolean interrupt = FALSE;
3407 #pragma omp parallel \
3408 default(none) num_threads(actual_num_threads) \
3409 if(actual_num_threads>1) \
3410 shared(interrupt, seqsrc_tld, score_params_tld, hit_params_tld, \
3411 gap_align_tld, results_tld, \
3412 redoneMatches_tld, \
3413 stderr, \
3414 numQueries, numMatches, theseMatches, \
3415 numFrames, program_number, subjectBlk_tld, positionBased, \
3416 default_db_genetic_code, localScalingFactor, queryInfo, \
3417 sbp, smithWaterman, compositionTestIndex_tld, forbidden, \
3418 NRrecord_tld, actual_num_threads, sbp_tld, \
3419 matrix_tld, query_info_tld, numContexts_tld, \
3420 genetic_code_string, queryBlk, compo_adjust_mode, \
3421 alignments_tld, incoming_align_set_tld, savedParams_tld, \
3422 scoringParams, redo_align_params_tld, \
3423 status_code_tld)
3424 {
3425 int b;
3426 #pragma omp for schedule(static)
3427 for (b = 0; b < numMatches; ++b) {
3428 #pragma omp flush(interrupt)
3429 if (!interrupt) {
3430 BlastCompo_Alignment** alignments = NULL;
3431 BlastCompo_Alignment** incoming_align_set = NULL;
3432 Blast_CompositionWorkspace* NRrecord = NULL;
3433 BlastCompo_QueryInfo* query_info = NULL;
3434
3435 int numAligns[6];
3436 Blast_KarlinBlk* kbp = NULL;
3437 BlastCompo_MatchingSequence matchingSeq = {0,};
3438 BlastHSPList* hsp_list = NULL;
3439 BlastCompo_Alignment* incoming_aligns = NULL;
3440 Blast_RedoAlignParams* redo_align_params;
3441 double best_evalue;
3442 Int4 best_score;
3443 int query_index;
3444 int context_index;
3445 int frame_index;
3446 void* discarded_aligns = NULL;
3447 BlastSeqSrc* seqSrc;
3448 BlastScoringParameters* scoringParams;
3449 BlastHitSavingParameters* hitParams;
3450 BlastCompo_Heap* redoneMatches;
3451 BlastScoreBlk* sbp;
3452 BLAST_SequenceBlk* subjectBlk;
3453 int numContexts;
3454 int compositionTestIndex;
3455 /* existing alignments for a match */
3456 Int4** matrix; /* score matrix */
3457 int* pStatusCode;
3458
3459 double pvalueForThisPair = (-1); /* p-value for this match
3460 for composition; -1 == no adjustment*/
3461 double LambdaRatio; /*lambda ratio*/
3462
3463 int tid = 0;
3464 #ifdef _OPENMP
3465 tid = omp_get_thread_num();
3466 #endif
3467 seqSrc = seqsrc_tld[tid];
3468 scoringParams = score_params_tld[tid];
3469 hitParams = hit_params_tld[tid];
3470 redoneMatches = redoneMatches_tld[tid];
3471 alignments = alignments_tld[tid];
3472 incoming_align_set = incoming_align_set_tld[tid];
3473 NRrecord = NRrecord_tld[tid];
3474 sbp = sbp_tld[tid];
3475 redo_align_params = redo_align_params_tld[tid];
3476 matrix = matrix_tld[tid];
3477 pStatusCode = &status_code_tld[tid];
3478 query_info = query_info_tld[tid];
3479 numContexts = numContexts_tld[tid];
3480 compositionTestIndex = compositionTestIndex_tld[tid];
3481 subjectBlk = subjectBlk_tld[tid];
3482
3483 BlastHSPList* localMatch = theseMatches[b];
3484
3485 if (localMatch->hsp_array == NULL) {
3486 if (seqSrc) {
3487 continue;
3488 }
3489 #pragma omp critical(intrpt)
3490 interrupt = TRUE;
3491 #pragma omp flush(interrupt)
3492 continue;
3493 }
3494
3495 if (BlastCompo_EarlyTermination(
3496 localMatch->best_evalue,
3497 redoneMatches,
3498 numQueries
3499 )) {
3500 Blast_HSPListFree(localMatch);
3501 if (seqSrc) {
3502 continue;
3503 }
3504 #pragma omp critical(intrpt)
3505 interrupt = TRUE;
3506 #pragma omp flush(interrupt)
3507 continue;
3508 }
3509
3510 query_index = localMatch->query_index;
3511 context_index = query_index * numFrames;
3512 /* Get the sequence for this match */
3513 if (seqSrc && BlastSeqSrcGetSupportsPartialFetching(seqSrc)) {
3514 BLAST_SetupPartialFetching(
3515 program_number,
3516 (BlastSeqSrc*) seqSrc,
3517 (const BlastHSPList**)&localMatch,
3518 1
3519 );
3520 }
3521
3522 if (subjectBlk) {
3523 matchingSeq.length = subjectBlk->length;
3524 matchingSeq.index = -1;
3525 matchingSeq.local_data = subjectBlk;
3526 } else {
3527 *pStatusCode = s_MatchingSequenceInitialize(
3528 &matchingSeq,
3529 program_number,
3530 seqSrc,
3531 default_db_genetic_code,
3532 localMatch->oid
3533 );
3534 if (*pStatusCode != 0) {
3535 /*
3536 * some sequences may have been excluded by membit filtering
3537 * so this is not really an exception
3538 */
3539 *pStatusCode = 0;
3540 goto match_loop_cleanup;
3541 }
3542 }
3543 *pStatusCode = s_ResultHspToDistinctAlign(
3544 incoming_align_set, /* o */
3545 numAligns, /* o */
3546 localMatch->hsp_array, /* i */
3547 localMatch->hspcnt, /* i */
3548 context_index, /* i */
3549 queryInfo, /* i */
3550 localScalingFactor /* i */
3551 );
3552 if (*pStatusCode != 0) {
3553 goto match_loop_cleanup;
3554 }
3555
3556 hsp_list = Blast_HSPListNew(0);
3557 for (frame_index = 0;
3558 frame_index < numFrames;
3559 frame_index++, context_index++) {
3560 incoming_aligns = incoming_align_set[frame_index];
3561 if (!incoming_aligns) {
3562 continue;
3563 }
3564 /*
3565 * All alignments in thisMatch should be to the same query
3566 */
3567 kbp = sbp->kbp_gap[context_index];
3568 if (smithWaterman) {
3569 *pStatusCode =
3570 Blast_RedoOneMatchSmithWaterman(
3571 alignments,
3572 redo_align_params,
3573 incoming_aligns,
3574 numAligns[frame_index],
3575 kbp->Lambda,
3576 kbp->logK,
3577 &matchingSeq,
3578 query_info,
3579 numQueries,
3580 matrix,
3581 BLASTAA_SIZE,
3582 NRrecord,
3583 &forbidden,
3584 redoneMatches,
3585 &pvalueForThisPair,
3586 compositionTestIndex,
3587 &LambdaRatio
3588 );
3589 } else {
3590 *pStatusCode =
3591 Blast_RedoOneMatch(
3592 alignments, // thread-local
3593 redo_align_params, // thread-local
3594 incoming_aligns, // thread-local
3595 numAligns[frame_index], // local
3596 kbp->Lambda, // thread-local
3597 &matchingSeq, // thread-local
3598 -1, // const
3599 query_info, // thread-local
3600 numContexts, // thread-local
3601 matrix, // thread-local
3602 BLASTAA_SIZE, // const
3603 NRrecord, // thread-local
3604 &pvalueForThisPair, // local
3605 compositionTestIndex, // thread-local
3606 &LambdaRatio // local
3607 );
3608 }
3609
3610 if (*pStatusCode != 0) {
3611 goto match_loop_cleanup;
3612 }
3613
3614 if (alignments[context_index] != NULL) {
3615 Int2 qframe = frame_index;
3616 if (program_number == eBlastTypeBlastx) {
3617 if (qframe < 3) {
3618 qframe++;
3619 } else {
3620 qframe = 2 - qframe;
3621 }
3622 }
3623 *pStatusCode =
3624 s_HSPListFromDistinctAlignments(hsp_list,
3625 &alignments[context_index],
3626 matchingSeq.index,
3627 queryInfo, qframe);
3628 if (*pStatusCode) {
3629 goto match_loop_cleanup;
3630 }
3631 }
3632 BlastCompo_AlignmentsFree(&incoming_aligns, NULL);
3633 incoming_align_set[frame_index] = NULL;
3634 }
3635
3636 if (hsp_list->hspcnt > 1) {
3637 s_HitlistReapContained(hsp_list->hsp_array,
3638 &hsp_list->hspcnt);
3639 }
3640 *pStatusCode =
3641 s_HitlistEvaluateAndPurge(&best_score, &best_evalue,
3642 hsp_list,
3643 seqSrc,
3644 matchingSeq.length,
3645 program_number,
3646 queryInfo, context_index,
3647 sbp, hitParams,
3648 pvalueForThisPair, LambdaRatio,
3649 matchingSeq.index);
3650 if (*pStatusCode != 0) {
3651 goto query_loop_cleanup;
3652 }
3653 if (best_evalue <= hitParams->options->expect_value) {
3654 /* The best alignment is significant */
3655 s_HSPListNormalizeScores(hsp_list, kbp->Lambda, kbp->logK,
3656 localScalingFactor);
3657 s_ComputeNumIdentities(
3658 queryBlk,
3659 queryInfo,
3660 subjectBlk,
3661 seqSrc,
3662 hsp_list,
3663 scoringParams->options,
3664 genetic_code_string,
3665 sbp
3666 );
3667 if (!seqSrc) {
3668 goto query_loop_cleanup;
3669 }
3670 if (BlastCompo_HeapWouldInsert(
3671 &redoneMatches[query_index],
3672 best_evalue,
3673 best_score,
3674 localMatch->oid
3675 )) {
3676 *pStatusCode =
3677 BlastCompo_HeapInsert(
3678 &redoneMatches[query_index],
3679 hsp_list,
3680 best_evalue,
3681 best_score,
3682 localMatch->oid,
3683 &discarded_aligns
3684 );
3685 if (*pStatusCode == 0) {
3686 hsp_list = NULL;
3687 }
3688 } else {
3689 hsp_list = Blast_HSPListFree(hsp_list);
3690 }
3691
3692 if (*pStatusCode) {
3693 goto query_loop_cleanup;
3694 }
3695 if (discarded_aligns != NULL) {
3696 Blast_HSPListFree(discarded_aligns);
3697 }
3698 }
3699 query_loop_cleanup:
3700 match_loop_cleanup:
3701 if (seqSrc) {
3702 localMatch = Blast_HSPListFree(localMatch);
3703 } else {
3704 Blast_HSPListSwap(localMatch, hsp_list);
3705 localMatch->oid = hsp_list->oid;
3706 }
3707 hsp_list = Blast_HSPListFree(hsp_list);
3708
3709 if (*pStatusCode != 0) {
3710 for (context_index = 0;
3711 context_index < numContexts;
3712 context_index++) {
3713 BlastCompo_AlignmentsFree(
3714 &alignments[context_index],
3715 s_FreeEditScript
3716 );
3717 }
3718 }
3719 s_MatchingSequenceRelease(&matchingSeq);
3720 BlastCompo_AlignmentsFree(&incoming_aligns, NULL);
3721 if (*pStatusCode != 0 || !seqSrc) {
3722 #pragma omp critical(intrpt)
3723 interrupt = TRUE;
3724 #pragma omp flush(interrupt)
3725 continue;
3726 }
3727
3728 } /* end of if(!interrupt) */
3729 }
3730 #pragma omp barrier
3731
3732 /*
3733 * end of omp parallel section
3734 */
3735 }
3736
3737 function_cleanup:
3738
3739 for (i = 0; i < actual_num_threads; ++i) {
3740 if (status_code_tld[i] != 0) {
3741 status_code = status_code_tld[i];
3742 }
3743 }
3744 for (i = 0; i < actual_num_threads; ++i) {
3745 if (seqSrc && status_code == 0) {
3746 s_FillResultsFromCompoHeaps(
3747 results_tld[i],
3748 redoneMatches_tld[i],
3749 hitParams->options->hitlist_size
3750 );
3751 if (redoneMatches_tld[i] != NULL) {
3752 int qi;
3753 for (qi = 0; qi < numQueries; ++qi) {
3754 sfree(redoneMatches_tld[i][qi].array);
3755 sfree(redoneMatches_tld[i][qi].heapArray);
3756 }
3757 s_ClearHeap(redoneMatches_tld[i]);
3758 }
3759 } else {
3760 if (redoneMatches_tld[i] != NULL) {
3761 int qi;
3762 for (qi = 0; qi < numQueries; ++qi) {
3763 sfree(redoneMatches_tld[i][qi].array);
3764 sfree(redoneMatches_tld[i][qi].heapArray);
3765 }
3766 s_ClearHeap(redoneMatches_tld[i]);
3767 }
3768 }
3769 sfree(redoneMatches_tld[i]);
3770 }
3771 if (redoneMatches != NULL) {
3772 int qi;
3773 for (qi = 0; qi < numQueries; ++qi) {
3774 sfree(redoneMatches[qi].array);
3775 sfree(redoneMatches[qi].heapArray);
3776 }
3777 s_ClearHeap(redoneMatches);
3778 }
3779
3780 if (hsp_stream != NULL) {
3781 /* Reduce results from all threads and continue with business as usual */
3782 SThreadLocalDataArray* thread_data =
3783 SThreadLocalDataArrayNew(actual_num_threads);
3784 int i;
3785 for (i = 0; i < actual_num_threads; ++i) {
3786 SThreadLocalData* tdi = thread_data->tld[i];
3787 BlastHSPResults* rdi = results_tld[i];
3788 tdi->hit_params = hit_params_tld[i];
3789 hit_params_tld[i] = NULL;
3790 tdi->results =
3791 (BlastHSPResults*) calloc(1, sizeof(BlastHSPResults));
3792 tdi->results->num_queries = rdi->num_queries;
3793 tdi->results->hitlist_array =
3794 (BlastHitList**) calloc(
3795 tdi->results->num_queries,
3796 sizeof(BlastHitList*)
3797 );
3798 int j;
3799 for (j = 0; j < tdi->results->num_queries; ++j) {
3800 tdi->results->hitlist_array[j] = rdi->hitlist_array[j];
3801 rdi->hitlist_array[j] = NULL;
3802 }
3803 }
3804 local_results = SThreadLocalDataArrayConsolidateResults(thread_data);
3805 ASSERT(local_results);
3806
3807 /* post-traceback pipes */
3808 BlastHSPStreamTBackClose(hsp_stream, local_results);
3809
3810 for (i = 0; i < local_results->num_queries; ++i) {
3811 results->hitlist_array[i] = local_results->hitlist_array[i];
3812 local_results->hitlist_array[i] = NULL;
3813 }
3814 for (i = 0; i < actual_num_threads; ++i) {
3815 thread_data->tld[i]->hit_params = NULL;
3816 int j;
3817 for (j = 0; j < local_results->num_queries; ++j) {
3818 thread_data->tld[i]->results->hitlist_array[j] =
3819 Blast_HitListFree(
3820 thread_data->tld[i]->results->hitlist_array[j]
3821 );
3822 }
3823 sfree(thread_data->tld[i]->results->hitlist_array);
3824 sfree(thread_data->tld[i]->results);
3825 thread_data->tld[i] = SThreadLocalDataFree(thread_data->tld[i]);
3826 results_tld[i] = Blast_HSPResultsFree(results_tld[i]);
3827 s_FreeBlastCompo_QueryInfoArray(&query_info_tld[i], numContexts);
3828 }
3829 sfree(thread_data->tld);
3830 sfree(thread_data);
3831 Blast_HSPResultsFree(local_results);
3832 }
3833
3834 if (redoneMatches != NULL) {
3835 for (query_index = 0; query_index < numQueries; query_index++) {
3836 BlastCompo_HeapRelease(&redoneMatches[query_index]);
3837 }
3838 sfree(redoneMatches);
3839 redoneMatches = NULL;
3840 }
3841 if (smithWaterman) {
3842 Blast_ForbiddenRangesRelease(&forbidden);
3843 }
3844 if (gapAlign != NULL) {
3845 gapAlign = BLAST_GapAlignStructFree(gapAlign);
3846 }
3847 s_RestoreSearch(sbp, scoringParams, savedParams, queryBlk->length,
3848 positionBased, compo_adjust_mode);
3849 s_SavedParametersFree(&savedParams);
3850
3851 for (i = 0; i < actual_num_threads; ++i) {
3852 s_BlastScoreBlk_Free(&sbp_tld[i]);
3853 gap_align_tld[i]->sbp = NULL;
3854 s_BlastGapAlignStruct_Free(gap_align_tld[i]);
3855 Blast_RedoAlignParamsFree(&redo_align_params_tld[i]);
3856 sfree(alignments_tld[i]);
3857 sfree(incoming_align_set_tld[i]);
3858 Blast_CompositionWorkspaceFree(&NRrecord_tld[i]);
3859 s_SavedParametersFree(&savedParams_tld[i]);
3860 BlastSeqSrcFree(seqsrc_tld[i]);
3861 }
3862 sfree(alignments_tld);
3863 sfree(compositionTestIndex_tld);
3864 sfree(gap_align_tld);
3865 sfree(gapping_params_context_tld);
3866 sfree(hit_params_tld);
3867 sfree(incoming_align_set_tld);
3868 sfree(matrix_tld);
3869 sfree(NRrecord_tld);
3870 sfree(numContexts_tld);
3871 sfree(query_info_tld);
3872 sfree(redo_align_params_tld);
3873 sfree(redoneMatches_tld);
3874 sfree(results_tld);
3875 sfree(savedParams_tld);
3876 sfree(sbp_tld);
3877 sfree(score_params_tld);
3878 sfree(seqsrc_tld);
3879 sfree(status_code_tld);
3880 sfree(subjectBlk_tld);
3881 sfree(theseMatches);
3882
3883 return (Int2) status_code;
3884 }
3885
3886
3887
3888