1 /* $Id: blast_hits.c 594743 2019-10-09 11:00:47Z fongah2 $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 *
27 */
28
29 /** @file blast_hits.c
30 * BLAST functions for saving hits after the (preliminary) gapped alignment
31 */
32
33 #include <algo/blast/core/ncbi_math.h>
34 #include <algo/blast/core/blast_hits.h>
35 #include <algo/blast/core/blast_util.h>
36 #include <algo/blast/core/blast_def.h>
37 #include <algo/blast/core/blast_hspstream.h>
38 #include "blast_hits_priv.h"
39 #include "blast_itree.h"
40 #include "jumper.h"
41
42
43 Int4
GetPrelimHitlistSize(Int4 hitlist_size,Int4 compositionBasedStats,Boolean gapped_calculation)44 GetPrelimHitlistSize(Int4 hitlist_size, Int4 compositionBasedStats, Boolean gapped_calculation)
45 {
46 Int4 prelim_hitlist_size = hitlist_size;
47 char * ADAPTIVE_CBS_ENV = getenv("ADAPTIVE_CBS");
48 if (compositionBasedStats) {
49 if(ADAPTIVE_CBS_ENV != NULL) {
50 if(hitlist_size < 1000) {
51 prelim_hitlist_size = MAX(prelim_hitlist_size + 1000, 1500);
52 }
53 else {
54 prelim_hitlist_size = prelim_hitlist_size*2 + 50;
55 }
56 }
57 else {
58 if(hitlist_size <= 500) {
59 prelim_hitlist_size = 1050;
60 }
61 else {
62 prelim_hitlist_size = prelim_hitlist_size*2 + 50;
63 }
64
65 }
66 }
67 else if (gapped_calculation) {
68 prelim_hitlist_size = MIN(MAX(2 * prelim_hitlist_size, 10), prelim_hitlist_size + 50);
69 }
70 return prelim_hitlist_size;
71 }
72
73
74 NCBI_XBLAST_EXPORT
SBlastHitsParametersNew(const BlastHitSavingOptions * hit_options,const BlastExtensionOptions * ext_options,const BlastScoringOptions * scoring_options,SBlastHitsParameters ** retval)75 Int2 SBlastHitsParametersNew(const BlastHitSavingOptions* hit_options,
76 const BlastExtensionOptions* ext_options,
77 const BlastScoringOptions* scoring_options,
78 SBlastHitsParameters* *retval)
79 {
80 ASSERT(retval);
81 *retval = NULL;
82
83 if (hit_options == NULL ||
84 ext_options == NULL ||
85 scoring_options == NULL)
86 return 1;
87
88 *retval = (SBlastHitsParameters*) malloc(sizeof(SBlastHitsParameters));
89 if (*retval == NULL)
90 return 2;
91
92 (*retval)->prelim_hitlist_size = GetPrelimHitlistSize(hit_options->hitlist_size,
93 ext_options->compositionBasedStats, scoring_options->gapped_calculation);
94
95 (*retval)->hsp_num_max = BlastHspNumMax(scoring_options->gapped_calculation, hit_options);
96
97 return 0;
98 }
99
100 SBlastHitsParameters*
SBlastHitsParametersDup(const SBlastHitsParameters * hit_params)101 SBlastHitsParametersDup(const SBlastHitsParameters* hit_params)
102 {
103 SBlastHitsParameters* retval = (SBlastHitsParameters*)
104 malloc(sizeof(SBlastHitsParameters));
105
106 if ( !retval ) {
107 return NULL;
108 }
109
110 memcpy((void*)retval, (void*) hit_params, sizeof(SBlastHitsParameters));
111 return retval;
112 }
113
114 NCBI_XBLAST_EXPORT
SBlastHitsParametersFree(SBlastHitsParameters * param)115 SBlastHitsParameters* SBlastHitsParametersFree(SBlastHitsParameters* param)
116 {
117 if (param)
118 {
119 sfree(param);
120 }
121 return NULL;
122 }
123
124
125
126 /********************************************************************************
127 Functions manipulating BlastHSP's
128 ********************************************************************************/
129
Blast_HSPFree(BlastHSP * hsp)130 BlastHSP* Blast_HSPFree(BlastHSP* hsp)
131 {
132 if (!hsp)
133 return NULL;
134 hsp->gap_info = GapEditScriptDelete(hsp->gap_info);
135 hsp->map_info = BlastHSPMappingInfoFree(hsp->map_info);
136 sfree(hsp->pat_info);
137 sfree(hsp);
138 return NULL;
139 }
140
Blast_HSPNew(void)141 BlastHSP* Blast_HSPNew(void)
142 {
143 BlastHSP* new_hsp = (BlastHSP*) calloc(1, sizeof(BlastHSP));
144 return new_hsp;
145 }
146
147 /*
148 Comments in blast_hits.h
149 */
150 Int2
Blast_HSPInit(Int4 query_start,Int4 query_end,Int4 subject_start,Int4 subject_end,Int4 query_gapped_start,Int4 subject_gapped_start,Int4 query_context,Int2 query_frame,Int2 subject_frame,Int4 score,GapEditScript ** gap_edit,BlastHSP ** ret_hsp)151 Blast_HSPInit(Int4 query_start, Int4 query_end, Int4 subject_start,
152 Int4 subject_end, Int4 query_gapped_start,
153 Int4 subject_gapped_start, Int4 query_context,
154 Int2 query_frame, Int2 subject_frame, Int4 score,
155 GapEditScript* *gap_edit, BlastHSP* *ret_hsp)
156 {
157 BlastHSP* new_hsp = NULL;
158
159 if (!ret_hsp)
160 return -1;
161
162 new_hsp = Blast_HSPNew();
163
164 *ret_hsp = NULL;
165
166 if (new_hsp == NULL)
167 return BLASTERR_MEMORY;
168
169
170 new_hsp->query.offset = query_start;
171 new_hsp->subject.offset = subject_start;
172 new_hsp->query.end = query_end;
173 new_hsp->subject.end = subject_end;
174 new_hsp->query.gapped_start = query_gapped_start;
175 new_hsp->subject.gapped_start = subject_gapped_start;
176 new_hsp->context = query_context;
177 new_hsp->query.frame = query_frame;
178 new_hsp->subject.frame = subject_frame;
179 new_hsp->score = score;
180 if (gap_edit && *gap_edit)
181 { /* If this is non-NULL transfer ownership. */
182 new_hsp->gap_info = *gap_edit;
183 *gap_edit = NULL;
184 }
185
186 *ret_hsp = new_hsp;
187
188 return 0;
189 }
190
191
BlastHSPMappingInfoFree(BlastHSPMappingInfo * info)192 BlastHSPMappingInfo* BlastHSPMappingInfoFree(BlastHSPMappingInfo* info)
193 {
194 if (!info) {
195 return NULL;
196 }
197
198 info->edits = JumperEditsBlockFree(info->edits);
199 if (info->subject_overhangs) {
200 SequenceOverhangsFree(info->subject_overhangs);
201 }
202 sfree(info);
203
204 return NULL;
205 }
206
BlastHSPMappingInfoNew(void)207 BlastHSPMappingInfo* BlastHSPMappingInfoNew(void)
208 {
209 BlastHSPMappingInfo* retval = calloc(1, sizeof(BlastHSPMappingInfo));
210 return retval;
211 }
212
BlastHspNumMax(Boolean gapped_calculation,const BlastHitSavingOptions * options)213 Int4 BlastHspNumMax(Boolean gapped_calculation, const BlastHitSavingOptions* options)
214 {
215 Int4 retval=0;
216
217 /* per-subject HSP limits do not apply to gapped searches; JIRA SB-616 */
218 if (options->hsp_num_max <= 0)
219 {
220 retval = INT4_MAX;
221 }
222 else
223 {
224 retval = options->hsp_num_max;
225 }
226
227 return retval;
228 }
229
230 /** Copies all contents of a BlastHSP structure. Used in PHI BLAST for splitting
231 * results corresponding to different pattern occurrences in query.
232 * @param hsp Original HSP [in]
233 * @return New HSP, copied from the original.
234 */
235 static BlastHSP*
s_BlastHSPCopy(const BlastHSP * hsp)236 s_BlastHSPCopy(const BlastHSP* hsp)
237 {
238 BlastHSP* new_hsp = NULL;
239 /* Do not pass the edit script, because we don't want to tranfer
240 ownership. */
241 Blast_HSPInit(hsp->query.offset, hsp->query.end, hsp->subject.offset,
242 hsp->subject.end, hsp->query.gapped_start,
243 hsp->subject.gapped_start, hsp->context,
244 hsp->query.frame, hsp->subject.frame, hsp->score,
245 NULL, &new_hsp);
246 new_hsp->evalue = hsp->evalue;
247 new_hsp->num = hsp->num;
248 new_hsp->num_ident = hsp->num_ident;
249 new_hsp->bit_score = hsp->bit_score;
250 new_hsp->comp_adjustment_method = hsp->comp_adjustment_method;
251 if (hsp->gap_info) {
252 new_hsp->gap_info = GapEditScriptDup(hsp->gap_info);
253 }
254
255 if (hsp->pat_info) {
256 /* Copy this HSP's pattern data. */
257 new_hsp->pat_info =
258 (SPHIHspInfo*) BlastMemDup(hsp->pat_info, sizeof(SPHIHspInfo));
259 }
260 return new_hsp;
261 }
262
263 /* Make a deep copy of an HSP */
Blast_HSPClone(const BlastHSP * hsp)264 BlastHSP* Blast_HSPClone(const BlastHSP* hsp)
265 {
266 BlastHSP* retval = NULL;
267
268 if (!hsp) {
269 return NULL;
270 }
271
272 retval = Blast_HSPNew();
273 if (retval) {
274 retval->score = hsp->score;
275 retval->num_ident = hsp->num_ident;
276 memcpy(&retval->query, &hsp->query, sizeof(BlastSeg));
277 memcpy(&retval->subject, &hsp->subject, sizeof(BlastSeg));
278 retval->context = hsp->context;
279 retval->evalue = hsp->evalue;
280 retval->bit_score = hsp->bit_score;
281 retval->num = hsp->num;
282 retval->comp_adjustment_method = hsp->comp_adjustment_method;
283 retval->num_positives = hsp->num_positives;
284
285 /* copy gapped traceback */
286 if (hsp->gap_info) {
287 retval->gap_info = GapEditScriptDup(hsp->gap_info);
288 if (!retval->gap_info) {
289 Blast_HSPFree(retval);
290 return NULL;
291 }
292 }
293
294 /* copy short read mapping data */
295 if (hsp->map_info) {
296 retval->map_info = BlastHSPMappingInfoNew();
297 if (!retval->map_info) {
298 Blast_HSPFree(retval);
299 return NULL;
300 }
301 retval->map_info->edits =
302 JumperEditsBlockDup(hsp->map_info->edits);
303 if (!retval->map_info->edits) {
304 Blast_HSPFree(retval);
305 return NULL;
306 }
307 retval->map_info->left_edge = hsp->map_info->left_edge;
308 retval->map_info->right_edge = hsp->map_info->right_edge;
309
310 if (hsp->map_info->subject_overhangs) {
311 SequenceOverhangs* old = hsp->map_info->subject_overhangs;
312 SequenceOverhangs* copy = calloc(1, sizeof(SequenceOverhangs));
313 if (!copy) {
314 Blast_HSPFree(retval);
315 return NULL;
316 }
317
318 if (old->left && old->left_len > 0) {
319 copy->left_len = old->left_len;
320 copy->left = malloc(copy->left_len * sizeof(Uint1));
321 if (!copy->left) {
322 SequenceOverhangsFree(copy);
323 Blast_HSPFree(retval);
324 return NULL;
325 }
326 memcpy(copy->left, old->left,
327 copy->left_len * sizeof(Uint1));
328 }
329
330 if (old->right && old->right_len > 0) {
331 copy->right_len = old->right_len;
332 copy->right = malloc(copy->right_len * sizeof(Uint1));
333 if (!copy->right) {
334 SequenceOverhangsFree(copy);
335 Blast_HSPFree(retval);
336 }
337 memcpy(copy->right, old->right,
338 copy->right_len * sizeof(Uint1));
339 }
340
341 retval->map_info->subject_overhangs = copy;
342 }
343 }
344
345 /* copy phi-blast pattern data */
346 if (hsp->pat_info) {
347 retval->pat_info =
348 (SPHIHspInfo*) BlastMemDup(hsp->pat_info, sizeof(SPHIHspInfo));
349 }
350 }
351
352 return retval;
353 }
354
355 /** Count the number of occurrences of pattern in sequence, which
356 * do not overlap by more than half the pattern match length.
357 * @param query_info Query information structure, containing pattern info. [in]
358 */
359 Int4
PhiBlastGetEffectiveNumberOfPatterns(const BlastQueryInfo * query_info)360 PhiBlastGetEffectiveNumberOfPatterns(const BlastQueryInfo *query_info)
361 {
362 Int4 index; /*loop index*/
363 Int4 lastEffectiveOccurrence; /*last nonoverlapping occurrence*/
364 Int4 count; /* Count of effective (nonoverlapping) occurrences */
365 Int4 min_pattern_length;
366 SPHIQueryInfo* pat_info;
367
368 ASSERT(query_info && query_info->pattern_info && query_info->contexts);
369
370 pat_info = query_info->pattern_info;
371
372 if (pat_info->num_patterns <= 1)
373 return pat_info->num_patterns;
374
375 /* Minimal length of a pattern is saved in the length adjustment field. */
376 min_pattern_length = query_info->contexts[0].length_adjustment;
377
378 count = 1;
379 lastEffectiveOccurrence = pat_info->occurrences[0].offset;
380 for(index = 1; index < pat_info->num_patterns; ++index) {
381 if (((pat_info->occurrences[index].offset - lastEffectiveOccurrence) * 2)
382 > min_pattern_length) {
383 lastEffectiveOccurrence = pat_info->occurrences[index].offset;
384 ++count;
385 }
386 }
387
388 return count;
389 }
390
391
392 /** Calculate e-value for an HSP found by PHI BLAST.
393 * @param hsp An HSP found by PHI BLAST [in]
394 * @param sbp Scoring block with statistical parameters [in]
395 * @param query_info Structure containing information about pattern counts [in]
396 * @param pattern_blk Structure containing counts of PHI pattern hits [in]
397 */
398 static void
s_HSPPHIGetEvalue(BlastHSP * hsp,BlastScoreBlk * sbp,const BlastQueryInfo * query_info,const SPHIPatternSearchBlk * pattern_blk)399 s_HSPPHIGetEvalue(BlastHSP* hsp, BlastScoreBlk* sbp,
400 const BlastQueryInfo* query_info,
401 const SPHIPatternSearchBlk* pattern_blk)
402 {
403 double paramC;
404 double Lambda;
405
406 ASSERT(query_info && hsp && sbp && pattern_blk);
407
408 paramC = sbp->kbp[0]->paramC;
409 Lambda = sbp->kbp[0]->Lambda;
410
411 /* We have the actual number of occurrences of pattern in db. */
412 hsp->evalue = paramC*(1+Lambda*hsp->score)*
413 PhiBlastGetEffectiveNumberOfPatterns(query_info)*
414 pattern_blk->num_patterns_db*
415 exp(-Lambda*hsp->score);
416 }
417
418 /** Update HSP data after reevaluation with ambiguities. In particular this
419 * function calculates number of identities and checks if the percent identity
420 * criterion is satisfied.
421 * @param hsp HSP to update [in] [out]
422 * @param gapped Is this a gapped search? [in]
423 * @param cutoff_score Cutoff score for saving the HSP [in]
424 * @param score New score [in]
425 * @param query_start Start of query sequence [in]
426 * @param subject_start Start of subject sequence [in]
427 * @param best_q_start Pointer to start of the new alignment in query [in]
428 * @param best_q_end Pointer to end of the new alignment in query [in]
429 * @param best_s_start Pointer to start of the new alignment in subject [in]
430 * @param best_s_end Pointer to end of the new alignment in subject [in]
431 * @param best_start_esp_index index of the edit script array where the new alignment
432 * starts. [in]
433 * @param best_end_esp_index index in the edit script array where the new alignment
434 * ends. [in]
435 * @param best_end_esp_num Number of edit operations in the last edit script,
436 * that are included in the alignment. [in]
437 * @return TRUE if HSP is scheduled to be deleted.
438 */
439 static Boolean
s_UpdateReevaluatedHSP(BlastHSP * hsp,Boolean gapped,Int4 cutoff_score,Int4 score,const Uint1 * query_start,const Uint1 * subject_start,const Uint1 * best_q_start,const Uint1 * best_q_end,const Uint1 * best_s_start,const Uint1 * best_s_end,int best_start_esp_index,int best_end_esp_index,int best_end_esp_num)440 s_UpdateReevaluatedHSP(BlastHSP* hsp, Boolean gapped,
441 Int4 cutoff_score,
442 Int4 score, const Uint1* query_start, const Uint1* subject_start,
443 const Uint1* best_q_start, const Uint1* best_q_end,
444 const Uint1* best_s_start, const Uint1* best_s_end,
445 int best_start_esp_index,
446 int best_end_esp_index,
447 int best_end_esp_num)
448 {
449 Boolean delete_hsp = TRUE;
450
451 hsp->score = score;
452
453 if (hsp->score >= cutoff_score) {
454 /* Update all HSP offsets. */
455 hsp->query.offset = best_q_start - query_start;
456 hsp->query.end = hsp->query.offset + best_q_end - best_q_start;
457 hsp->subject.offset = best_s_start - subject_start;
458 hsp->subject.end = hsp->subject.offset + best_s_end - best_s_start;
459
460 if (gapped) {
461 int last_num=hsp->gap_info->size - 1;
462 if (best_end_esp_index != last_num|| best_start_esp_index > 0)
463 {
464 GapEditScript* esp_temp = GapEditScriptNew(best_end_esp_index-best_start_esp_index+1);
465 GapEditScriptPartialCopy(esp_temp, 0, hsp->gap_info, best_start_esp_index, best_end_esp_index);
466 hsp->gap_info = GapEditScriptDelete(hsp->gap_info);
467 hsp->gap_info = esp_temp;
468 }
469 last_num = hsp->gap_info->size - 1;
470 hsp->gap_info->num[last_num] = best_end_esp_num;
471 ASSERT(best_end_esp_num >= 0);
472 }
473 delete_hsp = FALSE;
474 }
475
476 return delete_hsp;
477 }
478
Blast_HSPReevaluateWithAmbiguitiesGapped(BlastHSP * hsp,const Uint1 * q,const Int4 qlen,const Uint1 * s,const Int4 slen,const BlastHitSavingParameters * hit_params,const BlastScoringParameters * score_params,const BlastScoreBlk * sbp)479 Boolean Blast_HSPReevaluateWithAmbiguitiesGapped(BlastHSP* hsp,
480 const Uint1* q, const Int4 qlen,
481 const Uint1* s, const Int4 slen,
482 const BlastHitSavingParameters* hit_params,
483 const BlastScoringParameters* score_params,
484 const BlastScoreBlk* sbp)
485 {
486 Int4 sum, score, gap_open, gap_extend;
487 Int4 index; /* loop index */
488 Int4 qp, sp, ext;
489
490 int best_start_esp_index = 0;
491 int best_end_esp_index = 0;
492 int current_start_esp_index = 0;
493 int best_end_esp_num = 0;
494 GapEditScript* esp; /* Used to hold GapEditScript of hsp->gap_info */
495
496 const Uint1* best_q_start; /* Start of the best scoring part in query. */
497 const Uint1* best_s_start; /* Start of the best scoring part in subject. */
498 const Uint1* best_q_end; /* End of the best scoring part in query. */
499 const Uint1* best_s_end; /* End of the best scoring part in subject. */
500
501
502 const Uint1* current_q_start; /* Start of the current part of the alignment in
503 query. */
504 const Uint1* current_s_start; /* Start of the current part of the alignment in
505 subject. */
506
507 const Uint1* query,* subject;
508 Int4** matrix;
509 Int2 factor = 1;
510 const Uint1 kResidueMask = 0x0f;
511 Int4 cutoff_score = hit_params->cutoffs[hsp->context].cutoff_score;
512
513 matrix = sbp->matrix->data;
514
515 /* For a non-affine greedy case, calculate the real value of the gap
516 extension penalty. Multiply all scores by 2 if it is not integer. */
517 if (score_params->gap_open == 0 && score_params->gap_extend == 0) {
518 if (score_params->reward % 2 == 1)
519 factor = 2;
520 gap_open = 0;
521 gap_extend =
522 (score_params->reward - 2*score_params->penalty) * factor / 2;
523 } else {
524 gap_open = score_params->gap_open;
525 gap_extend = score_params->gap_extend;
526 }
527
528 query = q + hsp->query.offset;
529 subject = s + hsp->subject.offset;
530 score = 0;
531 sum = 0;
532
533 /* Point all pointers to the beginning of the alignment. */
534 best_q_start = best_q_end = current_q_start = query;
535 best_s_start = best_s_end = current_s_start = subject;
536 /* There are no previous edit scripts at the beginning. */
537
538 best_end_esp_num = -1;
539 esp = hsp->gap_info;
540 if (!esp) return TRUE;
541 for (index=0; index<esp->size; index++)
542 {
543 int op_index = 0; /* Index of an operation within a single edit script. */
544 for (op_index=0; op_index<esp->num[index]; )
545 {
546 /* Process substitutions one operation at a time, full gaps in one step. */
547 if (esp->op_type[index] == eGapAlignSub) {
548 sum += factor*matrix[*query & kResidueMask][*subject];
549 query++;
550 subject++;
551 op_index++;
552 } else if (esp->op_type[index] == eGapAlignDel) {
553 sum -= gap_open + gap_extend * esp->num[index];
554 subject += esp->num[index];
555 op_index += esp->num[index];
556 } else if (esp->op_type[index] == eGapAlignIns) {
557 sum -= gap_open + gap_extend * esp->num[index];
558 query += esp->num[index];
559 op_index += esp->num[index];
560 }
561
562 if (sum < 0) {
563 /* Point current edit script chain start to the new place.
564 If we are in the middle of an edit script, reduce its length and
565 point operation index to the beginning of a modified edit script;
566 if we are at the end, move to the next edit script. */
567 if (op_index < esp->num[index]) {
568 esp->num[index] -= op_index;
569 current_start_esp_index = index;
570 op_index = 0;
571 } else {
572 current_start_esp_index = index + 1;
573 }
574 /* Set sum to 0, to start a fresh count. */
575 sum = 0;
576 /* Set current starting positions in sequences to the new start. */
577 current_q_start = query;
578 current_s_start = subject;
579
580 /* If score has passed the cutoff at some point, leave the best score
581 and edit scripts positions information untouched, otherwise reset
582 the best score to 0 and point the best edit script positions to
583 the new start. */
584 if (score < cutoff_score) {
585 /* Start from new offset; discard all previous information. */
586 best_q_start = query;
587 best_s_start = subject;
588 score = 0;
589
590 /* Set best start and end edit script pointers to new start. */
591 best_start_esp_index = current_start_esp_index;
592 best_end_esp_index = current_start_esp_index;
593 }
594 /* break; */ /* start on next GapEditScript. */
595 } else if (sum > score) {
596 /* Remember this point as the best scoring end point, and the current
597 start of the alignment as the start of the best alignment. */
598 score = sum;
599
600 best_q_start = current_q_start;
601 best_s_start = current_s_start;
602 best_q_end = query;
603 best_s_end = subject;
604
605 best_start_esp_index = current_start_esp_index;
606 best_end_esp_index = index;
607 best_end_esp_num = op_index;
608 }
609 }
610 } /* loop on edit scripts */
611
612 score /= factor;
613
614 if (best_start_esp_index < esp->size && best_end_esp_index < esp->size) {
615 /* post processing: try to extend further */
616 ASSERT(esp->op_type[best_start_esp_index] == eGapAlignSub);
617 ASSERT(esp->op_type[best_end_esp_index] == eGapAlignSub);
618
619 qp = best_q_start - q;
620 sp = best_s_start - s;
621 ext = 0;
622 while(qp > 0 && sp > 0 && (q[--qp] == s[--sp]) && q[qp]<4) ext++;
623 best_q_start -= ext;
624 best_s_start -= ext;
625 esp->num[best_start_esp_index] += ext;
626 if (best_end_esp_index == best_start_esp_index) best_end_esp_num += ext;
627 score += ext * score_params->reward;
628
629 qp = best_q_end - q;
630 sp = best_s_end - s;
631 ext = 0;
632 while(qp < qlen && sp < slen && q[qp]<4 && (q[qp++] == s[sp++])) ext++;
633 best_q_end += ext;
634 best_s_end += ext;
635 esp->num[best_end_esp_index] += ext;
636 best_end_esp_num += ext;
637 score += ext * score_params->reward;
638 }
639
640 /* Update HSP data. */
641 return
642 s_UpdateReevaluatedHSP(hsp, TRUE, cutoff_score,
643 score, q, s, best_q_start,
644 best_q_end, best_s_start, best_s_end,
645 best_start_esp_index, best_end_esp_index,
646 best_end_esp_num);
647 }
648
649 /** Update HSP data after reevaluation with ambiguities for an ungapped search.
650 * In particular this function calculates number of identities and checks if the
651 * percent identity criterion is satisfied.
652 * @param hsp HSP to update [in] [out]
653 * @param cutoff_score Cutoff score for saving the HSP [in]
654 * @param score New score [in]
655 * @param query_start Start of query sequence [in]
656 * @param subject_start Start of subject sequence [in]
657 * @param best_q_start Pointer to start of the new alignment in query [in]
658 * @param best_q_end Pointer to end of the new alignment in query [in]
659 * @param best_s_start Pointer to start of the new alignment in subject [in]
660 * @param best_s_end Pointer to end of the new alignment in subject [in]
661 * @return TRUE if HSP is scheduled to be deleted.
662 */
663 static Boolean
s_UpdateReevaluatedHSPUngapped(BlastHSP * hsp,Int4 cutoff_score,Int4 score,const Uint1 * query_start,const Uint1 * subject_start,const Uint1 * best_q_start,const Uint1 * best_q_end,const Uint1 * best_s_start,const Uint1 * best_s_end)664 s_UpdateReevaluatedHSPUngapped(BlastHSP* hsp, Int4 cutoff_score, Int4 score,
665 const Uint1* query_start, const Uint1* subject_start,
666 const Uint1* best_q_start, const Uint1* best_q_end,
667 const Uint1* best_s_start, const Uint1* best_s_end)
668 {
669 return
670 s_UpdateReevaluatedHSP(hsp, FALSE, cutoff_score, score, query_start,
671 subject_start, best_q_start, best_q_end,
672 best_s_start, best_s_end, 0, 0, 0);
673 }
674
675 Boolean
Blast_HSPReevaluateWithAmbiguitiesUngapped(BlastHSP * hsp,const Uint1 * query_start,const Uint1 * subject_start,const BlastInitialWordParameters * word_params,BlastScoreBlk * sbp,Boolean translated)676 Blast_HSPReevaluateWithAmbiguitiesUngapped(BlastHSP* hsp, const Uint1* query_start,
677 const Uint1* subject_start, const BlastInitialWordParameters* word_params,
678 BlastScoreBlk* sbp, Boolean translated)
679 {
680 Int4 sum, score;
681 Int4** matrix;
682 const Uint1* query,* subject;
683 const Uint1* best_q_start,* best_s_start,* best_q_end,* best_s_end;
684 const Uint1* current_q_start, * current_s_start;
685 Int4 index;
686 const Uint1 kResidueMask = (translated ? 0xff : 0x0f);
687 Int4 hsp_length = hsp->query.end - hsp->query.offset;
688 Int4 cutoff_score = word_params->cutoffs[hsp->context].cutoff_score;
689
690 matrix = sbp->matrix->data;
691
692 query = query_start + hsp->query.offset;
693 subject = subject_start + hsp->subject.offset;
694 score = 0;
695 sum = 0;
696 best_q_start = best_q_end = current_q_start = query;
697 best_s_start = best_s_end = current_s_start = subject;
698
699 for (index = 0; index < hsp_length; ++index) {
700 sum += matrix[*query & kResidueMask][*subject];
701 query++;
702 subject++;
703 if (sum < 0) {
704 /* Start from new offset */
705 sum = 0;
706 current_q_start = query;
707 current_s_start = subject;
708 /* If previous top score never reached the cutoff, discard the front
709 part of the alignment completely. Otherwise keep pointer to the
710 top-scoring front part. */
711 if (score < cutoff_score) {
712 best_q_start = best_q_end = query;
713 best_s_start = best_s_end = subject;
714 score = 0;
715 }
716 } else if (sum > score) {
717 /* Remember this point as the best scoring end point */
718 score = sum;
719 best_q_end = query;
720 best_s_end = subject;
721 /* Set start of alignment to the current start, dismissing the
722 previous top-scoring piece. */
723 best_q_start = current_q_start;
724 best_s_start = current_s_start;
725 }
726 }
727
728 /* Update HSP data. */
729 return
730 s_UpdateReevaluatedHSPUngapped(hsp, cutoff_score, score,
731 query_start, subject_start, best_q_start,
732 best_q_end, best_s_start, best_s_end);
733 }
734
735 /** Calculate number of identities in a regular HSP.
736 * @param query The query sequence [in]
737 * @param subject The uncompressed subject sequence [in]
738 * @param hsp All information about the HSP [in]
739 * @param num_ident_ptr Number of identities [out]
740 * @param align_length_ptr The alignment length, including gaps [out]
741 * @param sbp Blast score blk [in]
742 * @param num_pos_ptr Number of Positives [out]
743 * @return 0 on success, -1 on invalid parameters or error
744 */
745 static Int2
s_Blast_HSPGetNumIdentitiesAndPositives(const Uint1 * query,const Uint1 * subject,const BlastHSP * hsp,Int4 * num_ident_ptr,Int4 * align_length_ptr,const BlastScoreBlk * sbp,Int4 * num_pos_ptr)746 s_Blast_HSPGetNumIdentitiesAndPositives(const Uint1* query, const Uint1* subject,
747 const BlastHSP* hsp, Int4* num_ident_ptr,
748 Int4* align_length_ptr, const BlastScoreBlk* sbp,
749 Int4* num_pos_ptr)
750 {
751 Int4 i, num_ident, align_length, q_off, s_off;
752 Uint1* q,* s;
753 Int4 q_length = hsp->query.end - hsp->query.offset;
754 Int4 s_length = hsp->subject.end - hsp->subject.offset;
755 Int4** matrix = NULL;
756 Int4 num_pos = 0;
757
758 q_off = hsp->query.offset;
759 s_off = hsp->subject.offset;
760
761 if ( !subject || !query || !hsp )
762 return -1;
763
764 q = (Uint1*) &query[q_off];
765 s = (Uint1*) &subject[s_off];
766
767 num_ident = 0;
768 align_length = 0;
769
770 if(NULL != sbp)
771 {
772 if(sbp->protein_alphabet)
773 matrix = sbp->matrix->data;
774 }
775
776 if (!hsp->gap_info) {
777 /* Ungapped case. Check that lengths are the same in query and subject,
778 then count number of matches. */
779 if (q_length != s_length)
780 return -1;
781 align_length = q_length;
782 for (i=0; i<align_length; i++) {
783 if (*q == *s)
784 num_ident++;
785 else if (NULL != matrix) {
786 if (matrix[*q][*s] > 0)
787 num_pos ++;
788 }
789 q++;
790 s++;
791 }
792 }
793 else {
794 Int4 index;
795 GapEditScript* esp = hsp->gap_info;
796 for (index=0; index<esp->size; index++)
797 {
798 align_length += esp->num[index];
799 switch (esp->op_type[index]) {
800 case eGapAlignSub:
801 for (i=0; i<esp->num[index]; i++) {
802 if (*q == *s) {
803 num_ident++;
804 }
805 else if (NULL != matrix) {
806 if (matrix[*q][*s] > 0)
807 num_pos ++;
808 }
809 q++;
810 s++;
811 }
812 break;
813 case eGapAlignDel:
814 s += esp->num[index];
815 break;
816 case eGapAlignIns:
817 q += esp->num[index];
818 break;
819 default:
820 s += esp->num[index];
821 q += esp->num[index];
822 break;
823 }
824 }
825 }
826
827 if (align_length_ptr) {
828 *align_length_ptr = align_length;
829 }
830 *num_ident_ptr = num_ident;
831
832 if(NULL != matrix)
833 *num_pos_ptr = num_pos + num_ident;
834
835 return 0;
836 }
837
838 /** Calculate number of identities in an HSP for an out-of-frame alignment.
839 * @param query The query sequence [in]
840 * @param subject The uncompressed subject sequence [in]
841 * @param hsp All information about the HSP [in]
842 * @param program BLAST program (blastx or tblastn) [in]
843 * @param num_ident_ptr Number of identities [out]
844 * @param align_length_ptr The alignment length, including gaps [out]
845 * @param sbp Blast score blk [in]
846 * @param num_pos_ptr Number of Positives [out]
847 * @return 0 on success, -1 on invalid parameters or error
848 */
849 static Int2
s_Blast_HSPGetOOFNumIdentitiesAndPositives(const Uint1 * query,const Uint1 * subject,const BlastHSP * hsp,EBlastProgramType program,Int4 * num_ident_ptr,Int4 * align_length_ptr,const BlastScoreBlk * sbp,Int4 * num_pos_ptr)850 s_Blast_HSPGetOOFNumIdentitiesAndPositives(
851 const Uint1* query, const Uint1* subject,
852 const BlastHSP* hsp, EBlastProgramType program,
853 Int4* num_ident_ptr, Int4* align_length_ptr,
854 const BlastScoreBlk* sbp, Int4* num_pos_ptr)
855 {
856 Int4 num_ident, align_length;
857 Int4 index;
858 Uint1* q,* s;
859 GapEditScript* esp;
860 Int4 ** matrix = NULL;
861 Int4 num_pos = 0;
862
863 if (!hsp->gap_info || !subject || !query)
864 return -1;
865
866 if(NULL != sbp)
867 {
868 if(sbp->protein_alphabet)
869 matrix = sbp->matrix->data;
870 }
871
872 if (program == eBlastTypeTblastn ||
873 program == eBlastTypeRpsTblastn) {
874 q = (Uint1*) &query[hsp->query.offset];
875 s = (Uint1*) &subject[hsp->subject.offset];
876 } else {
877 s = (Uint1*) &query[hsp->query.offset];
878 q = (Uint1*) &subject[hsp->subject.offset];
879 }
880 num_ident = 0;
881 align_length = 0;
882
883 esp = hsp->gap_info;
884 for (index=0; index<esp->size; index++)
885 {
886 int i;
887 switch (esp->op_type[index]) {
888 case eGapAlignSub: /* Substitution */
889 align_length += esp->num[index];
890 for (i=0; i<esp->num[index]; i++) {
891 if (*q == *s)
892 num_ident++;
893 else if (NULL != matrix) {
894 if (matrix[*q][*s] > 0)
895 num_pos ++;
896 }
897 ++q;
898 s += CODON_LENGTH;
899 }
900 break;
901 case eGapAlignIns: /* Insertion */
902 align_length += esp->num[index];
903 s += esp->num[index] * CODON_LENGTH;
904 break;
905 case eGapAlignDel: /* Deletion */
906 align_length += esp->num[index];
907 q += esp->num[index];
908 break;
909 case eGapAlignDel2: /* Gap of two nucleotides. */
910 s -= 2;
911 break;
912 case eGapAlignDel1: /* Gap of one nucleotide. */
913 s -= 1;
914 break;
915 case eGapAlignIns1: /* Insertion of one nucleotide. */
916 s += 1;
917 break;
918 case eGapAlignIns2: /* Insertion of two nucleotides. */
919 s += 2;
920 break;
921 default:
922 s += esp->num[index] * CODON_LENGTH;
923 q += esp->num[index];
924 break;
925 }
926 }
927
928 if (align_length_ptr) {
929 *align_length_ptr = align_length;
930 }
931 *num_ident_ptr = num_ident;
932
933 if(NULL != matrix)
934 *num_pos_ptr = num_pos + num_ident;
935
936 return 0;
937 }
938
939 Int2
Blast_HSPGetNumIdentities(const Uint1 * query,const Uint1 * subject,BlastHSP * hsp,const BlastScoringOptions * score_options,Int4 * align_length_ptr)940 Blast_HSPGetNumIdentities(const Uint1* query,
941 const Uint1* subject,
942 BlastHSP* hsp,
943 const BlastScoringOptions* score_options,
944 Int4* align_length_ptr)
945 {
946 Int2 retval = 0;
947
948 /* Calculate alignment length and number of identical letters.
949 Do not get the number of identities if the query is not available */
950 if (score_options->is_ooframe) {
951 retval = s_Blast_HSPGetOOFNumIdentitiesAndPositives(query, subject, hsp,
952 score_options->program_number,
953 &hsp->num_ident,
954 align_length_ptr,
955 NULL, NULL);
956 } else {
957 retval = s_Blast_HSPGetNumIdentitiesAndPositives(query, subject, hsp,
958 &hsp->num_ident,
959 align_length_ptr,
960 NULL, NULL);
961 }
962
963 return retval;
964 }
965 Int2
Blast_HSPGetNumIdentitiesAndPositives(const Uint1 * query,const Uint1 * subject,BlastHSP * hsp,const BlastScoringOptions * score_options,Int4 * align_length_ptr,const BlastScoreBlk * sbp)966 Blast_HSPGetNumIdentitiesAndPositives(const Uint1* query,
967 const Uint1* subject,
968 BlastHSP* hsp,
969 const BlastScoringOptions* score_options,
970 Int4* align_length_ptr,
971 const BlastScoreBlk * sbp)
972 {
973 Int2 retval = 0;
974
975 /* Calculate alignment length and number of identical letters.
976 Do not get the number of identities if the query is not available */
977 if (score_options->is_ooframe) {
978 retval = s_Blast_HSPGetOOFNumIdentitiesAndPositives(query, subject, hsp,
979 score_options->program_number,
980 &hsp->num_ident,
981 align_length_ptr,
982 sbp, &hsp->num_positives);
983 } else {
984 retval = s_Blast_HSPGetNumIdentitiesAndPositives(query, subject, hsp,
985 &hsp->num_ident,
986 align_length_ptr,
987 sbp, &hsp->num_positives);
988 }
989
990 return retval;
991 }
992
s_HSPTest(const BlastHSP * hsp,const BlastHitSavingOptions * hit_options,Int4 align_length)993 static Boolean s_HSPTest(const BlastHSP* hsp,
994 const BlastHitSavingOptions* hit_options,
995 Int4 align_length)
996 {
997 return ((hsp->num_ident * 100.0 <
998 align_length * hit_options->percent_identity) ||
999 align_length < hit_options->min_hit_length) ;
1000
1001 }
1002
1003 Boolean
Blast_HSPTestIdentityAndLength(EBlastProgramType program_number,BlastHSP * hsp,const Uint1 * query,const Uint1 * subject,const BlastScoringOptions * score_options,const BlastHitSavingOptions * hit_options)1004 Blast_HSPTestIdentityAndLength(EBlastProgramType program_number,
1005 BlastHSP* hsp, const Uint1* query, const Uint1* subject,
1006 const BlastScoringOptions* score_options,
1007 const BlastHitSavingOptions* hit_options)
1008 {
1009 Int4 align_length = 0;
1010 Boolean delete_hsp = FALSE;
1011 Int2 status = 0;
1012
1013 ASSERT(hsp && query && subject && score_options && hit_options);
1014
1015 status = Blast_HSPGetNumIdentities(query, subject, hsp, score_options,
1016 &align_length);
1017 ASSERT(status == 0);
1018 (void)status; /* to pacify compiler warning */
1019
1020 /* Check whether this HSP passes the percent identity and minimal hit
1021 length criteria, and delete it if it does not. */
1022 delete_hsp = s_HSPTest(hsp, hit_options, align_length);
1023
1024 return delete_hsp;
1025 }
1026
Blast_HSPTest(BlastHSP * hsp,const BlastHitSavingOptions * hit_options,Int4 align_length)1027 Boolean Blast_HSPTest(BlastHSP* hsp,
1028 const BlastHitSavingOptions* hit_options,
1029 Int4 align_length)
1030 {
1031 return s_HSPTest(hsp, hit_options, align_length);
1032 }
1033
Blast_HSPGetQueryCoverage(const BlastHSP * hsp,Int4 query_length)1034 double Blast_HSPGetQueryCoverage(const BlastHSP* hsp, Int4 query_length)
1035 {
1036 double pct = 0;
1037 if(query_length > 0) {
1038 pct = 100.0 * (double) (hsp->query.end - hsp->query.offset)/ (double) query_length;
1039 if(pct < 99)
1040 pct +=0.5;
1041 }
1042 return pct;
1043 }
1044
Blast_HSPQueryCoverageTest(BlastHSP * hsp,double min_query_coverage_pct,Int4 query_length)1045 Boolean Blast_HSPQueryCoverageTest(BlastHSP* hsp,
1046 double min_query_coverage_pct,
1047 Int4 query_length)
1048 {
1049 double hsp_coverage = Blast_HSPGetQueryCoverage( hsp, query_length);
1050 return (hsp_coverage < min_query_coverage_pct);
1051 }
1052
1053
1054 void
Blast_HSPCalcLengthAndGaps(const BlastHSP * hsp,Int4 * length_out,Int4 * gaps_out,Int4 * gap_opens_out)1055 Blast_HSPCalcLengthAndGaps(const BlastHSP* hsp, Int4* length_out,
1056 Int4* gaps_out, Int4* gap_opens_out)
1057 {
1058 Int4 length = hsp->query.end - hsp->query.offset;
1059 Int4 s_length = hsp->subject.end - hsp->subject.offset;
1060 Int4 gap_opens = 0, gaps = 0;
1061
1062 if (hsp->gap_info) {
1063 GapEditScript* esp = hsp->gap_info;
1064 Int4 index;
1065 for (index=0; index<esp->size; index++) {
1066 if (esp->op_type[index] == eGapAlignDel) {
1067 length += esp->num[index];
1068 gaps += esp->num[index];
1069 ++gap_opens;
1070 } else if (esp->op_type[index] == eGapAlignIns) {
1071 ++gap_opens;
1072 gaps += esp->num[index];
1073 }
1074 }
1075 } else if (s_length > length) {
1076 length = s_length;
1077 }
1078
1079 *length_out = length;
1080 *gap_opens_out = gap_opens;
1081 *gaps_out = gaps;
1082 }
1083
1084 /** Adjust start and end of an HSP in a translated sequence segment.
1085 * @param segment BlastSeg structure (part of BlastHSP) [in]
1086 * @param seq_length Length of the full sequence [in]
1087 * @param start Start of the alignment in this segment in nucleotide
1088 * coordinates, 1-offset [out]
1089 * @param end End of the alignment in this segment in nucleotide
1090 * coordinates, 1-offset [out]
1091 */
1092 static void
s_BlastSegGetTranslatedOffsets(const BlastSeg * segment,Int4 seq_length,Int4 * start,Int4 * end)1093 s_BlastSegGetTranslatedOffsets(const BlastSeg* segment, Int4 seq_length,
1094 Int4* start, Int4* end)
1095 {
1096 if (segment->frame < 0) {
1097 *start = seq_length - CODON_LENGTH*segment->offset + segment->frame;
1098 *end = seq_length - CODON_LENGTH*segment->end + segment->frame + 1;
1099 } else if (segment->frame > 0) {
1100 *start = CODON_LENGTH*(segment->offset) + segment->frame - 1;
1101 *end = CODON_LENGTH*segment->end + segment->frame - 2;
1102 } else {
1103 *start = segment->offset + 1;
1104 *end = segment->end;
1105 }
1106 }
1107
1108 void
Blast_HSPGetAdjustedOffsets(EBlastProgramType program,BlastHSP * hsp,Int4 query_length,Int4 subject_length,Int4 * q_start,Int4 * q_end,Int4 * s_start,Int4 * s_end)1109 Blast_HSPGetAdjustedOffsets(EBlastProgramType program, BlastHSP* hsp,
1110 Int4 query_length, Int4 subject_length,
1111 Int4* q_start, Int4* q_end,
1112 Int4* s_start, Int4* s_end)
1113 {
1114 if (!hsp->gap_info) {
1115 *q_start = hsp->query.offset + 1;
1116 *q_end = hsp->query.end;
1117 *s_start = hsp->subject.offset + 1;
1118 *s_end = hsp->subject.end;
1119 return;
1120 }
1121
1122 if (!Blast_QueryIsTranslated(program) &&
1123 !Blast_SubjectIsTranslated(program)) {
1124 if (hsp->query.frame != hsp->subject.frame) {
1125 /* Blastn: if different strands, flip offsets in query; leave
1126 offsets in subject as they are, but change order for correct
1127 correspondence. */
1128 *q_end = query_length - hsp->query.offset;
1129 *q_start = *q_end - hsp->query.end + hsp->query.offset + 1;
1130 *s_end = hsp->subject.offset + 1;
1131 *s_start = hsp->subject.end;
1132 } else {
1133 *q_start = hsp->query.offset + 1;
1134 *q_end = hsp->query.end;
1135 *s_start = hsp->subject.offset + 1;
1136 *s_end = hsp->subject.end;
1137 }
1138 } else {
1139 s_BlastSegGetTranslatedOffsets(&hsp->query, query_length, q_start, q_end);
1140 s_BlastSegGetTranslatedOffsets(&hsp->subject, subject_length,
1141 q_start, s_end);
1142 }
1143 }
1144
1145
1146 const Uint1*
Blast_HSPGetTargetTranslation(SBlastTargetTranslation * target_t,const BlastHSP * hsp,Int4 * translated_length)1147 Blast_HSPGetTargetTranslation(SBlastTargetTranslation* target_t,
1148 const BlastHSP* hsp, Int4* translated_length)
1149 {
1150 Int4 context = -1;
1151 Int4 start, stop;
1152
1153 ASSERT(target_t != NULL);
1154
1155 if (hsp == NULL)
1156 return NULL;
1157
1158 context = BLAST_FrameToContext(hsp->subject.frame, target_t->program_number);
1159 start = target_t->range[2*context];
1160 stop = target_t->range[2*context+1];
1161
1162 /* skip translation if full translation has already been done */
1163 if (target_t->partial && (start ||
1164 (stop < target_t->subject_blk->length / CODON_LENGTH -3)))
1165 {
1166 const int kMaxTranslation = 99; /* Needs to be divisible by three (?) */
1167 Int4 nucl_length = 0;
1168 Int4 translation_length = 0;
1169 Int4 nucl_start = 0;
1170 Int4 nucl_end = 0;
1171 Int4 nucl_shift = 0;
1172 Int4 start_shift = 0;
1173
1174 /* HSP coordinates are in terms of protein sequences. */
1175 if (hsp->subject.offset < 0) {
1176 nucl_start = 0;
1177 nucl_end = target_t->subject_blk->length;
1178 } else {
1179 nucl_start = MAX(0, 3*hsp->subject.offset - kMaxTranslation);
1180 nucl_end = MIN(target_t->subject_blk->length, 3*hsp->subject.end + kMaxTranslation);
1181 /* extend to the end of the sequence if close */
1182 if (target_t->subject_blk->length - nucl_end <= 21) {
1183 nucl_end = target_t->subject_blk->length;
1184 }
1185 }
1186
1187 nucl_length = nucl_end - nucl_start;
1188
1189 translation_length = 1+nucl_length/CODON_LENGTH;
1190 start_shift = nucl_start/CODON_LENGTH;
1191
1192 if (hsp->subject.frame < 0)
1193 nucl_shift = target_t->subject_blk->length - nucl_start - nucl_length;
1194 else
1195 nucl_shift = nucl_start;
1196
1197 if (start_shift < start || start_shift+translation_length > stop) {
1198 /* needs re-translation */
1199 Int4 length = 0; /* actual translation length */
1200 Uint1* nucl_seq = target_t->subject_blk->sequence + nucl_shift;
1201 Uint1* nucl_seq_rev = NULL;
1202
1203 target_t->range[2*context] = start_shift;
1204
1205 if (translation_length > stop-start) {
1206 /* needs re-allocation */
1207 sfree(target_t->translations[context]);
1208 target_t->translations[context] = (Uint1*) malloc(translation_length+2);
1209 }
1210
1211 if (hsp->subject.frame < 0) {
1212 /* needs reverse sequence */
1213 GetReverseNuclSequence(nucl_seq, nucl_length, &nucl_seq_rev);
1214 }
1215
1216 length = BLAST_GetTranslation(nucl_seq, nucl_seq_rev,
1217 nucl_length, hsp->subject.frame, target_t->translations[context],
1218 target_t->gen_code_string);
1219
1220 target_t->range[2*context+1] = start_shift + length;
1221
1222 sfree(nucl_seq_rev);
1223
1224 /* partial translation needs to be fenced */
1225 if(hsp->subject.offset >= 0) {
1226 target_t->translations[context][0] = FENCE_SENTRY;
1227 target_t->translations[context][length+1] = FENCE_SENTRY;
1228 }
1229 }
1230 }
1231 if (translated_length)
1232 *translated_length = target_t->range[2*context+1];
1233
1234 /* +1 as the first byte is a sentinel. */
1235 return target_t->translations[context] - target_t->range[2*context] + 1;
1236 }
1237
1238 Int2
Blast_HSPGetPartialSubjectTranslation(BLAST_SequenceBlk * subject_blk,BlastHSP * hsp,Boolean is_ooframe,const Uint1 * gen_code_string,Uint1 ** translation_buffer_ptr,Uint1 ** subject_ptr,Int4 * subject_length_ptr,Int4 * start_shift_ptr)1239 Blast_HSPGetPartialSubjectTranslation(BLAST_SequenceBlk* subject_blk,
1240 BlastHSP* hsp,
1241 Boolean is_ooframe,
1242 const Uint1* gen_code_string,
1243 Uint1** translation_buffer_ptr,
1244 Uint1** subject_ptr,
1245 Int4* subject_length_ptr,
1246 Int4* start_shift_ptr)
1247 {
1248 Int4 translation_length;
1249 Uint1* translation_buffer;
1250 Uint1* subject;
1251 Int4 start_shift;
1252 Int4 nucl_shift;
1253 Int2 status = 0;
1254
1255 ASSERT(subject_blk && hsp && gen_code_string && translation_buffer_ptr &&
1256 subject_ptr && subject_length_ptr && start_shift_ptr);
1257
1258 translation_buffer = *translation_buffer_ptr;
1259 sfree(translation_buffer);
1260
1261 if (!is_ooframe) {
1262 start_shift =
1263 MAX(0, 3*hsp->subject.offset - MAX_FULL_TRANSLATION);
1264 translation_length =
1265 MIN(3*hsp->subject.end + MAX_FULL_TRANSLATION,
1266 subject_blk->length) - start_shift;
1267 if (hsp->subject.frame > 0) {
1268 nucl_shift = start_shift;
1269 } else {
1270 nucl_shift = subject_blk->length - start_shift - translation_length;
1271 }
1272 status = (Int2)
1273 Blast_GetPartialTranslation(subject_blk->sequence_start+nucl_shift,
1274 translation_length, hsp->subject.frame,
1275 gen_code_string, &translation_buffer,
1276 subject_length_ptr, NULL);
1277 /* Below, the start_shift will be used for the protein
1278 coordinates, so need to divide it by 3 */
1279 start_shift /= CODON_LENGTH;
1280 } else {
1281 Int4 oof_end;
1282 oof_end = subject_blk->length;
1283
1284 start_shift =
1285 MAX(0, hsp->subject.offset - MAX_FULL_TRANSLATION);
1286 translation_length =
1287 MIN(hsp->subject.end + MAX_FULL_TRANSLATION, oof_end) - start_shift;
1288 if (hsp->subject.frame > 0) {
1289 nucl_shift = start_shift;
1290 } else {
1291 nucl_shift = oof_end - start_shift - translation_length;
1292 }
1293 status = (Int2)
1294 Blast_GetPartialTranslation(subject_blk->sequence_start+nucl_shift,
1295 translation_length, hsp->subject.frame,
1296 gen_code_string, NULL,
1297 subject_length_ptr, &translation_buffer);
1298 }
1299 hsp->subject.offset -= start_shift;
1300 hsp->subject.end -= start_shift;
1301 hsp->subject.gapped_start -= start_shift;
1302 *translation_buffer_ptr = translation_buffer;
1303 *start_shift_ptr = start_shift;
1304
1305 if (!is_ooframe) {
1306 subject = translation_buffer + 1;
1307 } else {
1308 subject = translation_buffer + CODON_LENGTH;
1309 }
1310 *subject_ptr = subject;
1311
1312 return status;
1313 }
1314
1315 void
Blast_HSPAdjustSubjectOffset(BlastHSP * hsp,Int4 start_shift)1316 Blast_HSPAdjustSubjectOffset(BlastHSP* hsp, Int4 start_shift)
1317 {
1318 /* Adjust subject offsets if shifted (partial) sequence was used
1319 for extension */
1320 if (start_shift > 0) {
1321 hsp->subject.offset += start_shift;
1322 hsp->subject.end += start_shift;
1323 hsp->subject.gapped_start += start_shift;
1324 }
1325
1326 return;
1327 }
1328
1329 int
ScoreCompareHSPs(const void * h1,const void * h2)1330 ScoreCompareHSPs(const void* h1, const void* h2)
1331 {
1332 BlastHSP* hsp1,* hsp2; /* the HSPs to be compared */
1333 int result = 0; /* the result of the comparison */
1334
1335 hsp1 = *((BlastHSP**) h1);
1336 hsp2 = *((BlastHSP**) h2);
1337
1338 /* Null HSPs are "greater" than any non-null ones, so they go to the end
1339 of a sorted list. */
1340 if (!hsp1 && !hsp2)
1341 return 0;
1342 else if (!hsp1)
1343 return 1;
1344 else if (!hsp2)
1345 return -1;
1346
1347 if (0 == (result = BLAST_CMP(hsp2->score, hsp1->score)) &&
1348 0 == (result = BLAST_CMP(hsp1->subject.offset, hsp2->subject.offset)) &&
1349 0 == (result = BLAST_CMP(hsp2->subject.end, hsp1->subject.end)) &&
1350 0 == (result = BLAST_CMP(hsp1->query .offset, hsp2->query .offset))) {
1351 /* if all other test can't distinguish the HSPs, then the final
1352 test is the result */
1353 result = BLAST_CMP(hsp2->query.end, hsp1->query.end);
1354 }
1355 return result;
1356 }
1357
Blast_HSPListIsSortedByScore(const BlastHSPList * hsp_list)1358 Boolean Blast_HSPListIsSortedByScore(const BlastHSPList* hsp_list)
1359 {
1360 Int4 index;
1361
1362 if (!hsp_list || hsp_list->hspcnt <= 1)
1363 return TRUE;
1364
1365 for (index = 0; index < hsp_list->hspcnt - 1; ++index) {
1366 if (ScoreCompareHSPs(&hsp_list->hsp_array[index],
1367 &hsp_list->hsp_array[index+1]) > 0) {
1368 return FALSE;
1369 }
1370 }
1371 return TRUE;
1372 }
1373
Blast_HSPListSortByScore(BlastHSPList * hsp_list)1374 void Blast_HSPListSortByScore(BlastHSPList* hsp_list)
1375 {
1376 if (!hsp_list || hsp_list->hspcnt <= 1)
1377 return;
1378
1379 if (!Blast_HSPListIsSortedByScore(hsp_list)) {
1380 qsort(hsp_list->hsp_array, hsp_list->hspcnt, sizeof(BlastHSP*),
1381 ScoreCompareHSPs);
1382 }
1383 }
1384
1385 /** Compares 2 evalues, consider them equal if both are close enough to zero.
1386 * @param evalue1 First evalue [in]
1387 * @param evalue2 Second evalue [in]
1388 */
1389 static int
s_EvalueComp(double evalue1,double evalue2)1390 s_EvalueComp(double evalue1, double evalue2)
1391 {
1392 const double epsilon = 1.0e-180;
1393 if (evalue1 < epsilon && evalue2 < epsilon) {
1394 return 0;
1395 }
1396
1397 if (evalue1 < evalue2) {
1398 return -1;
1399 } else if (evalue1 > evalue2) {
1400 return 1;
1401 } else {
1402 return 0;
1403 }
1404 }
1405
1406 /** Comparison callback function for sorting HSPs by e-value and score, before
1407 * saving BlastHSPList in a BlastHitList. E-value has priority over score,
1408 * because lower scoring HSPs might have lower e-values, if they are linked
1409 * with sum statistics.
1410 * E-values are compared only up to a certain precision.
1411 * @param v1 Pointer to first HSP [in]
1412 * @param v2 Pointer to second HSP [in]
1413 */
1414 static int
s_EvalueCompareHSPs(const void * v1,const void * v2)1415 s_EvalueCompareHSPs(const void* v1, const void* v2)
1416 {
1417 BlastHSP* h1,* h2;
1418 int retval = 0;
1419
1420 h1 = *((BlastHSP**) v1);
1421 h2 = *((BlastHSP**) v2);
1422
1423 /* Check if one or both of these are null. Those HSPs should go to the end */
1424 if (!h1 && !h2)
1425 return 0;
1426 else if (!h1)
1427 return 1;
1428 else if (!h2)
1429 return -1;
1430
1431 if ((retval = s_EvalueComp(h1->evalue, h2->evalue)) != 0)
1432 return retval;
1433
1434 return ScoreCompareHSPs(v1, v2);
1435 }
1436
Blast_HSPListSortByEvalue(BlastHSPList * hsp_list)1437 void Blast_HSPListSortByEvalue(BlastHSPList* hsp_list)
1438 {
1439 if (!hsp_list)
1440 return;
1441
1442 if (hsp_list->hspcnt > 1) {
1443 Int4 index;
1444 BlastHSP** hsp_array = hsp_list->hsp_array;
1445 /* First check if HSP array is already sorted. */
1446 for (index = 0; index < hsp_list->hspcnt - 1; ++index) {
1447 if (s_EvalueCompareHSPs(&hsp_array[index], &hsp_array[index+1]) > 0) {
1448 break;
1449 }
1450 }
1451 /* Sort the HSP array if it is not sorted yet. */
1452 if (index < hsp_list->hspcnt - 1) {
1453 qsort(hsp_list->hsp_array, hsp_list->hspcnt, sizeof(BlastHSP*),
1454 s_EvalueCompareHSPs);
1455 }
1456 }
1457 }
1458
1459
1460 /** Retrieve the starting diagonal of an HSP
1461 * @param hsp The target HSP
1462 * @return The starting diagonal
1463 */
1464 static Int4
s_HSPStartDiag(const BlastHSP * hsp)1465 s_HSPStartDiag(const BlastHSP *hsp)
1466 {
1467 return hsp->query.offset - hsp->subject.offset;
1468 }
1469
1470 /** Retrieve the ending diagonal of an HSP
1471 * @param hsp The target HSP
1472 * @return The ending diagonal
1473 */
1474 static Int4
s_HSPEndDiag(const BlastHSP * hsp)1475 s_HSPEndDiag(const BlastHSP *hsp)
1476 {
1477 return hsp->query.end - hsp->subject.end;
1478 }
1479
1480 /** Given two hits, check if the hits can be merged and do
1481 * the merge if so. Hits must not contain traceback
1482 * @param hsp1 The first hit. If merging happens, this hit is
1483 * overwritten with the merged version [in][out]
1484 * @param hsp2 The second hit [in]
1485 * @return TRUE if a merge was performed, FALSE if not
1486 */
1487 static Boolean
s_BlastMergeTwoHSPs(BlastHSP * hsp1,BlastHSP * hsp2,Boolean allow_gap)1488 s_BlastMergeTwoHSPs(BlastHSP* hsp1, BlastHSP* hsp2, Boolean allow_gap)
1489 {
1490 ASSERT(!hsp1->gap_info || !hsp2->gap_info);
1491
1492 /* do not merge off-diagonal hsps for ungapped search */
1493 if (!allow_gap &&
1494 hsp1->subject.offset - hsp2->subject.offset -hsp1->query.offset + hsp2->query.offset)
1495 {
1496 return FALSE;
1497 }
1498
1499 if(hsp1->subject.frame != hsp2->subject.frame)
1500 return FALSE;
1501
1502 /* combine the boundaries of the two HSPs,
1503 assuming they intersect at all */
1504 if (CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end,
1505 hsp2->query.offset,
1506 hsp1->subject.offset, hsp1->subject.end,
1507 hsp2->subject.offset) ||
1508 CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end,
1509 hsp2->query.end,
1510 hsp1->subject.offset, hsp1->subject.end,
1511 hsp2->subject.end)) {
1512
1513 double score_density = (hsp1->score + hsp2->score) *(1.0) /
1514 ((hsp1->query.end - hsp1->query.offset) +
1515 (hsp2->query.end - hsp2->query.offset));
1516 hsp1->query.offset = MIN(hsp1->query.offset, hsp2->query.offset);
1517 hsp1->subject.offset = MIN(hsp1->subject.offset, hsp2->subject.offset);
1518 hsp1->query.end = MAX(hsp1->query.end, hsp2->query.end);
1519 hsp1->subject.end = MAX(hsp1->subject.end, hsp2->subject.end);
1520 if (hsp2->score > hsp1->score) {
1521 hsp1->query.gapped_start = hsp2->query.gapped_start;
1522 hsp1->subject.gapped_start = hsp2->subject.gapped_start;
1523 hsp1->score = hsp2->score;
1524 }
1525
1526 hsp1->score = MAX((int) (score_density *(hsp1->query.end - hsp1->query.offset)), hsp1->score);
1527 return TRUE;
1528 }
1529
1530 return FALSE;
1531 }
1532
1533 /** Maximal diagonal distance between HSP starting offsets, within which HSPs
1534 * from search of different chunks of subject sequence are considered for
1535 * merging.
1536 */
1537 #define OVERLAP_DIAG_CLOSE 10
1538 /********************************************************************************
1539 Functions manipulating BlastHSPList's
1540 ********************************************************************************/
1541
Blast_HSPListFree(BlastHSPList * hsp_list)1542 BlastHSPList* Blast_HSPListFree(BlastHSPList* hsp_list)
1543 {
1544 Int4 index;
1545
1546 if (!hsp_list)
1547 return hsp_list;
1548
1549 for (index = 0; index < hsp_list->hspcnt; ++index) {
1550 Blast_HSPFree(hsp_list->hsp_array[index]);
1551 }
1552 sfree(hsp_list->hsp_array);
1553
1554 sfree(hsp_list);
1555 return NULL;
1556 }
1557
Blast_HSPListNew(Int4 hsp_max)1558 BlastHSPList* Blast_HSPListNew(Int4 hsp_max)
1559 {
1560 BlastHSPList* hsp_list = (BlastHSPList*) calloc(1, sizeof(BlastHSPList));
1561 const Int4 kDefaultAllocated=100;
1562
1563 /* hsp_max is max number of HSP's allowed in an HSP list;
1564 INT4_MAX taken as infinity. */
1565 hsp_list->hsp_max = INT4_MAX;
1566 if (hsp_max > 0)
1567 hsp_list->hsp_max = hsp_max;
1568
1569 hsp_list->allocated = MIN(kDefaultAllocated, hsp_list->hsp_max);
1570
1571 hsp_list->hsp_array = (BlastHSP**)
1572 calloc(hsp_list->allocated, sizeof(BlastHSP*));
1573
1574 return hsp_list;
1575 }
1576
1577 Boolean
Blast_HSPList_IsEmpty(const BlastHSPList * hsp_list)1578 Blast_HSPList_IsEmpty(const BlastHSPList* hsp_list)
1579 {
1580 return (hsp_list && hsp_list->hspcnt == 0) ? TRUE : FALSE;
1581 }
1582
BlastHSPListDup(const BlastHSPList * hsp_list)1583 BlastHSPList* BlastHSPListDup(const BlastHSPList* hsp_list)
1584 {
1585 BlastHSPList * rv = 0;
1586
1587 if (hsp_list) {
1588 int index = 0;
1589 int num = hsp_list->hspcnt;
1590
1591 rv = malloc(sizeof(BlastHSPList));
1592 *rv = *hsp_list;
1593
1594 if (num) {
1595 rv->hsp_array = malloc(sizeof(BlastHSP*) * num);
1596
1597 for(index = 0; index < hsp_list->hspcnt; ++index) {
1598 BlastHSP * h = hsp_list->hsp_array[index];
1599 BlastHSP ** h2 = & rv->hsp_array[index];
1600
1601 if (h) {
1602 *h2 = malloc(sizeof(BlastHSP));
1603 **h2 = *h;
1604 } else {
1605 *h2 = 0;
1606 }
1607 }
1608 }
1609 }
1610
1611 return rv;
1612 }
1613
Blast_HSPListSwap(BlastHSPList * list1,BlastHSPList * list2)1614 void Blast_HSPListSwap(BlastHSPList* list1, BlastHSPList* list2)
1615 {
1616 BlastHSPList tmp;
1617
1618 tmp = *list1;
1619 *list1 = *list2;
1620 *list2 = tmp;
1621 }
1622
1623 /** This is a copy of a static function from ncbimisc.c.
1624 * Turns array into a heap with respect to a given comparison function.
1625 */
1626 static void
s_Heapify(char * base0,char * base,char * lim,char * last,size_t width,int (* compar)(const void *,const void *))1627 s_Heapify (char* base0, char* base, char* lim, char* last, size_t width, int (*compar )(const void*, const void* ))
1628 {
1629 size_t i;
1630 char ch;
1631 char* left_son,* large_son;
1632
1633 left_son = base0 + 2*(base-base0) + width;
1634 while (base <= lim) {
1635 if (left_son == last)
1636 large_son = left_son;
1637 else
1638 large_son = (*compar)(left_son, left_son+width) >= 0 ?
1639 left_son : left_son+width;
1640 if ((*compar)(base, large_son) < 0) {
1641 for (i=0; i<width; ++i) {
1642 ch = base[i];
1643 base[i] = large_son[i];
1644 large_son[i] = ch;
1645 }
1646 base = large_son;
1647 left_son = base0 + 2*(base-base0) + width;
1648 } else
1649 break;
1650 }
1651 }
1652
1653 /** Creates a heap of elements based on a comparison function.
1654 * @param b An array [in] [out]
1655 * @param nel Number of elements in b [in]
1656 * @param width The size of each element [in]
1657 * @param compar Callback to compare two heap elements [in]
1658 */
1659 static void
s_CreateHeap(void * b,size_t nel,size_t width,int (* compar)(const void *,const void *))1660 s_CreateHeap (void* b, size_t nel, size_t width,
1661 int (*compar )(const void*, const void* ))
1662 {
1663 char* base = (char*)b;
1664 size_t i;
1665 char* base0 = (char*)base,* lim,* basef;
1666
1667 if (nel < 2)
1668 return;
1669
1670 lim = &base[((nel-2)/2)*width];
1671 basef = &base[(nel-1)*width];
1672 i = nel/2;
1673 for (base = &base0[(i - 1)*width]; i > 0; base = base - width) {
1674 s_Heapify(base0, base, lim, basef, width, compar);
1675 i--;
1676 }
1677 }
1678
1679 /** Given a BlastHSPList* with a heapified HSP array, check whether the
1680 * new HSP is better than the worst scoring. If it is, then remove the
1681 * worst scoring and insert, otherwise free the new one.
1682 * HSP and insert the new HSP in the heap.
1683 * @param hsp_list Contains all HSPs for a given subject. [in] [out]
1684 * @param hsp A pointer to new HSP to be inserted into the HSP list [in] [out]
1685 */
1686 static void
s_BlastHSPListInsertHSPInHeap(BlastHSPList * hsp_list,BlastHSP ** hsp)1687 s_BlastHSPListInsertHSPInHeap(BlastHSPList* hsp_list,
1688 BlastHSP** hsp)
1689 {
1690 BlastHSP** hsp_array = hsp_list->hsp_array;
1691 if (ScoreCompareHSPs(hsp, &hsp_array[0]) > 0)
1692 {
1693 Blast_HSPFree(*hsp);
1694 return;
1695 }
1696 else
1697 Blast_HSPFree(hsp_array[0]);
1698
1699 hsp_array[0] = *hsp;
1700 if (hsp_list->hspcnt >= 2) {
1701 s_Heapify((char*)hsp_array, (char*)hsp_array,
1702 (char*)&hsp_array[hsp_list->hspcnt/2 - 1],
1703 (char*)&hsp_array[hsp_list->hspcnt-1],
1704 sizeof(BlastHSP*), ScoreCompareHSPs);
1705 }
1706 }
1707
1708 #ifndef NDEBUG
1709 /** Verifies that the best_evalue field on the BlastHSPList is correct.
1710 * @param hsp_list object to check [in]
1711 * @return TRUE if OK, FALSE otherwise.
1712 */
1713 static Boolean
s_BlastCheckBestEvalue(const BlastHSPList * hsp_list)1714 s_BlastCheckBestEvalue(const BlastHSPList* hsp_list)
1715 {
1716 int index = 0;
1717 double best_evalue = (double) INT4_MAX;
1718 const double kDelta = 1.0e-200;
1719
1720 /* There are no HSP's here. */
1721 if (hsp_list->hspcnt == 0)
1722 return TRUE;
1723
1724 for (index=0; index<hsp_list->hspcnt; index++)
1725 best_evalue = MIN(hsp_list->hsp_array[index]->evalue, best_evalue);
1726
1727 /* check that it's within 1%. */
1728 if (ABS(best_evalue-hsp_list->best_evalue)/(best_evalue+kDelta) > 0.01)
1729 return FALSE;
1730
1731 return TRUE;
1732 }
1733 #endif /* _DEBUG */
1734
1735 /** Gets the best (lowest) evalue from the BlastHSPList.
1736 * @param hsp_list object containing the evalues [in]
1737 * @return TRUE if OK, FALSE otherwise.
1738 */
1739 static double
s_BlastGetBestEvalue(const BlastHSPList * hsp_list)1740 s_BlastGetBestEvalue(const BlastHSPList* hsp_list)
1741 {
1742 int index = 0;
1743 double best_evalue = (double) INT4_MAX;
1744
1745 for (index=0; index<hsp_list->hspcnt; index++)
1746 best_evalue = MIN(hsp_list->hsp_array[index]->evalue, best_evalue);
1747
1748 return best_evalue;
1749 }
1750
1751 /* Comments in blast_hits.h
1752 */
1753 Int2
Blast_HSPListSaveHSP(BlastHSPList * hsp_list,BlastHSP * new_hsp)1754 Blast_HSPListSaveHSP(BlastHSPList* hsp_list, BlastHSP* new_hsp)
1755 {
1756 BlastHSP** hsp_array;
1757 Int4 hspcnt;
1758 Int4 hsp_allocated; /* how many hsps are in the array. */
1759 Int2 status = 0;
1760
1761 hspcnt = hsp_list->hspcnt;
1762 hsp_allocated = hsp_list->allocated;
1763 hsp_array = hsp_list->hsp_array;
1764
1765
1766 /* Check if list is already full, then reallocate. */
1767 if (hspcnt >= hsp_allocated && hsp_list->do_not_reallocate == FALSE)
1768 {
1769 Int4 new_allocated = MIN(2*hsp_list->allocated, hsp_list->hsp_max);
1770 if (new_allocated > hsp_list->allocated) {
1771 hsp_array = (BlastHSP**)
1772 realloc(hsp_array, new_allocated*sizeof(BlastHSP*));
1773 if (hsp_array == NULL)
1774 {
1775 hsp_list->do_not_reallocate = TRUE;
1776 hsp_array = hsp_list->hsp_array;
1777 /** Return a non-zero status, because restriction on number
1778 of HSPs here is a result of memory allocation failure. */
1779 status = -1;
1780 } else {
1781 hsp_list->hsp_array = hsp_array;
1782 hsp_list->allocated = new_allocated;
1783 hsp_allocated = new_allocated;
1784 }
1785 } else {
1786 hsp_list->do_not_reallocate = TRUE;
1787 }
1788 /* If it is the first time when the HSP array is filled to capacity,
1789 create a heap now. */
1790 if (hsp_list->do_not_reallocate) {
1791 s_CreateHeap(hsp_array, hspcnt, sizeof(BlastHSP*), ScoreCompareHSPs);
1792 }
1793 }
1794
1795 /* If there is space in the allocated HSP array, simply save the new HSP.
1796 Othewise, if the new HSP has lower score than the worst HSP in the heap,
1797 then delete it, else insert it in the heap. */
1798 if (hspcnt < hsp_allocated)
1799 {
1800 hsp_array[hsp_list->hspcnt] = new_hsp;
1801 (hsp_list->hspcnt)++;
1802 return status;
1803 } else {
1804 /* Insert the new HSP in heap. */
1805 s_BlastHSPListInsertHSPInHeap(hsp_list, &new_hsp);
1806 }
1807
1808 return status;
1809 }
1810
Blast_HSPListGetEvalues(EBlastProgramType program_number,const BlastQueryInfo * query_info,Int4 subject_length,BlastHSPList * hsp_list,Boolean gapped_calculation,Boolean RPS_prelim,const BlastScoreBlk * sbp,double gap_decay_rate,double scaling_factor)1811 Int2 Blast_HSPListGetEvalues(EBlastProgramType program_number,
1812 const BlastQueryInfo* query_info,
1813 Int4 subject_length,
1814 BlastHSPList* hsp_list,
1815 Boolean gapped_calculation,
1816 Boolean RPS_prelim,
1817 const BlastScoreBlk* sbp, double gap_decay_rate,
1818 double scaling_factor)
1819 {
1820 BlastHSP* hsp;
1821 BlastHSP** hsp_array;
1822 Blast_KarlinBlk** kbp;
1823 Int4 hsp_cnt;
1824 Int4 index;
1825 Int4 kbp_context;
1826 Int4 score;
1827 double gap_decay_divisor = 1.;
1828 Boolean isRPS = Blast_ProgramIsRpsBlast(program_number);
1829
1830 if (hsp_list == NULL || hsp_list->hspcnt == 0)
1831 return 0;
1832
1833 kbp = (gapped_calculation ? sbp->kbp_gap : sbp->kbp);
1834 hsp_cnt = hsp_list->hspcnt;
1835 hsp_array = hsp_list->hsp_array;
1836
1837 if (gap_decay_rate != 0.)
1838 gap_decay_divisor = BLAST_GapDecayDivisor(gap_decay_rate, 1);
1839
1840 for (index=0; index<hsp_cnt; index++) {
1841 hsp = hsp_array[index];
1842
1843 ASSERT(hsp != NULL);
1844 ASSERT(scaling_factor != 0.0);
1845 #if 0
1846 ASSERT(sbp->round_down == FALSE || (hsp->score & 1) == 0);
1847 #endif
1848 /* Divide Lambda by the scaling factor, so e-value is
1849 calculated correctly from a scaled score. This is needed only
1850 for RPS BLAST, where scores are scaled, but Lambda is not. */
1851 kbp_context = hsp->context;
1852 if (RPS_prelim) {
1853 /* All kbp in preliminary stage are equivalent. However, some
1854 may be invalid. Search for the first populated kbp */
1855 int i;
1856 for (i=0; i < sbp->number_of_contexts; ++i) {
1857 if (kbp[i]) break;
1858 }
1859 kbp_context = i;
1860 }
1861 ASSERT(kbp[kbp_context]);
1862 kbp[kbp_context]->Lambda /= scaling_factor;
1863
1864 /* Round score down to even number for E-value calculations only. */
1865 /* Added 2018/5/16 by rackerst, SB-2303 */
1866 score = hsp->score;
1867 if (hsp_list && hsp_list->hspcnt != 0
1868 && gapped_calculation && sbp->round_down) {
1869 score &= ~1;
1870 }
1871
1872 if (sbp->gbp) {
1873 /* Only try Spouge's method if gumbel parameters are available */
1874 if (!isRPS) {
1875 hsp->evalue =
1876 BLAST_SpougeStoE(score, kbp[kbp_context], sbp->gbp,
1877 query_info->contexts[hsp->context].query_length,
1878 subject_length);
1879 } else {
1880 /* for RPS blast, query and subject is swapped */
1881 hsp->evalue =
1882 BLAST_SpougeStoE(score, kbp[kbp_context], sbp->gbp,
1883 subject_length,
1884 query_info->contexts[hsp->context].query_length);
1885 }
1886 } else {
1887 /* Get effective search space from the query information block */
1888 hsp->evalue =
1889 BLAST_KarlinStoE_simple(score, kbp[kbp_context],
1890 query_info->contexts[hsp->context].eff_searchsp);
1891 }
1892
1893 hsp->evalue /= gap_decay_divisor;
1894 /* Put back the unscaled value of Lambda. */
1895 kbp[kbp_context]->Lambda *= scaling_factor;
1896 }
1897
1898 /* Assign the best e-value field. Here the best e-value will always be
1899 attained for the first HSP in the list. Check that the incoming
1900 HSP list is properly sorted by score. */
1901 ASSERT(Blast_HSPListIsSortedByScore(hsp_list));
1902 hsp_list->best_evalue = s_BlastGetBestEvalue(hsp_list);
1903
1904 return 0;
1905 }
1906
Blast_HSPListGetBitScores(BlastHSPList * hsp_list,Boolean gapped_calculation,const BlastScoreBlk * sbp)1907 Int2 Blast_HSPListGetBitScores(BlastHSPList* hsp_list,
1908 Boolean gapped_calculation,
1909 const BlastScoreBlk* sbp)
1910 {
1911 BlastHSP* hsp;
1912 Blast_KarlinBlk** kbp;
1913 Int4 index;
1914
1915 if (hsp_list == NULL)
1916 return 1;
1917
1918 kbp = (gapped_calculation ? sbp->kbp_gap : sbp->kbp);
1919
1920 for (index=0; index<hsp_list->hspcnt; index++) {
1921 hsp = hsp_list->hsp_array[index];
1922 ASSERT(hsp != NULL);
1923 #if 0
1924 ASSERT(sbp->round_down == FALSE || (hsp->score & 1) == 0);
1925 #endif
1926 hsp->bit_score =
1927 (hsp->score*kbp[hsp->context]->Lambda - kbp[hsp->context]->logK) /
1928 NCBIMATH_LN2;
1929 }
1930
1931 return 0;
1932 }
1933
Blast_HSPListPHIGetBitScores(BlastHSPList * hsp_list,BlastScoreBlk * sbp)1934 void Blast_HSPListPHIGetBitScores(BlastHSPList* hsp_list, BlastScoreBlk* sbp)
1935 {
1936 Int4 index;
1937
1938 double lambda, logC;
1939
1940 ASSERT(sbp && sbp->kbp_gap && sbp->kbp_gap[0]);
1941
1942 lambda = sbp->kbp_gap[0]->Lambda;
1943 logC = log(sbp->kbp_gap[0]->paramC);
1944
1945 for (index=0; index<hsp_list->hspcnt; index++) {
1946 BlastHSP* hsp = hsp_list->hsp_array[index];
1947 ASSERT(hsp != NULL);
1948 hsp->bit_score =
1949 (hsp->score*lambda - logC - log(1.0 + hsp->score*lambda))
1950 / NCBIMATH_LN2;
1951 }
1952 }
1953
1954 void
Blast_HSPListPHIGetEvalues(BlastHSPList * hsp_list,BlastScoreBlk * sbp,const BlastQueryInfo * query_info,const SPHIPatternSearchBlk * pattern_blk)1955 Blast_HSPListPHIGetEvalues(BlastHSPList* hsp_list, BlastScoreBlk* sbp,
1956 const BlastQueryInfo* query_info,
1957 const SPHIPatternSearchBlk* pattern_blk)
1958 {
1959 Int4 index;
1960 BlastHSP* hsp;
1961
1962 if (!hsp_list || hsp_list->hspcnt == 0)
1963 return;
1964
1965 for (index = 0; index < hsp_list->hspcnt; ++index) {
1966 hsp = hsp_list->hsp_array[index];
1967 s_HSPPHIGetEvalue(hsp, sbp, query_info, pattern_blk);
1968 }
1969 /* The best e-value is the one for the highest scoring HSP, which
1970 must be the first in the list. Check that HSPs are sorted by score
1971 to make sure this assumption is correct. */
1972 ASSERT(Blast_HSPListIsSortedByScore(hsp_list));
1973 hsp_list->best_evalue = s_BlastGetBestEvalue(hsp_list);
1974 }
1975
Blast_HSPListReapByEvalue(BlastHSPList * hsp_list,const BlastHitSavingOptions * hit_options)1976 Int2 Blast_HSPListReapByEvalue(BlastHSPList* hsp_list,
1977 const BlastHitSavingOptions* hit_options)
1978 {
1979 BlastHSP* hsp;
1980 BlastHSP** hsp_array;
1981 Int4 hsp_cnt = 0;
1982 Int4 index;
1983 double cutoff;
1984
1985 if (hsp_list == NULL)
1986 return 0;
1987
1988 cutoff = hit_options->expect_value;
1989
1990 hsp_array = hsp_list->hsp_array;
1991 for (index = 0; index < hsp_list->hspcnt; index++) {
1992 hsp = hsp_array[index];
1993
1994 ASSERT(hsp != NULL);
1995
1996 if (hsp->evalue > cutoff) {
1997 hsp_array[index] = Blast_HSPFree(hsp_array[index]);
1998 } else {
1999 if (index > hsp_cnt)
2000 hsp_array[hsp_cnt] = hsp_array[index];
2001 hsp_cnt++;
2002 }
2003 }
2004
2005 hsp_list->hspcnt = hsp_cnt;
2006
2007 return 0;
2008 }
2009
Blast_HSPListReapByQueryCoverage(BlastHSPList * hsp_list,const BlastHitSavingOptions * hit_options,const BlastQueryInfo * query_info,EBlastProgramType program_number)2010 Int2 Blast_HSPListReapByQueryCoverage(BlastHSPList* hsp_list,
2011 const BlastHitSavingOptions* hit_options,
2012 const BlastQueryInfo* query_info,
2013 EBlastProgramType program_number)
2014
2015 {
2016 BlastHSP* hsp;
2017 BlastHSP** hsp_array;
2018 Int4 hsp_cnt = 0;
2019 Int4 index;
2020
2021 if ((hsp_list == NULL) || (hsp_list->hspcnt == 0) || (hit_options->query_cov_hsp_perc == 0))
2022 return 0;
2023
2024 hsp_array = hsp_list->hsp_array;
2025 for (index = 0; index < hsp_list->hspcnt; index++) {
2026 hsp = hsp_array[index];
2027 ASSERT(hsp != NULL);
2028 if ( Blast_HSPQueryCoverageTest(hsp, hit_options->query_cov_hsp_perc,
2029 query_info->contexts[hsp->context].query_length)) {
2030 hsp_array[index] = Blast_HSPFree(hsp_array[index]);
2031 } else {
2032 if (index > hsp_cnt)
2033 hsp_array[hsp_cnt] = hsp_array[index];
2034 hsp_cnt++;
2035 }
2036 }
2037
2038 hsp_list->hspcnt = hsp_cnt;
2039
2040 return 0;
2041 }
2042
Blast_TrimHSPListByMaxHsps(BlastHSPList * hsp_list,const BlastHitSavingOptions * hit_options)2043 Int2 Blast_TrimHSPListByMaxHsps(BlastHSPList* hsp_list,
2044 const BlastHitSavingOptions* hit_options)
2045 {
2046 BlastHSP** hsp_array;
2047 Int4 index;
2048 Int4 hsp_max;
2049
2050 if ((hsp_list == NULL) ||
2051 (hit_options->max_hsps_per_subject == 0) ||
2052 (hsp_list->hspcnt <= hit_options->max_hsps_per_subject))
2053 return 0;
2054
2055 hsp_max = hit_options->max_hsps_per_subject;
2056 hsp_array = hsp_list->hsp_array;
2057 for (index = hsp_max; index < hsp_list->hspcnt; index++) {
2058 hsp_array[index] = Blast_HSPFree(hsp_array[index]);
2059 }
2060
2061 hsp_list->hspcnt = hsp_max;
2062 return 0;
2063 }
2064
2065
2066 /** Same as Blast_HSPListReapByEvalue() except that it uses
2067 * the raw score of the hit and the HitSavingOptions->cutoff_score
2068 * to filter out hits. -RMH-
2069 */
Blast_HSPListReapByRawScore(BlastHSPList * hsp_list,const BlastHitSavingOptions * hit_options)2070 Int2 Blast_HSPListReapByRawScore(BlastHSPList* hsp_list,
2071 const BlastHitSavingOptions* hit_options)
2072 {
2073 BlastHSP* hsp;
2074 BlastHSP** hsp_array;
2075 Int4 hsp_cnt = 0;
2076 Int4 index;
2077
2078 if (hsp_list == NULL)
2079 return 0;
2080
2081 hsp_array = hsp_list->hsp_array;
2082 for (index = 0; index < hsp_list->hspcnt; index++) {
2083 hsp = hsp_array[index];
2084
2085 ASSERT(hsp != NULL);
2086
2087 if ( hsp->score < hit_options->cutoff_score ) {
2088 hsp_array[index] = Blast_HSPFree(hsp_array[index]);
2089 } else {
2090 if (index > hsp_cnt)
2091 hsp_array[hsp_cnt] = hsp_array[index];
2092 hsp_cnt++;
2093 }
2094 }
2095
2096 hsp_list->hspcnt = hsp_cnt;
2097
2098 return 0;
2099 }
2100
2101 /** callback used to sort HSP lists in order of increasing OID
2102 * @param x First HSP list [in]
2103 * @param y Second HSP list [in]
2104 * @return compare result
2105 */
s_SortHSPListByOid(const void * x,const void * y)2106 static int s_SortHSPListByOid(const void *x, const void *y)
2107 {
2108 BlastHSPList **xx = (BlastHSPList **)x;
2109 BlastHSPList **yy = (BlastHSPList **)y;
2110 return (*xx)->oid - (*yy)->oid;
2111 }
2112
Blast_HitListMerge(BlastHitList ** old_hit_list_ptr,BlastHitList ** combined_hit_list_ptr,Int4 contexts_per_query,Int4 * split_offsets,Int4 chunk_overlap_size,Boolean allow_gap)2113 Int2 Blast_HitListMerge(BlastHitList** old_hit_list_ptr,
2114 BlastHitList** combined_hit_list_ptr,
2115 Int4 contexts_per_query, Int4 *split_offsets,
2116 Int4 chunk_overlap_size, Boolean allow_gap)
2117 {
2118 Int4 i, j;
2119 Boolean query_is_split;
2120 BlastHitList* hitlist1 = *old_hit_list_ptr;
2121 BlastHitList* hitlist2 = *combined_hit_list_ptr;
2122 BlastHitList* new_hitlist;
2123 Int4 num_hsplists1;
2124 Int4 num_hsplists2;
2125
2126 if (hitlist1 == NULL)
2127 return 0;
2128 if (hitlist2 == NULL) {
2129 *combined_hit_list_ptr = hitlist1;
2130 *old_hit_list_ptr = NULL;
2131 return 0;
2132 }
2133 num_hsplists1 = hitlist1->hsplist_count;
2134 num_hsplists2 = hitlist2->hsplist_count;
2135 new_hitlist = Blast_HitListNew(hitlist1->hsplist_max);
2136
2137 /* sort the lists of HSPs by oid */
2138
2139 if (num_hsplists1 > 1) {
2140 qsort(hitlist1->hsplist_array, num_hsplists1,
2141 sizeof(BlastHSPList*), s_SortHSPListByOid);
2142 }
2143 if (num_hsplists2 > 1) {
2144 qsort(hitlist2->hsplist_array, num_hsplists2,
2145 sizeof(BlastHSPList*), s_SortHSPListByOid);
2146 }
2147
2148 /* find out if the two hitlists contain hits for a single
2149 (split) query sequence */
2150
2151 query_is_split = FALSE;
2152 for (i = 0; i < contexts_per_query; i++) {
2153 if (split_offsets[i] > 0) {
2154 query_is_split = TRUE;
2155 break;
2156 }
2157 }
2158 ASSERT(chunk_overlap_size != 0);
2159
2160 /* merge the HSPlists of the two HitLists */
2161
2162 i = j = 0;
2163 while (i < num_hsplists1 && j < num_hsplists2) {
2164 BlastHSPList* hsplist1 = hitlist1->hsplist_array[i];
2165 BlastHSPList* hsplist2 = hitlist2->hsplist_array[j];
2166
2167 if (hsplist1->oid < hsplist2->oid) {
2168 Blast_HitListUpdate(new_hitlist, hsplist1);
2169 i++;
2170 }
2171 else if (hsplist1->oid > hsplist2->oid) {
2172 Blast_HitListUpdate(new_hitlist, hsplist2);
2173 j++;
2174 }
2175 else {
2176 /* the old and new Hitlists contain hits to the same
2177 DB sequence, and these must be merged. */
2178
2179 if (query_is_split) {
2180 Blast_HSPListsMerge(hitlist1->hsplist_array + i,
2181 hitlist2->hsplist_array + j,
2182 hsplist2->hsp_max, split_offsets,
2183 contexts_per_query,
2184 chunk_overlap_size,
2185 allow_gap, FALSE);
2186 }
2187 else {
2188 Blast_HSPListAppend(hitlist1->hsplist_array + i,
2189 hitlist2->hsplist_array + j,
2190 hsplist2->hsp_max);
2191 }
2192 Blast_HitListUpdate(new_hitlist, hitlist2->hsplist_array[j]);
2193 i++;
2194 j++;
2195 }
2196 }
2197 for (; i < num_hsplists1; i++) {
2198 BlastHSPList* hsplist1 = hitlist1->hsplist_array[i];
2199 Blast_HitListUpdate(new_hitlist, hsplist1);
2200 }
2201 for (; j < num_hsplists2; j++) {
2202 BlastHSPList* hsplist2 = hitlist2->hsplist_array[j];
2203 Blast_HitListUpdate(new_hitlist, hsplist2);
2204 }
2205 hitlist1->hsplist_count = 0;
2206 Blast_HitListFree(hitlist1);
2207 hitlist2->hsplist_count = 0;
2208 Blast_HitListFree(hitlist2);
2209
2210 *old_hit_list_ptr = NULL;
2211 *combined_hit_list_ptr = new_hitlist;
2212 return 0;
2213 }
2214
2215
2216 /* See description in blast_hits.h */
2217
2218 Int2
Blast_HSPListPurgeNullHSPs(BlastHSPList * hsp_list)2219 Blast_HSPListPurgeNullHSPs(BlastHSPList* hsp_list)
2220
2221 {
2222 Int4 index, index1; /* loop indices. */
2223 Int4 hspcnt; /* total number of HSP's to iterate over. */
2224 BlastHSP** hsp_array; /* hsp_array to purge. */
2225
2226 if (hsp_list == NULL || hsp_list->hspcnt == 0)
2227 return 0;
2228
2229 hsp_array = hsp_list->hsp_array;
2230 hspcnt = hsp_list->hspcnt;
2231
2232 index1 = 0;
2233 for (index=0; index<hspcnt; index++)
2234 {
2235 if (hsp_array[index] != NULL)
2236 {
2237 hsp_array[index1] = hsp_array[index];
2238 index1++;
2239 }
2240 }
2241
2242 for (index=index1; index<hspcnt; index++)
2243 {
2244 hsp_array[index] = NULL;
2245 }
2246
2247 hsp_list->hspcnt = index1;
2248
2249 return 0;
2250 }
2251
2252 /** Callback for sorting HSPs by starting offset in query. Sorting is by
2253 * increasing context, then increasing query start offset, then increasing
2254 * subject start offset, then decreasing score, then increasing query end
2255 * offset, then increasing subject end offset. Null HSPs are moved to the
2256 * end of the array.
2257 * @param v1 pointer to first HSP [in]
2258 * @param v2 pointer to second HSP [in]
2259 * @return Result of comparison.
2260 */
2261 static int
s_QueryOffsetCompareHSPs(const void * v1,const void * v2)2262 s_QueryOffsetCompareHSPs(const void* v1, const void* v2)
2263 {
2264 BlastHSP* h1,* h2;
2265 BlastHSP** hp1,** hp2;
2266
2267 hp1 = (BlastHSP**) v1;
2268 hp2 = (BlastHSP**) v2;
2269 h1 = *hp1;
2270 h2 = *hp2;
2271
2272 if (!h1 && !h2)
2273 return 0;
2274 else if (!h1)
2275 return 1;
2276 else if (!h2)
2277 return -1;
2278
2279 /* If these are from different contexts, don't compare offsets */
2280 if (h1->context < h2->context)
2281 return -1;
2282 if (h1->context > h2->context)
2283 return 1;
2284
2285 if (h1->query.offset < h2->query.offset)
2286 return -1;
2287 if (h1->query.offset > h2->query.offset)
2288 return 1;
2289
2290 if (h1->subject.offset < h2->subject.offset)
2291 return -1;
2292 if (h1->subject.offset > h2->subject.offset)
2293 return 1;
2294
2295 /* tie breakers: sort by decreasing score, then
2296 by increasing size of query range, then by
2297 increasing subject range. */
2298
2299 if (h1->score < h2->score)
2300 return 1;
2301 if (h1->score > h2->score)
2302 return -1;
2303
2304 if (h1->query.end < h2->query.end)
2305 return 1;
2306 if (h1->query.end > h2->query.end)
2307 return -1;
2308
2309 if (h1->subject.end < h2->subject.end)
2310 return 1;
2311 if (h1->subject.end > h2->subject.end)
2312 return -1;
2313
2314 return 0;
2315 }
2316
2317 /** Callback for sorting HSPs by ending offset in query. Sorting is by
2318 * increasing context, then increasing query end offset, then increasing
2319 * subject end offset, then decreasing score, then decreasing query start
2320 * offset, then decreasing subject start offset. Null HSPs are moved to the
2321 * end of the array.
2322 * @param v1 pointer to first HSP [in]
2323 * @param v2 pointer to second HSP [in]
2324 * @return Result of comparison.
2325 */
2326 static int
s_QueryEndCompareHSPs(const void * v1,const void * v2)2327 s_QueryEndCompareHSPs(const void* v1, const void* v2)
2328 {
2329 BlastHSP* h1,* h2;
2330 BlastHSP** hp1,** hp2;
2331
2332 hp1 = (BlastHSP**) v1;
2333 hp2 = (BlastHSP**) v2;
2334 h1 = *hp1;
2335 h2 = *hp2;
2336
2337 if (!h1 && !h2)
2338 return 0;
2339 else if (!h1)
2340 return 1;
2341 else if (!h2)
2342 return -1;
2343
2344 /* If these are from different contexts, don't compare offsets */
2345 if (h1->context < h2->context)
2346 return -1;
2347 if (h1->context > h2->context)
2348 return 1;
2349
2350 if (h1->query.end < h2->query.end)
2351 return -1;
2352 if (h1->query.end > h2->query.end)
2353 return 1;
2354
2355 if (h1->subject.end < h2->subject.end)
2356 return -1;
2357 if (h1->subject.end > h2->subject.end)
2358 return 1;
2359
2360 /* tie breakers: sort by decreasing score, then
2361 by increasing size of query range, then by
2362 increasing size of subject range. The shortest range
2363 means the *largest* sequence offset must come
2364 first */
2365 if (h1->score < h2->score)
2366 return 1;
2367 if (h1->score > h2->score)
2368 return -1;
2369
2370 if (h1->query.offset < h2->query.offset)
2371 return 1;
2372 if (h1->query.offset > h2->query.offset)
2373 return -1;
2374
2375 if (h1->subject.offset < h2->subject.offset)
2376 return 1;
2377 if (h1->subject.offset > h2->subject.offset)
2378 return -1;
2379
2380 return 0;
2381 }
2382
2383
2384 /* cut off the GapEditScript according to hsp offset and end */
2385 static void
s_CutOffGapEditScript(BlastHSP * hsp,Int4 q_cut,Int4 s_cut,Boolean cut_begin)2386 s_CutOffGapEditScript(BlastHSP* hsp, Int4 q_cut, Int4 s_cut, Boolean cut_begin)
2387 {
2388 int index, opid, qid, sid;
2389 Boolean found = FALSE;
2390 GapEditScript *esp = hsp->gap_info;
2391 qid = 0;
2392 sid = 0;
2393 opid = 0;
2394 q_cut -= hsp->query.offset;
2395 s_cut -= hsp->subject.offset;
2396 for (index=0; index < esp->size; index++) {
2397 for(opid=0; opid < esp->num[index];){
2398 if (esp->op_type[index] == eGapAlignSub) {
2399 qid++;
2400 sid++;
2401 opid++;
2402 } else if (esp->op_type[index] == eGapAlignDel) {
2403 sid+=esp->num[index];
2404 opid+=esp->num[index];
2405 } else if (esp->op_type[index] == eGapAlignIns) {
2406 qid+=esp->num[index];
2407 opid+=esp->num[index];
2408 }
2409 if (qid >= q_cut && sid >= s_cut) found = TRUE;
2410 if (found) break;
2411 }
2412 if (found) break;
2413 }
2414
2415 // RMH: Unless both cut sites where found the following
2416 // block would access memory outside the GapEditScript
2417 // array.
2418 if ( found )
2419 {
2420 if (cut_begin) {
2421 int new_index = 0;
2422 if (opid < esp->num[index]) {
2423 ASSERT(esp->op_type[index] == eGapAlignSub);
2424 esp->op_type[0] = esp->op_type[index];
2425 esp->num[0] = esp->num[index] - opid;
2426 new_index++;
2427 }
2428 ++index;
2429 for (; index < esp->size; index++, new_index++) {
2430 esp->op_type[new_index] = esp->op_type[index];
2431 esp->num[new_index] = esp->num[index];
2432 }
2433 esp->size = new_index;
2434 hsp->query.offset += qid;
2435 hsp->subject.offset += sid;
2436 } else {
2437 if (opid < esp->num[index]) {
2438 ASSERT(esp->op_type[index] == eGapAlignSub);
2439 esp->num[index] = opid;
2440 }
2441 esp->size = index+1;
2442 hsp->query.end = hsp->query.offset + qid;
2443 hsp->subject.end = hsp->subject.offset + sid;
2444 }
2445 }
2446 }
2447
2448 Int4
Blast_HSPListPurgeHSPsWithCommonEndpoints(EBlastProgramType program,BlastHSPList * hsp_list,Boolean purge)2449 Blast_HSPListPurgeHSPsWithCommonEndpoints(EBlastProgramType program,
2450 BlastHSPList* hsp_list,
2451 Boolean purge)
2452
2453 {
2454 BlastHSP** hsp_array; /* hsp_array to purge. */
2455 BlastHSP* hsp;
2456 Int4 i, j, k;
2457 Int4 hsp_count;
2458 purge |= (program != eBlastTypeBlastn);
2459
2460 /* If HSP list is empty, return immediately. */
2461 if (hsp_list == NULL || hsp_list->hspcnt == 0)
2462 return 0;
2463
2464 /* Do nothing for PHI BLAST, because HSPs corresponding to different pattern
2465 occurrences may have common end points, but should all be kept. */
2466 if (Blast_ProgramIsPhiBlast(program))
2467 return hsp_list->hspcnt;
2468
2469 hsp_array = hsp_list->hsp_array;
2470 hsp_count = hsp_list->hspcnt;
2471
2472 qsort(hsp_array, hsp_count, sizeof(BlastHSP*), s_QueryOffsetCompareHSPs);
2473 i = 0;
2474 while (i < hsp_count) {
2475 j = 1;
2476 while (i+j < hsp_count &&
2477 hsp_array[i] && hsp_array[i+j] &&
2478 hsp_array[i]->context == hsp_array[i+j]->context &&
2479 hsp_array[i]->query.offset == hsp_array[i+j]->query.offset &&
2480 hsp_array[i]->subject.offset == hsp_array[i+j]->subject.offset) {
2481 hsp_count--;
2482 hsp = hsp_array[i+j];
2483 if (!purge && (hsp->query.end > hsp_array[i]->query.end)) {
2484 s_CutOffGapEditScript(hsp, hsp_array[i]->query.end,
2485 hsp_array[i]->subject.end, TRUE);
2486 } else {
2487 hsp = Blast_HSPFree(hsp);
2488 }
2489 for (k=i+j; k<hsp_count; k++) {
2490 hsp_array[k] = hsp_array[k+1];
2491 }
2492 hsp_array[hsp_count] = hsp;
2493 }
2494 i += j;
2495 }
2496
2497 qsort(hsp_array, hsp_count, sizeof(BlastHSP*), s_QueryEndCompareHSPs);
2498 i = 0;
2499 while (i < hsp_count) {
2500 j = 1;
2501 while (i+j < hsp_count &&
2502 hsp_array[i] && hsp_array[i+j] &&
2503 hsp_array[i]->context == hsp_array[i+j]->context &&
2504 hsp_array[i]->query.end == hsp_array[i+j]->query.end &&
2505 hsp_array[i]->subject.end == hsp_array[i+j]->subject.end) {
2506 hsp_count--;
2507 hsp = hsp_array[i+j];
2508 if (!purge && (hsp->query.offset < hsp_array[i]->query.offset)) {
2509 s_CutOffGapEditScript(hsp, hsp_array[i]->query.offset,
2510 hsp_array[i]->subject.offset, FALSE);
2511 } else {
2512 hsp = Blast_HSPFree(hsp);
2513 }
2514 for (k=i+j; k<hsp_count; k++) {
2515 hsp_array[k] = hsp_array[k+1];
2516 }
2517 hsp_array[hsp_count] = hsp;
2518 }
2519 i += j;
2520 }
2521
2522 if (purge) {
2523 Blast_HSPListPurgeNullHSPs(hsp_list);
2524 }
2525
2526 return hsp_count;
2527 }
2528
2529 Int4
Blast_HSPListSubjectBestHit(EBlastProgramType program,const BlastHSPSubjectBestHitOptions * subject_besthit_opts,const BlastQueryInfo * query_info,BlastHSPList * hsp_list)2530 Blast_HSPListSubjectBestHit(EBlastProgramType program,
2531 const BlastHSPSubjectBestHitOptions* subject_besthit_opts,
2532 const BlastQueryInfo *query_info,
2533 BlastHSPList* hsp_list)
2534 {
2535 BlastHSP** hsp_array; /* hsp_array to purge. */
2536 const int range_diff = subject_besthit_opts->max_range_diff;
2537 Boolean isBlastn = (program == eBlastTypeBlastn);
2538 unsigned int i, j;
2539 int o, e;
2540 int curr_context, target_context;
2541 int qlen;
2542
2543 /* If HSP list is empty, return immediately. */
2544 if (hsp_list == NULL || hsp_list->hspcnt == 0)
2545 return 0;
2546
2547 if (Blast_ProgramIsPhiBlast(program))
2548 return hsp_list->hspcnt;
2549
2550 hsp_array = hsp_list->hsp_array;
2551
2552 // The hsp list is sorted by score
2553 for(i=0; i < hsp_list->hspcnt -1; i++) {
2554 if(hsp_array[i] == NULL){
2555 continue;
2556 }
2557 j = 1;
2558 o = hsp_array[i]->query.offset - range_diff;
2559 e = hsp_array[i]->query.end + range_diff;
2560 if (o < 0) o = 0;
2561 if (e < 0) e = hsp_array[i]->query.end;
2562 while (i+j < hsp_list->hspcnt) {
2563 if (hsp_array[i+j] && hsp_array[i]->context == hsp_array[i+j]->context &&
2564 ((hsp_array[i+j]->query.offset >= o) &&
2565 (hsp_array[i+j]->query.end <= e))){
2566 hsp_array[i+j] = Blast_HSPFree(hsp_array[i+j]);
2567 }
2568 j++;
2569 }
2570 }
2571
2572 Blast_HSPListPurgeNullHSPs(hsp_list);
2573
2574 if(isBlastn) {
2575 for(i=0; i < hsp_list->hspcnt -1; i++) {
2576 if(hsp_array[i] == NULL){
2577 continue;
2578 }
2579 // Flip query offsets of current hsp to target context frame
2580 j = 1;
2581 curr_context = hsp_array[i]->context;
2582 qlen = query_info->contexts[curr_context].query_length;
2583 target_context = (hsp_array[i]->query.frame > 0) ? curr_context +1 : curr_context -1;
2584 e = qlen - (hsp_array[i]->query.offset - range_diff);
2585 o = qlen - (hsp_array[i]->query.end + range_diff);
2586 while (i+j < hsp_list->hspcnt) {
2587 if(hsp_array[i+j] && (hsp_array[i+j]->context == target_context) &&
2588 ((hsp_array[i+j]->query.offset >= o) &&
2589 (hsp_array[i+j]->query.end <= e))){
2590 hsp_array[i+j] = Blast_HSPFree(hsp_array[i+j]);
2591 }
2592 j++;
2593 }
2594 }
2595 Blast_HSPListPurgeNullHSPs(hsp_list);
2596 }
2597 return hsp_list->hspcnt;
2598 }
2599
2600 Int2
Blast_HSPListReevaluateUngapped(EBlastProgramType program,BlastHSPList * hsp_list,BLAST_SequenceBlk * query_blk,BLAST_SequenceBlk * subject_blk,const BlastInitialWordParameters * word_params,const BlastHitSavingParameters * hit_params,const BlastQueryInfo * query_info,BlastScoreBlk * sbp,const BlastScoringParameters * score_params,const BlastSeqSrc * seq_src,const Uint1 * gen_code_string)2601 Blast_HSPListReevaluateUngapped(EBlastProgramType program,
2602 BlastHSPList* hsp_list, BLAST_SequenceBlk* query_blk,
2603 BLAST_SequenceBlk* subject_blk,
2604 const BlastInitialWordParameters* word_params,
2605 const BlastHitSavingParameters* hit_params, const BlastQueryInfo* query_info,
2606 BlastScoreBlk* sbp, const BlastScoringParameters* score_params,
2607 const BlastSeqSrc* seq_src, const Uint1* gen_code_string)
2608 {
2609 BlastHSP** hsp_array,* hsp;
2610 const Uint1* subject_start = NULL;
2611 Uint1* query_start;
2612 Int4 index, context, hspcnt;
2613 Boolean purge;
2614 Int2 status = 0;
2615 const Boolean kTranslateSubject = Blast_SubjectIsTranslated(program);
2616 const Boolean kNucleotideSubject = Blast_SubjectIsNucleotide(program);
2617 SBlastTargetTranslation* target_t = NULL;
2618
2619 ASSERT(!score_params->options->gapped_calculation);
2620
2621 if (!hsp_list)
2622 return status;
2623
2624 hspcnt = hsp_list->hspcnt;
2625 hsp_array = hsp_list->hsp_array;
2626
2627 if (hsp_list->hspcnt == 0)
2628 /* All HSPs have been deleted */
2629 return status;
2630
2631 /* Retrieve the unpacked subject sequence and save it in the
2632 sequence_start element of the subject structure.
2633 NB: for the BLAST 2 Sequences case, the uncompressed sequence must
2634 already be there */
2635 if (seq_src) {
2636 /* Wrap subject sequence block into a BlastSeqSrcGetSeqArg structure, which is
2637 needed by the BlastSeqSrc API. */
2638 /* If this was a protein subject, leave as is. */
2639 if (kNucleotideSubject)
2640 {
2641 BlastSeqSrcGetSeqArg seq_arg;
2642 memset((void*) &seq_arg, 0, sizeof(seq_arg));
2643 seq_arg.oid = subject_blk->oid;
2644 seq_arg.encoding =
2645 (kTranslateSubject ? eBlastEncodingNcbi4na : eBlastEncodingNucleotide);
2646 seq_arg.check_oid_exclusion = TRUE;
2647 seq_arg.seq = subject_blk;
2648 /* Return the packed sequence to the database */
2649 BlastSeqSrcReleaseSequence(seq_src, &seq_arg);
2650 /* Get the unpacked sequence */
2651 if (( BLAST_SEQSRC_EXCLUDED == BlastSeqSrcGetSequence(seq_src, &seq_arg))) {
2652 for (index = 0; index < hspcnt; ++index) {
2653 hsp_array[index] = Blast_HSPFree(hsp_array[index]);
2654 }
2655 Blast_HSPListPurgeNullHSPs(hsp_list);
2656 return 0;
2657 }
2658 else if (status < 0){
2659 return status;
2660 }
2661 }
2662 }
2663
2664 if (kTranslateSubject) {
2665 if (!gen_code_string)
2666 return -1;
2667 /* Get the translation buffer */
2668 BlastTargetTranslationNew(subject_blk, gen_code_string, program,
2669 score_params->options->is_ooframe, &target_t);
2670 } else {
2671 /* Store sequence in blastna encoding in sequence_start */
2672 if (subject_blk->sequence_start)
2673 subject_start = subject_blk->sequence_start + 1;
2674 else
2675 subject_start = subject_blk->sequence;
2676 }
2677
2678 purge = FALSE;
2679 for (index = 0; index < hspcnt; ++index) {
2680 Boolean delete_hsp = FALSE;
2681 if (hsp_array[index] == NULL)
2682 continue;
2683 else
2684 hsp = hsp_array[index];
2685
2686 context = hsp->context;
2687
2688 query_start = query_blk->sequence +
2689 query_info->contexts[context].query_offset;
2690
2691 if (kTranslateSubject)
2692 subject_start = Blast_HSPGetTargetTranslation(target_t, hsp, NULL);
2693
2694 if (kNucleotideSubject) {
2695 delete_hsp =
2696 Blast_HSPReevaluateWithAmbiguitiesUngapped(hsp, query_start,
2697 subject_start, word_params, sbp, kTranslateSubject);
2698 }
2699
2700 if (!delete_hsp) {
2701 const Uint1* query_nomask = query_blk->sequence_nomask +
2702 query_info->contexts[context].query_offset;
2703 Int4 align_length = 0;
2704 Blast_HSPGetNumIdentitiesAndPositives(query_nomask,
2705 subject_start,
2706 hsp,
2707 score_params->options,
2708 &align_length,
2709 sbp);
2710
2711 delete_hsp = Blast_HSPTest(hsp, hit_params->options, align_length);
2712 }
2713 if (delete_hsp) { /* This HSP is now below the cutoff */
2714 hsp_array[index] = Blast_HSPFree(hsp_array[index]);
2715 purge = TRUE;
2716 }
2717 }
2718
2719 if (target_t)
2720 target_t = BlastTargetTranslationFree(target_t);
2721
2722 if (purge)
2723 Blast_HSPListPurgeNullHSPs(hsp_list);
2724
2725 /* Sort the HSP array by score (scores may have changed!) */
2726 Blast_HSPListSortByScore(hsp_list);
2727 Blast_HSPListAdjustOddBlastnScores(hsp_list, FALSE, sbp);
2728 return 0;
2729 }
2730
2731 /** Combine two HSP lists, without altering the individual HSPs, and without
2732 * reallocating the HSP array.
2733 * @param hsp_list New HSP list [in]
2734 * @param combined_hsp_list Old HSP list, to which new HSPs are added [in] [out]
2735 * @param new_hspcnt How many HSPs to save in the combined list? The extra ones
2736 * are freed. The best scoring HSPs are saved. This argument
2737 * cannot be greater than the allocated size of the
2738 * combined list's HSP array. [in]
2739 */
2740 static void
s_BlastHSPListsCombineByScore(BlastHSPList * hsp_list,BlastHSPList * combined_hsp_list,Int4 new_hspcnt)2741 s_BlastHSPListsCombineByScore(BlastHSPList* hsp_list,
2742 BlastHSPList* combined_hsp_list,
2743 Int4 new_hspcnt)
2744 {
2745 Int4 index, index1, index2;
2746 BlastHSP** new_hsp_array;
2747
2748 ASSERT(new_hspcnt <= combined_hsp_list->allocated);
2749
2750 if (new_hspcnt >= hsp_list->hspcnt + combined_hsp_list->hspcnt) {
2751 /* All HSPs from both arrays are saved */
2752 for (index=combined_hsp_list->hspcnt, index1=0;
2753 index1<hsp_list->hspcnt; index1++) {
2754 if (hsp_list->hsp_array[index1] != NULL)
2755 combined_hsp_list->hsp_array[index++] = hsp_list->hsp_array[index1];
2756 }
2757 combined_hsp_list->hspcnt = new_hspcnt;
2758 Blast_HSPListSortByScore(combined_hsp_list);
2759 } else {
2760 /* Not all HSPs are be saved; sort both arrays by score and save only
2761 the new_hspcnt best ones.
2762 For the merged set of HSPs, allocate array the same size as in the
2763 old HSP list. */
2764 new_hsp_array = (BlastHSP**)
2765 malloc(combined_hsp_list->allocated*sizeof(BlastHSP*));
2766
2767 Blast_HSPListSortByScore(combined_hsp_list);
2768 Blast_HSPListSortByScore(hsp_list);
2769 index1 = index2 = 0;
2770 for (index = 0; index < new_hspcnt; ++index) {
2771 if (index1 < combined_hsp_list->hspcnt &&
2772 (index2 >= hsp_list->hspcnt ||
2773 ScoreCompareHSPs(&combined_hsp_list->hsp_array[index1],
2774 &hsp_list->hsp_array[index2]) <= 0)) {
2775 new_hsp_array[index] = combined_hsp_list->hsp_array[index1];
2776 ++index1;
2777 } else {
2778 new_hsp_array[index] = hsp_list->hsp_array[index2];
2779 ++index2;
2780 }
2781 }
2782 /* Free the extra HSPs that could not be saved */
2783 for ( ; index1 < combined_hsp_list->hspcnt; ++index1) {
2784 combined_hsp_list->hsp_array[index1] =
2785 Blast_HSPFree(combined_hsp_list->hsp_array[index1]);
2786 }
2787 for ( ; index2 < hsp_list->hspcnt; ++index2) {
2788 hsp_list->hsp_array[index2] =
2789 Blast_HSPFree(hsp_list->hsp_array[index2]);
2790 }
2791 /* Point combined_hsp_list's HSP array to the new one */
2792 sfree(combined_hsp_list->hsp_array);
2793 combined_hsp_list->hsp_array = new_hsp_array;
2794 combined_hsp_list->hspcnt = new_hspcnt;
2795 }
2796
2797 /* Second HSP list now does not own any HSPs */
2798 hsp_list->hspcnt = 0;
2799 }
2800
Blast_HSPListAppend(BlastHSPList ** old_hsp_list_ptr,BlastHSPList ** combined_hsp_list_ptr,Int4 hsp_num_max)2801 Int2 Blast_HSPListAppend(BlastHSPList** old_hsp_list_ptr,
2802 BlastHSPList** combined_hsp_list_ptr, Int4 hsp_num_max)
2803 {
2804 BlastHSPList* combined_hsp_list = *combined_hsp_list_ptr;
2805 BlastHSPList* hsp_list = *old_hsp_list_ptr;
2806 Int4 new_hspcnt;
2807
2808 if (!hsp_list || hsp_list->hspcnt == 0)
2809 return 0;
2810
2811 /* If no previous HSP list, return a pointer to the old one */
2812 if (!combined_hsp_list) {
2813 *combined_hsp_list_ptr = hsp_list;
2814 *old_hsp_list_ptr = NULL;
2815 return 0;
2816 }
2817
2818 /* Just append new list to the end of the old list, in case of
2819 multiple frames of the subject sequence */
2820 new_hspcnt = MIN(combined_hsp_list->hspcnt + hsp_list->hspcnt,
2821 hsp_num_max);
2822 if (new_hspcnt > combined_hsp_list->allocated &&
2823 !combined_hsp_list->do_not_reallocate) {
2824 Int4 new_allocated = MIN(2*new_hspcnt, hsp_num_max);
2825 BlastHSP** new_hsp_array;
2826 new_hsp_array = (BlastHSP**)
2827 realloc(combined_hsp_list->hsp_array,
2828 new_allocated*sizeof(BlastHSP*));
2829
2830 if (new_hsp_array) {
2831 combined_hsp_list->allocated = new_allocated;
2832 combined_hsp_list->hsp_array = new_hsp_array;
2833 } else {
2834 combined_hsp_list->do_not_reallocate = TRUE;
2835 new_hspcnt = combined_hsp_list->allocated;
2836 }
2837 }
2838 if (combined_hsp_list->allocated == hsp_num_max)
2839 combined_hsp_list->do_not_reallocate = TRUE;
2840
2841 s_BlastHSPListsCombineByScore(hsp_list, combined_hsp_list, new_hspcnt);
2842
2843 hsp_list = Blast_HSPListFree(hsp_list);
2844 *old_hsp_list_ptr = NULL;
2845
2846 return 0;
2847 }
2848
Blast_HSPListsMerge(BlastHSPList ** hsp_list_ptr,BlastHSPList ** combined_hsp_list_ptr,Int4 hsp_num_max,Int4 * split_offsets,Int4 contexts_per_query,Int4 chunk_overlap_size,Boolean allow_gap,Boolean short_reads)2849 Int2 Blast_HSPListsMerge(BlastHSPList** hsp_list_ptr,
2850 BlastHSPList** combined_hsp_list_ptr,
2851 Int4 hsp_num_max, Int4 *split_offsets,
2852 Int4 contexts_per_query, Int4 chunk_overlap_size,
2853 Boolean allow_gap, Boolean short_reads)
2854 {
2855 BlastHSPList* combined_hsp_list = *combined_hsp_list_ptr;
2856 BlastHSPList* hsp_list = *hsp_list_ptr;
2857 BlastHSP* hsp1, *hsp2, *hsp_var;
2858 BlastHSP** hspp1,** hspp2;
2859 Int4 index1, index2;
2860 Int4 hspcnt1, hspcnt2, new_hspcnt = 0;
2861 Int4 start_diag, end_diag;
2862 Int4 offset_idx;
2863 BlastHSP** new_hsp_array;
2864
2865 if (!hsp_list || hsp_list->hspcnt == 0)
2866 return 0;
2867
2868 /* If no previous HSP list, just return a copy of the new one. */
2869 if (!combined_hsp_list) {
2870 *combined_hsp_list_ptr = hsp_list;
2871 *hsp_list_ptr = NULL;
2872 return 0;
2873 }
2874
2875 /* Merge the two HSP lists for successive chunks of the subject sequence.
2876 First put all HSPs that intersect the overlap region at the front of
2877 the respective HSP arrays. Note that if the query sequence is
2878 assumed split, each context of the query sequence can have a
2879 different split point */
2880 hspcnt1 = hspcnt2 = 0;
2881
2882 if (contexts_per_query < 0) { /* subject seq is split */
2883 for (index1 = 0; index1 < combined_hsp_list->hspcnt; index1++) {
2884 hsp1 = combined_hsp_list->hsp_array[index1];
2885 if (hsp1->subject.end > split_offsets[0]) {
2886 /* At least part of this HSP lies in the overlap strip. */
2887 hsp_var = combined_hsp_list->hsp_array[hspcnt1];
2888 combined_hsp_list->hsp_array[hspcnt1] = hsp1;
2889 combined_hsp_list->hsp_array[index1] = hsp_var;
2890 ++hspcnt1;
2891 }
2892 }
2893 for (index2 = 0; index2 < hsp_list->hspcnt; index2++) {
2894 hsp2 = hsp_list->hsp_array[index2];
2895 if (hsp2->subject.offset < split_offsets[0] + chunk_overlap_size) {
2896 /* At least part of this HSP lies in the overlap strip. */
2897 hsp_var = hsp_list->hsp_array[hspcnt2];
2898 hsp_list->hsp_array[hspcnt2] = hsp2;
2899 hsp_list->hsp_array[index2] = hsp_var;
2900 ++hspcnt2;
2901 }
2902 }
2903 }
2904 else { /* query seq is split */
2905
2906 /* An HSP can be a candidate for merging if it lies in the
2907 overlap region. Whether this is true depends on whether the
2908 HSP starts to the left of the split point, or ends to the
2909 right of the overlap region. A complication is that 'left'
2910 and 'right' have opposite meaning when the HSP is on the
2911 minus strand of the query sequence */
2912
2913 for (index1 = 0; index1 < combined_hsp_list->hspcnt; index1++) {
2914 hsp1 = combined_hsp_list->hsp_array[index1];
2915 offset_idx = hsp1->context % contexts_per_query;
2916 if (split_offsets[offset_idx] < 0) continue;
2917 if ((hsp1->query.frame >= 0 && hsp1->query.end >
2918 split_offsets[offset_idx]) ||
2919 (hsp1->query.frame < 0 && hsp1->query.offset <
2920 split_offsets[offset_idx] + chunk_overlap_size)) {
2921 /* At least part of this HSP lies in the overlap strip. */
2922 hsp_var = combined_hsp_list->hsp_array[hspcnt1];
2923 combined_hsp_list->hsp_array[hspcnt1] = hsp1;
2924 combined_hsp_list->hsp_array[index1] = hsp_var;
2925 ++hspcnt1;
2926 }
2927 }
2928 for (index2 = 0; index2 < hsp_list->hspcnt; index2++) {
2929 hsp2 = hsp_list->hsp_array[index2];
2930 offset_idx = hsp2->context % contexts_per_query;
2931 if (split_offsets[offset_idx] < 0) continue;
2932 if ((hsp2->query.frame < 0 && hsp2->query.end >
2933 split_offsets[offset_idx]) ||
2934 (hsp2->query.frame >= 0 && hsp2->query.offset <
2935 split_offsets[offset_idx] + chunk_overlap_size)) {
2936 /* At least part of this HSP lies in the overlap strip. */
2937 hsp_var = hsp_list->hsp_array[hspcnt2];
2938 hsp_list->hsp_array[hspcnt2] = hsp2;
2939 hsp_list->hsp_array[index2] = hsp_var;
2940 ++hspcnt2;
2941 }
2942 }
2943 }
2944
2945 /* the merge process is independent of whether merging happens
2946 between query chunks or subject chunks */
2947
2948 if (hspcnt1 > 0 && hspcnt2 > 0) {
2949 hspp1 = combined_hsp_list->hsp_array;
2950 hspp2 = hsp_list->hsp_array;
2951
2952 for (index1 = 0; index1 < hspcnt1; index1++) {
2953
2954 hsp1 = hspp1[index1];
2955
2956 for (index2 = 0; index2 < hspcnt2; index2++) {
2957
2958 hsp2 = hspp2[index2];
2959
2960 /* Skip already deleted HSPs, or HSPs from different contexts */
2961 if (!hsp2 || hsp1->context != hsp2->context)
2962 continue;
2963
2964 /* Short read qureies are shorter than the overlap region and may
2965 already have a traceback */
2966 if (short_reads) {
2967 hspp2[index2] = Blast_HSPFree(hsp2);
2968 continue;
2969 }
2970
2971 /* we have to determine the starting diagonal of one HSP
2972 and the ending diagonal of the other */
2973
2974 if (contexts_per_query < 0 || hsp1->query.frame >= 0) {
2975 end_diag = s_HSPEndDiag(hsp1);
2976 start_diag = s_HSPStartDiag(hsp2);
2977 }
2978 else {
2979 end_diag = s_HSPEndDiag(hsp2);
2980 start_diag = s_HSPStartDiag(hsp1);
2981 }
2982
2983 if (ABS(end_diag - start_diag) < OVERLAP_DIAG_CLOSE) {
2984 if (s_BlastMergeTwoHSPs(hsp1, hsp2, allow_gap)) {
2985 /* Free the second HSP. */
2986 hspp2[index2] = Blast_HSPFree(hsp2);
2987 }
2988 }
2989 }
2990 }
2991
2992 /* Purge the nulled out HSPs from the new HSP list */
2993 Blast_HSPListPurgeNullHSPs(hsp_list);
2994 }
2995
2996 /* The new number of HSPs is now the sum of the remaining counts in the
2997 two lists, but if there is a restriction on the number of HSPs to keep,
2998 it might have to be reduced. */
2999 new_hspcnt =
3000 MIN(hsp_list->hspcnt + combined_hsp_list->hspcnt, hsp_num_max);
3001
3002 if (new_hspcnt >= combined_hsp_list->allocated-1 &&
3003 combined_hsp_list->do_not_reallocate == FALSE) {
3004 Int4 new_allocated = MIN(2*new_hspcnt, hsp_num_max);
3005 if (new_allocated > combined_hsp_list->allocated) {
3006 new_hsp_array = (BlastHSP**)
3007 realloc(combined_hsp_list->hsp_array,
3008 new_allocated*sizeof(BlastHSP*));
3009 if (new_hsp_array == NULL) {
3010 combined_hsp_list->do_not_reallocate = TRUE;
3011 } else {
3012 combined_hsp_list->hsp_array = new_hsp_array;
3013 combined_hsp_list->allocated = new_allocated;
3014 }
3015 } else {
3016 combined_hsp_list->do_not_reallocate = TRUE;
3017 }
3018 new_hspcnt = MIN(new_hspcnt, combined_hsp_list->allocated);
3019 }
3020
3021 s_BlastHSPListsCombineByScore(hsp_list, combined_hsp_list, new_hspcnt);
3022
3023 hsp_list = Blast_HSPListFree(hsp_list);
3024 *hsp_list_ptr = NULL;
3025
3026 return 0;
3027 }
3028
Blast_HSPListAdjustOffsets(BlastHSPList * hsp_list,Int4 offset)3029 void Blast_HSPListAdjustOffsets(BlastHSPList* hsp_list, Int4 offset)
3030 {
3031 Int4 index;
3032
3033 if (offset == 0) {
3034 return;
3035 }
3036
3037 for (index=0; index<hsp_list->hspcnt; index++) {
3038 BlastHSP* hsp = hsp_list->hsp_array[index];
3039 hsp->subject.offset += offset;
3040 hsp->subject.end += offset;
3041 hsp->subject.gapped_start += offset;
3042 }
3043 }
3044
Blast_HSPListAdjustOddBlastnScores(BlastHSPList * hsp_list,Boolean gapped_calculation,const BlastScoreBlk * sbp)3045 void Blast_HSPListAdjustOddBlastnScores(BlastHSPList* hsp_list,
3046 Boolean gapped_calculation,
3047 const BlastScoreBlk* sbp)
3048 {
3049 int index;
3050
3051 if (!hsp_list || hsp_list->hspcnt == 0 ||
3052 gapped_calculation == FALSE ||
3053 sbp->round_down == FALSE)
3054 return;
3055
3056 for (index = 0; index < hsp_list->hspcnt; ++index)
3057 hsp_list->hsp_array[index]->score &= ~1;
3058
3059 /* Sort the HSPs again, since the order may have to be different now. */
3060 Blast_HSPListSortByScore(hsp_list);
3061 }
3062
3063 /** Callback for sorting hsp lists by their best evalue/score;
3064 * Evalues are compared with the condition that if both are close enough to
3065 * zero (currently < 1.0e-180), they are considered equal.
3066 * It is assumed that the HSP arrays in each hit list are already sorted by
3067 * e-value/score.
3068 */
3069 static int
s_EvalueCompareHSPLists(const void * v1,const void * v2)3070 s_EvalueCompareHSPLists(const void* v1, const void* v2)
3071 {
3072 BlastHSPList* h1,* h2;
3073 int retval = 0;
3074
3075 h1 = *(BlastHSPList**) v1;
3076 h2 = *(BlastHSPList**) v2;
3077
3078 /* If any of the HSP lists is empty, it is considered "worse" than the
3079 other, unless the other is also empty. */
3080 if (h1->hspcnt == 0 && h2->hspcnt == 0)
3081 return 0;
3082 else if (h1->hspcnt == 0)
3083 return 1;
3084 else if (h2->hspcnt == 0)
3085 return -1;
3086
3087 if ((retval = s_EvalueComp(h1->best_evalue,
3088 h2->best_evalue)) != 0)
3089 return retval;
3090
3091 if (h1->hsp_array[0]->score > h2->hsp_array[0]->score)
3092 return -1;
3093 if (h1->hsp_array[0]->score < h2->hsp_array[0]->score)
3094 return 1;
3095
3096 /* In case of equal best E-values and scores, order will be determined
3097 by ordinal ids of the subject sequences */
3098 return BLAST_CMP(h2->oid, h1->oid);
3099 }
3100
3101 /** Callback for sorting hsp lists by their best e-value/score, in
3102 * reverse order - from higher e-value to lower (lower score to higher).
3103 */
3104 static int
s_EvalueCompareHSPListsRev(const void * v1,const void * v2)3105 s_EvalueCompareHSPListsRev(const void* v1, const void* v2)
3106 {
3107 return s_EvalueCompareHSPLists(v2, v1);
3108 }
3109
3110 /********************************************************************************
3111 Functions manipulating BlastHitList's
3112 ********************************************************************************/
3113
3114 /*
3115 description in blast_hits.h
3116 */
Blast_HitListNew(Int4 hitlist_size)3117 BlastHitList* Blast_HitListNew(Int4 hitlist_size)
3118 {
3119 BlastHitList* new_hitlist = (BlastHitList*)
3120 calloc(1, sizeof(BlastHitList));
3121 new_hitlist->hsplist_max = hitlist_size;
3122 new_hitlist->low_score = INT4_MAX;
3123 new_hitlist->hsplist_count = 0;
3124 new_hitlist->hsplist_current = 0;
3125 return new_hitlist;
3126 }
3127
3128 /*
3129 description in blast_hits.h
3130 */
Blast_HitListFree(BlastHitList * hitlist)3131 BlastHitList* Blast_HitListFree(BlastHitList* hitlist)
3132 {
3133 if (!hitlist)
3134 return NULL;
3135
3136 Blast_HitListHSPListsFree(hitlist);
3137 sfree(hitlist);
3138 return NULL;
3139 }
3140
3141 /* description in blast_hits.h */
Blast_HitListHSPListsFree(BlastHitList * hitlist)3142 Int2 Blast_HitListHSPListsFree(BlastHitList* hitlist)
3143 {
3144 Int4 index;
3145
3146 if (!hitlist)
3147 return 0;
3148
3149 for (index = 0; index < hitlist->hsplist_count; ++index)
3150 hitlist->hsplist_array[index] =
3151 Blast_HSPListFree(hitlist->hsplist_array[index]);
3152
3153 sfree(hitlist->hsplist_array);
3154
3155 hitlist->hsplist_count = 0;
3156
3157 return 0;
3158 }
3159
3160 /** Purge a BlastHitList of empty HSP lists.
3161 * @param hit_list BLAST hit list structure. [in] [out]
3162 */
3163 static void
s_BlastHitListPurge(BlastHitList * hit_list)3164 s_BlastHitListPurge(BlastHitList* hit_list)
3165 {
3166 Int4 index, hsplist_count;
3167
3168 if (!hit_list)
3169 return;
3170
3171 hsplist_count = hit_list->hsplist_count;
3172 for (index = 0; index < hsplist_count &&
3173 hit_list->hsplist_array[index]->hspcnt > 0; ++index);
3174
3175 hit_list->hsplist_count = index;
3176 /* Free all empty HSP lists */
3177 for ( ; index < hsplist_count; ++index) {
3178 Blast_HSPListFree(hit_list->hsplist_array[index]);
3179 }
3180 }
3181
3182 /** Given a BlastHitList* with a heapified HSP list array, remove
3183 * the worst scoring HSP list and insert the new HSP list in the heap
3184 * @param hit_list Contains all HSP lists for a given query [in] [out]
3185 * @param hsp_list A new HSP list to be inserted into the hit list [in]
3186 */
3187 static void
s_BlastHitListInsertHSPListInHeap(BlastHitList * hit_list,BlastHSPList * hsp_list)3188 s_BlastHitListInsertHSPListInHeap(BlastHitList* hit_list,
3189 BlastHSPList* hsp_list)
3190 {
3191 Blast_HSPListFree(hit_list->hsplist_array[0]);
3192 hit_list->hsplist_array[0] = hsp_list;
3193 if (hit_list->hsplist_count >= 2) {
3194 s_Heapify((char*)hit_list->hsplist_array, (char*)hit_list->hsplist_array,
3195 (char*)&hit_list->hsplist_array[hit_list->hsplist_count/2 - 1],
3196 (char*)&hit_list->hsplist_array[hit_list->hsplist_count-1],
3197 sizeof(BlastHSPList*), s_EvalueCompareHSPLists);
3198 }
3199 hit_list->worst_evalue =
3200 hit_list->hsplist_array[0]->best_evalue;
3201 hit_list->low_score = hit_list->hsplist_array[0]->hsp_array[0]->score;
3202 }
3203
3204 /** Given a BlastHitList pointer this function makes the
3205 * hsplist_array larger, up to a maximum size.
3206 * These incremental increases are mostly an issue for users who
3207 * put in a very large number for number of hits to save, but only save a few.
3208 * @param hit_list object containing the hsplist_array to grow [in]
3209 * @return zero on success, 1 if full already.
3210 */
s_Blast_HitListGrowHSPListArray(BlastHitList * hit_list)3211 static Int2 s_Blast_HitListGrowHSPListArray(BlastHitList* hit_list)
3212
3213 {
3214 const int kStartValue = 100; /* default number of hsplist_array to start with. */
3215
3216 ASSERT(hit_list);
3217
3218 if (hit_list->hsplist_current >= hit_list->hsplist_max)
3219 return 1;
3220
3221 if (hit_list->hsplist_current <= 0)
3222 hit_list->hsplist_current = kStartValue;
3223 else
3224 hit_list->hsplist_current = MIN(2*hit_list->hsplist_current, hit_list->hsplist_max);
3225
3226 hit_list->hsplist_array =
3227 (BlastHSPList**) realloc(hit_list->hsplist_array, hit_list->hsplist_current*sizeof(BlastHSPList*));
3228
3229 if (hit_list->hsplist_array == NULL)
3230 return BLASTERR_MEMORY;
3231
3232 return 0;
3233 }
3234
Blast_HitListUpdate(BlastHitList * hit_list,BlastHSPList * hsp_list)3235 Int2 Blast_HitListUpdate(BlastHitList* hit_list,
3236 BlastHSPList* hsp_list)
3237 {
3238 hsp_list->best_evalue = s_BlastGetBestEvalue(hsp_list);
3239
3240 #ifndef NDEBUG
3241 ASSERT(s_BlastCheckBestEvalue(hsp_list) == TRUE); /* NCBI_FAKE_WARNING */
3242 #endif /* _DEBUG */
3243
3244 if (hit_list->hsplist_count < hit_list->hsplist_max) {
3245 /* If the array of HSP lists for this query is not yet allocated,
3246 do it here */
3247 if (hit_list->hsplist_current == hit_list->hsplist_count)
3248 {
3249 Int2 status = s_Blast_HitListGrowHSPListArray(hit_list);
3250 if (status)
3251 return status;
3252 }
3253 /* Just add to the end; sort later */
3254 hit_list->hsplist_array[hit_list->hsplist_count++] = hsp_list;
3255 hit_list->worst_evalue =
3256 MAX(hsp_list->best_evalue, hit_list->worst_evalue);
3257 hit_list->low_score =
3258 MIN(hsp_list->hsp_array[0]->score, hit_list->low_score);
3259 } else {
3260 int evalue_order = 0;
3261 if (!hit_list->heapified) {
3262 /* make sure all hsp_list is sorted */
3263 int index;
3264 for (index =0; index < hit_list->hsplist_count; index++) {
3265 Blast_HSPListSortByEvalue(hit_list->hsplist_array[index]);
3266 hit_list->hsplist_array[index]->best_evalue = s_BlastGetBestEvalue(hit_list->hsplist_array[index]);
3267 }
3268 s_CreateHeap(hit_list->hsplist_array, hit_list->hsplist_count,
3269 sizeof(BlastHSPList*), s_EvalueCompareHSPLists);
3270 hit_list->heapified = TRUE;
3271 }
3272
3273 /* make sure the hsp_list is sorted. We actually do not need to sort
3274 the full list: all that we need is the best score. However, the
3275 following code assumes hsp_list->hsp_array[0] has the best score. */
3276 Blast_HSPListSortByEvalue(hsp_list);
3277 hsp_list->best_evalue = s_BlastGetBestEvalue(hsp_list);
3278 evalue_order = s_EvalueCompareHSPLists(&(hit_list->hsplist_array[0]), &hsp_list);
3279 if (evalue_order < 0) {
3280 /* This hit list is less significant than any of those already saved;
3281 discard it. Note that newer hits with score and e-value both equal
3282 to the current worst will be saved, at the expense of some older
3283 hit.
3284 */
3285 Blast_HSPListFree(hsp_list);
3286 } else {
3287 s_BlastHitListInsertHSPListInHeap(hit_list, hsp_list);
3288 }
3289 }
3290 return 0;
3291 }
3292
3293 Int2
Blast_HitListPurgeNullHSPLists(BlastHitList * hit_list)3294 Blast_HitListPurgeNullHSPLists(BlastHitList* hit_list)
3295 {
3296 Int4 index, index1; /* loop indices. */
3297 Int4 hsplist_count; /* total number of HSPList's to iterate over. */
3298 BlastHSPList** hsplist_array; /* hsplist_array to purge. */
3299
3300 if (hit_list == NULL || hit_list->hsplist_count == 0)
3301 return 0;
3302
3303 hsplist_array = hit_list->hsplist_array;
3304 hsplist_count = hit_list->hsplist_count;
3305
3306 index1 = 0;
3307 for (index=0; index<hsplist_count; index++) {
3308 if (hsplist_array[index]) {
3309 hsplist_array[index1] = hsplist_array[index];
3310 index1++;
3311 }
3312 }
3313
3314 for (index=index1; index<hsplist_count; index++) {
3315 hsplist_array[index] = NULL;
3316 }
3317
3318 hit_list->hsplist_count = index1;
3319
3320 return 0;
3321 }
3322
Blast_HitListSortByEvalue(BlastHitList * hit_list)3323 Int2 Blast_HitListSortByEvalue(BlastHitList* hit_list)
3324 {
3325 if (hit_list && hit_list->hsplist_count > 1) {
3326 qsort(hit_list->hsplist_array, hit_list->hsplist_count,
3327 sizeof(BlastHSPList*), s_EvalueCompareHSPLists);
3328 }
3329 s_BlastHitListPurge(hit_list);
3330 return 0;
3331 }
3332
3333
3334 /********************************************************************************
3335 Functions manipulating BlastHSPResults
3336 ********************************************************************************/
3337
Blast_HSPResultsNew(Int4 num_queries)3338 BlastHSPResults* Blast_HSPResultsNew(Int4 num_queries)
3339 {
3340 BlastHSPResults* retval = NULL;
3341
3342 retval = (BlastHSPResults*) calloc(1, sizeof(BlastHSPResults));
3343 if ( !retval ) {
3344 return NULL;
3345 }
3346
3347 retval->num_queries = num_queries;
3348 retval->hitlist_array = (BlastHitList**)
3349 calloc(num_queries, sizeof(BlastHitList*));
3350
3351 if ( !retval->hitlist_array ) {
3352 return Blast_HSPResultsFree(retval);
3353 }
3354
3355 return retval;
3356 }
3357
Blast_HSPResultsFree(BlastHSPResults * results)3358 BlastHSPResults* Blast_HSPResultsFree(BlastHSPResults* results)
3359 {
3360 Int4 index;
3361
3362 if (!results)
3363 return NULL;
3364
3365 if (results->hitlist_array)
3366 {
3367 for (index = 0; index < results->num_queries; ++index)
3368 Blast_HitListFree(results->hitlist_array[index]);
3369 sfree(results->hitlist_array);
3370 }
3371 sfree(results);
3372 return NULL;
3373 }
3374
Blast_HSPResultsSortByEvalue(BlastHSPResults * results)3375 Int2 Blast_HSPResultsSortByEvalue(BlastHSPResults* results)
3376 {
3377 Int4 index;
3378 BlastHitList* hit_list;
3379
3380 if (!results)
3381 return 0;
3382
3383 for (index = 0; index < results->num_queries; ++index) {
3384 hit_list = results->hitlist_array[index];
3385 if (hit_list != NULL
3386 && hit_list->hsplist_count > 1
3387 && hit_list->hsplist_array != NULL) {
3388 qsort(hit_list->hsplist_array, hit_list->hsplist_count,
3389 sizeof(BlastHSPList*), s_EvalueCompareHSPLists);
3390 }
3391 s_BlastHitListPurge(hit_list);
3392 }
3393 return 0;
3394 }
3395
Blast_HSPResultsReverseSort(BlastHSPResults * results)3396 Int2 Blast_HSPResultsReverseSort(BlastHSPResults* results)
3397 {
3398 Int4 index;
3399 BlastHitList* hit_list;
3400
3401 for (index = 0; index < results->num_queries; ++index) {
3402 hit_list = results->hitlist_array[index];
3403 if (hit_list && hit_list->hsplist_count > 1) {
3404 qsort(hit_list->hsplist_array, hit_list->hsplist_count,
3405 sizeof(BlastHSPList*), s_EvalueCompareHSPListsRev);
3406 }
3407 s_BlastHitListPurge(hit_list);
3408 }
3409 return 0;
3410 }
3411
Blast_HSPResultsReverseOrder(BlastHSPResults * results)3412 Int2 Blast_HSPResultsReverseOrder(BlastHSPResults* results)
3413 {
3414 Int4 index;
3415 BlastHitList* hit_list;
3416
3417 for (index = 0; index < results->num_queries; ++index) {
3418 hit_list = results->hitlist_array[index];
3419 if (hit_list && hit_list->hsplist_count > 1) {
3420 BlastHSPList* hsplist;
3421 Int4 index1;
3422 /* Swap HSP lists: first with last; second with second from the end,
3423 etc. */
3424 for (index1 = 0; index1 < hit_list->hsplist_count/2; ++index1) {
3425 hsplist = hit_list->hsplist_array[index1];
3426 hit_list->hsplist_array[index1] =
3427 hit_list->hsplist_array[hit_list->hsplist_count-index1-1];
3428 hit_list->hsplist_array[hit_list->hsplist_count-index1-1] =
3429 hsplist;
3430 }
3431 }
3432 }
3433 return 0;
3434 }
3435
3436 /** Auxiliary structure for sorting HSPs */
3437 typedef struct SHspWrap {
3438 BlastHSPList *hsplist; /**< The HSPList to which this HSP belongs */
3439 BlastHSP *hsp; /**< HSP described by this structure */
3440 } SHspWrap;
3441
3442 /** callback used to sort a list of encapsulated HSP
3443 * structures in order of decreasing raw score
3444 * -RMH-
3445 */
s_SortHspWrapRawScore(const void * x,const void * y)3446 static int s_SortHspWrapRawScore(const void *x, const void *y)
3447 {
3448 SHspWrap *wrap1 = (SHspWrap *)x;
3449 SHspWrap *wrap2 = (SHspWrap *)y;
3450 if (wrap1->hsp->score > wrap2->hsp->score)
3451 return -1;
3452 if (wrap1->hsp->score < wrap2->hsp->score)
3453 return 1;
3454
3455 return 0;
3456 }
3457
3458 // Masklevel filtering for rmblastn. -RMH-
Blast_HSPResultsApplyMasklevel(BlastHSPResults * results,const BlastQueryInfo * query_info,Int4 masklevel,Int4 query_length)3459 Int2 Blast_HSPResultsApplyMasklevel(BlastHSPResults *results,
3460 const BlastQueryInfo *query_info,
3461 Int4 masklevel, Int4 query_length)
3462 {
3463 Int4 i, j, k, m;
3464 Int4 hsp_count;
3465 SHspWrap *hsp_array;
3466 BlastIntervalTree *tree;
3467
3468 /* set up the interval tree; subject offsets are not needed */
3469
3470 tree = Blast_IntervalTreeInit(0, query_length + 1, 0, 0);
3471
3472 for (i = 0; i < results->num_queries; i++) {
3473 BlastHitList *hitlist = results->hitlist_array[i];
3474 if (hitlist == NULL)
3475 continue;
3476
3477 for (j = hsp_count = 0; j < hitlist->hsplist_count; j++) {
3478 BlastHSPList *hsplist = hitlist->hsplist_array[j];
3479 hsp_count += hsplist->hspcnt;
3480 }
3481
3482 /* empty each HSP into a combined HSP array, then
3483 sort the array by raw score */
3484
3485 hsp_array = (SHspWrap *)malloc(hsp_count * sizeof(SHspWrap));
3486
3487 for (j = k = 0; j < hitlist->hsplist_count; j++) {
3488 BlastHSPList *hsplist = hitlist->hsplist_array[j];
3489 for (m = 0; m < hsplist->hspcnt; k++, m++) {
3490 BlastHSP *hsp = hsplist->hsp_array[m];
3491 hsp_array[k].hsplist = hsplist;
3492 hsp_array[k].hsp = hsp;
3493 }
3494 hsplist->hspcnt = 0;
3495 }
3496
3497 qsort(hsp_array, hsp_count, sizeof(SHspWrap), s_SortHspWrapRawScore);
3498
3499 /* Starting with the best HSP, use the interval tree to
3500 check that the query range of each HSP in the list has
3501 not already been enveloped by too many higher-scoring
3502 HSPs. If this is not the case, add the HSP back into results */
3503
3504 Blast_IntervalTreeReset(tree);
3505
3506 for (j = 0; j < hsp_count; j++) {
3507 BlastHSPList *hsplist = hsp_array[j].hsplist;
3508 BlastHSP *hsp = hsp_array[j].hsp;
3509
3510 if (BlastIntervalTreeMasksHSP(tree,
3511 hsp, query_info, 0, masklevel)) {
3512 Blast_HSPFree(hsp);
3513 }
3514 else {
3515 BlastIntervalTreeAddHSP(hsp, tree, query_info, eQueryOnlyStrandIndifferent);
3516 Blast_HSPListSaveHSP(hsplist, hsp);
3517
3518 /* the first HSP added back into an HSPList
3519 automatically has the best e-value */
3520 // RMH: hmmmmmmm
3521 if (hsplist->hspcnt == 1)
3522 hsplist->best_evalue = hsp->evalue;
3523 }
3524 }
3525 sfree(hsp_array);
3526
3527 /* remove any HSPLists that are still empty after the
3528 culling process. Sort any remaining lists by score */
3529 for (j = 0; j < hitlist->hsplist_count; j++) {
3530 BlastHSPList *hsplist = hitlist->hsplist_array[j];
3531 if (hsplist->hspcnt == 0) {
3532 hitlist->hsplist_array[j] =
3533 Blast_HSPListFree(hitlist->hsplist_array[j]);
3534 }
3535 else {
3536 Blast_HSPListSortByScore(hitlist->hsplist_array[j]);
3537 }
3538 }
3539 Blast_HitListPurgeNullHSPLists(hitlist);
3540 }
3541
3542 tree = Blast_IntervalTreeFree(tree);
3543 return 0;
3544 }
3545
Blast_HSPResultsInsertHSPList(BlastHSPResults * results,BlastHSPList * hsp_list,Int4 hitlist_size)3546 Int2 Blast_HSPResultsInsertHSPList(BlastHSPResults* results,
3547 BlastHSPList* hsp_list, Int4 hitlist_size)
3548 {
3549 if (!hsp_list || hsp_list->hspcnt == 0)
3550 return 0;
3551
3552 ASSERT(hsp_list->query_index < results->num_queries);
3553
3554 if (!results->hitlist_array[hsp_list->query_index]) {
3555 results->hitlist_array[hsp_list->query_index] =
3556 Blast_HitListNew(hitlist_size);
3557 }
3558 Blast_HitListUpdate(results->hitlist_array[hsp_list->query_index],
3559 hsp_list);
3560 return 0;
3561 }
3562
3563 BlastHSPResults**
PHIBlast_HSPResultsSplit(const BlastHSPResults * results,const SPHIQueryInfo * pattern_info)3564 PHIBlast_HSPResultsSplit(const BlastHSPResults* results,
3565 const SPHIQueryInfo* pattern_info)
3566 {
3567 BlastHSPResults* *phi_results = NULL;
3568 int pattern_index, hit_index;
3569 int num_patterns;
3570 BlastHitList* hit_list = NULL;
3571 BlastHSPList** hsplist_array; /* Temporary per-pattern HSP lists. */
3572
3573 if (!pattern_info || pattern_info->num_patterns == 0)
3574 return NULL;
3575
3576 num_patterns = pattern_info->num_patterns;
3577
3578 phi_results =
3579 (BlastHSPResults**) calloc(num_patterns, sizeof(BlastHSPResults*));
3580
3581 if (!results || !results->hitlist_array[0])
3582 return phi_results; /* An empty results set is expected if no hits. */
3583
3584 hsplist_array = (BlastHSPList**) calloc(num_patterns, sizeof(BlastHSPList*));
3585 hit_list = results->hitlist_array[0];
3586
3587 for (hit_index = 0; hit_index < hit_list->hsplist_count; ++hit_index) {
3588 BlastHSPList* hsp_list = hit_list->hsplist_array[hit_index];
3589 int hsp_index;
3590 /* Copy HSPs corresponding to different pattern occurrences into
3591 separate HSP lists. */
3592 for (hsp_index = 0; hsp_index < hsp_list->hspcnt; ++hsp_index) {
3593 BlastHSP* hsp = s_BlastHSPCopy(hsp_list->hsp_array[hsp_index]);
3594 pattern_index = hsp->pat_info->index;
3595 if (!hsplist_array[pattern_index])
3596 hsplist_array[pattern_index] = Blast_HSPListNew(0);
3597 hsplist_array[pattern_index]->oid = hsp_list->oid;
3598 Blast_HSPListSaveHSP(hsplist_array[pattern_index], hsp);
3599 }
3600
3601 /* Save HSP lists corresponding to different pattern occurrences
3602 in separate results structures. */
3603 for (pattern_index = 0; pattern_index < num_patterns;
3604 ++pattern_index) {
3605 if (hsplist_array[pattern_index]) {
3606 if (!phi_results[pattern_index])
3607 phi_results[pattern_index] = Blast_HSPResultsNew(1);
3608 Blast_HSPResultsInsertHSPList(phi_results[pattern_index],
3609 hsplist_array[pattern_index],
3610 hit_list->hsplist_max);
3611 hsplist_array[pattern_index] = NULL;
3612 }
3613 }
3614 }
3615
3616 sfree(hsplist_array);
3617
3618 /* Sort HSPLists in each of the results structures by e-value. */
3619 for (pattern_index = 0; pattern_index < num_patterns; ++pattern_index) {
3620 Blast_HSPResultsSortByEvalue(phi_results[pattern_index]);
3621 }
3622
3623 return phi_results;
3624 }
3625
3626 BlastHSPResults*
Blast_HSPResultsFromHSPStream(BlastHSPStream * hsp_stream,size_t num_queries,SBlastHitsParameters * bhp)3627 Blast_HSPResultsFromHSPStream(BlastHSPStream* hsp_stream,
3628 size_t num_queries,
3629 SBlastHitsParameters* bhp)
3630 {
3631 BlastHSPResults* retval = NULL;
3632 BlastHSPList* hsp_list = NULL;
3633
3634 retval = Blast_HSPResultsNew((Int4) num_queries);
3635
3636 while (BlastHSPStreamRead(hsp_stream, &hsp_list) != kBlastHSPStream_Eof) {
3637 Blast_HSPResultsInsertHSPList(retval, hsp_list,
3638 bhp->prelim_hitlist_size);
3639 }
3640 SBlastHitsParametersFree(bhp);
3641 return retval;
3642 }
3643
3644 /** Comparison function for sorting HSP lists in increasing order of the
3645 * number of HSPs in a hit. Needed for s_TrimResultsByTotalHSPLimit below.
3646 * @param v1 Pointer to the first HSP list [in]
3647 * @param v2 Pointer to the second HSP list [in]
3648 */
3649 static int
s_CompareHsplistHspcnt(const void * v1,const void * v2)3650 s_CompareHsplistHspcnt(const void* v1, const void* v2)
3651 {
3652 BlastHSPList* r1 = *((BlastHSPList**) v1);
3653 BlastHSPList* r2 = *((BlastHSPList**) v2);
3654
3655 if (r1->hspcnt < r2->hspcnt)
3656 return -1;
3657 else if (r1->hspcnt > r2->hspcnt)
3658 return 1;
3659 else
3660 return 0;
3661 }
3662
3663 /** Removes extra results if a limit is imposed on the total number of HSPs
3664 * returned. If the search involves multiple query sequences, the total HSP
3665 * limit is applied separately to each query.
3666 * The trimming algorithm makes sure that at least 1 HSP is returned for each
3667 * database sequence hit. Suppose results for a given query consist of HSP
3668 * lists for N database sequences, and the limit is T. HSP lists are sorted in
3669 * order of increasing number of HSPs in each list. Then the algorithm proceeds
3670 * by leaving at most i*T/N HSPs for the first i HSP lists, for every
3671 * i = 1, 2, ..., N.
3672 * @param results Results after preliminary stage of a BLAST search [in|out]
3673 * @param total_hsp_limit Limit on total number of HSPs [in]
3674 * @return TRUE if HSP limit has been exceeded, FALSE otherwise.
3675 */
3676 static Boolean
s_TrimResultsByTotalHSPLimit(BlastHSPResults * results,Uint4 total_hsp_limit)3677 s_TrimResultsByTotalHSPLimit(BlastHSPResults* results, Uint4 total_hsp_limit)
3678 {
3679 int query_index;
3680 Boolean hsp_limit_exceeded = FALSE;
3681
3682 if (total_hsp_limit == 0) {
3683 return hsp_limit_exceeded;
3684 }
3685
3686 for (query_index = 0; query_index < results->num_queries; ++query_index) {
3687 BlastHitList* hit_list = NULL;
3688 BlastHSPList** hsplist_array = NULL;
3689 Int4 hsplist_count = 0;
3690 int subj_index;
3691
3692 if ( !(hit_list = results->hitlist_array[query_index]) )
3693 continue;
3694 /* The count of HSPs is separate for each query. */
3695 hsplist_count = hit_list->hsplist_count;
3696
3697 hsplist_array = (BlastHSPList**)
3698 malloc(hsplist_count*sizeof(BlastHSPList*));
3699
3700 for (subj_index = 0; subj_index < hsplist_count; ++subj_index) {
3701 hsplist_array[subj_index] = hit_list->hsplist_array[subj_index];
3702 }
3703
3704 qsort((void*)hsplist_array, hsplist_count,
3705 sizeof(BlastHSPList*), s_CompareHsplistHspcnt);
3706
3707 {
3708 Int4 tot_hsps = 0; /* total number of HSPs */
3709 Uint4 hsp_per_seq = MAX(1, total_hsp_limit/hsplist_count);
3710 for (subj_index = 0; subj_index < hsplist_count; ++subj_index) {
3711 Int4 allowed_hsp_num = ((subj_index+1)*hsp_per_seq) - tot_hsps;
3712 BlastHSPList* hsp_list = hsplist_array[subj_index];
3713 if (hsp_list->hspcnt > allowed_hsp_num) {
3714 /* Free the extra HSPs */
3715 int hsp_index;
3716 for (hsp_index = allowed_hsp_num;
3717 hsp_index < hsp_list->hspcnt; ++hsp_index) {
3718 Blast_HSPFree(hsp_list->hsp_array[hsp_index]);
3719 }
3720 hsp_list->hspcnt = allowed_hsp_num;
3721 hsp_limit_exceeded = TRUE;
3722 }
3723 tot_hsps += hsp_list->hspcnt;
3724 }
3725 }
3726 sfree(hsplist_array);
3727 }
3728
3729 return hsp_limit_exceeded;
3730 }
3731
3732 typedef struct BlastHSPwOid {
3733 BlastHSP * hsp;
3734 Int4 oid;
3735 } BlastHSPwOid;
3736
3737 static int
s_CompareScoreHSPwOid(const void * v1,const void * v2)3738 s_CompareScoreHSPwOid(const void* v1, const void* v2)
3739 {
3740 BlastHSPwOid * r1 = (BlastHSPwOid *) v1;
3741 BlastHSPwOid * r2 = (BlastHSPwOid *) v2;
3742 return (s_EvalueCompareHSPs(&(r1->hsp), &(r2->hsp)));
3743 }
3744
3745 static int
s_CompareOidHSPwOid(const void * v1,const void * v2)3746 s_CompareOidHSPwOid(const void* v1, const void* v2)
3747 {
3748 BlastHSPwOid * r1 = (BlastHSPwOid *) v1;
3749 BlastHSPwOid * r2 = (BlastHSPwOid *) v2;
3750 return (r1->oid > r2->oid);
3751 }
3752
3753
3754 /* extended version of the above function. Provides information about query number
3755 * which exceeded number of HSP.
3756 * The hsp_limit_exceeded is of results->num_queries size guarantied.
3757 */
3758 static Boolean
s_TrimResultsByTotalHSPLimitEx(BlastHSPResults * results,Uint4 total_hsp_limit,Boolean * hsp_limit_exceeded)3759 s_TrimResultsByTotalHSPLimitEx(BlastHSPResults* results,
3760 Uint4 total_hsp_limit,
3761 Boolean *hsp_limit_exceeded)
3762 {
3763 int query_index;
3764 Boolean any_hsp_limit_exceeded = FALSE;
3765
3766 if (total_hsp_limit == 0) {
3767 return any_hsp_limit_exceeded;
3768 }
3769
3770 for (query_index = 0; query_index < results->num_queries; ++query_index) {
3771 BlastHitList* hit_list = NULL;
3772 Int4 hsplist_count = 0;
3773 int subj_index;
3774 Int4 total_hsps = 0; /* total number of HSPs */
3775
3776 if( hsp_limit_exceeded) hsp_limit_exceeded[query_index] = FALSE;
3777
3778 if ( !(hit_list = results->hitlist_array[query_index]) )
3779 continue;
3780 /* The count of HSPs is separate for each query. */
3781 hsplist_count = hit_list->hsplist_count;
3782
3783 for (subj_index = 0; subj_index < hsplist_count; ++subj_index) {
3784 total_hsps += hit_list->hsplist_array[subj_index]->hspcnt;
3785 }
3786
3787 if(total_hsps > total_hsp_limit)
3788 {
3789 BlastHSPwOid * everything_list = (BlastHSPwOid *) malloc(total_hsps * sizeof(BlastHSPwOid));
3790 BlastHSPList * subj_list;
3791 int hsp_counter = 0;
3792 int max_hit_list_size = hit_list->hsplist_max;
3793 if( hsp_limit_exceeded) {
3794 hsp_limit_exceeded[query_index] = TRUE;
3795 any_hsp_limit_exceeded = TRUE;
3796 }
3797 for (subj_index = 0; subj_index < hsplist_count; ++subj_index) {
3798 int subj_hsp;
3799 BlastHSP ** hsps_per_subj;
3800 subj_list = hit_list->hsplist_array[subj_index];
3801 hsps_per_subj = subj_list->hsp_array;
3802 for(subj_hsp=0; subj_hsp < subj_list->hspcnt; ++subj_hsp) {
3803 everything_list[hsp_counter].hsp = hsps_per_subj[subj_hsp];
3804 everything_list[hsp_counter].oid = subj_list->oid;
3805 hsps_per_subj[subj_hsp] = NULL;
3806 hsp_counter ++;
3807 }
3808
3809 }
3810 results->hitlist_array[query_index] = Blast_HitListFree(hit_list);
3811 qsort((void*)everything_list, total_hsps, sizeof(BlastHSPwOid), s_CompareScoreHSPwOid);
3812
3813 for(hsp_counter = total_hsp_limit; hsp_counter < total_hsps ; ++hsp_counter) {
3814 everything_list[hsp_counter].hsp = Blast_HSPFree(everything_list[hsp_counter].hsp);
3815 everything_list[hsp_counter].oid = 0x7fffff;
3816 }
3817
3818 qsort((void*)everything_list, total_hsp_limit, sizeof(BlastHSPwOid), s_CompareOidHSPwOid);
3819 subj_list = NULL;
3820 for(hsp_counter = 0; hsp_counter < total_hsp_limit; ++ hsp_counter)
3821 {
3822 int hsp_counter_start = hsp_counter;
3823 int hspcnt;
3824 int num_hsp;
3825
3826 while ((everything_list[hsp_counter].oid == everything_list[hsp_counter+1].oid) &&
3827 (hsp_counter + 1 < total_hsp_limit)) {
3828 hsp_counter ++;
3829 }
3830 num_hsp = hsp_counter -hsp_counter_start + 1;
3831 subj_list = Blast_HSPListNew(num_hsp);
3832 subj_list->oid = everything_list[hsp_counter].oid;
3833 subj_list->query_index = query_index;
3834
3835 for(hspcnt = 0; hspcnt < num_hsp; ++hspcnt) {
3836 Blast_HSPListSaveHSP(subj_list, everything_list[hsp_counter_start + hspcnt].hsp);
3837 }
3838
3839 Blast_HSPResultsInsertHSPList(results,subj_list, max_hit_list_size );
3840 }
3841 free(everything_list);
3842 }
3843 }
3844
3845 return any_hsp_limit_exceeded;
3846 }
3847
3848 BlastHSPResults*
Blast_HSPResultsFromHSPStreamWithLimit(BlastHSPStream * hsp_stream,Uint4 num_queries,SBlastHitsParameters * hit_param,Uint4 max_num_hsps,Boolean * removed_hsps)3849 Blast_HSPResultsFromHSPStreamWithLimit(BlastHSPStream* hsp_stream,
3850 Uint4 num_queries,
3851 SBlastHitsParameters* hit_param,
3852 Uint4 max_num_hsps,
3853 Boolean* removed_hsps)
3854 {
3855 Boolean rm_hsps = FALSE;
3856 BlastHSPResults* retval = Blast_HSPResultsFromHSPStream(hsp_stream,
3857 num_queries,
3858 hit_param);
3859
3860 rm_hsps = s_TrimResultsByTotalHSPLimit(retval, max_num_hsps);
3861 if (removed_hsps) {
3862 *removed_hsps = rm_hsps;
3863 }
3864 return retval;
3865 }
3866 BlastHSPResults*
Blast_HSPResultsFromHSPStreamWithLimitEx(BlastHSPStream * hsp_stream,Uint4 num_queries,SBlastHitsParameters * hit_param,Uint4 max_num_hsps,Boolean * removed_hsps)3867 Blast_HSPResultsFromHSPStreamWithLimitEx(BlastHSPStream* hsp_stream,
3868 Uint4 num_queries,
3869 SBlastHitsParameters* hit_param,
3870 Uint4 max_num_hsps,
3871 Boolean *removed_hsps)
3872 {
3873 Boolean rm_hsps = FALSE;
3874 BlastHSPResults* retval = Blast_HSPResultsFromHSPStream(hsp_stream,
3875 num_queries,
3876 hit_param);
3877
3878 rm_hsps = s_TrimResultsByTotalHSPLimitEx(retval, max_num_hsps,removed_hsps);
3879 if (removed_hsps) {
3880 *removed_hsps = rm_hsps;
3881 }
3882 return retval;
3883 }
3884