1 /* $Id: blast_hits.c,v 1.283 2016/09/13 14:49:13 fukanchi Exp $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 *
27 */
28
29 /** @file blast_hits.c
30 * BLAST functions for saving hits after the (preliminary) gapped alignment
31 */
32
33 #include <algo/blast/core/ncbi_math.h>
34 #include <algo/blast/core/blast_hits.h>
35 #include <algo/blast/core/blast_util.h>
36 #include <algo/blast/core/blast_def.h>
37 #include <algo/blast/core/blast_hspstream.h>
38 #include "blast_hits_priv.h"
39 #include "blast_itree.h"
40 #include "jumper.h"
41
42 NCBI_XBLAST_EXPORT
SBlastHitsParametersNew(const BlastHitSavingOptions * hit_options,const BlastExtensionOptions * ext_options,const BlastScoringOptions * scoring_options,SBlastHitsParameters ** retval)43 Int2 SBlastHitsParametersNew(const BlastHitSavingOptions* hit_options,
44 const BlastExtensionOptions* ext_options,
45 const BlastScoringOptions* scoring_options,
46 SBlastHitsParameters* *retval)
47 {
48 Int4 prelim_hitlist_size;
49 ASSERT(retval);
50 *retval = NULL;
51
52 if (hit_options == NULL ||
53 ext_options == NULL ||
54 scoring_options == NULL)
55 return 1;
56
57 *retval = (SBlastHitsParameters*) malloc(sizeof(SBlastHitsParameters));
58 if (*retval == NULL)
59 return 2;
60
61 prelim_hitlist_size = hit_options->hitlist_size;
62 if (ext_options->compositionBasedStats)
63 prelim_hitlist_size = prelim_hitlist_size * 2 + 50;
64 else if (scoring_options->gapped_calculation)
65 prelim_hitlist_size = MIN(2 * prelim_hitlist_size,
66 prelim_hitlist_size + 50);
67
68 (*retval)->prelim_hitlist_size = MAX(prelim_hitlist_size, 10);
69
70 (*retval)->hsp_num_max = BlastHspNumMax(scoring_options->gapped_calculation, hit_options);
71
72 return 0;
73 }
74
75 SBlastHitsParameters*
SBlastHitsParametersDup(const SBlastHitsParameters * hit_params)76 SBlastHitsParametersDup(const SBlastHitsParameters* hit_params)
77 {
78 SBlastHitsParameters* retval = (SBlastHitsParameters*)
79 malloc(sizeof(SBlastHitsParameters));
80
81 if ( !retval ) {
82 return NULL;
83 }
84
85 memcpy((void*)retval, (void*) hit_params, sizeof(SBlastHitsParameters));
86 return retval;
87 }
88
89 NCBI_XBLAST_EXPORT
SBlastHitsParametersFree(SBlastHitsParameters * param)90 SBlastHitsParameters* SBlastHitsParametersFree(SBlastHitsParameters* param)
91 {
92 if (param)
93 {
94 sfree(param);
95 }
96 return NULL;
97 }
98
99
100
101 /********************************************************************************
102 Functions manipulating BlastHSP's
103 ********************************************************************************/
104
Blast_HSPFree(BlastHSP * hsp)105 BlastHSP* Blast_HSPFree(BlastHSP* hsp)
106 {
107 if (!hsp)
108 return NULL;
109 hsp->gap_info = GapEditScriptDelete(hsp->gap_info);
110 hsp->map_info = BlastHSPMappingInfoFree(hsp->map_info);
111 sfree(hsp->pat_info);
112 sfree(hsp);
113 return NULL;
114 }
115
Blast_HSPNew(void)116 BlastHSP* Blast_HSPNew(void)
117 {
118 BlastHSP* new_hsp = (BlastHSP*) calloc(1, sizeof(BlastHSP));
119 return new_hsp;
120 }
121
122 /*
123 Comments in blast_hits.h
124 */
125 Int2
Blast_HSPInit(Int4 query_start,Int4 query_end,Int4 subject_start,Int4 subject_end,Int4 query_gapped_start,Int4 subject_gapped_start,Int4 query_context,Int2 query_frame,Int2 subject_frame,Int4 score,GapEditScript ** gap_edit,BlastHSP ** ret_hsp)126 Blast_HSPInit(Int4 query_start, Int4 query_end, Int4 subject_start,
127 Int4 subject_end, Int4 query_gapped_start,
128 Int4 subject_gapped_start, Int4 query_context,
129 Int2 query_frame, Int2 subject_frame, Int4 score,
130 GapEditScript* *gap_edit, BlastHSP* *ret_hsp)
131 {
132 BlastHSP* new_hsp = NULL;
133
134 if (!ret_hsp)
135 return -1;
136
137 new_hsp = Blast_HSPNew();
138
139 *ret_hsp = NULL;
140
141 if (new_hsp == NULL)
142 return BLASTERR_MEMORY;
143
144
145 new_hsp->query.offset = query_start;
146 new_hsp->subject.offset = subject_start;
147 new_hsp->query.end = query_end;
148 new_hsp->subject.end = subject_end;
149 new_hsp->query.gapped_start = query_gapped_start;
150 new_hsp->subject.gapped_start = subject_gapped_start;
151 new_hsp->context = query_context;
152 new_hsp->query.frame = query_frame;
153 new_hsp->subject.frame = subject_frame;
154 new_hsp->score = score;
155 if (gap_edit && *gap_edit)
156 { /* If this is non-NULL transfer ownership. */
157 new_hsp->gap_info = *gap_edit;
158 *gap_edit = NULL;
159 }
160
161 *ret_hsp = new_hsp;
162
163 return 0;
164 }
165
166
BlastHSPMappingInfoFree(BlastHSPMappingInfo * info)167 BlastHSPMappingInfo* BlastHSPMappingInfoFree(BlastHSPMappingInfo* info)
168 {
169 if (!info) {
170 return NULL;
171 }
172
173 info->edits = JumperEditsBlockFree(info->edits);
174 if (info->subject_overhangs) {
175 SequenceOverhangsFree(info->subject_overhangs);
176 }
177 sfree(info);
178
179 return NULL;
180 }
181
BlastHSPMappingInfoNew(void)182 BlastHSPMappingInfo* BlastHSPMappingInfoNew(void)
183 {
184 BlastHSPMappingInfo* retval = calloc(1, sizeof(BlastHSPMappingInfo));
185 return retval;
186 }
187
BlastHspNumMax(Boolean gapped_calculation,const BlastHitSavingOptions * options)188 Int4 BlastHspNumMax(Boolean gapped_calculation, const BlastHitSavingOptions* options)
189 {
190 Int4 retval=0;
191
192 /* per-subject HSP limits do not apply to gapped searches; JIRA SB-616 */
193 if (options->hsp_num_max <= 0)
194 {
195 retval = INT4_MAX;
196 }
197 else
198 {
199 retval = options->hsp_num_max;
200 }
201
202 return retval;
203 }
204
205 /** Copies all contents of a BlastHSP structure. Used in PHI BLAST for splitting
206 * results corresponding to different pattern occurrences in query.
207 * @param hsp Original HSP [in]
208 * @return New HSP, copied from the original.
209 */
210 static BlastHSP*
s_BlastHSPCopy(const BlastHSP * hsp)211 s_BlastHSPCopy(const BlastHSP* hsp)
212 {
213 BlastHSP* new_hsp = NULL;
214 /* Do not pass the edit script, because we don't want to tranfer
215 ownership. */
216 Blast_HSPInit(hsp->query.offset, hsp->query.end, hsp->subject.offset,
217 hsp->subject.end, hsp->query.gapped_start,
218 hsp->subject.gapped_start, hsp->context,
219 hsp->query.frame, hsp->subject.frame, hsp->score,
220 NULL, &new_hsp);
221 new_hsp->evalue = hsp->evalue;
222 new_hsp->num = hsp->num;
223 new_hsp->num_ident = hsp->num_ident;
224 new_hsp->bit_score = hsp->bit_score;
225 new_hsp->comp_adjustment_method = hsp->comp_adjustment_method;
226 if (hsp->gap_info) {
227 new_hsp->gap_info = GapEditScriptDup(hsp->gap_info);
228 }
229
230 if (hsp->pat_info) {
231 /* Copy this HSP's pattern data. */
232 new_hsp->pat_info =
233 (SPHIHspInfo*) BlastMemDup(hsp->pat_info, sizeof(SPHIHspInfo));
234 }
235 return new_hsp;
236 }
237
238 /* Make a deep copy of an HSP */
Blast_HSPClone(const BlastHSP * hsp)239 BlastHSP* Blast_HSPClone(const BlastHSP* hsp)
240 {
241 BlastHSP* retval = NULL;
242
243 if (!hsp) {
244 return NULL;
245 }
246
247 retval = Blast_HSPNew();
248 if (retval) {
249 retval->score = hsp->score;
250 retval->num_ident = hsp->num_ident;
251 memcpy(&retval->query, &hsp->query, sizeof(BlastSeg));
252 memcpy(&retval->subject, &hsp->subject, sizeof(BlastSeg));
253 retval->context = hsp->context;
254 retval->evalue = hsp->evalue;
255 retval->bit_score = hsp->bit_score;
256 retval->num = hsp->num;
257 retval->comp_adjustment_method = hsp->comp_adjustment_method;
258 retval->num_positives = hsp->num_positives;
259
260 /* copy gapped traceback */
261 if (hsp->gap_info) {
262 retval->gap_info = GapEditScriptDup(hsp->gap_info);
263 if (!retval->gap_info) {
264 Blast_HSPFree(retval);
265 return NULL;
266 }
267 }
268
269 /* copy short read mapping data */
270 if (hsp->map_info) {
271 retval->map_info = BlastHSPMappingInfoNew();
272 if (!retval->map_info) {
273 Blast_HSPFree(retval);
274 return NULL;
275 }
276 retval->map_info->edits =
277 JumperEditsBlockDup(hsp->map_info->edits);
278 if (!retval->map_info->edits) {
279 Blast_HSPFree(retval);
280 return NULL;
281 }
282 retval->map_info->left_edge = hsp->map_info->left_edge;
283 retval->map_info->right_edge = hsp->map_info->right_edge;
284
285 if (hsp->map_info->subject_overhangs) {
286 SequenceOverhangs* old = hsp->map_info->subject_overhangs;
287 SequenceOverhangs* copy = calloc(1, sizeof(SequenceOverhangs));
288 if (!copy) {
289 Blast_HSPFree(retval);
290 return NULL;
291 }
292
293 if (old->left && old->left_len > 0) {
294 copy->left_len = old->left_len;
295 copy->left = malloc(copy->left_len * sizeof(Uint1));
296 if (!copy->left) {
297 SequenceOverhangsFree(copy);
298 Blast_HSPFree(retval);
299 return NULL;
300 }
301 memcpy(copy->left, old->left,
302 copy->left_len * sizeof(Uint1));
303 }
304
305 if (old->right && old->right_len > 0) {
306 copy->right_len = old->right_len;
307 copy->right = malloc(copy->right_len * sizeof(Uint1));
308 if (!copy->right) {
309 SequenceOverhangsFree(copy);
310 Blast_HSPFree(retval);
311 }
312 memcpy(copy->right, old->right,
313 copy->right_len * sizeof(Uint1));
314 }
315
316 retval->map_info->subject_overhangs = copy;
317 }
318 }
319
320 /* copy phi-blast pattern data */
321 if (hsp->pat_info) {
322 retval->pat_info =
323 (SPHIHspInfo*) BlastMemDup(hsp->pat_info, sizeof(SPHIHspInfo));
324 }
325 }
326
327 return retval;
328 }
329
330 /** Count the number of occurrences of pattern in sequence, which
331 * do not overlap by more than half the pattern match length.
332 * @param query_info Query information structure, containing pattern info. [in]
333 */
334 Int4
PhiBlastGetEffectiveNumberOfPatterns(const BlastQueryInfo * query_info)335 PhiBlastGetEffectiveNumberOfPatterns(const BlastQueryInfo *query_info)
336 {
337 Int4 index; /*loop index*/
338 Int4 lastEffectiveOccurrence; /*last nonoverlapping occurrence*/
339 Int4 count; /* Count of effective (nonoverlapping) occurrences */
340 Int4 min_pattern_length;
341 SPHIQueryInfo* pat_info;
342
343 ASSERT(query_info && query_info->pattern_info && query_info->contexts);
344
345 pat_info = query_info->pattern_info;
346
347 if (pat_info->num_patterns <= 1)
348 return pat_info->num_patterns;
349
350 /* Minimal length of a pattern is saved in the length adjustment field. */
351 min_pattern_length = query_info->contexts[0].length_adjustment;
352
353 count = 1;
354 lastEffectiveOccurrence = pat_info->occurrences[0].offset;
355 for(index = 1; index < pat_info->num_patterns; ++index) {
356 if (((pat_info->occurrences[index].offset - lastEffectiveOccurrence) * 2)
357 > min_pattern_length) {
358 lastEffectiveOccurrence = pat_info->occurrences[index].offset;
359 ++count;
360 }
361 }
362
363 return count;
364 }
365
366
367 /** Calculate e-value for an HSP found by PHI BLAST.
368 * @param hsp An HSP found by PHI BLAST [in]
369 * @param sbp Scoring block with statistical parameters [in]
370 * @param query_info Structure containing information about pattern counts [in]
371 * @param pattern_blk Structure containing counts of PHI pattern hits [in]
372 */
373 static void
s_HSPPHIGetEvalue(BlastHSP * hsp,BlastScoreBlk * sbp,const BlastQueryInfo * query_info,const SPHIPatternSearchBlk * pattern_blk)374 s_HSPPHIGetEvalue(BlastHSP* hsp, BlastScoreBlk* sbp,
375 const BlastQueryInfo* query_info,
376 const SPHIPatternSearchBlk* pattern_blk)
377 {
378 double paramC;
379 double Lambda;
380
381 ASSERT(query_info && hsp && sbp && pattern_blk);
382
383 paramC = sbp->kbp[0]->paramC;
384 Lambda = sbp->kbp[0]->Lambda;
385
386 /* We have the actual number of occurrences of pattern in db. */
387 hsp->evalue = paramC*(1+Lambda*hsp->score)*
388 PhiBlastGetEffectiveNumberOfPatterns(query_info)*
389 pattern_blk->num_patterns_db*
390 exp(-Lambda*hsp->score);
391 }
392
393 /** Update HSP data after reevaluation with ambiguities. In particular this
394 * function calculates number of identities and checks if the percent identity
395 * criterion is satisfied.
396 * @param hsp HSP to update [in] [out]
397 * @param gapped Is this a gapped search? [in]
398 * @param cutoff_score Cutoff score for saving the HSP [in]
399 * @param score New score [in]
400 * @param query_start Start of query sequence [in]
401 * @param subject_start Start of subject sequence [in]
402 * @param best_q_start Pointer to start of the new alignment in query [in]
403 * @param best_q_end Pointer to end of the new alignment in query [in]
404 * @param best_s_start Pointer to start of the new alignment in subject [in]
405 * @param best_s_end Pointer to end of the new alignment in subject [in]
406 * @param best_start_esp_index index of the edit script array where the new alignment
407 * starts. [in]
408 * @param best_end_esp_index index in the edit script array where the new alignment
409 * ends. [in]
410 * @param best_end_esp_num Number of edit operations in the last edit script,
411 * that are included in the alignment. [in]
412 * @return TRUE if HSP is scheduled to be deleted.
413 */
414 static Boolean
s_UpdateReevaluatedHSP(BlastHSP * hsp,Boolean gapped,Int4 cutoff_score,Int4 score,const Uint1 * query_start,const Uint1 * subject_start,const Uint1 * best_q_start,const Uint1 * best_q_end,const Uint1 * best_s_start,const Uint1 * best_s_end,int best_start_esp_index,int best_end_esp_index,int best_end_esp_num)415 s_UpdateReevaluatedHSP(BlastHSP* hsp, Boolean gapped,
416 Int4 cutoff_score,
417 Int4 score, const Uint1* query_start, const Uint1* subject_start,
418 const Uint1* best_q_start, const Uint1* best_q_end,
419 const Uint1* best_s_start, const Uint1* best_s_end,
420 int best_start_esp_index,
421 int best_end_esp_index,
422 int best_end_esp_num)
423 {
424 Boolean delete_hsp = TRUE;
425
426 hsp->score = score;
427
428 if (hsp->score >= cutoff_score) {
429 /* Update all HSP offsets. */
430 hsp->query.offset = best_q_start - query_start;
431 hsp->query.end = hsp->query.offset + best_q_end - best_q_start;
432 hsp->subject.offset = best_s_start - subject_start;
433 hsp->subject.end = hsp->subject.offset + best_s_end - best_s_start;
434
435 if (gapped) {
436 int last_num=hsp->gap_info->size - 1;
437 if (best_end_esp_index != last_num|| best_start_esp_index > 0)
438 {
439 GapEditScript* esp_temp = GapEditScriptNew(best_end_esp_index-best_start_esp_index+1);
440 GapEditScriptPartialCopy(esp_temp, 0, hsp->gap_info, best_start_esp_index, best_end_esp_index);
441 hsp->gap_info = GapEditScriptDelete(hsp->gap_info);
442 hsp->gap_info = esp_temp;
443 }
444 last_num = hsp->gap_info->size - 1;
445 hsp->gap_info->num[last_num] = best_end_esp_num;
446 ASSERT(best_end_esp_num >= 0);
447 }
448 delete_hsp = FALSE;
449 }
450
451 return delete_hsp;
452 }
453
Blast_HSPReevaluateWithAmbiguitiesGapped(BlastHSP * hsp,const Uint1 * q,const Int4 qlen,const Uint1 * s,const Int4 slen,const BlastHitSavingParameters * hit_params,const BlastScoringParameters * score_params,const BlastScoreBlk * sbp)454 Boolean Blast_HSPReevaluateWithAmbiguitiesGapped(BlastHSP* hsp,
455 const Uint1* q, const Int4 qlen,
456 const Uint1* s, const Int4 slen,
457 const BlastHitSavingParameters* hit_params,
458 const BlastScoringParameters* score_params,
459 const BlastScoreBlk* sbp)
460 {
461 Int4 sum, score, gap_open, gap_extend;
462 Int4 index; /* loop index */
463 Int4 qp, sp, ext;
464
465 int best_start_esp_index = 0;
466 int best_end_esp_index = 0;
467 int current_start_esp_index = 0;
468 int best_end_esp_num = 0;
469 GapEditScript* esp; /* Used to hold GapEditScript of hsp->gap_info */
470
471 const Uint1* best_q_start; /* Start of the best scoring part in query. */
472 const Uint1* best_s_start; /* Start of the best scoring part in subject. */
473 const Uint1* best_q_end; /* End of the best scoring part in query. */
474 const Uint1* best_s_end; /* End of the best scoring part in subject. */
475
476
477 const Uint1* current_q_start; /* Start of the current part of the alignment in
478 query. */
479 const Uint1* current_s_start; /* Start of the current part of the alignment in
480 subject. */
481
482 const Uint1* query,* subject;
483 Int4** matrix;
484 Int2 factor = 1;
485 const Uint1 kResidueMask = 0x0f;
486 Int4 cutoff_score = hit_params->cutoffs[hsp->context].cutoff_score;
487
488 matrix = sbp->matrix->data;
489
490 /* For a non-affine greedy case, calculate the real value of the gap
491 extension penalty. Multiply all scores by 2 if it is not integer. */
492 if (score_params->gap_open == 0 && score_params->gap_extend == 0) {
493 if (score_params->reward % 2 == 1)
494 factor = 2;
495 gap_open = 0;
496 gap_extend =
497 (score_params->reward - 2*score_params->penalty) * factor / 2;
498 } else {
499 gap_open = score_params->gap_open;
500 gap_extend = score_params->gap_extend;
501 }
502
503 query = q + hsp->query.offset;
504 subject = s + hsp->subject.offset;
505 score = 0;
506 sum = 0;
507
508 /* Point all pointers to the beginning of the alignment. */
509 best_q_start = best_q_end = current_q_start = query;
510 best_s_start = best_s_end = current_s_start = subject;
511 /* There are no previous edit scripts at the beginning. */
512
513 best_end_esp_num = -1;
514 esp = hsp->gap_info;
515 if (!esp) return TRUE;
516 for (index=0; index<esp->size; index++)
517 {
518 int op_index = 0; /* Index of an operation within a single edit script. */
519 for (op_index=0; op_index<esp->num[index]; )
520 {
521 /* Process substitutions one operation at a time, full gaps in one step. */
522 if (esp->op_type[index] == eGapAlignSub) {
523 sum += factor*matrix[*query & kResidueMask][*subject];
524 query++;
525 subject++;
526 op_index++;
527 } else if (esp->op_type[index] == eGapAlignDel) {
528 sum -= gap_open + gap_extend * esp->num[index];
529 subject += esp->num[index];
530 op_index += esp->num[index];
531 } else if (esp->op_type[index] == eGapAlignIns) {
532 sum -= gap_open + gap_extend * esp->num[index];
533 query += esp->num[index];
534 op_index += esp->num[index];
535 }
536
537 if (sum < 0) {
538 /* Point current edit script chain start to the new place.
539 If we are in the middle of an edit script, reduce its length and
540 point operation index to the beginning of a modified edit script;
541 if we are at the end, move to the next edit script. */
542 if (op_index < esp->num[index]) {
543 esp->num[index] -= op_index;
544 current_start_esp_index = index;
545 op_index = 0;
546 } else {
547 current_start_esp_index = index + 1;
548 }
549 /* Set sum to 0, to start a fresh count. */
550 sum = 0;
551 /* Set current starting positions in sequences to the new start. */
552 current_q_start = query;
553 current_s_start = subject;
554
555 /* If score has passed the cutoff at some point, leave the best score
556 and edit scripts positions information untouched, otherwise reset
557 the best score to 0 and point the best edit script positions to
558 the new start. */
559 if (score < cutoff_score) {
560 /* Start from new offset; discard all previous information. */
561 best_q_start = query;
562 best_s_start = subject;
563 score = 0;
564
565 /* Set best start and end edit script pointers to new start. */
566 best_start_esp_index = current_start_esp_index;
567 best_end_esp_index = current_start_esp_index;
568 }
569 /* break; */ /* start on next GapEditScript. */
570 } else if (sum > score) {
571 /* Remember this point as the best scoring end point, and the current
572 start of the alignment as the start of the best alignment. */
573 score = sum;
574
575 best_q_start = current_q_start;
576 best_s_start = current_s_start;
577 best_q_end = query;
578 best_s_end = subject;
579
580 best_start_esp_index = current_start_esp_index;
581 best_end_esp_index = index;
582 best_end_esp_num = op_index;
583 }
584 }
585 } /* loop on edit scripts */
586
587 score /= factor;
588
589 if (best_start_esp_index < esp->size && best_end_esp_index < esp->size) {
590 /* post processing: try to extend further */
591 ASSERT(esp->op_type[best_start_esp_index] == eGapAlignSub);
592 ASSERT(esp->op_type[best_end_esp_index] == eGapAlignSub);
593
594 qp = best_q_start - q;
595 sp = best_s_start - s;
596 ext = 0;
597 while(qp > 0 && sp > 0 && (q[--qp] == s[--sp]) && q[qp]<4) ext++;
598 best_q_start -= ext;
599 best_s_start -= ext;
600 esp->num[best_start_esp_index] += ext;
601 if (best_end_esp_index == best_start_esp_index) best_end_esp_num += ext;
602 score += ext * score_params->reward;
603
604 qp = best_q_end - q;
605 sp = best_s_end - s;
606 ext = 0;
607 while(qp < qlen && sp < slen && q[qp]<4 && (q[qp++] == s[sp++])) ext++;
608 best_q_end += ext;
609 best_s_end += ext;
610 esp->num[best_end_esp_index] += ext;
611 best_end_esp_num += ext;
612 score += ext * score_params->reward;
613 }
614
615 /* Update HSP data. */
616 return
617 s_UpdateReevaluatedHSP(hsp, TRUE, cutoff_score,
618 score, q, s, best_q_start,
619 best_q_end, best_s_start, best_s_end,
620 best_start_esp_index, best_end_esp_index,
621 best_end_esp_num);
622 }
623
624 /** Update HSP data after reevaluation with ambiguities for an ungapped search.
625 * In particular this function calculates number of identities and checks if the
626 * percent identity criterion is satisfied.
627 * @param hsp HSP to update [in] [out]
628 * @param cutoff_score Cutoff score for saving the HSP [in]
629 * @param score New score [in]
630 * @param query_start Start of query sequence [in]
631 * @param subject_start Start of subject sequence [in]
632 * @param best_q_start Pointer to start of the new alignment in query [in]
633 * @param best_q_end Pointer to end of the new alignment in query [in]
634 * @param best_s_start Pointer to start of the new alignment in subject [in]
635 * @param best_s_end Pointer to end of the new alignment in subject [in]
636 * @return TRUE if HSP is scheduled to be deleted.
637 */
638 static Boolean
s_UpdateReevaluatedHSPUngapped(BlastHSP * hsp,Int4 cutoff_score,Int4 score,const Uint1 * query_start,const Uint1 * subject_start,const Uint1 * best_q_start,const Uint1 * best_q_end,const Uint1 * best_s_start,const Uint1 * best_s_end)639 s_UpdateReevaluatedHSPUngapped(BlastHSP* hsp, Int4 cutoff_score, Int4 score,
640 const Uint1* query_start, const Uint1* subject_start,
641 const Uint1* best_q_start, const Uint1* best_q_end,
642 const Uint1* best_s_start, const Uint1* best_s_end)
643 {
644 return
645 s_UpdateReevaluatedHSP(hsp, FALSE, cutoff_score, score, query_start,
646 subject_start, best_q_start, best_q_end,
647 best_s_start, best_s_end, 0, 0, 0);
648 }
649
650 Boolean
Blast_HSPReevaluateWithAmbiguitiesUngapped(BlastHSP * hsp,const Uint1 * query_start,const Uint1 * subject_start,const BlastInitialWordParameters * word_params,BlastScoreBlk * sbp,Boolean translated)651 Blast_HSPReevaluateWithAmbiguitiesUngapped(BlastHSP* hsp, const Uint1* query_start,
652 const Uint1* subject_start, const BlastInitialWordParameters* word_params,
653 BlastScoreBlk* sbp, Boolean translated)
654 {
655 Int4 sum, score;
656 Int4** matrix;
657 const Uint1* query,* subject;
658 const Uint1* best_q_start,* best_s_start,* best_q_end,* best_s_end;
659 const Uint1* current_q_start, * current_s_start;
660 Int4 index;
661 const Uint1 kResidueMask = (translated ? 0xff : 0x0f);
662 Int4 hsp_length = hsp->query.end - hsp->query.offset;
663 Int4 cutoff_score = word_params->cutoffs[hsp->context].cutoff_score;
664
665 matrix = sbp->matrix->data;
666
667 query = query_start + hsp->query.offset;
668 subject = subject_start + hsp->subject.offset;
669 score = 0;
670 sum = 0;
671 best_q_start = best_q_end = current_q_start = query;
672 best_s_start = best_s_end = current_s_start = subject;
673
674 for (index = 0; index < hsp_length; ++index) {
675 sum += matrix[*query & kResidueMask][*subject];
676 query++;
677 subject++;
678 if (sum < 0) {
679 /* Start from new offset */
680 sum = 0;
681 current_q_start = query;
682 current_s_start = subject;
683 /* If previous top score never reached the cutoff, discard the front
684 part of the alignment completely. Otherwise keep pointer to the
685 top-scoring front part. */
686 if (score < cutoff_score) {
687 best_q_start = best_q_end = query;
688 best_s_start = best_s_end = subject;
689 score = 0;
690 }
691 } else if (sum > score) {
692 /* Remember this point as the best scoring end point */
693 score = sum;
694 best_q_end = query;
695 best_s_end = subject;
696 /* Set start of alignment to the current start, dismissing the
697 previous top-scoring piece. */
698 best_q_start = current_q_start;
699 best_s_start = current_s_start;
700 }
701 }
702
703 /* Update HSP data. */
704 return
705 s_UpdateReevaluatedHSPUngapped(hsp, cutoff_score, score,
706 query_start, subject_start, best_q_start,
707 best_q_end, best_s_start, best_s_end);
708 }
709
710 /** Calculate number of identities in a regular HSP.
711 * @param query The query sequence [in]
712 * @param subject The uncompressed subject sequence [in]
713 * @param hsp All information about the HSP [in]
714 * @param num_ident_ptr Number of identities [out]
715 * @param align_length_ptr The alignment length, including gaps [out]
716 * @param sbp Blast score blk [in]
717 * @param num_pos_ptr Number of Positives [out]
718 * @return 0 on success, -1 on invalid parameters or error
719 */
720 static Int2
s_Blast_HSPGetNumIdentitiesAndPositives(const Uint1 * query,const Uint1 * subject,const BlastHSP * hsp,Int4 * num_ident_ptr,Int4 * align_length_ptr,const BlastScoreBlk * sbp,Int4 * num_pos_ptr)721 s_Blast_HSPGetNumIdentitiesAndPositives(const Uint1* query, const Uint1* subject,
722 const BlastHSP* hsp, Int4* num_ident_ptr,
723 Int4* align_length_ptr, const BlastScoreBlk* sbp,
724 Int4* num_pos_ptr)
725 {
726 Int4 i, num_ident, align_length, q_off, s_off;
727 Uint1* q,* s;
728 Int4 q_length = hsp->query.end - hsp->query.offset;
729 Int4 s_length = hsp->subject.end - hsp->subject.offset;
730 Int4** matrix = NULL;
731 Int4 num_pos = 0;
732
733 q_off = hsp->query.offset;
734 s_off = hsp->subject.offset;
735
736 if ( !subject || !query || !hsp )
737 return -1;
738
739 q = (Uint1*) &query[q_off];
740 s = (Uint1*) &subject[s_off];
741
742 num_ident = 0;
743 align_length = 0;
744
745 if(NULL != sbp)
746 {
747 if(sbp->protein_alphabet)
748 matrix = sbp->matrix->data;
749 }
750
751 if (!hsp->gap_info) {
752 /* Ungapped case. Check that lengths are the same in query and subject,
753 then count number of matches. */
754 if (q_length != s_length)
755 return -1;
756 align_length = q_length;
757 for (i=0; i<align_length; i++) {
758 if (*q == *s)
759 num_ident++;
760 else if (NULL != matrix) {
761 if (matrix[*q][*s] > 0)
762 num_pos ++;
763 }
764 q++;
765 s++;
766 }
767 }
768 else {
769 Int4 index;
770 GapEditScript* esp = hsp->gap_info;
771 for (index=0; index<esp->size; index++)
772 {
773 align_length += esp->num[index];
774 switch (esp->op_type[index]) {
775 case eGapAlignSub:
776 for (i=0; i<esp->num[index]; i++) {
777 if (*q == *s) {
778 num_ident++;
779 }
780 else if (NULL != matrix) {
781 if (matrix[*q][*s] > 0)
782 num_pos ++;
783 }
784 q++;
785 s++;
786 }
787 break;
788 case eGapAlignDel:
789 s += esp->num[index];
790 break;
791 case eGapAlignIns:
792 q += esp->num[index];
793 break;
794 default:
795 s += esp->num[index];
796 q += esp->num[index];
797 break;
798 }
799 }
800 }
801
802 if (align_length_ptr) {
803 *align_length_ptr = align_length;
804 }
805 *num_ident_ptr = num_ident;
806
807 if(NULL != matrix)
808 *num_pos_ptr = num_pos + num_ident;
809
810 return 0;
811 }
812
813 /** Calculate number of identities in an HSP for an out-of-frame alignment.
814 * @param query The query sequence [in]
815 * @param subject The uncompressed subject sequence [in]
816 * @param hsp All information about the HSP [in]
817 * @param program BLAST program (blastx or tblastn) [in]
818 * @param num_ident_ptr Number of identities [out]
819 * @param align_length_ptr The alignment length, including gaps [out]
820 * @param sbp Blast score blk [in]
821 * @param num_pos_ptr Number of Positives [out]
822 * @return 0 on success, -1 on invalid parameters or error
823 */
824 static Int2
s_Blast_HSPGetOOFNumIdentitiesAndPositives(const Uint1 * query,const Uint1 * subject,const BlastHSP * hsp,EBlastProgramType program,Int4 * num_ident_ptr,Int4 * align_length_ptr,const BlastScoreBlk * sbp,Int4 * num_pos_ptr)825 s_Blast_HSPGetOOFNumIdentitiesAndPositives(
826 const Uint1* query, const Uint1* subject,
827 const BlastHSP* hsp, EBlastProgramType program,
828 Int4* num_ident_ptr, Int4* align_length_ptr,
829 const BlastScoreBlk* sbp, Int4* num_pos_ptr)
830 {
831 Int4 num_ident, align_length;
832 Int4 index;
833 Uint1* q,* s;
834 GapEditScript* esp;
835 Int4 ** matrix = NULL;
836 Int4 num_pos = 0;
837
838 if (!hsp->gap_info || !subject || !query)
839 return -1;
840
841 if(NULL != sbp)
842 {
843 if(sbp->protein_alphabet)
844 matrix = sbp->matrix->data;
845 }
846
847 if (program == eBlastTypeTblastn ||
848 program == eBlastTypeRpsTblastn) {
849 q = (Uint1*) &query[hsp->query.offset];
850 s = (Uint1*) &subject[hsp->subject.offset];
851 } else {
852 s = (Uint1*) &query[hsp->query.offset];
853 q = (Uint1*) &subject[hsp->subject.offset];
854 }
855 num_ident = 0;
856 align_length = 0;
857
858 esp = hsp->gap_info;
859 for (index=0; index<esp->size; index++)
860 {
861 int i;
862 switch (esp->op_type[index]) {
863 case eGapAlignSub: /* Substitution */
864 align_length += esp->num[index];
865 for (i=0; i<esp->num[index]; i++) {
866 if (*q == *s)
867 num_ident++;
868 else if (NULL != matrix) {
869 if (matrix[*q][*s] > 0)
870 num_pos ++;
871 }
872 ++q;
873 s += CODON_LENGTH;
874 }
875 break;
876 case eGapAlignIns: /* Insertion */
877 align_length += esp->num[index];
878 s += esp->num[index] * CODON_LENGTH;
879 break;
880 case eGapAlignDel: /* Deletion */
881 align_length += esp->num[index];
882 q += esp->num[index];
883 break;
884 case eGapAlignDel2: /* Gap of two nucleotides. */
885 s -= 2;
886 break;
887 case eGapAlignDel1: /* Gap of one nucleotide. */
888 s -= 1;
889 break;
890 case eGapAlignIns1: /* Insertion of one nucleotide. */
891 s += 1;
892 break;
893 case eGapAlignIns2: /* Insertion of two nucleotides. */
894 s += 2;
895 break;
896 default:
897 s += esp->num[index] * CODON_LENGTH;
898 q += esp->num[index];
899 break;
900 }
901 }
902
903 if (align_length_ptr) {
904 *align_length_ptr = align_length;
905 }
906 *num_ident_ptr = num_ident;
907
908 if(NULL != matrix)
909 *num_pos_ptr = num_pos + num_ident;
910
911 return 0;
912 }
913
914 Int2
Blast_HSPGetNumIdentities(const Uint1 * query,const Uint1 * subject,BlastHSP * hsp,const BlastScoringOptions * score_options,Int4 * align_length_ptr)915 Blast_HSPGetNumIdentities(const Uint1* query,
916 const Uint1* subject,
917 BlastHSP* hsp,
918 const BlastScoringOptions* score_options,
919 Int4* align_length_ptr)
920 {
921 Int2 retval = 0;
922
923 /* Calculate alignment length and number of identical letters.
924 Do not get the number of identities if the query is not available */
925 if (score_options->is_ooframe) {
926 retval = s_Blast_HSPGetOOFNumIdentitiesAndPositives(query, subject, hsp,
927 score_options->program_number,
928 &hsp->num_ident,
929 align_length_ptr,
930 NULL, NULL);
931 } else {
932 retval = s_Blast_HSPGetNumIdentitiesAndPositives(query, subject, hsp,
933 &hsp->num_ident,
934 align_length_ptr,
935 NULL, NULL);
936 }
937
938 return retval;
939 }
940 Int2
Blast_HSPGetNumIdentitiesAndPositives(const Uint1 * query,const Uint1 * subject,BlastHSP * hsp,const BlastScoringOptions * score_options,Int4 * align_length_ptr,const BlastScoreBlk * sbp)941 Blast_HSPGetNumIdentitiesAndPositives(const Uint1* query,
942 const Uint1* subject,
943 BlastHSP* hsp,
944 const BlastScoringOptions* score_options,
945 Int4* align_length_ptr,
946 const BlastScoreBlk * sbp)
947 {
948 Int2 retval = 0;
949
950 /* Calculate alignment length and number of identical letters.
951 Do not get the number of identities if the query is not available */
952 if (score_options->is_ooframe) {
953 retval = s_Blast_HSPGetOOFNumIdentitiesAndPositives(query, subject, hsp,
954 score_options->program_number,
955 &hsp->num_ident,
956 align_length_ptr,
957 sbp, &hsp->num_positives);
958 } else {
959 retval = s_Blast_HSPGetNumIdentitiesAndPositives(query, subject, hsp,
960 &hsp->num_ident,
961 align_length_ptr,
962 sbp, &hsp->num_positives);
963 }
964
965 return retval;
966 }
967
s_HSPTest(const BlastHSP * hsp,const BlastHitSavingOptions * hit_options,Int4 align_length)968 static Boolean s_HSPTest(const BlastHSP* hsp,
969 const BlastHitSavingOptions* hit_options,
970 Int4 align_length)
971 {
972 return ((hsp->num_ident * 100.0 <
973 align_length * hit_options->percent_identity) ||
974 align_length < hit_options->min_hit_length) ;
975
976 }
977
978 Boolean
Blast_HSPTestIdentityAndLength(EBlastProgramType program_number,BlastHSP * hsp,const Uint1 * query,const Uint1 * subject,const BlastScoringOptions * score_options,const BlastHitSavingOptions * hit_options)979 Blast_HSPTestIdentityAndLength(EBlastProgramType program_number,
980 BlastHSP* hsp, const Uint1* query, const Uint1* subject,
981 const BlastScoringOptions* score_options,
982 const BlastHitSavingOptions* hit_options)
983 {
984 Int4 align_length = 0;
985 Boolean delete_hsp = FALSE;
986 Int2 status = 0;
987
988 ASSERT(hsp && query && subject && score_options && hit_options);
989
990 status = Blast_HSPGetNumIdentities(query, subject, hsp, score_options,
991 &align_length);
992 ASSERT(status == 0);
993 (void)status; /* to pacify compiler warning */
994
995 /* Check whether this HSP passes the percent identity and minimal hit
996 length criteria, and delete it if it does not. */
997 delete_hsp = s_HSPTest(hsp, hit_options, align_length);
998
999 return delete_hsp;
1000 }
1001
Blast_HSPTest(BlastHSP * hsp,const BlastHitSavingOptions * hit_options,Int4 align_length)1002 Boolean Blast_HSPTest(BlastHSP* hsp,
1003 const BlastHitSavingOptions* hit_options,
1004 Int4 align_length)
1005 {
1006 return s_HSPTest(hsp, hit_options, align_length);
1007 }
1008
Blast_HSPGetQueryCoverage(const BlastHSP * hsp,Int4 query_length)1009 double Blast_HSPGetQueryCoverage(const BlastHSP* hsp, Int4 query_length)
1010 {
1011 double pct = 0;
1012 if(query_length > 0) {
1013 pct = 100.0 * (double) (hsp->query.end - hsp->query.offset)/ (double) query_length;
1014 if(pct < 99)
1015 pct +=0.5;
1016 }
1017 return pct;
1018 }
1019
Blast_HSPQueryCoverageTest(BlastHSP * hsp,double min_query_coverage_pct,Int4 query_length)1020 Boolean Blast_HSPQueryCoverageTest(BlastHSP* hsp,
1021 double min_query_coverage_pct,
1022 Int4 query_length)
1023 {
1024 double hsp_coverage = Blast_HSPGetQueryCoverage( hsp, query_length);
1025 return (hsp_coverage < min_query_coverage_pct);
1026 }
1027
1028
1029 void
Blast_HSPCalcLengthAndGaps(const BlastHSP * hsp,Int4 * length_out,Int4 * gaps_out,Int4 * gap_opens_out)1030 Blast_HSPCalcLengthAndGaps(const BlastHSP* hsp, Int4* length_out,
1031 Int4* gaps_out, Int4* gap_opens_out)
1032 {
1033 Int4 length = hsp->query.end - hsp->query.offset;
1034 Int4 s_length = hsp->subject.end - hsp->subject.offset;
1035 Int4 gap_opens = 0, gaps = 0;
1036
1037 if (hsp->gap_info) {
1038 GapEditScript* esp = hsp->gap_info;
1039 Int4 index;
1040 for (index=0; index<esp->size; index++) {
1041 if (esp->op_type[index] == eGapAlignDel) {
1042 length += esp->num[index];
1043 gaps += esp->num[index];
1044 ++gap_opens;
1045 } else if (esp->op_type[index] == eGapAlignIns) {
1046 ++gap_opens;
1047 gaps += esp->num[index];
1048 }
1049 }
1050 } else if (s_length > length) {
1051 length = s_length;
1052 }
1053
1054 *length_out = length;
1055 *gap_opens_out = gap_opens;
1056 *gaps_out = gaps;
1057 }
1058
1059 /** Adjust start and end of an HSP in a translated sequence segment.
1060 * @param segment BlastSeg structure (part of BlastHSP) [in]
1061 * @param seq_length Length of the full sequence [in]
1062 * @param start Start of the alignment in this segment in nucleotide
1063 * coordinates, 1-offset [out]
1064 * @param end End of the alignment in this segment in nucleotide
1065 * coordinates, 1-offset [out]
1066 */
1067 static void
s_BlastSegGetTranslatedOffsets(const BlastSeg * segment,Int4 seq_length,Int4 * start,Int4 * end)1068 s_BlastSegGetTranslatedOffsets(const BlastSeg* segment, Int4 seq_length,
1069 Int4* start, Int4* end)
1070 {
1071 if (segment->frame < 0) {
1072 *start = seq_length - CODON_LENGTH*segment->offset + segment->frame;
1073 *end = seq_length - CODON_LENGTH*segment->end + segment->frame + 1;
1074 } else if (segment->frame > 0) {
1075 *start = CODON_LENGTH*(segment->offset) + segment->frame - 1;
1076 *end = CODON_LENGTH*segment->end + segment->frame - 2;
1077 } else {
1078 *start = segment->offset + 1;
1079 *end = segment->end;
1080 }
1081 }
1082
1083 void
Blast_HSPGetAdjustedOffsets(EBlastProgramType program,BlastHSP * hsp,Int4 query_length,Int4 subject_length,Int4 * q_start,Int4 * q_end,Int4 * s_start,Int4 * s_end)1084 Blast_HSPGetAdjustedOffsets(EBlastProgramType program, BlastHSP* hsp,
1085 Int4 query_length, Int4 subject_length,
1086 Int4* q_start, Int4* q_end,
1087 Int4* s_start, Int4* s_end)
1088 {
1089 if (!hsp->gap_info) {
1090 *q_start = hsp->query.offset + 1;
1091 *q_end = hsp->query.end;
1092 *s_start = hsp->subject.offset + 1;
1093 *s_end = hsp->subject.end;
1094 return;
1095 }
1096
1097 if (!Blast_QueryIsTranslated(program) &&
1098 !Blast_SubjectIsTranslated(program)) {
1099 if (hsp->query.frame != hsp->subject.frame) {
1100 /* Blastn: if different strands, flip offsets in query; leave
1101 offsets in subject as they are, but change order for correct
1102 correspondence. */
1103 *q_end = query_length - hsp->query.offset;
1104 *q_start = *q_end - hsp->query.end + hsp->query.offset + 1;
1105 *s_end = hsp->subject.offset + 1;
1106 *s_start = hsp->subject.end;
1107 } else {
1108 *q_start = hsp->query.offset + 1;
1109 *q_end = hsp->query.end;
1110 *s_start = hsp->subject.offset + 1;
1111 *s_end = hsp->subject.end;
1112 }
1113 } else {
1114 s_BlastSegGetTranslatedOffsets(&hsp->query, query_length, q_start, q_end);
1115 s_BlastSegGetTranslatedOffsets(&hsp->subject, subject_length,
1116 q_start, s_end);
1117 }
1118 }
1119
1120
1121 const Uint1*
Blast_HSPGetTargetTranslation(SBlastTargetTranslation * target_t,const BlastHSP * hsp,Int4 * translated_length)1122 Blast_HSPGetTargetTranslation(SBlastTargetTranslation* target_t,
1123 const BlastHSP* hsp, Int4* translated_length)
1124 {
1125 Int4 context = -1;
1126 Int4 start, stop;
1127
1128 ASSERT(target_t != NULL);
1129
1130 if (hsp == NULL)
1131 return NULL;
1132
1133 context = BLAST_FrameToContext(hsp->subject.frame, target_t->program_number);
1134 start = target_t->range[2*context];
1135 stop = target_t->range[2*context+1];
1136
1137 /* skip translation if full translation has already been done */
1138 if (target_t->partial && (start ||
1139 (stop < target_t->subject_blk->length / CODON_LENGTH -3)))
1140 {
1141 const int kMaxTranslation = 99; /* Needs to be divisible by three (?) */
1142 Int4 nucl_length = 0;
1143 Int4 translation_length = 0;
1144 Int4 nucl_start = 0;
1145 Int4 nucl_end = 0;
1146 Int4 nucl_shift = 0;
1147 Int4 start_shift = 0;
1148
1149 /* HSP coordinates are in terms of protein sequences. */
1150 if (hsp->subject.offset < 0) {
1151 nucl_start = 0;
1152 nucl_end = target_t->subject_blk->length;
1153 } else {
1154 nucl_start = MAX(0, 3*hsp->subject.offset - kMaxTranslation);
1155 nucl_end = MIN(target_t->subject_blk->length, 3*hsp->subject.end + kMaxTranslation);
1156 /* extend to the end of the sequence if close */
1157 if (target_t->subject_blk->length - nucl_end <= 21) {
1158 nucl_end = target_t->subject_blk->length;
1159 }
1160 }
1161
1162 nucl_length = nucl_end - nucl_start;
1163
1164 translation_length = 1+nucl_length/CODON_LENGTH;
1165 start_shift = nucl_start/CODON_LENGTH;
1166
1167 if (hsp->subject.frame < 0)
1168 nucl_shift = target_t->subject_blk->length - nucl_start - nucl_length;
1169 else
1170 nucl_shift = nucl_start;
1171
1172 if (start_shift < start || start_shift+translation_length > stop) {
1173 /* needs re-translation */
1174 Int4 length = 0; /* actual translation length */
1175 Uint1* nucl_seq = target_t->subject_blk->sequence + nucl_shift;
1176 Uint1* nucl_seq_rev = NULL;
1177
1178 target_t->range[2*context] = start_shift;
1179
1180 if (translation_length > stop-start) {
1181 /* needs re-allocation */
1182 sfree(target_t->translations[context]);
1183 target_t->translations[context] = (Uint1*) malloc(translation_length+2);
1184 }
1185
1186 if (hsp->subject.frame < 0) {
1187 /* needs reverse sequence */
1188 GetReverseNuclSequence(nucl_seq, nucl_length, &nucl_seq_rev);
1189 }
1190
1191 length = BLAST_GetTranslation(nucl_seq, nucl_seq_rev,
1192 nucl_length, hsp->subject.frame, target_t->translations[context],
1193 target_t->gen_code_string);
1194
1195 target_t->range[2*context+1] = start_shift + length;
1196
1197 sfree(nucl_seq_rev);
1198
1199 /* partial translation needs to be fenced */
1200 if(hsp->subject.offset >= 0) {
1201 target_t->translations[context][0] = FENCE_SENTRY;
1202 target_t->translations[context][length+1] = FENCE_SENTRY;
1203 }
1204 }
1205 }
1206 if (translated_length)
1207 *translated_length = target_t->range[2*context+1];
1208
1209 /* +1 as the first byte is a sentinel. */
1210 return target_t->translations[context] - target_t->range[2*context] + 1;
1211 }
1212
1213 Int2
Blast_HSPGetPartialSubjectTranslation(BLAST_SequenceBlk * subject_blk,BlastHSP * hsp,Boolean is_ooframe,const Uint1 * gen_code_string,Uint1 ** translation_buffer_ptr,Uint1 ** subject_ptr,Int4 * subject_length_ptr,Int4 * start_shift_ptr)1214 Blast_HSPGetPartialSubjectTranslation(BLAST_SequenceBlk* subject_blk,
1215 BlastHSP* hsp,
1216 Boolean is_ooframe,
1217 const Uint1* gen_code_string,
1218 Uint1** translation_buffer_ptr,
1219 Uint1** subject_ptr,
1220 Int4* subject_length_ptr,
1221 Int4* start_shift_ptr)
1222 {
1223 Int4 translation_length;
1224 Uint1* translation_buffer;
1225 Uint1* subject;
1226 Int4 start_shift;
1227 Int4 nucl_shift;
1228 Int2 status = 0;
1229
1230 ASSERT(subject_blk && hsp && gen_code_string && translation_buffer_ptr &&
1231 subject_ptr && subject_length_ptr && start_shift_ptr);
1232
1233 translation_buffer = *translation_buffer_ptr;
1234 sfree(translation_buffer);
1235
1236 if (!is_ooframe) {
1237 start_shift =
1238 MAX(0, 3*hsp->subject.offset - MAX_FULL_TRANSLATION);
1239 translation_length =
1240 MIN(3*hsp->subject.end + MAX_FULL_TRANSLATION,
1241 subject_blk->length) - start_shift;
1242 if (hsp->subject.frame > 0) {
1243 nucl_shift = start_shift;
1244 } else {
1245 nucl_shift = subject_blk->length - start_shift - translation_length;
1246 }
1247 status = (Int2)
1248 Blast_GetPartialTranslation(subject_blk->sequence_start+nucl_shift,
1249 translation_length, hsp->subject.frame,
1250 gen_code_string, &translation_buffer,
1251 subject_length_ptr, NULL);
1252 /* Below, the start_shift will be used for the protein
1253 coordinates, so need to divide it by 3 */
1254 start_shift /= CODON_LENGTH;
1255 } else {
1256 Int4 oof_end;
1257 oof_end = subject_blk->length;
1258
1259 start_shift =
1260 MAX(0, hsp->subject.offset - MAX_FULL_TRANSLATION);
1261 translation_length =
1262 MIN(hsp->subject.end + MAX_FULL_TRANSLATION, oof_end) - start_shift;
1263 if (hsp->subject.frame > 0) {
1264 nucl_shift = start_shift;
1265 } else {
1266 nucl_shift = oof_end - start_shift - translation_length;
1267 }
1268 status = (Int2)
1269 Blast_GetPartialTranslation(subject_blk->sequence_start+nucl_shift,
1270 translation_length, hsp->subject.frame,
1271 gen_code_string, NULL,
1272 subject_length_ptr, &translation_buffer);
1273 }
1274 hsp->subject.offset -= start_shift;
1275 hsp->subject.end -= start_shift;
1276 hsp->subject.gapped_start -= start_shift;
1277 *translation_buffer_ptr = translation_buffer;
1278 *start_shift_ptr = start_shift;
1279
1280 if (!is_ooframe) {
1281 subject = translation_buffer + 1;
1282 } else {
1283 subject = translation_buffer + CODON_LENGTH;
1284 }
1285 *subject_ptr = subject;
1286
1287 return status;
1288 }
1289
1290 void
Blast_HSPAdjustSubjectOffset(BlastHSP * hsp,Int4 start_shift)1291 Blast_HSPAdjustSubjectOffset(BlastHSP* hsp, Int4 start_shift)
1292 {
1293 /* Adjust subject offsets if shifted (partial) sequence was used
1294 for extension */
1295 if (start_shift > 0) {
1296 hsp->subject.offset += start_shift;
1297 hsp->subject.end += start_shift;
1298 hsp->subject.gapped_start += start_shift;
1299 }
1300
1301 return;
1302 }
1303
1304 int
ScoreCompareHSPs(const void * h1,const void * h2)1305 ScoreCompareHSPs(const void* h1, const void* h2)
1306 {
1307 BlastHSP* hsp1,* hsp2; /* the HSPs to be compared */
1308 int result = 0; /* the result of the comparison */
1309
1310 hsp1 = *((BlastHSP**) h1);
1311 hsp2 = *((BlastHSP**) h2);
1312
1313 /* Null HSPs are "greater" than any non-null ones, so they go to the end
1314 of a sorted list. */
1315 if (!hsp1 && !hsp2)
1316 return 0;
1317 else if (!hsp1)
1318 return 1;
1319 else if (!hsp2)
1320 return -1;
1321
1322 if (0 == (result = BLAST_CMP(hsp2->score, hsp1->score)) &&
1323 0 == (result = BLAST_CMP(hsp1->subject.offset, hsp2->subject.offset)) &&
1324 0 == (result = BLAST_CMP(hsp2->subject.end, hsp1->subject.end)) &&
1325 0 == (result = BLAST_CMP(hsp1->query .offset, hsp2->query .offset))) {
1326 /* if all other test can't distinguish the HSPs, then the final
1327 test is the result */
1328 result = BLAST_CMP(hsp2->query.end, hsp1->query.end);
1329 }
1330 return result;
1331 }
1332
Blast_HSPListIsSortedByScore(const BlastHSPList * hsp_list)1333 Boolean Blast_HSPListIsSortedByScore(const BlastHSPList* hsp_list)
1334 {
1335 Int4 index;
1336
1337 if (!hsp_list || hsp_list->hspcnt <= 1)
1338 return TRUE;
1339
1340 for (index = 0; index < hsp_list->hspcnt - 1; ++index) {
1341 if (ScoreCompareHSPs(&hsp_list->hsp_array[index],
1342 &hsp_list->hsp_array[index+1]) > 0) {
1343 return FALSE;
1344 }
1345 }
1346 return TRUE;
1347 }
1348
Blast_HSPListSortByScore(BlastHSPList * hsp_list)1349 void Blast_HSPListSortByScore(BlastHSPList* hsp_list)
1350 {
1351 if (!hsp_list || hsp_list->hspcnt <= 1)
1352 return;
1353
1354 if (!Blast_HSPListIsSortedByScore(hsp_list)) {
1355 qsort(hsp_list->hsp_array, hsp_list->hspcnt, sizeof(BlastHSP*),
1356 ScoreCompareHSPs);
1357 }
1358 }
1359
1360 /** Compares 2 evalues, consider them equal if both are close enough to zero.
1361 * @param evalue1 First evalue [in]
1362 * @param evalue2 Second evalue [in]
1363 */
1364 static int
s_EvalueComp(double evalue1,double evalue2)1365 s_EvalueComp(double evalue1, double evalue2)
1366 {
1367 const double epsilon = 1.0e-180;
1368 if (evalue1 < epsilon && evalue2 < epsilon) {
1369 return 0;
1370 }
1371
1372 if (evalue1 < evalue2) {
1373 return -1;
1374 } else if (evalue1 > evalue2) {
1375 return 1;
1376 } else {
1377 return 0;
1378 }
1379 }
1380
1381 /** Comparison callback function for sorting HSPs by e-value and score, before
1382 * saving BlastHSPList in a BlastHitList. E-value has priority over score,
1383 * because lower scoring HSPs might have lower e-values, if they are linked
1384 * with sum statistics.
1385 * E-values are compared only up to a certain precision.
1386 * @param v1 Pointer to first HSP [in]
1387 * @param v2 Pointer to second HSP [in]
1388 */
1389 static int
s_EvalueCompareHSPs(const void * v1,const void * v2)1390 s_EvalueCompareHSPs(const void* v1, const void* v2)
1391 {
1392 BlastHSP* h1,* h2;
1393 int retval = 0;
1394
1395 h1 = *((BlastHSP**) v1);
1396 h2 = *((BlastHSP**) v2);
1397
1398 /* Check if one or both of these are null. Those HSPs should go to the end */
1399 if (!h1 && !h2)
1400 return 0;
1401 else if (!h1)
1402 return 1;
1403 else if (!h2)
1404 return -1;
1405
1406 if ((retval = s_EvalueComp(h1->evalue, h2->evalue)) != 0)
1407 return retval;
1408
1409 return ScoreCompareHSPs(v1, v2);
1410 }
1411
Blast_HSPListSortByEvalue(BlastHSPList * hsp_list)1412 void Blast_HSPListSortByEvalue(BlastHSPList* hsp_list)
1413 {
1414 if (!hsp_list)
1415 return;
1416
1417 if (hsp_list->hspcnt > 1) {
1418 Int4 index;
1419 BlastHSP** hsp_array = hsp_list->hsp_array;
1420 /* First check if HSP array is already sorted. */
1421 for (index = 0; index < hsp_list->hspcnt - 1; ++index) {
1422 if (s_EvalueCompareHSPs(&hsp_array[index], &hsp_array[index+1]) > 0) {
1423 break;
1424 }
1425 }
1426 /* Sort the HSP array if it is not sorted yet. */
1427 if (index < hsp_list->hspcnt - 1) {
1428 qsort(hsp_list->hsp_array, hsp_list->hspcnt, sizeof(BlastHSP*),
1429 s_EvalueCompareHSPs);
1430 }
1431 }
1432 }
1433
1434
1435 /** Retrieve the starting diagonal of an HSP
1436 * @param hsp The target HSP
1437 * @return The starting diagonal
1438 */
1439 static Int4
s_HSPStartDiag(const BlastHSP * hsp)1440 s_HSPStartDiag(const BlastHSP *hsp)
1441 {
1442 return hsp->query.offset - hsp->subject.offset;
1443 }
1444
1445 /** Retrieve the ending diagonal of an HSP
1446 * @param hsp The target HSP
1447 * @return The ending diagonal
1448 */
1449 static Int4
s_HSPEndDiag(const BlastHSP * hsp)1450 s_HSPEndDiag(const BlastHSP *hsp)
1451 {
1452 return hsp->query.end - hsp->subject.end;
1453 }
1454
1455 /** Given two hits, check if the hits can be merged and do
1456 * the merge if so. Hits must not contain traceback
1457 * @param hsp1 The first hit. If merging happens, this hit is
1458 * overwritten with the merged version [in][out]
1459 * @param hsp2 The second hit [in]
1460 * @return TRUE if a merge was performed, FALSE if not
1461 */
1462 static Boolean
s_BlastMergeTwoHSPs(BlastHSP * hsp1,BlastHSP * hsp2,Boolean allow_gap)1463 s_BlastMergeTwoHSPs(BlastHSP* hsp1, BlastHSP* hsp2, Boolean allow_gap)
1464 {
1465 ASSERT(!hsp1->gap_info || !hsp2->gap_info);
1466
1467 /* do not merge off-diagonal hsps for ungapped search */
1468 if (!allow_gap &&
1469 hsp1->subject.offset - hsp2->subject.offset -hsp1->query.offset + hsp2->query.offset)
1470 {
1471 return FALSE;
1472 }
1473
1474 if(hsp1->subject.frame != hsp2->subject.frame)
1475 return FALSE;
1476
1477 /* combine the boundaries of the two HSPs,
1478 assuming they intersect at all */
1479 if (CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end,
1480 hsp2->query.offset,
1481 hsp1->subject.offset, hsp1->subject.end,
1482 hsp2->subject.offset) ||
1483 CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end,
1484 hsp2->query.end,
1485 hsp1->subject.offset, hsp1->subject.end,
1486 hsp2->subject.end)) {
1487
1488 double score_density = (hsp1->score + hsp2->score) *(1.0) /
1489 ((hsp1->query.end - hsp1->query.offset) +
1490 (hsp2->query.end - hsp2->query.offset));
1491 hsp1->query.offset = MIN(hsp1->query.offset, hsp2->query.offset);
1492 hsp1->subject.offset = MIN(hsp1->subject.offset, hsp2->subject.offset);
1493 hsp1->query.end = MAX(hsp1->query.end, hsp2->query.end);
1494 hsp1->subject.end = MAX(hsp1->subject.end, hsp2->subject.end);
1495 if (hsp2->score > hsp1->score) {
1496 hsp1->query.gapped_start = hsp2->query.gapped_start;
1497 hsp1->subject.gapped_start = hsp2->subject.gapped_start;
1498 hsp1->score = hsp2->score;
1499 }
1500
1501 hsp1->score = MAX((int) (score_density *(hsp1->query.end - hsp1->query.offset)), hsp1->score);
1502 return TRUE;
1503 }
1504
1505 return FALSE;
1506 }
1507
1508 /** Maximal diagonal distance between HSP starting offsets, within which HSPs
1509 * from search of different chunks of subject sequence are considered for
1510 * merging.
1511 */
1512 #define OVERLAP_DIAG_CLOSE 10
1513 /********************************************************************************
1514 Functions manipulating BlastHSPList's
1515 ********************************************************************************/
1516
Blast_HSPListFree(BlastHSPList * hsp_list)1517 BlastHSPList* Blast_HSPListFree(BlastHSPList* hsp_list)
1518 {
1519 Int4 index;
1520
1521 if (!hsp_list)
1522 return hsp_list;
1523
1524 for (index = 0; index < hsp_list->hspcnt; ++index) {
1525 Blast_HSPFree(hsp_list->hsp_array[index]);
1526 }
1527 sfree(hsp_list->hsp_array);
1528
1529 sfree(hsp_list);
1530 return NULL;
1531 }
1532
Blast_HSPListNew(Int4 hsp_max)1533 BlastHSPList* Blast_HSPListNew(Int4 hsp_max)
1534 {
1535 BlastHSPList* hsp_list = (BlastHSPList*) calloc(1, sizeof(BlastHSPList));
1536 const Int4 kDefaultAllocated=100;
1537
1538 /* hsp_max is max number of HSP's allowed in an HSP list;
1539 INT4_MAX taken as infinity. */
1540 hsp_list->hsp_max = INT4_MAX;
1541 if (hsp_max > 0)
1542 hsp_list->hsp_max = hsp_max;
1543
1544 hsp_list->allocated = MIN(kDefaultAllocated, hsp_list->hsp_max);
1545
1546 hsp_list->hsp_array = (BlastHSP**)
1547 calloc(hsp_list->allocated, sizeof(BlastHSP*));
1548
1549 return hsp_list;
1550 }
1551
1552 Boolean
Blast_HSPList_IsEmpty(const BlastHSPList * hsp_list)1553 Blast_HSPList_IsEmpty(const BlastHSPList* hsp_list)
1554 {
1555 return (hsp_list && hsp_list->hspcnt == 0) ? TRUE : FALSE;
1556 }
1557
BlastHSPListDup(const BlastHSPList * hsp_list)1558 BlastHSPList* BlastHSPListDup(const BlastHSPList* hsp_list)
1559 {
1560 BlastHSPList * rv = 0;
1561
1562 if (hsp_list) {
1563 int index = 0;
1564 int num = hsp_list->hspcnt;
1565
1566 rv = malloc(sizeof(BlastHSPList));
1567 *rv = *hsp_list;
1568
1569 if (num) {
1570 rv->hsp_array = malloc(sizeof(BlastHSP*) * num);
1571
1572 for(index = 0; index < hsp_list->hspcnt; ++index) {
1573 BlastHSP * h = hsp_list->hsp_array[index];
1574 BlastHSP ** h2 = & rv->hsp_array[index];
1575
1576 if (h) {
1577 *h2 = malloc(sizeof(BlastHSP));
1578 **h2 = *h;
1579 } else {
1580 *h2 = 0;
1581 }
1582 }
1583 }
1584 }
1585
1586 return rv;
1587 }
1588
Blast_HSPListSwap(BlastHSPList * list1,BlastHSPList * list2)1589 void Blast_HSPListSwap(BlastHSPList* list1, BlastHSPList* list2)
1590 {
1591 BlastHSPList tmp;
1592
1593 tmp = *list1;
1594 *list1 = *list2;
1595 *list2 = tmp;
1596 }
1597
1598 /** This is a copy of a static function from ncbimisc.c.
1599 * Turns array into a heap with respect to a given comparison function.
1600 */
1601 static void
s_Heapify(char * base0,char * base,char * lim,char * last,size_t width,int (* compar)(const void *,const void *))1602 s_Heapify (char* base0, char* base, char* lim, char* last, size_t width, int (*compar )(const void*, const void* ))
1603 {
1604 size_t i;
1605 char ch;
1606 char* left_son,* large_son;
1607
1608 left_son = base0 + 2*(base-base0) + width;
1609 while (base <= lim) {
1610 if (left_son == last)
1611 large_son = left_son;
1612 else
1613 large_son = (*compar)(left_son, left_son+width) >= 0 ?
1614 left_son : left_son+width;
1615 if ((*compar)(base, large_son) < 0) {
1616 for (i=0; i<width; ++i) {
1617 ch = base[i];
1618 base[i] = large_son[i];
1619 large_son[i] = ch;
1620 }
1621 base = large_son;
1622 left_son = base0 + 2*(base-base0) + width;
1623 } else
1624 break;
1625 }
1626 }
1627
1628 /** Creates a heap of elements based on a comparison function.
1629 * @param b An array [in] [out]
1630 * @param nel Number of elements in b [in]
1631 * @param width The size of each element [in]
1632 * @param compar Callback to compare two heap elements [in]
1633 */
1634 static void
s_CreateHeap(void * b,size_t nel,size_t width,int (* compar)(const void *,const void *))1635 s_CreateHeap (void* b, size_t nel, size_t width,
1636 int (*compar )(const void*, const void* ))
1637 {
1638 char* base = (char*)b;
1639 size_t i;
1640 char* base0 = (char*)base,* lim,* basef;
1641
1642 if (nel < 2)
1643 return;
1644
1645 lim = &base[((nel-2)/2)*width];
1646 basef = &base[(nel-1)*width];
1647 i = nel/2;
1648 for (base = &base0[(i - 1)*width]; i > 0; base = base - width) {
1649 s_Heapify(base0, base, lim, basef, width, compar);
1650 i--;
1651 }
1652 }
1653
1654 /** Given a BlastHSPList* with a heapified HSP array, check whether the
1655 * new HSP is better than the worst scoring. If it is, then remove the
1656 * worst scoring and insert, otherwise free the new one.
1657 * HSP and insert the new HSP in the heap.
1658 * @param hsp_list Contains all HSPs for a given subject. [in] [out]
1659 * @param hsp A pointer to new HSP to be inserted into the HSP list [in] [out]
1660 */
1661 static void
s_BlastHSPListInsertHSPInHeap(BlastHSPList * hsp_list,BlastHSP ** hsp)1662 s_BlastHSPListInsertHSPInHeap(BlastHSPList* hsp_list,
1663 BlastHSP** hsp)
1664 {
1665 BlastHSP** hsp_array = hsp_list->hsp_array;
1666 if (ScoreCompareHSPs(hsp, &hsp_array[0]) > 0)
1667 {
1668 Blast_HSPFree(*hsp);
1669 return;
1670 }
1671 else
1672 Blast_HSPFree(hsp_array[0]);
1673
1674 hsp_array[0] = *hsp;
1675 if (hsp_list->hspcnt >= 2) {
1676 s_Heapify((char*)hsp_array, (char*)hsp_array,
1677 (char*)&hsp_array[hsp_list->hspcnt/2 - 1],
1678 (char*)&hsp_array[hsp_list->hspcnt-1],
1679 sizeof(BlastHSP*), ScoreCompareHSPs);
1680 }
1681 }
1682
1683 #ifndef NDEBUG
1684 /** Verifies that the best_evalue field on the BlastHSPList is correct.
1685 * @param hsp_list object to check [in]
1686 * @return TRUE if OK, FALSE otherwise.
1687 */
1688 static Boolean
s_BlastCheckBestEvalue(const BlastHSPList * hsp_list)1689 s_BlastCheckBestEvalue(const BlastHSPList* hsp_list)
1690 {
1691 int index = 0;
1692 double best_evalue = (double) INT4_MAX;
1693 const double kDelta = 1.0e-200;
1694
1695 /* There are no HSP's here. */
1696 if (hsp_list->hspcnt == 0)
1697 return TRUE;
1698
1699 for (index=0; index<hsp_list->hspcnt; index++)
1700 best_evalue = MIN(hsp_list->hsp_array[index]->evalue, best_evalue);
1701
1702 /* check that it's within 1%. */
1703 if (ABS(best_evalue-hsp_list->best_evalue)/(best_evalue+kDelta) > 0.01)
1704 return FALSE;
1705
1706 return TRUE;
1707 }
1708 #endif /* _DEBUG */
1709
1710 /** Gets the best (lowest) evalue from the BlastHSPList.
1711 * @param hsp_list object containing the evalues [in]
1712 * @return TRUE if OK, FALSE otherwise.
1713 */
1714 static double
s_BlastGetBestEvalue(const BlastHSPList * hsp_list)1715 s_BlastGetBestEvalue(const BlastHSPList* hsp_list)
1716 {
1717 int index = 0;
1718 double best_evalue = (double) INT4_MAX;
1719
1720 for (index=0; index<hsp_list->hspcnt; index++)
1721 best_evalue = MIN(hsp_list->hsp_array[index]->evalue, best_evalue);
1722
1723 return best_evalue;
1724 }
1725
1726 /* Comments in blast_hits.h
1727 */
1728 Int2
Blast_HSPListSaveHSP(BlastHSPList * hsp_list,BlastHSP * new_hsp)1729 Blast_HSPListSaveHSP(BlastHSPList* hsp_list, BlastHSP* new_hsp)
1730 {
1731 BlastHSP** hsp_array;
1732 Int4 hspcnt;
1733 Int4 hsp_allocated; /* how many hsps are in the array. */
1734 Int2 status = 0;
1735
1736 hspcnt = hsp_list->hspcnt;
1737 hsp_allocated = hsp_list->allocated;
1738 hsp_array = hsp_list->hsp_array;
1739
1740
1741 /* Check if list is already full, then reallocate. */
1742 if (hspcnt >= hsp_allocated && hsp_list->do_not_reallocate == FALSE)
1743 {
1744 Int4 new_allocated = MIN(2*hsp_list->allocated, hsp_list->hsp_max);
1745 if (new_allocated > hsp_list->allocated) {
1746 hsp_array = (BlastHSP**)
1747 realloc(hsp_array, new_allocated*sizeof(BlastHSP*));
1748 if (hsp_array == NULL)
1749 {
1750 hsp_list->do_not_reallocate = TRUE;
1751 hsp_array = hsp_list->hsp_array;
1752 /** Return a non-zero status, because restriction on number
1753 of HSPs here is a result of memory allocation failure. */
1754 status = -1;
1755 } else {
1756 hsp_list->hsp_array = hsp_array;
1757 hsp_list->allocated = new_allocated;
1758 hsp_allocated = new_allocated;
1759 }
1760 } else {
1761 hsp_list->do_not_reallocate = TRUE;
1762 }
1763 /* If it is the first time when the HSP array is filled to capacity,
1764 create a heap now. */
1765 if (hsp_list->do_not_reallocate) {
1766 s_CreateHeap(hsp_array, hspcnt, sizeof(BlastHSP*), ScoreCompareHSPs);
1767 }
1768 }
1769
1770 /* If there is space in the allocated HSP array, simply save the new HSP.
1771 Othewise, if the new HSP has lower score than the worst HSP in the heap,
1772 then delete it, else insert it in the heap. */
1773 if (hspcnt < hsp_allocated)
1774 {
1775 hsp_array[hsp_list->hspcnt] = new_hsp;
1776 (hsp_list->hspcnt)++;
1777 return status;
1778 } else {
1779 /* Insert the new HSP in heap. */
1780 s_BlastHSPListInsertHSPInHeap(hsp_list, &new_hsp);
1781 }
1782
1783 return status;
1784 }
1785
Blast_HSPListGetEvalues(EBlastProgramType program_number,const BlastQueryInfo * query_info,Int4 subject_length,BlastHSPList * hsp_list,Boolean gapped_calculation,Boolean RPS_prelim,const BlastScoreBlk * sbp,double gap_decay_rate,double scaling_factor)1786 Int2 Blast_HSPListGetEvalues(EBlastProgramType program_number,
1787 const BlastQueryInfo* query_info,
1788 Int4 subject_length,
1789 BlastHSPList* hsp_list,
1790 Boolean gapped_calculation,
1791 Boolean RPS_prelim,
1792 const BlastScoreBlk* sbp, double gap_decay_rate,
1793 double scaling_factor)
1794 {
1795 BlastHSP* hsp;
1796 BlastHSP** hsp_array;
1797 Blast_KarlinBlk** kbp;
1798 Int4 hsp_cnt;
1799 Int4 index;
1800 Int4 kbp_context;
1801 double gap_decay_divisor = 1.;
1802 Boolean isRPS = Blast_ProgramIsRpsBlast(program_number);
1803
1804 if (hsp_list == NULL || hsp_list->hspcnt == 0)
1805 return 0;
1806
1807 kbp = (gapped_calculation ? sbp->kbp_gap : sbp->kbp);
1808 hsp_cnt = hsp_list->hspcnt;
1809 hsp_array = hsp_list->hsp_array;
1810
1811 if (gap_decay_rate != 0.)
1812 gap_decay_divisor = BLAST_GapDecayDivisor(gap_decay_rate, 1);
1813
1814 for (index=0; index<hsp_cnt; index++) {
1815 hsp = hsp_array[index];
1816
1817 ASSERT(hsp != NULL);
1818 ASSERT(scaling_factor != 0.0);
1819 ASSERT(sbp->round_down == FALSE || (hsp->score & 1) == 0);
1820
1821 /* Divide Lambda by the scaling factor, so e-value is
1822 calculated correctly from a scaled score. This is needed only
1823 for RPS BLAST, where scores are scaled, but Lambda is not. */
1824 kbp_context = hsp->context;
1825 if (RPS_prelim) {
1826 /* All kbp in preliminary stage are equivalent. However, some
1827 may be invalid. Search for the first populated kbp */
1828 int i;
1829 for (i=0; i < sbp->number_of_contexts; ++i) {
1830 if (kbp[i]) break;
1831 }
1832 kbp_context = i;
1833 }
1834 ASSERT(kbp[kbp_context]);
1835 kbp[kbp_context]->Lambda /= scaling_factor;
1836
1837 if (sbp->gbp) {
1838 /* Only try Spouge's method if gumbel parameters are available */
1839 if (!isRPS) {
1840 hsp->evalue =
1841 BLAST_SpougeStoE(hsp->score, kbp[kbp_context], sbp->gbp,
1842 query_info->contexts[hsp->context].query_length,
1843 subject_length);
1844 } else {
1845 /* for RPS blast, query and subject is swapped */
1846 hsp->evalue =
1847 BLAST_SpougeStoE(hsp->score, kbp[kbp_context], sbp->gbp,
1848 subject_length,
1849 query_info->contexts[hsp->context].query_length);
1850 }
1851 } else {
1852 /* Get effective search space from the query information block */
1853 hsp->evalue =
1854 BLAST_KarlinStoE_simple(hsp->score, kbp[kbp_context],
1855 query_info->contexts[hsp->context].eff_searchsp);
1856 }
1857
1858 hsp->evalue /= gap_decay_divisor;
1859 /* Put back the unscaled value of Lambda. */
1860 kbp[kbp_context]->Lambda *= scaling_factor;
1861 }
1862
1863 /* Assign the best e-value field. Here the best e-value will always be
1864 attained for the first HSP in the list. Check that the incoming
1865 HSP list is properly sorted by score. */
1866 ASSERT(Blast_HSPListIsSortedByScore(hsp_list));
1867 hsp_list->best_evalue = s_BlastGetBestEvalue(hsp_list);
1868
1869 return 0;
1870 }
1871
Blast_HSPListGetBitScores(BlastHSPList * hsp_list,Boolean gapped_calculation,const BlastScoreBlk * sbp)1872 Int2 Blast_HSPListGetBitScores(BlastHSPList* hsp_list,
1873 Boolean gapped_calculation,
1874 const BlastScoreBlk* sbp)
1875 {
1876 BlastHSP* hsp;
1877 Blast_KarlinBlk** kbp;
1878 Int4 index;
1879
1880 if (hsp_list == NULL)
1881 return 1;
1882
1883 kbp = (gapped_calculation ? sbp->kbp_gap : sbp->kbp);
1884
1885 for (index=0; index<hsp_list->hspcnt; index++) {
1886 hsp = hsp_list->hsp_array[index];
1887 ASSERT(hsp != NULL);
1888 ASSERT(sbp->round_down == FALSE || (hsp->score & 1) == 0);
1889 hsp->bit_score =
1890 (hsp->score*kbp[hsp->context]->Lambda - kbp[hsp->context]->logK) /
1891 NCBIMATH_LN2;
1892 }
1893
1894 return 0;
1895 }
1896
Blast_HSPListPHIGetBitScores(BlastHSPList * hsp_list,BlastScoreBlk * sbp)1897 void Blast_HSPListPHIGetBitScores(BlastHSPList* hsp_list, BlastScoreBlk* sbp)
1898 {
1899 Int4 index;
1900
1901 double lambda, logC;
1902
1903 ASSERT(sbp && sbp->kbp_gap && sbp->kbp_gap[0]);
1904
1905 lambda = sbp->kbp_gap[0]->Lambda;
1906 logC = log(sbp->kbp_gap[0]->paramC);
1907
1908 for (index=0; index<hsp_list->hspcnt; index++) {
1909 BlastHSP* hsp = hsp_list->hsp_array[index];
1910 ASSERT(hsp != NULL);
1911 hsp->bit_score =
1912 (hsp->score*lambda - logC - log(1.0 + hsp->score*lambda))
1913 / NCBIMATH_LN2;
1914 }
1915 }
1916
1917 void
Blast_HSPListPHIGetEvalues(BlastHSPList * hsp_list,BlastScoreBlk * sbp,const BlastQueryInfo * query_info,const SPHIPatternSearchBlk * pattern_blk)1918 Blast_HSPListPHIGetEvalues(BlastHSPList* hsp_list, BlastScoreBlk* sbp,
1919 const BlastQueryInfo* query_info,
1920 const SPHIPatternSearchBlk* pattern_blk)
1921 {
1922 Int4 index;
1923 BlastHSP* hsp;
1924
1925 if (!hsp_list || hsp_list->hspcnt == 0)
1926 return;
1927
1928 for (index = 0; index < hsp_list->hspcnt; ++index) {
1929 hsp = hsp_list->hsp_array[index];
1930 s_HSPPHIGetEvalue(hsp, sbp, query_info, pattern_blk);
1931 }
1932 /* The best e-value is the one for the highest scoring HSP, which
1933 must be the first in the list. Check that HSPs are sorted by score
1934 to make sure this assumption is correct. */
1935 ASSERT(Blast_HSPListIsSortedByScore(hsp_list));
1936 hsp_list->best_evalue = s_BlastGetBestEvalue(hsp_list);
1937 }
1938
Blast_HSPListReapByEvalue(BlastHSPList * hsp_list,const BlastHitSavingOptions * hit_options)1939 Int2 Blast_HSPListReapByEvalue(BlastHSPList* hsp_list,
1940 const BlastHitSavingOptions* hit_options)
1941 {
1942 BlastHSP* hsp;
1943 BlastHSP** hsp_array;
1944 Int4 hsp_cnt = 0;
1945 Int4 index;
1946 double cutoff;
1947
1948 if (hsp_list == NULL)
1949 return 0;
1950
1951 cutoff = hit_options->expect_value;
1952
1953 hsp_array = hsp_list->hsp_array;
1954 for (index = 0; index < hsp_list->hspcnt; index++) {
1955 hsp = hsp_array[index];
1956
1957 ASSERT(hsp != NULL);
1958
1959 if (hsp->evalue > cutoff) {
1960 hsp_array[index] = Blast_HSPFree(hsp_array[index]);
1961 } else {
1962 if (index > hsp_cnt)
1963 hsp_array[hsp_cnt] = hsp_array[index];
1964 hsp_cnt++;
1965 }
1966 }
1967
1968 hsp_list->hspcnt = hsp_cnt;
1969
1970 return 0;
1971 }
1972
Blast_HSPListReapByQueryCoverage(BlastHSPList * hsp_list,const BlastHitSavingOptions * hit_options,const BlastQueryInfo * query_info,EBlastProgramType program_number)1973 Int2 Blast_HSPListReapByQueryCoverage(BlastHSPList* hsp_list,
1974 const BlastHitSavingOptions* hit_options,
1975 const BlastQueryInfo* query_info,
1976 EBlastProgramType program_number)
1977
1978 {
1979 BlastHSP* hsp;
1980 BlastHSP** hsp_array;
1981 Int4 hsp_cnt = 0;
1982 Int4 index;
1983
1984 if ((hsp_list == NULL) || (hsp_list->hspcnt == 0) || (hit_options->query_cov_hsp_perc == 0))
1985 return 0;
1986
1987 hsp_array = hsp_list->hsp_array;
1988 for (index = 0; index < hsp_list->hspcnt; index++) {
1989 hsp = hsp_array[index];
1990 ASSERT(hsp != NULL);
1991 if ( Blast_HSPQueryCoverageTest(hsp, hit_options->query_cov_hsp_perc,
1992 query_info->contexts[hsp->context].query_length)) {
1993 hsp_array[index] = Blast_HSPFree(hsp_array[index]);
1994 } else {
1995 if (index > hsp_cnt)
1996 hsp_array[hsp_cnt] = hsp_array[index];
1997 hsp_cnt++;
1998 }
1999 }
2000
2001 hsp_list->hspcnt = hsp_cnt;
2002
2003 return 0;
2004 }
2005
Blast_TrimHSPListByMaxHsps(BlastHSPList * hsp_list,const BlastHitSavingOptions * hit_options)2006 Int2 Blast_TrimHSPListByMaxHsps(BlastHSPList* hsp_list,
2007 const BlastHitSavingOptions* hit_options)
2008 {
2009 BlastHSP** hsp_array;
2010 Int4 index;
2011 Int4 hsp_max;
2012
2013 if ((hsp_list == NULL) ||
2014 (hit_options->max_hsps_per_subject == 0) ||
2015 (hsp_list->hspcnt <= hit_options->max_hsps_per_subject))
2016 return 0;
2017
2018 hsp_max = hit_options->max_hsps_per_subject;
2019 hsp_array = hsp_list->hsp_array;
2020 for (index = hsp_max; index < hsp_list->hspcnt; index++) {
2021 hsp_array[index] = Blast_HSPFree(hsp_array[index]);
2022 }
2023
2024 hsp_list->hspcnt = hsp_max;
2025 return 0;
2026 }
2027
2028
2029 /** Same as Blast_HSPListReapByEvalue() except that it uses
2030 * the raw score of the hit and the HitSavingOptions->cutoff_score
2031 * to filter out hits. -RMH-
2032 */
Blast_HSPListReapByRawScore(BlastHSPList * hsp_list,const BlastHitSavingOptions * hit_options)2033 Int2 Blast_HSPListReapByRawScore(BlastHSPList* hsp_list,
2034 const BlastHitSavingOptions* hit_options)
2035 {
2036 BlastHSP* hsp;
2037 BlastHSP** hsp_array;
2038 Int4 hsp_cnt = 0;
2039 Int4 index;
2040
2041 if (hsp_list == NULL)
2042 return 0;
2043
2044 hsp_array = hsp_list->hsp_array;
2045 for (index = 0; index < hsp_list->hspcnt; index++) {
2046 hsp = hsp_array[index];
2047
2048 ASSERT(hsp != NULL);
2049
2050 if ( hsp->score < hit_options->cutoff_score ) {
2051 hsp_array[index] = Blast_HSPFree(hsp_array[index]);
2052 } else {
2053 if (index > hsp_cnt)
2054 hsp_array[hsp_cnt] = hsp_array[index];
2055 hsp_cnt++;
2056 }
2057 }
2058
2059 hsp_list->hspcnt = hsp_cnt;
2060
2061 return 0;
2062 }
2063
2064 /** callback used to sort HSP lists in order of increasing OID
2065 * @param x First HSP list [in]
2066 * @param y Second HSP list [in]
2067 * @return compare result
2068 */
s_SortHSPListByOid(const void * x,const void * y)2069 static int s_SortHSPListByOid(const void *x, const void *y)
2070 {
2071 BlastHSPList **xx = (BlastHSPList **)x;
2072 BlastHSPList **yy = (BlastHSPList **)y;
2073 return (*xx)->oid - (*yy)->oid;
2074 }
2075
Blast_HitListMerge(BlastHitList ** old_hit_list_ptr,BlastHitList ** combined_hit_list_ptr,Int4 contexts_per_query,Int4 * split_offsets,Int4 chunk_overlap_size,Boolean allow_gap)2076 Int2 Blast_HitListMerge(BlastHitList** old_hit_list_ptr,
2077 BlastHitList** combined_hit_list_ptr,
2078 Int4 contexts_per_query, Int4 *split_offsets,
2079 Int4 chunk_overlap_size, Boolean allow_gap)
2080 {
2081 Int4 i, j;
2082 Boolean query_is_split;
2083 BlastHitList* hitlist1 = *old_hit_list_ptr;
2084 BlastHitList* hitlist2 = *combined_hit_list_ptr;
2085 BlastHitList* new_hitlist;
2086 Int4 num_hsplists1;
2087 Int4 num_hsplists2;
2088
2089 if (hitlist1 == NULL)
2090 return 0;
2091 if (hitlist2 == NULL) {
2092 *combined_hit_list_ptr = hitlist1;
2093 *old_hit_list_ptr = NULL;
2094 return 0;
2095 }
2096 num_hsplists1 = hitlist1->hsplist_count;
2097 num_hsplists2 = hitlist2->hsplist_count;
2098 new_hitlist = Blast_HitListNew(hitlist1->hsplist_max);
2099
2100 /* sort the lists of HSPs by oid */
2101
2102 if (num_hsplists1 > 1) {
2103 qsort(hitlist1->hsplist_array, num_hsplists1,
2104 sizeof(BlastHSPList*), s_SortHSPListByOid);
2105 }
2106 if (num_hsplists2 > 1) {
2107 qsort(hitlist2->hsplist_array, num_hsplists2,
2108 sizeof(BlastHSPList*), s_SortHSPListByOid);
2109 }
2110
2111 /* find out if the two hitlists contain hits for a single
2112 (split) query sequence */
2113
2114 query_is_split = FALSE;
2115 for (i = 0; i < contexts_per_query; i++) {
2116 if (split_offsets[i] > 0) {
2117 query_is_split = TRUE;
2118 break;
2119 }
2120 }
2121 ASSERT(chunk_overlap_size != 0);
2122
2123 /* merge the HSPlists of the two HitLists */
2124
2125 i = j = 0;
2126 while (i < num_hsplists1 && j < num_hsplists2) {
2127 BlastHSPList* hsplist1 = hitlist1->hsplist_array[i];
2128 BlastHSPList* hsplist2 = hitlist2->hsplist_array[j];
2129
2130 if (hsplist1->oid < hsplist2->oid) {
2131 Blast_HitListUpdate(new_hitlist, hsplist1);
2132 i++;
2133 }
2134 else if (hsplist1->oid > hsplist2->oid) {
2135 Blast_HitListUpdate(new_hitlist, hsplist2);
2136 j++;
2137 }
2138 else {
2139 /* the old and new Hitlists contain hits to the same
2140 DB sequence, and these must be merged. */
2141
2142 if (query_is_split) {
2143 Blast_HSPListsMerge(hitlist1->hsplist_array + i,
2144 hitlist2->hsplist_array + j,
2145 hsplist2->hsp_max, split_offsets,
2146 contexts_per_query,
2147 chunk_overlap_size,
2148 allow_gap, FALSE);
2149 }
2150 else {
2151 Blast_HSPListAppend(hitlist1->hsplist_array + i,
2152 hitlist2->hsplist_array + j,
2153 hsplist2->hsp_max);
2154 }
2155 Blast_HitListUpdate(new_hitlist, hitlist2->hsplist_array[j]);
2156 i++;
2157 j++;
2158 }
2159 }
2160 for (; i < num_hsplists1; i++) {
2161 BlastHSPList* hsplist1 = hitlist1->hsplist_array[i];
2162 Blast_HitListUpdate(new_hitlist, hsplist1);
2163 }
2164 for (; j < num_hsplists2; j++) {
2165 BlastHSPList* hsplist2 = hitlist2->hsplist_array[j];
2166 Blast_HitListUpdate(new_hitlist, hsplist2);
2167 }
2168 hitlist1->hsplist_count = 0;
2169 Blast_HitListFree(hitlist1);
2170 hitlist2->hsplist_count = 0;
2171 Blast_HitListFree(hitlist2);
2172
2173 *old_hit_list_ptr = NULL;
2174 *combined_hit_list_ptr = new_hitlist;
2175 return 0;
2176 }
2177
2178
2179 /* See description in blast_hits.h */
2180
2181 Int2
Blast_HSPListPurgeNullHSPs(BlastHSPList * hsp_list)2182 Blast_HSPListPurgeNullHSPs(BlastHSPList* hsp_list)
2183
2184 {
2185 Int4 index, index1; /* loop indices. */
2186 Int4 hspcnt; /* total number of HSP's to iterate over. */
2187 BlastHSP** hsp_array; /* hsp_array to purge. */
2188
2189 if (hsp_list == NULL || hsp_list->hspcnt == 0)
2190 return 0;
2191
2192 hsp_array = hsp_list->hsp_array;
2193 hspcnt = hsp_list->hspcnt;
2194
2195 index1 = 0;
2196 for (index=0; index<hspcnt; index++)
2197 {
2198 if (hsp_array[index] != NULL)
2199 {
2200 hsp_array[index1] = hsp_array[index];
2201 index1++;
2202 }
2203 }
2204
2205 for (index=index1; index<hspcnt; index++)
2206 {
2207 hsp_array[index] = NULL;
2208 }
2209
2210 hsp_list->hspcnt = index1;
2211
2212 return 0;
2213 }
2214
2215 /** Callback for sorting HSPs by starting offset in query. Sorting is by
2216 * increasing context, then increasing query start offset, then increasing
2217 * subject start offset, then decreasing score, then increasing query end
2218 * offset, then increasing subject end offset. Null HSPs are moved to the
2219 * end of the array.
2220 * @param v1 pointer to first HSP [in]
2221 * @param v2 pointer to second HSP [in]
2222 * @return Result of comparison.
2223 */
2224 static int
s_QueryOffsetCompareHSPs(const void * v1,const void * v2)2225 s_QueryOffsetCompareHSPs(const void* v1, const void* v2)
2226 {
2227 BlastHSP* h1,* h2;
2228 BlastHSP** hp1,** hp2;
2229
2230 hp1 = (BlastHSP**) v1;
2231 hp2 = (BlastHSP**) v2;
2232 h1 = *hp1;
2233 h2 = *hp2;
2234
2235 if (!h1 && !h2)
2236 return 0;
2237 else if (!h1)
2238 return 1;
2239 else if (!h2)
2240 return -1;
2241
2242 /* If these are from different contexts, don't compare offsets */
2243 if (h1->context < h2->context)
2244 return -1;
2245 if (h1->context > h2->context)
2246 return 1;
2247
2248 if (h1->query.offset < h2->query.offset)
2249 return -1;
2250 if (h1->query.offset > h2->query.offset)
2251 return 1;
2252
2253 if (h1->subject.offset < h2->subject.offset)
2254 return -1;
2255 if (h1->subject.offset > h2->subject.offset)
2256 return 1;
2257
2258 /* tie breakers: sort by decreasing score, then
2259 by increasing size of query range, then by
2260 increasing subject range. */
2261
2262 if (h1->score < h2->score)
2263 return 1;
2264 if (h1->score > h2->score)
2265 return -1;
2266
2267 if (h1->query.end < h2->query.end)
2268 return 1;
2269 if (h1->query.end > h2->query.end)
2270 return -1;
2271
2272 if (h1->subject.end < h2->subject.end)
2273 return 1;
2274 if (h1->subject.end > h2->subject.end)
2275 return -1;
2276
2277 return 0;
2278 }
2279
2280 /** Callback for sorting HSPs by ending offset in query. Sorting is by
2281 * increasing context, then increasing query end offset, then increasing
2282 * subject end offset, then decreasing score, then decreasing query start
2283 * offset, then decreasing subject start offset. Null HSPs are moved to the
2284 * end of the array.
2285 * @param v1 pointer to first HSP [in]
2286 * @param v2 pointer to second HSP [in]
2287 * @return Result of comparison.
2288 */
2289 static int
s_QueryEndCompareHSPs(const void * v1,const void * v2)2290 s_QueryEndCompareHSPs(const void* v1, const void* v2)
2291 {
2292 BlastHSP* h1,* h2;
2293 BlastHSP** hp1,** hp2;
2294
2295 hp1 = (BlastHSP**) v1;
2296 hp2 = (BlastHSP**) v2;
2297 h1 = *hp1;
2298 h2 = *hp2;
2299
2300 if (!h1 && !h2)
2301 return 0;
2302 else if (!h1)
2303 return 1;
2304 else if (!h2)
2305 return -1;
2306
2307 /* If these are from different contexts, don't compare offsets */
2308 if (h1->context < h2->context)
2309 return -1;
2310 if (h1->context > h2->context)
2311 return 1;
2312
2313 if (h1->query.end < h2->query.end)
2314 return -1;
2315 if (h1->query.end > h2->query.end)
2316 return 1;
2317
2318 if (h1->subject.end < h2->subject.end)
2319 return -1;
2320 if (h1->subject.end > h2->subject.end)
2321 return 1;
2322
2323 /* tie breakers: sort by decreasing score, then
2324 by increasing size of query range, then by
2325 increasing size of subject range. The shortest range
2326 means the *largest* sequence offset must come
2327 first */
2328 if (h1->score < h2->score)
2329 return 1;
2330 if (h1->score > h2->score)
2331 return -1;
2332
2333 if (h1->query.offset < h2->query.offset)
2334 return 1;
2335 if (h1->query.offset > h2->query.offset)
2336 return -1;
2337
2338 if (h1->subject.offset < h2->subject.offset)
2339 return 1;
2340 if (h1->subject.offset > h2->subject.offset)
2341 return -1;
2342
2343 return 0;
2344 }
2345
2346
2347 /* cut off the GapEditScript according to hsp offset and end */
2348 static void
s_CutOffGapEditScript(BlastHSP * hsp,Int4 q_cut,Int4 s_cut,Boolean cut_begin)2349 s_CutOffGapEditScript(BlastHSP* hsp, Int4 q_cut, Int4 s_cut, Boolean cut_begin)
2350 {
2351 int index, opid, qid, sid;
2352 Boolean found = FALSE;
2353 GapEditScript *esp = hsp->gap_info;
2354 qid = 0;
2355 sid = 0;
2356 opid = 0;
2357 q_cut -= hsp->query.offset;
2358 s_cut -= hsp->subject.offset;
2359 for (index=0; index < esp->size; index++) {
2360 for(opid=0; opid < esp->num[index];){
2361 if (esp->op_type[index] == eGapAlignSub) {
2362 qid++;
2363 sid++;
2364 opid++;
2365 } else if (esp->op_type[index] == eGapAlignDel) {
2366 sid+=esp->num[index];
2367 opid+=esp->num[index];
2368 } else if (esp->op_type[index] == eGapAlignIns) {
2369 qid+=esp->num[index];
2370 opid+=esp->num[index];
2371 }
2372 if (qid >= q_cut && sid >= s_cut) found = TRUE;
2373 if (found) break;
2374 }
2375 if (found) break;
2376 }
2377
2378 // RMH: Unless both cut sites where found the following
2379 // block would access memory outside the GapEditScript
2380 // array.
2381 if ( found )
2382 {
2383 if (cut_begin) {
2384 int new_index = 0;
2385 if (opid < esp->num[index]) {
2386 ASSERT(esp->op_type[index] == eGapAlignSub);
2387 esp->op_type[0] = esp->op_type[index];
2388 esp->num[0] = esp->num[index] - opid;
2389 new_index++;
2390 }
2391 ++index;
2392 for (; index < esp->size; index++, new_index++) {
2393 esp->op_type[new_index] = esp->op_type[index];
2394 esp->num[new_index] = esp->num[index];
2395 }
2396 esp->size = new_index;
2397 hsp->query.offset += qid;
2398 hsp->subject.offset += sid;
2399 } else {
2400 if (opid < esp->num[index]) {
2401 ASSERT(esp->op_type[index] == eGapAlignSub);
2402 esp->num[index] = opid;
2403 }
2404 esp->size = index+1;
2405 hsp->query.end = hsp->query.offset + qid;
2406 hsp->subject.end = hsp->subject.offset + sid;
2407 }
2408 }
2409 }
2410
2411 Int4
Blast_HSPListPurgeHSPsWithCommonEndpoints(EBlastProgramType program,BlastHSPList * hsp_list,Boolean purge)2412 Blast_HSPListPurgeHSPsWithCommonEndpoints(EBlastProgramType program,
2413 BlastHSPList* hsp_list,
2414 Boolean purge)
2415
2416 {
2417 BlastHSP** hsp_array; /* hsp_array to purge. */
2418 BlastHSP* hsp;
2419 Int4 i, j, k;
2420 Int4 hsp_count;
2421 purge |= (program != eBlastTypeBlastn);
2422
2423 /* If HSP list is empty, return immediately. */
2424 if (hsp_list == NULL || hsp_list->hspcnt == 0)
2425 return 0;
2426
2427 /* Do nothing for PHI BLAST, because HSPs corresponding to different pattern
2428 occurrences may have common end points, but should all be kept. */
2429 if (Blast_ProgramIsPhiBlast(program))
2430 return hsp_list->hspcnt;
2431
2432 hsp_array = hsp_list->hsp_array;
2433 hsp_count = hsp_list->hspcnt;
2434
2435 qsort(hsp_array, hsp_count, sizeof(BlastHSP*), s_QueryOffsetCompareHSPs);
2436 i = 0;
2437 while (i < hsp_count) {
2438 j = 1;
2439 while (i+j < hsp_count &&
2440 hsp_array[i] && hsp_array[i+j] &&
2441 hsp_array[i]->context == hsp_array[i+j]->context &&
2442 hsp_array[i]->query.offset == hsp_array[i+j]->query.offset &&
2443 hsp_array[i]->subject.offset == hsp_array[i+j]->subject.offset) {
2444 hsp_count--;
2445 hsp = hsp_array[i+j];
2446 if (!purge && (hsp->query.end > hsp_array[i]->query.end)) {
2447 s_CutOffGapEditScript(hsp, hsp_array[i]->query.end,
2448 hsp_array[i]->subject.end, TRUE);
2449 } else {
2450 hsp = Blast_HSPFree(hsp);
2451 }
2452 for (k=i+j; k<hsp_count; k++) {
2453 hsp_array[k] = hsp_array[k+1];
2454 }
2455 hsp_array[hsp_count] = hsp;
2456 }
2457 i += j;
2458 }
2459
2460 qsort(hsp_array, hsp_count, sizeof(BlastHSP*), s_QueryEndCompareHSPs);
2461 i = 0;
2462 while (i < hsp_count) {
2463 j = 1;
2464 while (i+j < hsp_count &&
2465 hsp_array[i] && hsp_array[i+j] &&
2466 hsp_array[i]->context == hsp_array[i+j]->context &&
2467 hsp_array[i]->query.end == hsp_array[i+j]->query.end &&
2468 hsp_array[i]->subject.end == hsp_array[i+j]->subject.end) {
2469 hsp_count--;
2470 hsp = hsp_array[i+j];
2471 if (!purge && (hsp->query.offset < hsp_array[i]->query.offset)) {
2472 s_CutOffGapEditScript(hsp, hsp_array[i]->query.offset,
2473 hsp_array[i]->subject.offset, FALSE);
2474 } else {
2475 hsp = Blast_HSPFree(hsp);
2476 }
2477 for (k=i+j; k<hsp_count; k++) {
2478 hsp_array[k] = hsp_array[k+1];
2479 }
2480 hsp_array[hsp_count] = hsp;
2481 }
2482 i += j;
2483 }
2484
2485 if (purge) {
2486 Blast_HSPListPurgeNullHSPs(hsp_list);
2487 }
2488
2489 return hsp_count;
2490 }
2491
2492 Int2
Blast_HSPListReevaluateUngapped(EBlastProgramType program,BlastHSPList * hsp_list,BLAST_SequenceBlk * query_blk,BLAST_SequenceBlk * subject_blk,const BlastInitialWordParameters * word_params,const BlastHitSavingParameters * hit_params,const BlastQueryInfo * query_info,BlastScoreBlk * sbp,const BlastScoringParameters * score_params,const BlastSeqSrc * seq_src,const Uint1 * gen_code_string)2493 Blast_HSPListReevaluateUngapped(EBlastProgramType program,
2494 BlastHSPList* hsp_list, BLAST_SequenceBlk* query_blk,
2495 BLAST_SequenceBlk* subject_blk,
2496 const BlastInitialWordParameters* word_params,
2497 const BlastHitSavingParameters* hit_params, const BlastQueryInfo* query_info,
2498 BlastScoreBlk* sbp, const BlastScoringParameters* score_params,
2499 const BlastSeqSrc* seq_src, const Uint1* gen_code_string)
2500 {
2501 BlastHSP** hsp_array,* hsp;
2502 const Uint1* subject_start = NULL;
2503 Uint1* query_start;
2504 Int4 index, context, hspcnt;
2505 Boolean purge;
2506 Int2 status = 0;
2507 const Boolean kTranslateSubject = Blast_SubjectIsTranslated(program);
2508 const Boolean kNucleotideSubject = Blast_SubjectIsNucleotide(program);
2509 SBlastTargetTranslation* target_t = NULL;
2510
2511 ASSERT(!score_params->options->gapped_calculation);
2512
2513 if (!hsp_list)
2514 return status;
2515
2516 hspcnt = hsp_list->hspcnt;
2517 hsp_array = hsp_list->hsp_array;
2518
2519 if (hsp_list->hspcnt == 0)
2520 /* All HSPs have been deleted */
2521 return status;
2522
2523 /* Retrieve the unpacked subject sequence and save it in the
2524 sequence_start element of the subject structure.
2525 NB: for the BLAST 2 Sequences case, the uncompressed sequence must
2526 already be there */
2527 if (seq_src) {
2528 /* Wrap subject sequence block into a BlastSeqSrcGetSeqArg structure, which is
2529 needed by the BlastSeqSrc API. */
2530 /* If this was a protein subject, leave as is. */
2531 if (kNucleotideSubject)
2532 {
2533 BlastSeqSrcGetSeqArg seq_arg;
2534 memset((void*) &seq_arg, 0, sizeof(seq_arg));
2535 seq_arg.oid = subject_blk->oid;
2536 seq_arg.encoding =
2537 (kTranslateSubject ? eBlastEncodingNcbi4na : eBlastEncodingNucleotide);
2538 seq_arg.check_oid_exclusion = TRUE;
2539 seq_arg.seq = subject_blk;
2540 /* Return the packed sequence to the database */
2541 BlastSeqSrcReleaseSequence(seq_src, &seq_arg);
2542 /* Get the unpacked sequence */
2543 if (( BLAST_SEQSRC_EXCLUDED == BlastSeqSrcGetSequence(seq_src, &seq_arg))) {
2544 for (index = 0; index < hspcnt; ++index) {
2545 hsp_array[index] = Blast_HSPFree(hsp_array[index]);
2546 }
2547 Blast_HSPListPurgeNullHSPs(hsp_list);
2548 return 0;
2549 }
2550 else if (status < 0){
2551 return status;
2552 }
2553 }
2554 }
2555
2556 if (kTranslateSubject) {
2557 if (!gen_code_string)
2558 return -1;
2559 /* Get the translation buffer */
2560 BlastTargetTranslationNew(subject_blk, gen_code_string, program,
2561 score_params->options->is_ooframe, &target_t);
2562 } else {
2563 /* Store sequence in blastna encoding in sequence_start */
2564 if (subject_blk->sequence_start)
2565 subject_start = subject_blk->sequence_start + 1;
2566 else
2567 subject_start = subject_blk->sequence;
2568 }
2569
2570 purge = FALSE;
2571 for (index = 0; index < hspcnt; ++index) {
2572 Boolean delete_hsp = FALSE;
2573 if (hsp_array[index] == NULL)
2574 continue;
2575 else
2576 hsp = hsp_array[index];
2577
2578 context = hsp->context;
2579
2580 query_start = query_blk->sequence +
2581 query_info->contexts[context].query_offset;
2582
2583 if (kTranslateSubject)
2584 subject_start = Blast_HSPGetTargetTranslation(target_t, hsp, NULL);
2585
2586 if (kNucleotideSubject) {
2587 delete_hsp =
2588 Blast_HSPReevaluateWithAmbiguitiesUngapped(hsp, query_start,
2589 subject_start, word_params, sbp, kTranslateSubject);
2590 }
2591
2592 if (!delete_hsp) {
2593 const Uint1* query_nomask = query_blk->sequence_nomask +
2594 query_info->contexts[context].query_offset;
2595 Int4 align_length = 0;
2596 Blast_HSPGetNumIdentitiesAndPositives(query_nomask,
2597 subject_start,
2598 hsp,
2599 score_params->options,
2600 &align_length,
2601 sbp);
2602
2603 delete_hsp = Blast_HSPTest(hsp, hit_params->options, align_length);
2604 }
2605 if (delete_hsp) { /* This HSP is now below the cutoff */
2606 hsp_array[index] = Blast_HSPFree(hsp_array[index]);
2607 purge = TRUE;
2608 }
2609 }
2610
2611 if (target_t)
2612 target_t = BlastTargetTranslationFree(target_t);
2613
2614 if (purge)
2615 Blast_HSPListPurgeNullHSPs(hsp_list);
2616
2617 /* Sort the HSP array by score (scores may have changed!) */
2618 Blast_HSPListSortByScore(hsp_list);
2619 Blast_HSPListAdjustOddBlastnScores(hsp_list, FALSE, sbp);
2620 return 0;
2621 }
2622
2623 /** Combine two HSP lists, without altering the individual HSPs, and without
2624 * reallocating the HSP array.
2625 * @param hsp_list New HSP list [in]
2626 * @param combined_hsp_list Old HSP list, to which new HSPs are added [in] [out]
2627 * @param new_hspcnt How many HSPs to save in the combined list? The extra ones
2628 * are freed. The best scoring HSPs are saved. This argument
2629 * cannot be greater than the allocated size of the
2630 * combined list's HSP array. [in]
2631 */
2632 static void
s_BlastHSPListsCombineByScore(BlastHSPList * hsp_list,BlastHSPList * combined_hsp_list,Int4 new_hspcnt)2633 s_BlastHSPListsCombineByScore(BlastHSPList* hsp_list,
2634 BlastHSPList* combined_hsp_list,
2635 Int4 new_hspcnt)
2636 {
2637 Int4 index, index1, index2;
2638 BlastHSP** new_hsp_array;
2639
2640 ASSERT(new_hspcnt <= combined_hsp_list->allocated);
2641
2642 if (new_hspcnt >= hsp_list->hspcnt + combined_hsp_list->hspcnt) {
2643 /* All HSPs from both arrays are saved */
2644 for (index=combined_hsp_list->hspcnt, index1=0;
2645 index1<hsp_list->hspcnt; index1++) {
2646 if (hsp_list->hsp_array[index1] != NULL)
2647 combined_hsp_list->hsp_array[index++] = hsp_list->hsp_array[index1];
2648 }
2649 combined_hsp_list->hspcnt = new_hspcnt;
2650 Blast_HSPListSortByScore(combined_hsp_list);
2651 } else {
2652 /* Not all HSPs are be saved; sort both arrays by score and save only
2653 the new_hspcnt best ones.
2654 For the merged set of HSPs, allocate array the same size as in the
2655 old HSP list. */
2656 new_hsp_array = (BlastHSP**)
2657 malloc(combined_hsp_list->allocated*sizeof(BlastHSP*));
2658
2659 Blast_HSPListSortByScore(combined_hsp_list);
2660 Blast_HSPListSortByScore(hsp_list);
2661 index1 = index2 = 0;
2662 for (index = 0; index < new_hspcnt; ++index) {
2663 if (index1 < combined_hsp_list->hspcnt &&
2664 (index2 >= hsp_list->hspcnt ||
2665 ScoreCompareHSPs(&combined_hsp_list->hsp_array[index1],
2666 &hsp_list->hsp_array[index2]) <= 0)) {
2667 new_hsp_array[index] = combined_hsp_list->hsp_array[index1];
2668 ++index1;
2669 } else {
2670 new_hsp_array[index] = hsp_list->hsp_array[index2];
2671 ++index2;
2672 }
2673 }
2674 /* Free the extra HSPs that could not be saved */
2675 for ( ; index1 < combined_hsp_list->hspcnt; ++index1) {
2676 combined_hsp_list->hsp_array[index1] =
2677 Blast_HSPFree(combined_hsp_list->hsp_array[index1]);
2678 }
2679 for ( ; index2 < hsp_list->hspcnt; ++index2) {
2680 hsp_list->hsp_array[index2] =
2681 Blast_HSPFree(hsp_list->hsp_array[index2]);
2682 }
2683 /* Point combined_hsp_list's HSP array to the new one */
2684 sfree(combined_hsp_list->hsp_array);
2685 combined_hsp_list->hsp_array = new_hsp_array;
2686 combined_hsp_list->hspcnt = new_hspcnt;
2687 }
2688
2689 /* Second HSP list now does not own any HSPs */
2690 hsp_list->hspcnt = 0;
2691 }
2692
Blast_HSPListAppend(BlastHSPList ** old_hsp_list_ptr,BlastHSPList ** combined_hsp_list_ptr,Int4 hsp_num_max)2693 Int2 Blast_HSPListAppend(BlastHSPList** old_hsp_list_ptr,
2694 BlastHSPList** combined_hsp_list_ptr, Int4 hsp_num_max)
2695 {
2696 BlastHSPList* combined_hsp_list = *combined_hsp_list_ptr;
2697 BlastHSPList* hsp_list = *old_hsp_list_ptr;
2698 Int4 new_hspcnt;
2699
2700 if (!hsp_list || hsp_list->hspcnt == 0)
2701 return 0;
2702
2703 /* If no previous HSP list, return a pointer to the old one */
2704 if (!combined_hsp_list) {
2705 *combined_hsp_list_ptr = hsp_list;
2706 *old_hsp_list_ptr = NULL;
2707 return 0;
2708 }
2709
2710 /* Just append new list to the end of the old list, in case of
2711 multiple frames of the subject sequence */
2712 new_hspcnt = MIN(combined_hsp_list->hspcnt + hsp_list->hspcnt,
2713 hsp_num_max);
2714 if (new_hspcnt > combined_hsp_list->allocated &&
2715 !combined_hsp_list->do_not_reallocate) {
2716 Int4 new_allocated = MIN(2*new_hspcnt, hsp_num_max);
2717 BlastHSP** new_hsp_array;
2718 new_hsp_array = (BlastHSP**)
2719 realloc(combined_hsp_list->hsp_array,
2720 new_allocated*sizeof(BlastHSP*));
2721
2722 if (new_hsp_array) {
2723 combined_hsp_list->allocated = new_allocated;
2724 combined_hsp_list->hsp_array = new_hsp_array;
2725 } else {
2726 combined_hsp_list->do_not_reallocate = TRUE;
2727 new_hspcnt = combined_hsp_list->allocated;
2728 }
2729 }
2730 if (combined_hsp_list->allocated == hsp_num_max)
2731 combined_hsp_list->do_not_reallocate = TRUE;
2732
2733 s_BlastHSPListsCombineByScore(hsp_list, combined_hsp_list, new_hspcnt);
2734
2735 hsp_list = Blast_HSPListFree(hsp_list);
2736 *old_hsp_list_ptr = NULL;
2737
2738 return 0;
2739 }
2740
Blast_HSPListsMerge(BlastHSPList ** hsp_list_ptr,BlastHSPList ** combined_hsp_list_ptr,Int4 hsp_num_max,Int4 * split_offsets,Int4 contexts_per_query,Int4 chunk_overlap_size,Boolean allow_gap,Boolean short_reads)2741 Int2 Blast_HSPListsMerge(BlastHSPList** hsp_list_ptr,
2742 BlastHSPList** combined_hsp_list_ptr,
2743 Int4 hsp_num_max, Int4 *split_offsets,
2744 Int4 contexts_per_query, Int4 chunk_overlap_size,
2745 Boolean allow_gap, Boolean short_reads)
2746 {
2747 BlastHSPList* combined_hsp_list = *combined_hsp_list_ptr;
2748 BlastHSPList* hsp_list = *hsp_list_ptr;
2749 BlastHSP* hsp1, *hsp2, *hsp_var;
2750 BlastHSP** hspp1,** hspp2;
2751 Int4 index1, index2;
2752 Int4 hspcnt1, hspcnt2, new_hspcnt = 0;
2753 Int4 start_diag, end_diag;
2754 Int4 offset_idx;
2755 BlastHSP** new_hsp_array;
2756
2757 if (!hsp_list || hsp_list->hspcnt == 0)
2758 return 0;
2759
2760 /* If no previous HSP list, just return a copy of the new one. */
2761 if (!combined_hsp_list) {
2762 *combined_hsp_list_ptr = hsp_list;
2763 *hsp_list_ptr = NULL;
2764 return 0;
2765 }
2766
2767 /* Merge the two HSP lists for successive chunks of the subject sequence.
2768 First put all HSPs that intersect the overlap region at the front of
2769 the respective HSP arrays. Note that if the query sequence is
2770 assumed split, each context of the query sequence can have a
2771 different split point */
2772 hspcnt1 = hspcnt2 = 0;
2773
2774 if (contexts_per_query < 0) { /* subject seq is split */
2775 for (index1 = 0; index1 < combined_hsp_list->hspcnt; index1++) {
2776 hsp1 = combined_hsp_list->hsp_array[index1];
2777 if (hsp1->subject.end > split_offsets[0]) {
2778 /* At least part of this HSP lies in the overlap strip. */
2779 hsp_var = combined_hsp_list->hsp_array[hspcnt1];
2780 combined_hsp_list->hsp_array[hspcnt1] = hsp1;
2781 combined_hsp_list->hsp_array[index1] = hsp_var;
2782 ++hspcnt1;
2783 }
2784 }
2785 for (index2 = 0; index2 < hsp_list->hspcnt; index2++) {
2786 hsp2 = hsp_list->hsp_array[index2];
2787 if (hsp2->subject.offset < split_offsets[0] + chunk_overlap_size) {
2788 /* At least part of this HSP lies in the overlap strip. */
2789 hsp_var = hsp_list->hsp_array[hspcnt2];
2790 hsp_list->hsp_array[hspcnt2] = hsp2;
2791 hsp_list->hsp_array[index2] = hsp_var;
2792 ++hspcnt2;
2793 }
2794 }
2795 }
2796 else { /* query seq is split */
2797
2798 /* An HSP can be a candidate for merging if it lies in the
2799 overlap region. Whether this is true depends on whether the
2800 HSP starts to the left of the split point, or ends to the
2801 right of the overlap region. A complication is that 'left'
2802 and 'right' have opposite meaning when the HSP is on the
2803 minus strand of the query sequence */
2804
2805 for (index1 = 0; index1 < combined_hsp_list->hspcnt; index1++) {
2806 hsp1 = combined_hsp_list->hsp_array[index1];
2807 offset_idx = hsp1->context % contexts_per_query;
2808 if (split_offsets[offset_idx] < 0) continue;
2809 if ((hsp1->query.frame >= 0 && hsp1->query.end >
2810 split_offsets[offset_idx]) ||
2811 (hsp1->query.frame < 0 && hsp1->query.offset <
2812 split_offsets[offset_idx] + chunk_overlap_size)) {
2813 /* At least part of this HSP lies in the overlap strip. */
2814 hsp_var = combined_hsp_list->hsp_array[hspcnt1];
2815 combined_hsp_list->hsp_array[hspcnt1] = hsp1;
2816 combined_hsp_list->hsp_array[index1] = hsp_var;
2817 ++hspcnt1;
2818 }
2819 }
2820 for (index2 = 0; index2 < hsp_list->hspcnt; index2++) {
2821 hsp2 = hsp_list->hsp_array[index2];
2822 offset_idx = hsp2->context % contexts_per_query;
2823 if (split_offsets[offset_idx] < 0) continue;
2824 if ((hsp2->query.frame < 0 && hsp2->query.end >
2825 split_offsets[offset_idx]) ||
2826 (hsp2->query.frame >= 0 && hsp2->query.offset <
2827 split_offsets[offset_idx] + chunk_overlap_size)) {
2828 /* At least part of this HSP lies in the overlap strip. */
2829 hsp_var = hsp_list->hsp_array[hspcnt2];
2830 hsp_list->hsp_array[hspcnt2] = hsp2;
2831 hsp_list->hsp_array[index2] = hsp_var;
2832 ++hspcnt2;
2833 }
2834 }
2835 }
2836
2837 /* the merge process is independent of whether merging happens
2838 between query chunks or subject chunks */
2839
2840 if (hspcnt1 > 0 && hspcnt2 > 0) {
2841 hspp1 = combined_hsp_list->hsp_array;
2842 hspp2 = hsp_list->hsp_array;
2843
2844 for (index1 = 0; index1 < hspcnt1; index1++) {
2845
2846 hsp1 = hspp1[index1];
2847
2848 for (index2 = 0; index2 < hspcnt2; index2++) {
2849
2850 hsp2 = hspp2[index2];
2851
2852 /* Skip already deleted HSPs, or HSPs from different contexts */
2853 if (!hsp2 || hsp1->context != hsp2->context)
2854 continue;
2855
2856 /* Short read qureies are shorter than the overlap region and may
2857 already have a traceback */
2858 if (short_reads) {
2859 hspp2[index2] = Blast_HSPFree(hsp2);
2860 continue;
2861 }
2862
2863 /* we have to determine the starting diagonal of one HSP
2864 and the ending diagonal of the other */
2865
2866 if (contexts_per_query < 0 || hsp1->query.frame >= 0) {
2867 end_diag = s_HSPEndDiag(hsp1);
2868 start_diag = s_HSPStartDiag(hsp2);
2869 }
2870 else {
2871 end_diag = s_HSPEndDiag(hsp2);
2872 start_diag = s_HSPStartDiag(hsp1);
2873 }
2874
2875 if (ABS(end_diag - start_diag) < OVERLAP_DIAG_CLOSE) {
2876 if (s_BlastMergeTwoHSPs(hsp1, hsp2, allow_gap)) {
2877 /* Free the second HSP. */
2878 hspp2[index2] = Blast_HSPFree(hsp2);
2879 }
2880 }
2881 }
2882 }
2883
2884 /* Purge the nulled out HSPs from the new HSP list */
2885 Blast_HSPListPurgeNullHSPs(hsp_list);
2886 }
2887
2888 /* The new number of HSPs is now the sum of the remaining counts in the
2889 two lists, but if there is a restriction on the number of HSPs to keep,
2890 it might have to be reduced. */
2891 new_hspcnt =
2892 MIN(hsp_list->hspcnt + combined_hsp_list->hspcnt, hsp_num_max);
2893
2894 if (new_hspcnt >= combined_hsp_list->allocated-1 &&
2895 combined_hsp_list->do_not_reallocate == FALSE) {
2896 Int4 new_allocated = MIN(2*new_hspcnt, hsp_num_max);
2897 if (new_allocated > combined_hsp_list->allocated) {
2898 new_hsp_array = (BlastHSP**)
2899 realloc(combined_hsp_list->hsp_array,
2900 new_allocated*sizeof(BlastHSP*));
2901 if (new_hsp_array == NULL) {
2902 combined_hsp_list->do_not_reallocate = TRUE;
2903 } else {
2904 combined_hsp_list->hsp_array = new_hsp_array;
2905 combined_hsp_list->allocated = new_allocated;
2906 }
2907 } else {
2908 combined_hsp_list->do_not_reallocate = TRUE;
2909 }
2910 new_hspcnt = MIN(new_hspcnt, combined_hsp_list->allocated);
2911 }
2912
2913 s_BlastHSPListsCombineByScore(hsp_list, combined_hsp_list, new_hspcnt);
2914
2915 hsp_list = Blast_HSPListFree(hsp_list);
2916 *hsp_list_ptr = NULL;
2917
2918 return 0;
2919 }
2920
Blast_HSPListAdjustOffsets(BlastHSPList * hsp_list,Int4 offset)2921 void Blast_HSPListAdjustOffsets(BlastHSPList* hsp_list, Int4 offset)
2922 {
2923 Int4 index;
2924
2925 if (offset == 0) {
2926 return;
2927 }
2928
2929 for (index=0; index<hsp_list->hspcnt; index++) {
2930 BlastHSP* hsp = hsp_list->hsp_array[index];
2931 hsp->subject.offset += offset;
2932 hsp->subject.end += offset;
2933 hsp->subject.gapped_start += offset;
2934 }
2935 }
2936
Blast_HSPListAdjustOddBlastnScores(BlastHSPList * hsp_list,Boolean gapped_calculation,const BlastScoreBlk * sbp)2937 void Blast_HSPListAdjustOddBlastnScores(BlastHSPList* hsp_list,
2938 Boolean gapped_calculation,
2939 const BlastScoreBlk* sbp)
2940 {
2941 int index;
2942
2943 if (!hsp_list || hsp_list->hspcnt == 0 ||
2944 gapped_calculation == FALSE ||
2945 sbp->round_down == FALSE)
2946 return;
2947
2948 for (index = 0; index < hsp_list->hspcnt; ++index)
2949 hsp_list->hsp_array[index]->score &= ~1;
2950
2951 /* Sort the HSPs again, since the order may have to be different now. */
2952 Blast_HSPListSortByScore(hsp_list);
2953 }
2954
2955 /** Callback for sorting hsp lists by their best evalue/score;
2956 * Evalues are compared with the condition that if both are close enough to
2957 * zero (currently < 1.0e-180), they are considered equal.
2958 * It is assumed that the HSP arrays in each hit list are already sorted by
2959 * e-value/score.
2960 */
2961 static int
s_EvalueCompareHSPLists(const void * v1,const void * v2)2962 s_EvalueCompareHSPLists(const void* v1, const void* v2)
2963 {
2964 BlastHSPList* h1,* h2;
2965 int retval = 0;
2966
2967 h1 = *(BlastHSPList**) v1;
2968 h2 = *(BlastHSPList**) v2;
2969
2970 /* If any of the HSP lists is empty, it is considered "worse" than the
2971 other, unless the other is also empty. */
2972 if (h1->hspcnt == 0 && h2->hspcnt == 0)
2973 return 0;
2974 else if (h1->hspcnt == 0)
2975 return 1;
2976 else if (h2->hspcnt == 0)
2977 return -1;
2978
2979 if ((retval = s_EvalueComp(h1->best_evalue,
2980 h2->best_evalue)) != 0)
2981 return retval;
2982
2983 if (h1->hsp_array[0]->score > h2->hsp_array[0]->score)
2984 return -1;
2985 if (h1->hsp_array[0]->score < h2->hsp_array[0]->score)
2986 return 1;
2987
2988 /* In case of equal best E-values and scores, order will be determined
2989 by ordinal ids of the subject sequences */
2990 return BLAST_CMP(h2->oid, h1->oid);
2991 }
2992
2993 /** Callback for sorting hsp lists by their best e-value/score, in
2994 * reverse order - from higher e-value to lower (lower score to higher).
2995 */
2996 static int
s_EvalueCompareHSPListsRev(const void * v1,const void * v2)2997 s_EvalueCompareHSPListsRev(const void* v1, const void* v2)
2998 {
2999 return s_EvalueCompareHSPLists(v2, v1);
3000 }
3001
3002 /********************************************************************************
3003 Functions manipulating BlastHitList's
3004 ********************************************************************************/
3005
3006 /*
3007 description in blast_hits.h
3008 */
Blast_HitListNew(Int4 hitlist_size)3009 BlastHitList* Blast_HitListNew(Int4 hitlist_size)
3010 {
3011 BlastHitList* new_hitlist = (BlastHitList*)
3012 calloc(1, sizeof(BlastHitList));
3013 new_hitlist->hsplist_max = hitlist_size;
3014 new_hitlist->low_score = INT4_MAX;
3015 new_hitlist->hsplist_count = 0;
3016 new_hitlist->hsplist_current = 0;
3017 return new_hitlist;
3018 }
3019
3020 /*
3021 description in blast_hits.h
3022 */
Blast_HitListFree(BlastHitList * hitlist)3023 BlastHitList* Blast_HitListFree(BlastHitList* hitlist)
3024 {
3025 if (!hitlist)
3026 return NULL;
3027
3028 Blast_HitListHSPListsFree(hitlist);
3029 sfree(hitlist);
3030 return NULL;
3031 }
3032
3033 /* description in blast_hits.h */
Blast_HitListHSPListsFree(BlastHitList * hitlist)3034 Int2 Blast_HitListHSPListsFree(BlastHitList* hitlist)
3035 {
3036 Int4 index;
3037
3038 if (!hitlist)
3039 return 0;
3040
3041 for (index = 0; index < hitlist->hsplist_count; ++index)
3042 hitlist->hsplist_array[index] =
3043 Blast_HSPListFree(hitlist->hsplist_array[index]);
3044
3045 sfree(hitlist->hsplist_array);
3046
3047 hitlist->hsplist_count = 0;
3048
3049 return 0;
3050 }
3051
3052 /** Purge a BlastHitList of empty HSP lists.
3053 * @param hit_list BLAST hit list structure. [in] [out]
3054 */
3055 static void
s_BlastHitListPurge(BlastHitList * hit_list)3056 s_BlastHitListPurge(BlastHitList* hit_list)
3057 {
3058 Int4 index, hsplist_count;
3059
3060 if (!hit_list)
3061 return;
3062
3063 hsplist_count = hit_list->hsplist_count;
3064 for (index = 0; index < hsplist_count &&
3065 hit_list->hsplist_array[index]->hspcnt > 0; ++index);
3066
3067 hit_list->hsplist_count = index;
3068 /* Free all empty HSP lists */
3069 for ( ; index < hsplist_count; ++index) {
3070 Blast_HSPListFree(hit_list->hsplist_array[index]);
3071 }
3072 }
3073
3074 /** Given a BlastHitList* with a heapified HSP list array, remove
3075 * the worst scoring HSP list and insert the new HSP list in the heap
3076 * @param hit_list Contains all HSP lists for a given query [in] [out]
3077 * @param hsp_list A new HSP list to be inserted into the hit list [in]
3078 */
3079 static void
s_BlastHitListInsertHSPListInHeap(BlastHitList * hit_list,BlastHSPList * hsp_list)3080 s_BlastHitListInsertHSPListInHeap(BlastHitList* hit_list,
3081 BlastHSPList* hsp_list)
3082 {
3083 Blast_HSPListFree(hit_list->hsplist_array[0]);
3084 hit_list->hsplist_array[0] = hsp_list;
3085 if (hit_list->hsplist_count >= 2) {
3086 s_Heapify((char*)hit_list->hsplist_array, (char*)hit_list->hsplist_array,
3087 (char*)&hit_list->hsplist_array[hit_list->hsplist_count/2 - 1],
3088 (char*)&hit_list->hsplist_array[hit_list->hsplist_count-1],
3089 sizeof(BlastHSPList*), s_EvalueCompareHSPLists);
3090 }
3091 hit_list->worst_evalue =
3092 hit_list->hsplist_array[0]->best_evalue;
3093 hit_list->low_score = hit_list->hsplist_array[0]->hsp_array[0]->score;
3094 }
3095
3096 /** Given a BlastHitList pointer this function makes the
3097 * hsplist_array larger, up to a maximum size.
3098 * These incremental increases are mostly an issue for users who
3099 * put in a very large number for number of hits to save, but only save a few.
3100 * @param hit_list object containing the hsplist_array to grow [in]
3101 * @return zero on success, 1 if full already.
3102 */
s_Blast_HitListGrowHSPListArray(BlastHitList * hit_list)3103 static Int2 s_Blast_HitListGrowHSPListArray(BlastHitList* hit_list)
3104
3105 {
3106 const int kStartValue = 100; /* default number of hsplist_array to start with. */
3107
3108 ASSERT(hit_list);
3109
3110 if (hit_list->hsplist_current >= hit_list->hsplist_max)
3111 return 1;
3112
3113 if (hit_list->hsplist_current <= 0)
3114 hit_list->hsplist_current = kStartValue;
3115 else
3116 hit_list->hsplist_current = MIN(2*hit_list->hsplist_current, hit_list->hsplist_max);
3117
3118 hit_list->hsplist_array =
3119 (BlastHSPList**) realloc(hit_list->hsplist_array, hit_list->hsplist_current*sizeof(BlastHSPList*));
3120
3121 if (hit_list->hsplist_array == NULL)
3122 return BLASTERR_MEMORY;
3123
3124 return 0;
3125 }
3126
Blast_HitListUpdate(BlastHitList * hit_list,BlastHSPList * hsp_list)3127 Int2 Blast_HitListUpdate(BlastHitList* hit_list,
3128 BlastHSPList* hsp_list)
3129 {
3130 hsp_list->best_evalue = s_BlastGetBestEvalue(hsp_list);
3131
3132 #ifndef NDEBUG
3133 ASSERT(s_BlastCheckBestEvalue(hsp_list) == TRUE); /* NCBI_FAKE_WARNING */
3134 #endif /* _DEBUG */
3135
3136 if (hit_list->hsplist_count < hit_list->hsplist_max) {
3137 /* If the array of HSP lists for this query is not yet allocated,
3138 do it here */
3139 if (hit_list->hsplist_current == hit_list->hsplist_count)
3140 {
3141 Int2 status = s_Blast_HitListGrowHSPListArray(hit_list);
3142 if (status)
3143 return status;
3144 }
3145 /* Just add to the end; sort later */
3146 hit_list->hsplist_array[hit_list->hsplist_count++] = hsp_list;
3147 hit_list->worst_evalue =
3148 MAX(hsp_list->best_evalue, hit_list->worst_evalue);
3149 hit_list->low_score =
3150 MIN(hsp_list->hsp_array[0]->score, hit_list->low_score);
3151 } else {
3152 int evalue_order = 0;
3153 /* make sure the hsp_list is sorted. We actually do not need to sort
3154 the full list: all that we need is the best score. However, the
3155 following code assumes hsp_list->hsp_array[0] has the best score. */
3156 Blast_HSPListSortByEvalue(hsp_list);
3157 /* Compare e-values only with a certain precision */
3158 evalue_order = s_EvalueComp(hsp_list->best_evalue,
3159 hit_list->worst_evalue);
3160 if (evalue_order > 0 ||
3161 (evalue_order == 0 &&
3162 (hsp_list->hsp_array[0]->score < hit_list->low_score))) {
3163 /* This hit list is less significant than any of those already saved;
3164 discard it. Note that newer hits with score and e-value both equal
3165 to the current worst will be saved, at the expense of some older
3166 hit.
3167 */
3168 Blast_HSPListFree(hsp_list);
3169 } else {
3170 if (!hit_list->heapified) {
3171 /* make sure all hsp_list is sorted */
3172 int index;
3173 for (index =0; index < hit_list->hsplist_count; index++)
3174 Blast_HSPListSortByEvalue(hit_list->hsplist_array[index]);
3175
3176 s_CreateHeap(hit_list->hsplist_array, hit_list->hsplist_count,
3177 sizeof(BlastHSPList*), s_EvalueCompareHSPLists);
3178 hit_list->heapified = TRUE;
3179 }
3180 s_BlastHitListInsertHSPListInHeap(hit_list, hsp_list);
3181 }
3182 }
3183 return 0;
3184 }
3185
3186 Int2
Blast_HitListPurgeNullHSPLists(BlastHitList * hit_list)3187 Blast_HitListPurgeNullHSPLists(BlastHitList* hit_list)
3188 {
3189 Int4 index, index1; /* loop indices. */
3190 Int4 hsplist_count; /* total number of HSPList's to iterate over. */
3191 BlastHSPList** hsplist_array; /* hsplist_array to purge. */
3192
3193 if (hit_list == NULL || hit_list->hsplist_count == 0)
3194 return 0;
3195
3196 hsplist_array = hit_list->hsplist_array;
3197 hsplist_count = hit_list->hsplist_count;
3198
3199 index1 = 0;
3200 for (index=0; index<hsplist_count; index++) {
3201 if (hsplist_array[index]) {
3202 hsplist_array[index1] = hsplist_array[index];
3203 index1++;
3204 }
3205 }
3206
3207 for (index=index1; index<hsplist_count; index++) {
3208 hsplist_array[index] = NULL;
3209 }
3210
3211 hit_list->hsplist_count = index1;
3212
3213 return 0;
3214 }
3215
Blast_HitListSortByEvalue(BlastHitList * hit_list)3216 Int2 Blast_HitListSortByEvalue(BlastHitList* hit_list)
3217 {
3218 if (hit_list && hit_list->hsplist_count > 1) {
3219 qsort(hit_list->hsplist_array, hit_list->hsplist_count,
3220 sizeof(BlastHSPList*), s_EvalueCompareHSPLists);
3221 }
3222 s_BlastHitListPurge(hit_list);
3223 return 0;
3224 }
3225
3226
3227 /********************************************************************************
3228 Functions manipulating BlastHSPResults
3229 ********************************************************************************/
3230
Blast_HSPResultsNew(Int4 num_queries)3231 BlastHSPResults* Blast_HSPResultsNew(Int4 num_queries)
3232 {
3233 BlastHSPResults* retval = NULL;
3234
3235 retval = (BlastHSPResults*) calloc(1, sizeof(BlastHSPResults));
3236 if ( !retval ) {
3237 return NULL;
3238 }
3239
3240 retval->num_queries = num_queries;
3241 retval->hitlist_array = (BlastHitList**)
3242 calloc(num_queries, sizeof(BlastHitList*));
3243
3244 if ( !retval->hitlist_array ) {
3245 return Blast_HSPResultsFree(retval);
3246 }
3247
3248 return retval;
3249 }
3250
Blast_HSPResultsFree(BlastHSPResults * results)3251 BlastHSPResults* Blast_HSPResultsFree(BlastHSPResults* results)
3252 {
3253 Int4 index;
3254
3255 if (!results)
3256 return NULL;
3257
3258 if (results->hitlist_array)
3259 {
3260 for (index = 0; index < results->num_queries; ++index)
3261 Blast_HitListFree(results->hitlist_array[index]);
3262 sfree(results->hitlist_array);
3263 }
3264 sfree(results);
3265 return NULL;
3266 }
3267
Blast_HSPResultsSortByEvalue(BlastHSPResults * results)3268 Int2 Blast_HSPResultsSortByEvalue(BlastHSPResults* results)
3269 {
3270 Int4 index;
3271 BlastHitList* hit_list;
3272
3273 if (!results)
3274 return 0;
3275
3276 for (index = 0; index < results->num_queries; ++index) {
3277 hit_list = results->hitlist_array[index];
3278 if (hit_list != NULL
3279 && hit_list->hsplist_count > 1
3280 && hit_list->hsplist_array != NULL) {
3281 qsort(hit_list->hsplist_array, hit_list->hsplist_count,
3282 sizeof(BlastHSPList*), s_EvalueCompareHSPLists);
3283 }
3284 s_BlastHitListPurge(hit_list);
3285 }
3286 return 0;
3287 }
3288
Blast_HSPResultsReverseSort(BlastHSPResults * results)3289 Int2 Blast_HSPResultsReverseSort(BlastHSPResults* results)
3290 {
3291 Int4 index;
3292 BlastHitList* hit_list;
3293
3294 for (index = 0; index < results->num_queries; ++index) {
3295 hit_list = results->hitlist_array[index];
3296 if (hit_list && hit_list->hsplist_count > 1) {
3297 qsort(hit_list->hsplist_array, hit_list->hsplist_count,
3298 sizeof(BlastHSPList*), s_EvalueCompareHSPListsRev);
3299 }
3300 s_BlastHitListPurge(hit_list);
3301 }
3302 return 0;
3303 }
3304
Blast_HSPResultsReverseOrder(BlastHSPResults * results)3305 Int2 Blast_HSPResultsReverseOrder(BlastHSPResults* results)
3306 {
3307 Int4 index;
3308 BlastHitList* hit_list;
3309
3310 for (index = 0; index < results->num_queries; ++index) {
3311 hit_list = results->hitlist_array[index];
3312 if (hit_list && hit_list->hsplist_count > 1) {
3313 BlastHSPList* hsplist;
3314 Int4 index1;
3315 /* Swap HSP lists: first with last; second with second from the end,
3316 etc. */
3317 for (index1 = 0; index1 < hit_list->hsplist_count/2; ++index1) {
3318 hsplist = hit_list->hsplist_array[index1];
3319 hit_list->hsplist_array[index1] =
3320 hit_list->hsplist_array[hit_list->hsplist_count-index1-1];
3321 hit_list->hsplist_array[hit_list->hsplist_count-index1-1] =
3322 hsplist;
3323 }
3324 }
3325 }
3326 return 0;
3327 }
3328
3329 /** Auxiliary structure for sorting HSPs */
3330 typedef struct SHspWrap {
3331 BlastHSPList *hsplist; /**< The HSPList to which this HSP belongs */
3332 BlastHSP *hsp; /**< HSP described by this structure */
3333 } SHspWrap;
3334
3335 /** callback used to sort a list of encapsulated HSP
3336 * structures in order of decreasing raw score
3337 * -RMH-
3338 */
s_SortHspWrapRawScore(const void * x,const void * y)3339 static int s_SortHspWrapRawScore(const void *x, const void *y)
3340 {
3341 SHspWrap *wrap1 = (SHspWrap *)x;
3342 SHspWrap *wrap2 = (SHspWrap *)y;
3343 if (wrap1->hsp->score > wrap2->hsp->score)
3344 return -1;
3345 if (wrap1->hsp->score < wrap2->hsp->score)
3346 return 1;
3347
3348 return 0;
3349 }
3350
3351 // Masklevel filtering for rmblastn. -RMH-
Blast_HSPResultsApplyMasklevel(BlastHSPResults * results,const BlastQueryInfo * query_info,Int4 masklevel,Int4 query_length)3352 Int2 Blast_HSPResultsApplyMasklevel(BlastHSPResults *results,
3353 const BlastQueryInfo *query_info,
3354 Int4 masklevel, Int4 query_length)
3355 {
3356 Int4 i, j, k, m;
3357 Int4 hsp_count;
3358 SHspWrap *hsp_array;
3359 BlastIntervalTree *tree;
3360
3361 /* set up the interval tree; subject offsets are not needed */
3362
3363 tree = Blast_IntervalTreeInit(0, query_length + 1, 0, 0);
3364
3365 for (i = 0; i < results->num_queries; i++) {
3366 BlastHitList *hitlist = results->hitlist_array[i];
3367 if (hitlist == NULL)
3368 continue;
3369
3370 for (j = hsp_count = 0; j < hitlist->hsplist_count; j++) {
3371 BlastHSPList *hsplist = hitlist->hsplist_array[j];
3372 hsp_count += hsplist->hspcnt;
3373 }
3374
3375 /* empty each HSP into a combined HSP array, then
3376 sort the array by raw score */
3377
3378 hsp_array = (SHspWrap *)malloc(hsp_count * sizeof(SHspWrap));
3379
3380 for (j = k = 0; j < hitlist->hsplist_count; j++) {
3381 BlastHSPList *hsplist = hitlist->hsplist_array[j];
3382 for (m = 0; m < hsplist->hspcnt; k++, m++) {
3383 BlastHSP *hsp = hsplist->hsp_array[m];
3384 hsp_array[k].hsplist = hsplist;
3385 hsp_array[k].hsp = hsp;
3386 }
3387 hsplist->hspcnt = 0;
3388 }
3389
3390 qsort(hsp_array, hsp_count, sizeof(SHspWrap), s_SortHspWrapRawScore);
3391
3392 /* Starting with the best HSP, use the interval tree to
3393 check that the query range of each HSP in the list has
3394 not already been enveloped by too many higher-scoring
3395 HSPs. If this is not the case, add the HSP back into results */
3396
3397 Blast_IntervalTreeReset(tree);
3398
3399 for (j = 0; j < hsp_count; j++) {
3400 BlastHSPList *hsplist = hsp_array[j].hsplist;
3401 BlastHSP *hsp = hsp_array[j].hsp;
3402
3403 if (BlastIntervalTreeMasksHSP(tree,
3404 hsp, query_info, 0, masklevel)) {
3405 Blast_HSPFree(hsp);
3406 }
3407 else {
3408 BlastIntervalTreeAddHSP(hsp, tree, query_info, eQueryOnlyStrandIndifferent);
3409 Blast_HSPListSaveHSP(hsplist, hsp);
3410
3411 /* the first HSP added back into an HSPList
3412 automatically has the best e-value */
3413 // RMH: hmmmmmmm
3414 if (hsplist->hspcnt == 1)
3415 hsplist->best_evalue = hsp->evalue;
3416 }
3417 }
3418 sfree(hsp_array);
3419
3420 /* remove any HSPLists that are still empty after the
3421 culling process. Sort any remaining lists by score */
3422 for (j = 0; j < hitlist->hsplist_count; j++) {
3423 BlastHSPList *hsplist = hitlist->hsplist_array[j];
3424 if (hsplist->hspcnt == 0) {
3425 hitlist->hsplist_array[j] =
3426 Blast_HSPListFree(hitlist->hsplist_array[j]);
3427 }
3428 else {
3429 Blast_HSPListSortByScore(hitlist->hsplist_array[j]);
3430 }
3431 }
3432 Blast_HitListPurgeNullHSPLists(hitlist);
3433 }
3434
3435 tree = Blast_IntervalTreeFree(tree);
3436 return 0;
3437 }
3438
Blast_HSPResultsInsertHSPList(BlastHSPResults * results,BlastHSPList * hsp_list,Int4 hitlist_size)3439 Int2 Blast_HSPResultsInsertHSPList(BlastHSPResults* results,
3440 BlastHSPList* hsp_list, Int4 hitlist_size)
3441 {
3442 if (!hsp_list || hsp_list->hspcnt == 0)
3443 return 0;
3444
3445 ASSERT(hsp_list->query_index < results->num_queries);
3446
3447 if (!results->hitlist_array[hsp_list->query_index]) {
3448 results->hitlist_array[hsp_list->query_index] =
3449 Blast_HitListNew(hitlist_size);
3450 }
3451 Blast_HitListUpdate(results->hitlist_array[hsp_list->query_index],
3452 hsp_list);
3453 return 0;
3454 }
3455
3456 BlastHSPResults**
PHIBlast_HSPResultsSplit(const BlastHSPResults * results,const SPHIQueryInfo * pattern_info)3457 PHIBlast_HSPResultsSplit(const BlastHSPResults* results,
3458 const SPHIQueryInfo* pattern_info)
3459 {
3460 BlastHSPResults* *phi_results = NULL;
3461 int pattern_index, hit_index;
3462 int num_patterns;
3463 BlastHitList* hit_list = NULL;
3464 BlastHSPList** hsplist_array; /* Temporary per-pattern HSP lists. */
3465
3466 if (!pattern_info || pattern_info->num_patterns == 0)
3467 return NULL;
3468
3469 num_patterns = pattern_info->num_patterns;
3470
3471 phi_results =
3472 (BlastHSPResults**) calloc(num_patterns, sizeof(BlastHSPResults*));
3473
3474 if (!results || !results->hitlist_array[0])
3475 return phi_results; /* An empty results set is expected if no hits. */
3476
3477 hsplist_array = (BlastHSPList**) calloc(num_patterns, sizeof(BlastHSPList*));
3478 hit_list = results->hitlist_array[0];
3479
3480 for (hit_index = 0; hit_index < hit_list->hsplist_count; ++hit_index) {
3481 BlastHSPList* hsp_list = hit_list->hsplist_array[hit_index];
3482 int hsp_index;
3483 /* Copy HSPs corresponding to different pattern occurrences into
3484 separate HSP lists. */
3485 for (hsp_index = 0; hsp_index < hsp_list->hspcnt; ++hsp_index) {
3486 BlastHSP* hsp = s_BlastHSPCopy(hsp_list->hsp_array[hsp_index]);
3487 pattern_index = hsp->pat_info->index;
3488 if (!hsplist_array[pattern_index])
3489 hsplist_array[pattern_index] = Blast_HSPListNew(0);
3490 hsplist_array[pattern_index]->oid = hsp_list->oid;
3491 Blast_HSPListSaveHSP(hsplist_array[pattern_index], hsp);
3492 }
3493
3494 /* Save HSP lists corresponding to different pattern occurrences
3495 in separate results structures. */
3496 for (pattern_index = 0; pattern_index < num_patterns;
3497 ++pattern_index) {
3498 if (hsplist_array[pattern_index]) {
3499 if (!phi_results[pattern_index])
3500 phi_results[pattern_index] = Blast_HSPResultsNew(1);
3501 Blast_HSPResultsInsertHSPList(phi_results[pattern_index],
3502 hsplist_array[pattern_index],
3503 hit_list->hsplist_max);
3504 hsplist_array[pattern_index] = NULL;
3505 }
3506 }
3507 }
3508
3509 sfree(hsplist_array);
3510
3511 /* Sort HSPLists in each of the results structures by e-value. */
3512 for (pattern_index = 0; pattern_index < num_patterns; ++pattern_index) {
3513 Blast_HSPResultsSortByEvalue(phi_results[pattern_index]);
3514 }
3515
3516 return phi_results;
3517 }
3518
3519 BlastHSPResults*
Blast_HSPResultsFromHSPStream(BlastHSPStream * hsp_stream,size_t num_queries,SBlastHitsParameters * bhp)3520 Blast_HSPResultsFromHSPStream(BlastHSPStream* hsp_stream,
3521 size_t num_queries,
3522 SBlastHitsParameters* bhp)
3523 {
3524 BlastHSPResults* retval = NULL;
3525 BlastHSPList* hsp_list = NULL;
3526
3527 retval = Blast_HSPResultsNew((Int4) num_queries);
3528
3529 while (BlastHSPStreamRead(hsp_stream, &hsp_list) != kBlastHSPStream_Eof) {
3530 Blast_HSPResultsInsertHSPList(retval, hsp_list,
3531 bhp->prelim_hitlist_size);
3532 }
3533 SBlastHitsParametersFree(bhp);
3534 return retval;
3535 }
3536
3537 /** Comparison function for sorting HSP lists in increasing order of the
3538 * number of HSPs in a hit. Needed for s_TrimResultsByTotalHSPLimit below.
3539 * @param v1 Pointer to the first HSP list [in]
3540 * @param v2 Pointer to the second HSP list [in]
3541 */
3542 static int
s_CompareHsplistHspcnt(const void * v1,const void * v2)3543 s_CompareHsplistHspcnt(const void* v1, const void* v2)
3544 {
3545 BlastHSPList* r1 = *((BlastHSPList**) v1);
3546 BlastHSPList* r2 = *((BlastHSPList**) v2);
3547
3548 if (r1->hspcnt < r2->hspcnt)
3549 return -1;
3550 else if (r1->hspcnt > r2->hspcnt)
3551 return 1;
3552 else
3553 return 0;
3554 }
3555
3556 /** Removes extra results if a limit is imposed on the total number of HSPs
3557 * returned. If the search involves multiple query sequences, the total HSP
3558 * limit is applied separately to each query.
3559 * The trimming algorithm makes sure that at least 1 HSP is returned for each
3560 * database sequence hit. Suppose results for a given query consist of HSP
3561 * lists for N database sequences, and the limit is T. HSP lists are sorted in
3562 * order of increasing number of HSPs in each list. Then the algorithm proceeds
3563 * by leaving at most i*T/N HSPs for the first i HSP lists, for every
3564 * i = 1, 2, ..., N.
3565 * @param results Results after preliminary stage of a BLAST search [in|out]
3566 * @param total_hsp_limit Limit on total number of HSPs [in]
3567 * @return TRUE if HSP limit has been exceeded, FALSE otherwise.
3568 */
3569 static Boolean
s_TrimResultsByTotalHSPLimit(BlastHSPResults * results,Uint4 total_hsp_limit)3570 s_TrimResultsByTotalHSPLimit(BlastHSPResults* results, Uint4 total_hsp_limit)
3571 {
3572 int query_index;
3573 Boolean hsp_limit_exceeded = FALSE;
3574
3575 if (total_hsp_limit == 0) {
3576 return hsp_limit_exceeded;
3577 }
3578
3579 for (query_index = 0; query_index < results->num_queries; ++query_index) {
3580 BlastHitList* hit_list = NULL;
3581 BlastHSPList** hsplist_array = NULL;
3582 Int4 hsplist_count = 0;
3583 int subj_index;
3584
3585 if ( !(hit_list = results->hitlist_array[query_index]) )
3586 continue;
3587 /* The count of HSPs is separate for each query. */
3588 hsplist_count = hit_list->hsplist_count;
3589
3590 hsplist_array = (BlastHSPList**)
3591 malloc(hsplist_count*sizeof(BlastHSPList*));
3592
3593 for (subj_index = 0; subj_index < hsplist_count; ++subj_index) {
3594 hsplist_array[subj_index] = hit_list->hsplist_array[subj_index];
3595 }
3596
3597 qsort((void*)hsplist_array, hsplist_count,
3598 sizeof(BlastHSPList*), s_CompareHsplistHspcnt);
3599
3600 {
3601 Int4 tot_hsps = 0; /* total number of HSPs */
3602 Uint4 hsp_per_seq = MAX(1, total_hsp_limit/hsplist_count);
3603 for (subj_index = 0; subj_index < hsplist_count; ++subj_index) {
3604 Int4 allowed_hsp_num = ((subj_index+1)*hsp_per_seq) - tot_hsps;
3605 BlastHSPList* hsp_list = hsplist_array[subj_index];
3606 if (hsp_list->hspcnt > allowed_hsp_num) {
3607 /* Free the extra HSPs */
3608 int hsp_index;
3609 for (hsp_index = allowed_hsp_num;
3610 hsp_index < hsp_list->hspcnt; ++hsp_index) {
3611 Blast_HSPFree(hsp_list->hsp_array[hsp_index]);
3612 }
3613 hsp_list->hspcnt = allowed_hsp_num;
3614 hsp_limit_exceeded = TRUE;
3615 }
3616 tot_hsps += hsp_list->hspcnt;
3617 }
3618 }
3619 sfree(hsplist_array);
3620 }
3621
3622 return hsp_limit_exceeded;
3623 }
3624
3625 typedef struct BlastHSPwOid {
3626 BlastHSP * hsp;
3627 Int4 oid;
3628 } BlastHSPwOid;
3629
3630 static int
s_CompareScoreHSPwOid(const void * v1,const void * v2)3631 s_CompareScoreHSPwOid(const void* v1, const void* v2)
3632 {
3633 BlastHSPwOid * r1 = (BlastHSPwOid *) v1;
3634 BlastHSPwOid * r2 = (BlastHSPwOid *) v2;
3635 return (s_EvalueCompareHSPs(&(r1->hsp), &(r2->hsp)));
3636 }
3637
3638 static int
s_CompareOidHSPwOid(const void * v1,const void * v2)3639 s_CompareOidHSPwOid(const void* v1, const void* v2)
3640 {
3641 BlastHSPwOid * r1 = (BlastHSPwOid *) v1;
3642 BlastHSPwOid * r2 = (BlastHSPwOid *) v2;
3643 return (r1->oid > r2->oid);
3644 }
3645
3646
3647 /* extended version of the above function. Provides information about query number
3648 * which exceeded number of HSP.
3649 * The hsp_limit_exceeded is of results->num_queries size guarantied.
3650 */
3651 static Boolean
s_TrimResultsByTotalHSPLimitEx(BlastHSPResults * results,Uint4 total_hsp_limit,Boolean * hsp_limit_exceeded)3652 s_TrimResultsByTotalHSPLimitEx(BlastHSPResults* results,
3653 Uint4 total_hsp_limit,
3654 Boolean *hsp_limit_exceeded)
3655 {
3656 int query_index;
3657 Boolean any_hsp_limit_exceeded = FALSE;
3658
3659 if (total_hsp_limit == 0) {
3660 return any_hsp_limit_exceeded;
3661 }
3662
3663 for (query_index = 0; query_index < results->num_queries; ++query_index) {
3664 BlastHitList* hit_list = NULL;
3665 Int4 hsplist_count = 0;
3666 int subj_index;
3667 Int4 total_hsps = 0; /* total number of HSPs */
3668
3669 if( hsp_limit_exceeded) hsp_limit_exceeded[query_index] = FALSE;
3670
3671 if ( !(hit_list = results->hitlist_array[query_index]) )
3672 continue;
3673 /* The count of HSPs is separate for each query. */
3674 hsplist_count = hit_list->hsplist_count;
3675
3676 for (subj_index = 0; subj_index < hsplist_count; ++subj_index) {
3677 total_hsps += hit_list->hsplist_array[subj_index]->hspcnt;
3678 }
3679
3680 if(total_hsps > total_hsp_limit)
3681 {
3682 BlastHSPwOid * everything_list = (BlastHSPwOid *) malloc(total_hsps * sizeof(BlastHSPwOid));
3683 BlastHSPList * subj_list;
3684 int hsp_counter = 0;
3685 int max_hit_list_size = hit_list->hsplist_max;
3686 if( hsp_limit_exceeded) {
3687 hsp_limit_exceeded[query_index] = TRUE;
3688 any_hsp_limit_exceeded = TRUE;
3689 }
3690 for (subj_index = 0; subj_index < hsplist_count; ++subj_index) {
3691 int subj_hsp;
3692 BlastHSP ** hsps_per_subj;
3693 subj_list = hit_list->hsplist_array[subj_index];
3694 hsps_per_subj = subj_list->hsp_array;
3695 for(subj_hsp=0; subj_hsp < subj_list->hspcnt; ++subj_hsp) {
3696 everything_list[hsp_counter].hsp = hsps_per_subj[subj_hsp];
3697 everything_list[hsp_counter].oid = subj_list->oid;
3698 hsps_per_subj[subj_hsp] = NULL;
3699 hsp_counter ++;
3700 }
3701
3702 }
3703 results->hitlist_array[query_index] = Blast_HitListFree(hit_list);
3704 qsort((void*)everything_list, total_hsps, sizeof(BlastHSPwOid), s_CompareScoreHSPwOid);
3705
3706 for(hsp_counter = total_hsp_limit; hsp_counter < total_hsps ; ++hsp_counter) {
3707 everything_list[hsp_counter].hsp = Blast_HSPFree(everything_list[hsp_counter].hsp);
3708 everything_list[hsp_counter].oid = 0x7fffff;
3709 }
3710
3711 qsort((void*)everything_list, total_hsp_limit, sizeof(BlastHSPwOid), s_CompareOidHSPwOid);
3712 subj_list = NULL;
3713 for(hsp_counter = 0; hsp_counter < total_hsp_limit; ++ hsp_counter)
3714 {
3715 int hsp_counter_start = hsp_counter;
3716 int hspcnt;
3717 int num_hsp;
3718
3719 while ((everything_list[hsp_counter].oid == everything_list[hsp_counter+1].oid) &&
3720 (hsp_counter + 1 < total_hsp_limit)) {
3721 hsp_counter ++;
3722 }
3723 num_hsp = hsp_counter -hsp_counter_start + 1;
3724 subj_list = Blast_HSPListNew(num_hsp);
3725 subj_list->oid = everything_list[hsp_counter].oid;
3726 subj_list->query_index = query_index;
3727
3728 for(hspcnt = 0; hspcnt < num_hsp; ++hspcnt) {
3729 Blast_HSPListSaveHSP(subj_list, everything_list[hsp_counter_start + hspcnt].hsp);
3730 }
3731
3732 Blast_HSPResultsInsertHSPList(results,subj_list, max_hit_list_size );
3733 }
3734 free(everything_list);
3735 }
3736 }
3737
3738 return any_hsp_limit_exceeded;
3739 }
3740
3741 BlastHSPResults*
Blast_HSPResultsFromHSPStreamWithLimit(BlastHSPStream * hsp_stream,Uint4 num_queries,SBlastHitsParameters * hit_param,Uint4 max_num_hsps,Boolean * removed_hsps)3742 Blast_HSPResultsFromHSPStreamWithLimit(BlastHSPStream* hsp_stream,
3743 Uint4 num_queries,
3744 SBlastHitsParameters* hit_param,
3745 Uint4 max_num_hsps,
3746 Boolean* removed_hsps)
3747 {
3748 Boolean rm_hsps = FALSE;
3749 BlastHSPResults* retval = Blast_HSPResultsFromHSPStream(hsp_stream,
3750 num_queries,
3751 hit_param);
3752
3753 rm_hsps = s_TrimResultsByTotalHSPLimit(retval, max_num_hsps);
3754 if (removed_hsps) {
3755 *removed_hsps = rm_hsps;
3756 }
3757 return retval;
3758 }
3759 BlastHSPResults*
Blast_HSPResultsFromHSPStreamWithLimitEx(BlastHSPStream * hsp_stream,Uint4 num_queries,SBlastHitsParameters * hit_param,Uint4 max_num_hsps,Boolean * removed_hsps)3760 Blast_HSPResultsFromHSPStreamWithLimitEx(BlastHSPStream* hsp_stream,
3761 Uint4 num_queries,
3762 SBlastHitsParameters* hit_param,
3763 Uint4 max_num_hsps,
3764 Boolean *removed_hsps)
3765 {
3766 Boolean rm_hsps = FALSE;
3767 BlastHSPResults* retval = Blast_HSPResultsFromHSPStream(hsp_stream,
3768 num_queries,
3769 hit_param);
3770
3771 rm_hsps = s_TrimResultsByTotalHSPLimitEx(retval, max_num_hsps,removed_hsps);
3772 if (removed_hsps) {
3773 *removed_hsps = rm_hsps;
3774 }
3775 return retval;
3776 }
3777
3778
Blast_HSPChainNew(void)3779 BlastHSPChain* Blast_HSPChainNew(void)
3780 {
3781 BlastHSPChain* chain = calloc(1, sizeof(BlastHSPChain));
3782 return chain;
3783 }
3784
Blast_HSPChainFree(BlastHSPChain * ch)3785 BlastHSPChain* Blast_HSPChainFree(BlastHSPChain* ch)
3786 {
3787 Int4 i;
3788
3789 if (!ch) {
3790 return NULL;
3791 }
3792
3793 for (i = 0;i < ch->num_hsps;i++) {
3794 Blast_HSPFree(ch->hsp_array[i]);
3795 }
3796 sfree(ch->hsp_array);
3797 sfree(ch);
3798
3799 return NULL;
3800 }
3801
Blast_MappingResultsNew(void)3802 BlastMappingResults* Blast_MappingResultsNew(void)
3803 {
3804 BlastMappingResults* retval = calloc(1, sizeof(BlastMappingResults));
3805 return retval;
3806 }
3807
Blast_MappingResultsFree(BlastMappingResults * results)3808 BlastMappingResults* Blast_MappingResultsFree(BlastMappingResults* results)
3809 {
3810 if (!results) {
3811 return NULL;
3812 }
3813
3814 if (results->chain_array) {
3815 Int4 i;
3816 for (i = 0;i < results->num_results;i++) {
3817 Blast_HSPChainFree(results->chain_array[i]);
3818 }
3819 sfree(results->chain_array);
3820 }
3821 sfree(results);
3822
3823 return NULL;
3824 }
3825