1 /* $Id: blast_hits.c 594743 2019-10-09 11:00:47Z fongah2 $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  *
27  */
28 
29 /** @file blast_hits.c
30  * BLAST functions for saving hits after the (preliminary) gapped alignment
31  */
32 
33 #include <algo/blast/core/ncbi_math.h>
34 #include <algo/blast/core/blast_hits.h>
35 #include <algo/blast/core/blast_util.h>
36 #include <algo/blast/core/blast_def.h>
37 #include <algo/blast/core/blast_hspstream.h>
38 #include "blast_hits_priv.h"
39 #include "blast_itree.h"
40 #include "jumper.h"
41 
42 
43 Int4
GetPrelimHitlistSize(Int4 hitlist_size,Int4 compositionBasedStats,Boolean gapped_calculation)44 GetPrelimHitlistSize(Int4 hitlist_size, Int4 compositionBasedStats, Boolean gapped_calculation)
45 {
46     Int4 prelim_hitlist_size = hitlist_size;
47     char * ADAPTIVE_CBS_ENV = getenv("ADAPTIVE_CBS");
48     if (compositionBasedStats) {
49     	if(ADAPTIVE_CBS_ENV != NULL) {
50     		if(hitlist_size < 1000) {
51     			prelim_hitlist_size = MAX(prelim_hitlist_size + 1000, 1500);
52     		}
53     		else {
54     			prelim_hitlist_size = prelim_hitlist_size*2 + 50;
55     		}
56     	}
57     	else {
58     		if(hitlist_size <= 500) {
59     			prelim_hitlist_size = 1050;
60     		}
61     		else {
62     			prelim_hitlist_size = prelim_hitlist_size*2 + 50;
63     		}
64 
65     	}
66     }
67     else if (gapped_calculation) {
68          prelim_hitlist_size = MIN(MAX(2 * prelim_hitlist_size, 10), prelim_hitlist_size + 50);
69     }
70     return prelim_hitlist_size;
71 }
72 
73 
74 NCBI_XBLAST_EXPORT
SBlastHitsParametersNew(const BlastHitSavingOptions * hit_options,const BlastExtensionOptions * ext_options,const BlastScoringOptions * scoring_options,SBlastHitsParameters ** retval)75 Int2 SBlastHitsParametersNew(const BlastHitSavingOptions* hit_options,
76                              const BlastExtensionOptions* ext_options,
77                              const BlastScoringOptions* scoring_options,
78                              SBlastHitsParameters* *retval)
79 {
80        ASSERT(retval);
81        *retval = NULL;
82 
83        if (hit_options == NULL ||
84            ext_options == NULL ||
85            scoring_options == NULL)
86            return 1;
87 
88        *retval = (SBlastHitsParameters*) malloc(sizeof(SBlastHitsParameters));
89        if (*retval == NULL)
90            return 2;
91 
92        (*retval)->prelim_hitlist_size = GetPrelimHitlistSize(hit_options->hitlist_size,
93     		   ext_options->compositionBasedStats, scoring_options->gapped_calculation);
94 
95        (*retval)->hsp_num_max = BlastHspNumMax(scoring_options->gapped_calculation, hit_options);
96 
97        return 0;
98 }
99 
100 SBlastHitsParameters*
SBlastHitsParametersDup(const SBlastHitsParameters * hit_params)101 SBlastHitsParametersDup(const SBlastHitsParameters* hit_params)
102 {
103     SBlastHitsParameters* retval = (SBlastHitsParameters*)
104         malloc(sizeof(SBlastHitsParameters));
105 
106     if ( !retval ) {
107         return NULL;
108     }
109 
110     memcpy((void*)retval, (void*) hit_params, sizeof(SBlastHitsParameters));
111     return retval;
112 }
113 
114 NCBI_XBLAST_EXPORT
SBlastHitsParametersFree(SBlastHitsParameters * param)115 SBlastHitsParameters* SBlastHitsParametersFree(SBlastHitsParameters* param)
116 {
117        if (param)
118        {
119                sfree(param);
120        }
121        return NULL;
122 }
123 
124 
125 
126 /********************************************************************************
127           Functions manipulating BlastHSP's
128 ********************************************************************************/
129 
Blast_HSPFree(BlastHSP * hsp)130 BlastHSP* Blast_HSPFree(BlastHSP* hsp)
131 {
132    if (!hsp)
133       return NULL;
134    hsp->gap_info = GapEditScriptDelete(hsp->gap_info);
135    hsp->map_info = BlastHSPMappingInfoFree(hsp->map_info);
136    sfree(hsp->pat_info);
137    sfree(hsp);
138    return NULL;
139 }
140 
Blast_HSPNew(void)141 BlastHSP* Blast_HSPNew(void)
142 {
143      BlastHSP* new_hsp = (BlastHSP*) calloc(1, sizeof(BlastHSP));
144      return new_hsp;
145 }
146 
147 /*
148    Comments in blast_hits.h
149 */
150 Int2
Blast_HSPInit(Int4 query_start,Int4 query_end,Int4 subject_start,Int4 subject_end,Int4 query_gapped_start,Int4 subject_gapped_start,Int4 query_context,Int2 query_frame,Int2 subject_frame,Int4 score,GapEditScript ** gap_edit,BlastHSP ** ret_hsp)151 Blast_HSPInit(Int4 query_start, Int4 query_end, Int4 subject_start,
152               Int4 subject_end, Int4 query_gapped_start,
153               Int4 subject_gapped_start, Int4 query_context,
154               Int2 query_frame, Int2 subject_frame, Int4 score,
155               GapEditScript* *gap_edit, BlastHSP* *ret_hsp)
156 {
157    BlastHSP* new_hsp = NULL;
158 
159    if (!ret_hsp)
160       return -1;
161 
162    new_hsp = Blast_HSPNew();
163 
164    *ret_hsp = NULL;
165 
166    if (new_hsp == NULL)
167 	return BLASTERR_MEMORY;
168 
169 
170    new_hsp->query.offset = query_start;
171    new_hsp->subject.offset = subject_start;
172    new_hsp->query.end = query_end;
173    new_hsp->subject.end = subject_end;
174    new_hsp->query.gapped_start = query_gapped_start;
175    new_hsp->subject.gapped_start = subject_gapped_start;
176    new_hsp->context = query_context;
177    new_hsp->query.frame = query_frame;
178    new_hsp->subject.frame = subject_frame;
179    new_hsp->score = score;
180    if (gap_edit && *gap_edit)
181    { /* If this is non-NULL transfer ownership. */
182         new_hsp->gap_info = *gap_edit;
183         *gap_edit = NULL;
184    }
185 
186    *ret_hsp = new_hsp;
187 
188    return 0;
189 }
190 
191 
BlastHSPMappingInfoFree(BlastHSPMappingInfo * info)192 BlastHSPMappingInfo* BlastHSPMappingInfoFree(BlastHSPMappingInfo* info)
193 {
194    if (!info) {
195        return NULL;
196    }
197 
198    info->edits = JumperEditsBlockFree(info->edits);
199    if (info->subject_overhangs) {
200        SequenceOverhangsFree(info->subject_overhangs);
201    }
202    sfree(info);
203 
204    return NULL;
205 }
206 
BlastHSPMappingInfoNew(void)207 BlastHSPMappingInfo* BlastHSPMappingInfoNew(void)
208 {
209     BlastHSPMappingInfo* retval = calloc(1, sizeof(BlastHSPMappingInfo));
210     return retval;
211 }
212 
BlastHspNumMax(Boolean gapped_calculation,const BlastHitSavingOptions * options)213 Int4 BlastHspNumMax(Boolean gapped_calculation, const BlastHitSavingOptions* options)
214 {
215    Int4 retval=0;
216 
217    /* per-subject HSP limits do not apply to gapped searches; JIRA SB-616 */
218    if (options->hsp_num_max <= 0)
219    {
220       retval = INT4_MAX;
221    }
222    else
223    {
224       retval = options->hsp_num_max;
225    }
226 
227    return retval;
228 }
229 
230 /** Copies all contents of a BlastHSP structure. Used in PHI BLAST for splitting
231  * results corresponding to different pattern occurrences in query.
232  * @param hsp Original HSP [in]
233  * @return New HSP, copied from the original.
234  */
235 static BlastHSP*
s_BlastHSPCopy(const BlastHSP * hsp)236 s_BlastHSPCopy(const BlastHSP* hsp)
237 {
238     BlastHSP* new_hsp = NULL;
239     /* Do not pass the edit script, because we don't want to tranfer
240        ownership. */
241     Blast_HSPInit(hsp->query.offset, hsp->query.end, hsp->subject.offset,
242                   hsp->subject.end, hsp->query.gapped_start,
243                   hsp->subject.gapped_start, hsp->context,
244                   hsp->query.frame, hsp->subject.frame, hsp->score,
245                   NULL, &new_hsp);
246     new_hsp->evalue = hsp->evalue;
247     new_hsp->num = hsp->num;
248     new_hsp->num_ident = hsp->num_ident;
249     new_hsp->bit_score = hsp->bit_score;
250     new_hsp->comp_adjustment_method = hsp->comp_adjustment_method;
251     if (hsp->gap_info) {
252         new_hsp->gap_info = GapEditScriptDup(hsp->gap_info);
253     }
254 
255     if (hsp->pat_info) {
256         /* Copy this HSP's pattern data. */
257         new_hsp->pat_info =
258             (SPHIHspInfo*) BlastMemDup(hsp->pat_info, sizeof(SPHIHspInfo));
259     }
260     return new_hsp;
261 }
262 
263 /* Make a deep copy of an HSP */
Blast_HSPClone(const BlastHSP * hsp)264 BlastHSP* Blast_HSPClone(const BlastHSP* hsp)
265 {
266     BlastHSP* retval = NULL;
267 
268     if (!hsp) {
269         return NULL;
270     }
271 
272     retval = Blast_HSPNew();
273     if (retval) {
274         retval->score = hsp->score;
275         retval->num_ident = hsp->num_ident;
276         memcpy(&retval->query, &hsp->query, sizeof(BlastSeg));
277         memcpy(&retval->subject, &hsp->subject, sizeof(BlastSeg));
278         retval->context = hsp->context;
279         retval->evalue = hsp->evalue;
280         retval->bit_score = hsp->bit_score;
281         retval->num = hsp->num;
282         retval->comp_adjustment_method = hsp->comp_adjustment_method;
283         retval->num_positives = hsp->num_positives;
284 
285         /* copy gapped traceback */
286         if (hsp->gap_info) {
287             retval->gap_info = GapEditScriptDup(hsp->gap_info);
288             if (!retval->gap_info) {
289                 Blast_HSPFree(retval);
290                 return NULL;
291             }
292         }
293 
294         /* copy short read mapping data */
295         if (hsp->map_info) {
296             retval->map_info = BlastHSPMappingInfoNew();
297             if (!retval->map_info) {
298                 Blast_HSPFree(retval);
299                 return NULL;
300             }
301             retval->map_info->edits =
302                 JumperEditsBlockDup(hsp->map_info->edits);
303             if (!retval->map_info->edits) {
304                 Blast_HSPFree(retval);
305                 return NULL;
306             }
307             retval->map_info->left_edge = hsp->map_info->left_edge;
308             retval->map_info->right_edge = hsp->map_info->right_edge;
309 
310             if (hsp->map_info->subject_overhangs) {
311                 SequenceOverhangs* old = hsp->map_info->subject_overhangs;
312                 SequenceOverhangs* copy = calloc(1, sizeof(SequenceOverhangs));
313                 if (!copy) {
314                     Blast_HSPFree(retval);
315                     return NULL;
316                 }
317 
318                 if (old->left && old->left_len > 0) {
319                     copy->left_len = old->left_len;
320                     copy->left = malloc(copy->left_len * sizeof(Uint1));
321                     if (!copy->left) {
322                         SequenceOverhangsFree(copy);
323                         Blast_HSPFree(retval);
324                         return NULL;
325                     }
326                     memcpy(copy->left, old->left,
327                            copy->left_len * sizeof(Uint1));
328                 }
329 
330                 if (old->right && old->right_len > 0) {
331                     copy->right_len = old->right_len;
332                     copy->right = malloc(copy->right_len * sizeof(Uint1));
333                     if (!copy->right) {
334                         SequenceOverhangsFree(copy);
335                         Blast_HSPFree(retval);
336                     }
337                     memcpy(copy->right, old->right,
338                            copy->right_len * sizeof(Uint1));
339                 }
340 
341                 retval->map_info->subject_overhangs = copy;
342             }
343         }
344 
345         /* copy phi-blast pattern data */
346         if (hsp->pat_info) {
347             retval->pat_info =
348                 (SPHIHspInfo*) BlastMemDup(hsp->pat_info, sizeof(SPHIHspInfo));
349         }
350     }
351 
352     return retval;
353 }
354 
355 /** Count the number of occurrences of pattern in sequence, which
356  * do not overlap by more than half the pattern match length.
357  * @param query_info Query information structure, containing pattern info. [in]
358  */
359 Int4
PhiBlastGetEffectiveNumberOfPatterns(const BlastQueryInfo * query_info)360 PhiBlastGetEffectiveNumberOfPatterns(const BlastQueryInfo *query_info)
361 {
362     Int4 index; /*loop index*/
363     Int4 lastEffectiveOccurrence; /*last nonoverlapping occurrence*/
364     Int4 count; /* Count of effective (nonoverlapping) occurrences */
365     Int4 min_pattern_length;
366     SPHIQueryInfo* pat_info;
367 
368     ASSERT(query_info && query_info->pattern_info && query_info->contexts);
369 
370     pat_info = query_info->pattern_info;
371 
372     if (pat_info->num_patterns <= 1)
373         return pat_info->num_patterns;
374 
375     /* Minimal length of a pattern is saved in the length adjustment field. */
376     min_pattern_length = query_info->contexts[0].length_adjustment;
377 
378     count = 1;
379     lastEffectiveOccurrence = pat_info->occurrences[0].offset;
380     for(index = 1; index < pat_info->num_patterns; ++index) {
381         if (((pat_info->occurrences[index].offset - lastEffectiveOccurrence) * 2)
382             > min_pattern_length) {
383             lastEffectiveOccurrence = pat_info->occurrences[index].offset;
384             ++count;
385         }
386     }
387 
388     return count;
389 }
390 
391 
392 /** Calculate e-value for an HSP found by PHI BLAST.
393  * @param hsp An HSP found by PHI BLAST [in]
394  * @param sbp Scoring block with statistical parameters [in]
395  * @param query_info Structure containing information about pattern counts [in]
396  * @param pattern_blk Structure containing counts of PHI pattern hits [in]
397  */
398 static void
s_HSPPHIGetEvalue(BlastHSP * hsp,BlastScoreBlk * sbp,const BlastQueryInfo * query_info,const SPHIPatternSearchBlk * pattern_blk)399 s_HSPPHIGetEvalue(BlastHSP* hsp, BlastScoreBlk* sbp,
400                   const BlastQueryInfo* query_info,
401                   const SPHIPatternSearchBlk* pattern_blk)
402 {
403    double paramC;
404    double Lambda;
405 
406    ASSERT(query_info && hsp && sbp && pattern_blk);
407 
408    paramC = sbp->kbp[0]->paramC;
409    Lambda = sbp->kbp[0]->Lambda;
410 
411    /* We have the actual number of occurrences of pattern in db. */
412    hsp->evalue = paramC*(1+Lambda*hsp->score)*
413                 PhiBlastGetEffectiveNumberOfPatterns(query_info)*
414                 pattern_blk->num_patterns_db*
415                 exp(-Lambda*hsp->score);
416 }
417 
418 /** Update HSP data after reevaluation with ambiguities. In particular this
419  * function calculates number of identities and checks if the percent identity
420  * criterion is satisfied.
421  * @param hsp HSP to update [in] [out]
422  * @param gapped Is this a gapped search? [in]
423  * @param cutoff_score Cutoff score for saving the HSP [in]
424  * @param score New score [in]
425  * @param query_start Start of query sequence [in]
426  * @param subject_start Start of subject sequence [in]
427  * @param best_q_start Pointer to start of the new alignment in query [in]
428  * @param best_q_end Pointer to end of the new alignment in query [in]
429  * @param best_s_start Pointer to start of the new alignment in subject [in]
430  * @param best_s_end Pointer to end of the new alignment in subject [in]
431  * @param best_start_esp_index index of the edit script array where the new alignment
432  *                       starts. [in]
433  * @param best_end_esp_index index in the edit script array where the new alignment
434  *                     ends. [in]
435  * @param best_end_esp_num Number of edit operations in the last edit script,
436  *                         that are included in the alignment. [in]
437  * @return TRUE if HSP is scheduled to be deleted.
438  */
439 static Boolean
s_UpdateReevaluatedHSP(BlastHSP * hsp,Boolean gapped,Int4 cutoff_score,Int4 score,const Uint1 * query_start,const Uint1 * subject_start,const Uint1 * best_q_start,const Uint1 * best_q_end,const Uint1 * best_s_start,const Uint1 * best_s_end,int best_start_esp_index,int best_end_esp_index,int best_end_esp_num)440 s_UpdateReevaluatedHSP(BlastHSP* hsp, Boolean gapped,
441                        Int4 cutoff_score,
442                        Int4 score, const Uint1* query_start, const Uint1* subject_start,
443                        const Uint1* best_q_start, const Uint1* best_q_end,
444                        const Uint1* best_s_start, const Uint1* best_s_end,
445                        int best_start_esp_index,
446                        int best_end_esp_index,
447                        int best_end_esp_num)
448 {
449     Boolean delete_hsp = TRUE;
450 
451     hsp->score = score;
452 
453     if (hsp->score >= cutoff_score) {
454         /* Update all HSP offsets. */
455         hsp->query.offset = best_q_start - query_start;
456         hsp->query.end = hsp->query.offset + best_q_end - best_q_start;
457         hsp->subject.offset = best_s_start - subject_start;
458         hsp->subject.end = hsp->subject.offset + best_s_end - best_s_start;
459 
460         if (gapped) {
461             int last_num=hsp->gap_info->size - 1;
462             if (best_end_esp_index != last_num|| best_start_esp_index > 0)
463             {
464                 GapEditScript* esp_temp = GapEditScriptNew(best_end_esp_index-best_start_esp_index+1);
465                 GapEditScriptPartialCopy(esp_temp, 0, hsp->gap_info, best_start_esp_index, best_end_esp_index);
466                 hsp->gap_info = GapEditScriptDelete(hsp->gap_info);
467                 hsp->gap_info = esp_temp;
468             }
469             last_num = hsp->gap_info->size - 1;
470             hsp->gap_info->num[last_num] = best_end_esp_num;
471             ASSERT(best_end_esp_num >= 0);
472         }
473         delete_hsp = FALSE;
474     }
475 
476     return delete_hsp;
477 }
478 
Blast_HSPReevaluateWithAmbiguitiesGapped(BlastHSP * hsp,const Uint1 * q,const Int4 qlen,const Uint1 * s,const Int4 slen,const BlastHitSavingParameters * hit_params,const BlastScoringParameters * score_params,const BlastScoreBlk * sbp)479 Boolean Blast_HSPReevaluateWithAmbiguitiesGapped(BlastHSP* hsp,
480            const Uint1* q, const Int4 qlen,
481            const Uint1* s, const Int4 slen,
482            const BlastHitSavingParameters* hit_params,
483            const BlastScoringParameters* score_params,
484            const BlastScoreBlk* sbp)
485 {
486    Int4 sum, score, gap_open, gap_extend;
487    Int4 index; /* loop index */
488    Int4 qp, sp, ext;
489 
490    int best_start_esp_index = 0;
491    int best_end_esp_index = 0;
492    int current_start_esp_index = 0;
493    int best_end_esp_num = 0;
494    GapEditScript* esp;  /* Used to hold GapEditScript of hsp->gap_info */
495 
496    const Uint1* best_q_start; /* Start of the best scoring part in query. */
497    const Uint1* best_s_start; /* Start of the best scoring part in subject. */
498    const Uint1* best_q_end;   /* End of the best scoring part in query. */
499    const Uint1* best_s_end;   /* End of the best scoring part in subject. */
500 
501 
502    const Uint1* current_q_start; /* Start of the current part of the alignment in
503                            query. */
504    const Uint1* current_s_start; /* Start of the current part of the alignment in
505                            subject. */
506 
507    const Uint1* query,* subject;
508    Int4** matrix;
509    Int2 factor = 1;
510    const Uint1 kResidueMask = 0x0f;
511    Int4 cutoff_score = hit_params->cutoffs[hsp->context].cutoff_score;
512 
513    matrix = sbp->matrix->data;
514 
515    /* For a non-affine greedy case, calculate the real value of the gap
516       extension penalty. Multiply all scores by 2 if it is not integer. */
517    if (score_params->gap_open == 0 && score_params->gap_extend == 0) {
518       if (score_params->reward % 2 == 1)
519          factor = 2;
520       gap_open = 0;
521       gap_extend =
522          (score_params->reward - 2*score_params->penalty) * factor / 2;
523    } else {
524       gap_open = score_params->gap_open;
525       gap_extend = score_params->gap_extend;
526    }
527 
528    query = q + hsp->query.offset;
529    subject = s + hsp->subject.offset;
530    score = 0;
531    sum = 0;
532 
533    /* Point all pointers to the beginning of the alignment. */
534    best_q_start = best_q_end = current_q_start = query;
535    best_s_start = best_s_end = current_s_start = subject;
536    /* There are no previous edit scripts at the beginning. */
537 
538    best_end_esp_num = -1;
539    esp = hsp->gap_info;
540    if (!esp) return TRUE;
541    for (index=0; index<esp->size; index++)
542    {
543        int op_index = 0;  /* Index of an operation within a single edit script. */
544        for (op_index=0; op_index<esp->num[index]; )
545        {
546           /* Process substitutions one operation at a time, full gaps in one step. */
547           if (esp->op_type[index] == eGapAlignSub) {
548               sum += factor*matrix[*query & kResidueMask][*subject];
549               query++;
550               subject++;
551               op_index++;
552           } else if (esp->op_type[index] == eGapAlignDel) {
553               sum -= gap_open + gap_extend * esp->num[index];
554               subject += esp->num[index];
555               op_index += esp->num[index];
556           } else if (esp->op_type[index] == eGapAlignIns) {
557               sum -= gap_open + gap_extend * esp->num[index];
558               query += esp->num[index];
559               op_index += esp->num[index];
560           }
561 
562           if (sum < 0) {
563            /* Point current edit script chain start to the new place.
564               If we are in the middle of an edit script, reduce its length and
565               point operation index to the beginning of a modified edit script;
566               if we are at the end, move to the next edit script. */
567               if (op_index < esp->num[index]) {
568                   esp->num[index] -= op_index;
569                   current_start_esp_index = index;
570                   op_index = 0;
571               } else {
572                   current_start_esp_index = index + 1;
573               }
574               /* Set sum to 0, to start a fresh count. */
575               sum = 0;
576               /* Set current starting positions in sequences to the new start. */
577               current_q_start = query;
578               current_s_start = subject;
579 
580               /* If score has passed the cutoff at some point, leave the best score
581                  and edit scripts positions information untouched, otherwise reset
582                  the best score to 0 and point the best edit script positions to
583                  the new start. */
584               if (score < cutoff_score) {
585                   /* Start from new offset; discard all previous information. */
586                   best_q_start = query;
587                   best_s_start = subject;
588                   score = 0;
589 
590                   /* Set best start and end edit script pointers to new start. */
591                   best_start_esp_index = current_start_esp_index;
592                   best_end_esp_index = current_start_esp_index;
593               }
594              /* break; */ /* start on next GapEditScript. */
595           } else if (sum > score) {
596               /* Remember this point as the best scoring end point, and the current
597               start of the alignment as the start of the best alignment. */
598               score = sum;
599 
600               best_q_start = current_q_start;
601               best_s_start = current_s_start;
602               best_q_end = query;
603               best_s_end = subject;
604 
605               best_start_esp_index = current_start_esp_index;
606               best_end_esp_index = index;
607               best_end_esp_num = op_index;
608           }
609        }
610    } /* loop on edit scripts */
611 
612    score /= factor;
613 
614    if (best_start_esp_index < esp->size && best_end_esp_index < esp->size) {
615       /* post processing: try to extend further */
616       ASSERT(esp->op_type[best_start_esp_index] == eGapAlignSub);
617       ASSERT(esp->op_type[best_end_esp_index] == eGapAlignSub);
618 
619       qp = best_q_start - q;
620       sp = best_s_start - s;
621       ext = 0;
622       while(qp > 0 && sp > 0 && (q[--qp] == s[--sp]) && q[qp]<4) ext++;
623       best_q_start -= ext;
624       best_s_start -= ext;
625       esp->num[best_start_esp_index] += ext;
626       if (best_end_esp_index == best_start_esp_index) best_end_esp_num += ext;
627       score += ext * score_params->reward;
628 
629       qp = best_q_end - q;
630       sp = best_s_end - s;
631       ext = 0;
632       while(qp < qlen && sp < slen && q[qp]<4 && (q[qp++] == s[sp++])) ext++;
633       best_q_end += ext;
634       best_s_end += ext;
635       esp->num[best_end_esp_index] += ext;
636       best_end_esp_num += ext;
637       score += ext * score_params->reward;
638    }
639 
640    /* Update HSP data. */
641    return
642        s_UpdateReevaluatedHSP(hsp, TRUE, cutoff_score,
643                               score, q, s, best_q_start,
644                               best_q_end, best_s_start, best_s_end,
645                               best_start_esp_index, best_end_esp_index,
646                               best_end_esp_num);
647 }
648 
649 /** Update HSP data after reevaluation with ambiguities for an ungapped search.
650  * In particular this function calculates number of identities and checks if the
651  * percent identity criterion is satisfied.
652  * @param hsp HSP to update [in] [out]
653  * @param cutoff_score Cutoff score for saving the HSP [in]
654  * @param score New score [in]
655  * @param query_start Start of query sequence [in]
656  * @param subject_start Start of subject sequence [in]
657  * @param best_q_start Pointer to start of the new alignment in query [in]
658  * @param best_q_end Pointer to end of the new alignment in query [in]
659  * @param best_s_start Pointer to start of the new alignment in subject [in]
660  * @param best_s_end Pointer to end of the new alignment in subject [in]
661  * @return TRUE if HSP is scheduled to be deleted.
662  */
663 static Boolean
s_UpdateReevaluatedHSPUngapped(BlastHSP * hsp,Int4 cutoff_score,Int4 score,const Uint1 * query_start,const Uint1 * subject_start,const Uint1 * best_q_start,const Uint1 * best_q_end,const Uint1 * best_s_start,const Uint1 * best_s_end)664 s_UpdateReevaluatedHSPUngapped(BlastHSP* hsp, Int4 cutoff_score, Int4 score,
665                                const Uint1* query_start, const Uint1* subject_start,
666                                const Uint1* best_q_start, const Uint1* best_q_end,
667                                const Uint1* best_s_start, const Uint1* best_s_end)
668 {
669     return
670         s_UpdateReevaluatedHSP(hsp, FALSE, cutoff_score, score, query_start,
671                                subject_start, best_q_start, best_q_end,
672                                best_s_start, best_s_end, 0, 0, 0);
673 }
674 
675 Boolean
Blast_HSPReevaluateWithAmbiguitiesUngapped(BlastHSP * hsp,const Uint1 * query_start,const Uint1 * subject_start,const BlastInitialWordParameters * word_params,BlastScoreBlk * sbp,Boolean translated)676 Blast_HSPReevaluateWithAmbiguitiesUngapped(BlastHSP* hsp, const Uint1* query_start,
677    const Uint1* subject_start, const BlastInitialWordParameters* word_params,
678    BlastScoreBlk* sbp, Boolean translated)
679 {
680    Int4 sum, score;
681    Int4** matrix;
682    const Uint1* query,* subject;
683    const Uint1* best_q_start,* best_s_start,* best_q_end,* best_s_end;
684    const Uint1* current_q_start, * current_s_start;
685    Int4 index;
686    const Uint1 kResidueMask = (translated ? 0xff : 0x0f);
687    Int4 hsp_length = hsp->query.end - hsp->query.offset;
688    Int4 cutoff_score = word_params->cutoffs[hsp->context].cutoff_score;
689 
690    matrix = sbp->matrix->data;
691 
692    query = query_start + hsp->query.offset;
693    subject = subject_start + hsp->subject.offset;
694    score = 0;
695    sum = 0;
696    best_q_start = best_q_end = current_q_start = query;
697    best_s_start = best_s_end = current_s_start = subject;
698 
699    for (index = 0; index < hsp_length; ++index) {
700       sum += matrix[*query & kResidueMask][*subject];
701       query++;
702       subject++;
703       if (sum < 0) {
704           /* Start from new offset */
705           sum = 0;
706           current_q_start = query;
707           current_s_start = subject;
708           /* If previous top score never reached the cutoff, discard the front
709              part of the alignment completely. Otherwise keep pointer to the
710              top-scoring front part. */
711          if (score < cutoff_score) {
712             best_q_start = best_q_end = query;
713             best_s_start = best_s_end = subject;
714             score = 0;
715          }
716       } else if (sum > score) {
717          /* Remember this point as the best scoring end point */
718          score = sum;
719          best_q_end = query;
720          best_s_end = subject;
721          /* Set start of alignment to the current start, dismissing the
722             previous top-scoring piece. */
723          best_q_start = current_q_start;
724          best_s_start = current_s_start;
725       }
726    }
727 
728    /* Update HSP data. */
729    return
730        s_UpdateReevaluatedHSPUngapped(hsp, cutoff_score, score,
731                                       query_start, subject_start, best_q_start,
732                                       best_q_end, best_s_start, best_s_end);
733 }
734 
735 /** Calculate number of identities in a regular HSP.
736  * @param query The query sequence [in]
737  * @param subject The uncompressed subject sequence [in]
738  * @param hsp All information about the HSP [in]
739  * @param num_ident_ptr Number of identities [out]
740  * @param align_length_ptr The alignment length, including gaps [out]
741  * @param sbp Blast score blk [in]
742  * @param num_pos_ptr Number of Positives [out]
743  * @return 0 on success, -1 on invalid parameters or error
744  */
745 static Int2
s_Blast_HSPGetNumIdentitiesAndPositives(const Uint1 * query,const Uint1 * subject,const BlastHSP * hsp,Int4 * num_ident_ptr,Int4 * align_length_ptr,const BlastScoreBlk * sbp,Int4 * num_pos_ptr)746 s_Blast_HSPGetNumIdentitiesAndPositives(const Uint1* query, const Uint1* subject,
747                             			const BlastHSP* hsp, Int4* num_ident_ptr,
748                             			Int4* align_length_ptr, const BlastScoreBlk* sbp,
749                             			Int4* num_pos_ptr)
750 {
751    Int4 i, num_ident, align_length, q_off, s_off;
752    Uint1* q,* s;
753    Int4 q_length = hsp->query.end - hsp->query.offset;
754    Int4 s_length = hsp->subject.end - hsp->subject.offset;
755    Int4** matrix = NULL;
756    Int4 num_pos = 0;
757 
758    q_off = hsp->query.offset;
759    s_off = hsp->subject.offset;
760 
761    if ( !subject || !query || !hsp )
762       return -1;
763 
764    q = (Uint1*) &query[q_off];
765    s = (Uint1*) &subject[s_off];
766 
767    num_ident = 0;
768    align_length = 0;
769 
770    if(NULL != sbp)
771    {
772 	   if(sbp->protein_alphabet)
773 		   matrix = sbp->matrix->data;
774    }
775 
776    if (!hsp->gap_info) {
777       /* Ungapped case. Check that lengths are the same in query and subject,
778          then count number of matches. */
779       if (q_length != s_length)
780          return -1;
781       align_length = q_length;
782       for (i=0; i<align_length; i++) {
783          if (*q == *s)
784             num_ident++;
785          else if (NULL != matrix) {
786         	 if (matrix[*q][*s] > 0)
787         		 num_pos ++;
788              }
789          q++;
790          s++;
791       }
792    	}
793     else {
794       Int4 index;
795       GapEditScript* esp = hsp->gap_info;
796       for (index=0; index<esp->size; index++)
797       {
798          align_length += esp->num[index];
799          switch (esp->op_type[index]) {
800          case eGapAlignSub:
801             for (i=0; i<esp->num[index]; i++) {
802                if (*q == *s) {
803                   num_ident++;
804                }
805                else if (NULL != matrix) {
806             	   if (matrix[*q][*s] > 0)
807             		   num_pos ++;
808                }
809                q++;
810                s++;
811             }
812             break;
813          case eGapAlignDel:
814             s += esp->num[index];
815             break;
816          case eGapAlignIns:
817             q += esp->num[index];
818             break;
819          default:
820             s += esp->num[index];
821             q += esp->num[index];
822             break;
823          }
824       }
825    }
826 
827    if (align_length_ptr) {
828        *align_length_ptr = align_length;
829    }
830    *num_ident_ptr = num_ident;
831 
832    if(NULL != matrix)
833 	   *num_pos_ptr = num_pos + num_ident;
834 
835    return 0;
836 }
837 
838 /** Calculate number of identities in an HSP for an out-of-frame alignment.
839  * @param query The query sequence [in]
840  * @param subject The uncompressed subject sequence [in]
841  * @param hsp All information about the HSP [in]
842  * @param program BLAST program (blastx or tblastn) [in]
843  * @param num_ident_ptr Number of identities [out]
844  * @param align_length_ptr The alignment length, including gaps [out]
845  * @param sbp Blast score blk [in]
846  * @param num_pos_ptr Number of Positives [out]
847  * @return 0 on success, -1 on invalid parameters or error
848  */
849 static Int2
s_Blast_HSPGetOOFNumIdentitiesAndPositives(const Uint1 * query,const Uint1 * subject,const BlastHSP * hsp,EBlastProgramType program,Int4 * num_ident_ptr,Int4 * align_length_ptr,const BlastScoreBlk * sbp,Int4 * num_pos_ptr)850 s_Blast_HSPGetOOFNumIdentitiesAndPositives(
851 		const Uint1* query, const Uint1* subject,
852 		const BlastHSP* hsp, EBlastProgramType program,
853 		Int4* num_ident_ptr, Int4* align_length_ptr,
854 		const BlastScoreBlk* sbp, Int4* num_pos_ptr)
855 {
856    Int4 num_ident, align_length;
857    Int4 index;
858    Uint1* q,* s;
859    GapEditScript* esp;
860    Int4 ** matrix = NULL;
861    Int4 num_pos = 0;
862 
863    if (!hsp->gap_info || !subject || !query)
864       return -1;
865 
866    if(NULL != sbp)
867    {
868 	   if(sbp->protein_alphabet)
869 		   matrix = sbp->matrix->data;
870    }
871 
872    if (program == eBlastTypeTblastn ||
873        program == eBlastTypeRpsTblastn) {
874        q = (Uint1*) &query[hsp->query.offset];
875        s = (Uint1*) &subject[hsp->subject.offset];
876    } else {
877        s = (Uint1*) &query[hsp->query.offset];
878        q = (Uint1*) &subject[hsp->subject.offset];
879    }
880    num_ident = 0;
881    align_length = 0;
882 
883    esp = hsp->gap_info;
884    for (index=0; index<esp->size; index++)
885    {
886       int i;
887       switch (esp->op_type[index]) {
888       case eGapAlignSub: /* Substitution */
889          align_length += esp->num[index];
890          for (i=0; i<esp->num[index]; i++) {
891             if (*q == *s)
892                num_ident++;
893             else if (NULL != matrix) {
894               	 if (matrix[*q][*s] > 0)
895                		 num_pos ++;
896             }
897             ++q;
898             s += CODON_LENGTH;
899          }
900          break;
901       case eGapAlignIns: /* Insertion */
902          align_length += esp->num[index];
903          s += esp->num[index] * CODON_LENGTH;
904          break;
905       case eGapAlignDel: /* Deletion */
906          align_length += esp->num[index];
907          q += esp->num[index];
908          break;
909       case eGapAlignDel2: /* Gap of two nucleotides. */
910          s -= 2;
911          break;
912       case eGapAlignDel1: /* Gap of one nucleotide. */
913          s -= 1;
914          break;
915       case eGapAlignIns1: /* Insertion of one nucleotide. */
916          s += 1;
917          break;
918       case eGapAlignIns2: /* Insertion of two nucleotides. */
919          s += 2;
920          break;
921       default:
922          s += esp->num[index] * CODON_LENGTH;
923          q += esp->num[index];
924          break;
925       }
926    }
927 
928    if (align_length_ptr) {
929        *align_length_ptr = align_length;
930    }
931    *num_ident_ptr = num_ident;
932 
933    if(NULL != matrix)
934 	   *num_pos_ptr = num_pos + num_ident;
935 
936    return 0;
937 }
938 
939 Int2
Blast_HSPGetNumIdentities(const Uint1 * query,const Uint1 * subject,BlastHSP * hsp,const BlastScoringOptions * score_options,Int4 * align_length_ptr)940 Blast_HSPGetNumIdentities(const Uint1* query,
941                           const Uint1* subject,
942                           BlastHSP* hsp,
943                           const BlastScoringOptions* score_options,
944                           Int4* align_length_ptr)
945 {
946     Int2 retval = 0;
947 
948     /* Calculate alignment length and number of identical letters.
949        Do not get the number of identities if the query is not available */
950     if (score_options->is_ooframe) {
951         retval = s_Blast_HSPGetOOFNumIdentitiesAndPositives(query, subject, hsp,
952                                                	   	   	    score_options->program_number,
953                                                	   	   	    &hsp->num_ident,
954                                                	   	   	    align_length_ptr,
955                                                	   	   	    NULL, NULL);
956     } else {
957         retval = s_Blast_HSPGetNumIdentitiesAndPositives(query, subject, hsp,
958                                             		  	 &hsp->num_ident,
959                                             		  	 align_length_ptr,
960                                             		  	 NULL, NULL);
961     }
962 
963     return retval;
964 }
965 Int2
Blast_HSPGetNumIdentitiesAndPositives(const Uint1 * query,const Uint1 * subject,BlastHSP * hsp,const BlastScoringOptions * score_options,Int4 * align_length_ptr,const BlastScoreBlk * sbp)966 Blast_HSPGetNumIdentitiesAndPositives(const Uint1* query,
967         							  const Uint1* subject,
968         							  BlastHSP* hsp,
969         							  const BlastScoringOptions* score_options,
970         							  Int4* align_length_ptr,
971         							  const BlastScoreBlk * sbp)
972 {
973     Int2 retval = 0;
974 
975     /* Calculate alignment length and number of identical letters.
976        Do not get the number of identities if the query is not available */
977     if (score_options->is_ooframe) {
978         retval = s_Blast_HSPGetOOFNumIdentitiesAndPositives(query, subject, hsp,
979                                                	   	   	    score_options->program_number,
980                                                	   	   	    &hsp->num_ident,
981                                                	   	   	    align_length_ptr,
982                                                	   	   	    sbp, &hsp->num_positives);
983     } else {
984         retval = s_Blast_HSPGetNumIdentitiesAndPositives(query, subject, hsp,
985                                             		  	 &hsp->num_ident,
986                                             		  	 align_length_ptr,
987                                             		  	 sbp, &hsp->num_positives);
988     }
989 
990     return retval;
991 }
992 
s_HSPTest(const BlastHSP * hsp,const BlastHitSavingOptions * hit_options,Int4 align_length)993 static Boolean s_HSPTest(const BlastHSP* hsp,
994                          const BlastHitSavingOptions* hit_options,
995                          Int4 align_length)
996 {
997 	   return ((hsp->num_ident * 100.0 <
998 			   align_length * hit_options->percent_identity) ||
999 			   align_length < hit_options->min_hit_length) ;
1000 
1001 }
1002 
1003 Boolean
Blast_HSPTestIdentityAndLength(EBlastProgramType program_number,BlastHSP * hsp,const Uint1 * query,const Uint1 * subject,const BlastScoringOptions * score_options,const BlastHitSavingOptions * hit_options)1004 Blast_HSPTestIdentityAndLength(EBlastProgramType program_number,
1005                                BlastHSP* hsp, const Uint1* query, const Uint1* subject,
1006                                const BlastScoringOptions* score_options,
1007                                const BlastHitSavingOptions* hit_options)
1008 {
1009    Int4 align_length = 0;
1010    Boolean delete_hsp = FALSE;
1011    Int2 status = 0;
1012 
1013    ASSERT(hsp && query && subject && score_options && hit_options);
1014 
1015    status = Blast_HSPGetNumIdentities(query, subject, hsp, score_options,
1016                                       &align_length);
1017    ASSERT(status == 0);
1018    (void)status;    /* to pacify compiler warning */
1019 
1020    /* Check whether this HSP passes the percent identity and minimal hit
1021       length criteria, and delete it if it does not. */
1022    delete_hsp = s_HSPTest(hsp, hit_options, align_length);
1023 
1024    return delete_hsp;
1025 }
1026 
Blast_HSPTest(BlastHSP * hsp,const BlastHitSavingOptions * hit_options,Int4 align_length)1027 Boolean Blast_HSPTest(BlastHSP* hsp,
1028 		 	 	 	  const BlastHitSavingOptions* hit_options,
1029 		 	 	 	  Int4 align_length)
1030 {
1031 	return s_HSPTest(hsp, hit_options, align_length);
1032 }
1033 
Blast_HSPGetQueryCoverage(const BlastHSP * hsp,Int4 query_length)1034 double Blast_HSPGetQueryCoverage(const BlastHSP* hsp, Int4 query_length)
1035 {
1036 	double pct = 0;
1037     if(query_length > 0) {
1038             pct = 100.0 * (double) (hsp->query.end - hsp->query.offset)/ (double) query_length;
1039             if(pct < 99)
1040             	pct +=0.5;
1041     }
1042     return pct;
1043 }
1044 
Blast_HSPQueryCoverageTest(BlastHSP * hsp,double min_query_coverage_pct,Int4 query_length)1045 Boolean Blast_HSPQueryCoverageTest(BlastHSP* hsp,
1046         						   double min_query_coverage_pct,
1047         						   Int4 query_length)
1048 {
1049      double hsp_coverage = Blast_HSPGetQueryCoverage( hsp, query_length);
1050      return (hsp_coverage < min_query_coverage_pct);
1051 }
1052 
1053 
1054 void
Blast_HSPCalcLengthAndGaps(const BlastHSP * hsp,Int4 * length_out,Int4 * gaps_out,Int4 * gap_opens_out)1055 Blast_HSPCalcLengthAndGaps(const BlastHSP* hsp, Int4* length_out,
1056                            Int4* gaps_out, Int4* gap_opens_out)
1057 {
1058    Int4 length = hsp->query.end - hsp->query.offset;
1059    Int4 s_length = hsp->subject.end - hsp->subject.offset;
1060    Int4 gap_opens = 0, gaps = 0;
1061 
1062    if (hsp->gap_info) {
1063       GapEditScript* esp = hsp->gap_info;
1064       Int4 index;
1065       for (index=0; index<esp->size; index++) {
1066          if (esp->op_type[index] == eGapAlignDel) {
1067             length += esp->num[index];
1068             gaps += esp->num[index];
1069             ++gap_opens;
1070          } else if (esp->op_type[index] == eGapAlignIns) {
1071             ++gap_opens;
1072             gaps += esp->num[index];
1073          }
1074       }
1075    } else if (s_length > length) {
1076       length = s_length;
1077    }
1078 
1079    *length_out = length;
1080    *gap_opens_out = gap_opens;
1081    *gaps_out = gaps;
1082 }
1083 
1084 /** Adjust start and end of an HSP in a translated sequence segment.
1085  * @param segment BlastSeg structure (part of BlastHSP) [in]
1086  * @param seq_length Length of the full sequence [in]
1087  * @param start Start of the alignment in this segment in nucleotide
1088  *              coordinates, 1-offset [out]
1089  * @param end End of the alignment in this segment in nucleotide
1090  *            coordinates, 1-offset [out]
1091  */
1092 static void
s_BlastSegGetTranslatedOffsets(const BlastSeg * segment,Int4 seq_length,Int4 * start,Int4 * end)1093 s_BlastSegGetTranslatedOffsets(const BlastSeg* segment, Int4 seq_length,
1094                               Int4* start, Int4* end)
1095 {
1096    if (segment->frame < 0)	{
1097       *start = seq_length - CODON_LENGTH*segment->offset + segment->frame;
1098       *end = seq_length - CODON_LENGTH*segment->end + segment->frame + 1;
1099    } else if (segment->frame > 0)	{
1100       *start = CODON_LENGTH*(segment->offset) + segment->frame - 1;
1101       *end = CODON_LENGTH*segment->end + segment->frame - 2;
1102    } else {
1103       *start = segment->offset + 1;
1104       *end = segment->end;
1105    }
1106 }
1107 
1108 void
Blast_HSPGetAdjustedOffsets(EBlastProgramType program,BlastHSP * hsp,Int4 query_length,Int4 subject_length,Int4 * q_start,Int4 * q_end,Int4 * s_start,Int4 * s_end)1109 Blast_HSPGetAdjustedOffsets(EBlastProgramType program, BlastHSP* hsp,
1110                             Int4 query_length, Int4 subject_length,
1111                             Int4* q_start, Int4* q_end,
1112                             Int4* s_start, Int4* s_end)
1113 {
1114    if (!hsp->gap_info) {
1115       *q_start = hsp->query.offset + 1;
1116       *q_end = hsp->query.end;
1117       *s_start = hsp->subject.offset + 1;
1118       *s_end = hsp->subject.end;
1119       return;
1120    }
1121 
1122    if (!Blast_QueryIsTranslated(program) &&
1123        !Blast_SubjectIsTranslated(program)) {
1124       if (hsp->query.frame != hsp->subject.frame) {
1125          /* Blastn: if different strands, flip offsets in query; leave
1126             offsets in subject as they are, but change order for correct
1127             correspondence. */
1128          *q_end = query_length - hsp->query.offset;
1129          *q_start = *q_end - hsp->query.end + hsp->query.offset + 1;
1130          *s_end = hsp->subject.offset + 1;
1131          *s_start = hsp->subject.end;
1132       } else {
1133          *q_start = hsp->query.offset + 1;
1134          *q_end = hsp->query.end;
1135          *s_start = hsp->subject.offset + 1;
1136          *s_end = hsp->subject.end;
1137       }
1138    } else {
1139       s_BlastSegGetTranslatedOffsets(&hsp->query, query_length, q_start, q_end);
1140       s_BlastSegGetTranslatedOffsets(&hsp->subject, subject_length,
1141                                     q_start, s_end);
1142    }
1143 }
1144 
1145 
1146 const Uint1*
Blast_HSPGetTargetTranslation(SBlastTargetTranslation * target_t,const BlastHSP * hsp,Int4 * translated_length)1147 Blast_HSPGetTargetTranslation(SBlastTargetTranslation* target_t,
1148                               const BlastHSP* hsp, Int4* translated_length)
1149 {
1150     Int4 context = -1;
1151     Int4 start, stop;
1152 
1153     ASSERT(target_t != NULL);
1154 
1155     if (hsp == NULL)
1156        return NULL;
1157 
1158     context = BLAST_FrameToContext(hsp->subject.frame, target_t->program_number);
1159     start = target_t->range[2*context];
1160     stop = target_t->range[2*context+1];
1161 
1162     /* skip translation if full translation has already been done */
1163     if (target_t->partial && (start ||
1164         (stop < target_t->subject_blk->length / CODON_LENGTH -3)))
1165     {
1166     	 const int kMaxTranslation = 99; /* Needs to be divisible by three (?) */
1167          Int4 nucl_length = 0;
1168          Int4 translation_length = 0;
1169          Int4 nucl_start = 0;
1170          Int4 nucl_end = 0;
1171     	 Int4 nucl_shift = 0;
1172     	 Int4 start_shift = 0;
1173 
1174          /* HSP coordinates are in terms of protein sequences. */
1175          if (hsp->subject.offset < 0) {
1176              nucl_start = 0;
1177              nucl_end = target_t->subject_blk->length;
1178          } else {
1179              nucl_start = MAX(0, 3*hsp->subject.offset - kMaxTranslation);
1180              nucl_end = MIN(target_t->subject_blk->length, 3*hsp->subject.end + kMaxTranslation);
1181              /* extend to the end of the sequence if close */
1182              if (target_t->subject_blk->length - nucl_end <= 21) {
1183                  nucl_end = target_t->subject_blk->length;
1184              }
1185          }
1186 
1187          nucl_length = nucl_end - nucl_start;
1188 
1189          translation_length = 1+nucl_length/CODON_LENGTH;
1190          start_shift = nucl_start/CODON_LENGTH;
1191 
1192          if (hsp->subject.frame < 0)
1193              nucl_shift = target_t->subject_blk->length - nucl_start - nucl_length;
1194          else
1195              nucl_shift = nucl_start;
1196 
1197          if (start_shift < start || start_shift+translation_length > stop) {
1198                /* needs re-translation */
1199                Int4 length = 0; /* actual translation length */
1200                Uint1* nucl_seq = target_t->subject_blk->sequence + nucl_shift;
1201                Uint1* nucl_seq_rev = NULL;
1202 
1203                target_t->range[2*context] = start_shift;
1204 
1205                if (translation_length > stop-start) {
1206                    /* needs re-allocation */
1207                    sfree(target_t->translations[context]);
1208                    target_t->translations[context] = (Uint1*) malloc(translation_length+2);
1209                }
1210 
1211                if (hsp->subject.frame < 0) {
1212                    /* needs reverse sequence */
1213                    GetReverseNuclSequence(nucl_seq, nucl_length, &nucl_seq_rev);
1214                }
1215 
1216                length = BLAST_GetTranslation(nucl_seq, nucl_seq_rev,
1217                        nucl_length, hsp->subject.frame, target_t->translations[context],
1218                        target_t->gen_code_string);
1219 
1220                target_t->range[2*context+1] = start_shift + length;
1221 
1222                sfree(nucl_seq_rev);
1223 
1224                /* partial translation needs to be fenced */
1225                if(hsp->subject.offset >= 0) {
1226                    target_t->translations[context][0] = FENCE_SENTRY;
1227                    target_t->translations[context][length+1] = FENCE_SENTRY;
1228                }
1229          }
1230     }
1231     if (translated_length)
1232         *translated_length = target_t->range[2*context+1];
1233 
1234     /* +1 as the first byte is a sentinel. */
1235     return target_t->translations[context] - target_t->range[2*context] + 1;
1236 }
1237 
1238 Int2
Blast_HSPGetPartialSubjectTranslation(BLAST_SequenceBlk * subject_blk,BlastHSP * hsp,Boolean is_ooframe,const Uint1 * gen_code_string,Uint1 ** translation_buffer_ptr,Uint1 ** subject_ptr,Int4 * subject_length_ptr,Int4 * start_shift_ptr)1239 Blast_HSPGetPartialSubjectTranslation(BLAST_SequenceBlk* subject_blk,
1240                                       BlastHSP* hsp,
1241                                       Boolean is_ooframe,
1242                                       const Uint1* gen_code_string,
1243                                       Uint1** translation_buffer_ptr,
1244                                       Uint1** subject_ptr,
1245                                       Int4* subject_length_ptr,
1246                                       Int4* start_shift_ptr)
1247 {
1248    Int4 translation_length;
1249    Uint1* translation_buffer;
1250    Uint1* subject;
1251    Int4 start_shift;
1252    Int4 nucl_shift;
1253    Int2 status = 0;
1254 
1255    ASSERT(subject_blk && hsp && gen_code_string && translation_buffer_ptr &&
1256           subject_ptr && subject_length_ptr && start_shift_ptr);
1257 
1258    translation_buffer = *translation_buffer_ptr;
1259    sfree(translation_buffer);
1260 
1261    if (!is_ooframe) {
1262       start_shift =
1263          MAX(0, 3*hsp->subject.offset - MAX_FULL_TRANSLATION);
1264       translation_length =
1265          MIN(3*hsp->subject.end + MAX_FULL_TRANSLATION,
1266              subject_blk->length) - start_shift;
1267       if (hsp->subject.frame > 0) {
1268          nucl_shift = start_shift;
1269       } else {
1270          nucl_shift = subject_blk->length - start_shift - translation_length;
1271       }
1272       status = (Int2)
1273          Blast_GetPartialTranslation(subject_blk->sequence_start+nucl_shift,
1274                                      translation_length, hsp->subject.frame,
1275                                      gen_code_string, &translation_buffer,
1276                                      subject_length_ptr, NULL);
1277       /* Below, the start_shift will be used for the protein
1278          coordinates, so need to divide it by 3 */
1279       start_shift /= CODON_LENGTH;
1280    } else {
1281       Int4 oof_end;
1282       oof_end = subject_blk->length;
1283 
1284       start_shift =
1285          MAX(0, hsp->subject.offset - MAX_FULL_TRANSLATION);
1286       translation_length =
1287          MIN(hsp->subject.end + MAX_FULL_TRANSLATION, oof_end) - start_shift;
1288       if (hsp->subject.frame > 0) {
1289          nucl_shift = start_shift;
1290       } else {
1291          nucl_shift = oof_end - start_shift - translation_length;
1292       }
1293       status = (Int2)
1294          Blast_GetPartialTranslation(subject_blk->sequence_start+nucl_shift,
1295                                      translation_length, hsp->subject.frame,
1296                                      gen_code_string, NULL,
1297                                      subject_length_ptr, &translation_buffer);
1298    }
1299    hsp->subject.offset -= start_shift;
1300    hsp->subject.end -= start_shift;
1301    hsp->subject.gapped_start -= start_shift;
1302    *translation_buffer_ptr = translation_buffer;
1303    *start_shift_ptr = start_shift;
1304 
1305    if (!is_ooframe) {
1306       subject = translation_buffer + 1;
1307    } else {
1308       subject = translation_buffer + CODON_LENGTH;
1309    }
1310    *subject_ptr = subject;
1311 
1312    return status;
1313 }
1314 
1315 void
Blast_HSPAdjustSubjectOffset(BlastHSP * hsp,Int4 start_shift)1316 Blast_HSPAdjustSubjectOffset(BlastHSP* hsp, Int4 start_shift)
1317 {
1318     /* Adjust subject offsets if shifted (partial) sequence was used
1319        for extension */
1320     if (start_shift > 0) {
1321         hsp->subject.offset += start_shift;
1322         hsp->subject.end += start_shift;
1323         hsp->subject.gapped_start += start_shift;
1324     }
1325 
1326     return;
1327 }
1328 
1329 int
ScoreCompareHSPs(const void * h1,const void * h2)1330 ScoreCompareHSPs(const void* h1, const void* h2)
1331 {
1332    BlastHSP* hsp1,* hsp2;   /* the HSPs to be compared */
1333    int result = 0;      /* the result of the comparison */
1334 
1335    hsp1 = *((BlastHSP**) h1);
1336    hsp2 = *((BlastHSP**) h2);
1337 
1338    /* Null HSPs are "greater" than any non-null ones, so they go to the end
1339       of a sorted list. */
1340    if (!hsp1 && !hsp2)
1341        return 0;
1342    else if (!hsp1)
1343        return 1;
1344    else if (!hsp2)
1345        return -1;
1346 
1347    if (0 == (result = BLAST_CMP(hsp2->score,          hsp1->score)) &&
1348        0 == (result = BLAST_CMP(hsp1->subject.offset, hsp2->subject.offset)) &&
1349        0 == (result = BLAST_CMP(hsp2->subject.end,    hsp1->subject.end)) &&
1350        0 == (result = BLAST_CMP(hsp1->query  .offset, hsp2->query  .offset))) {
1351        /* if all other test can't distinguish the HSPs, then the final
1352           test is the result */
1353        result = BLAST_CMP(hsp2->query.end, hsp1->query.end);
1354    }
1355    return result;
1356 }
1357 
Blast_HSPListIsSortedByScore(const BlastHSPList * hsp_list)1358 Boolean Blast_HSPListIsSortedByScore(const BlastHSPList* hsp_list)
1359 {
1360     Int4 index;
1361 
1362     if (!hsp_list || hsp_list->hspcnt <= 1)
1363         return TRUE;
1364 
1365     for (index = 0; index < hsp_list->hspcnt - 1; ++index) {
1366         if (ScoreCompareHSPs(&hsp_list->hsp_array[index],
1367                              &hsp_list->hsp_array[index+1]) > 0) {
1368             return FALSE;
1369         }
1370     }
1371     return TRUE;
1372 }
1373 
Blast_HSPListSortByScore(BlastHSPList * hsp_list)1374 void Blast_HSPListSortByScore(BlastHSPList* hsp_list)
1375 {
1376     if (!hsp_list || hsp_list->hspcnt <= 1)
1377         return;
1378 
1379     if (!Blast_HSPListIsSortedByScore(hsp_list)) {
1380         qsort(hsp_list->hsp_array, hsp_list->hspcnt, sizeof(BlastHSP*),
1381               ScoreCompareHSPs);
1382     }
1383 }
1384 
1385 /** Compares 2 evalues, consider them equal if both are close enough to zero.
1386  * @param evalue1 First evalue [in]
1387  * @param evalue2 Second evalue [in]
1388  */
1389 static int
s_EvalueComp(double evalue1,double evalue2)1390 s_EvalueComp(double evalue1, double evalue2)
1391 {
1392     const double epsilon = 1.0e-180;
1393     if (evalue1 < epsilon && evalue2 < epsilon) {
1394         return 0;
1395     }
1396 
1397     if (evalue1 < evalue2) {
1398         return -1;
1399     } else if (evalue1 > evalue2) {
1400         return 1;
1401     } else {
1402         return 0;
1403     }
1404 }
1405 
1406 /** Comparison callback function for sorting HSPs by e-value and score, before
1407  * saving BlastHSPList in a BlastHitList. E-value has priority over score,
1408  * because lower scoring HSPs might have lower e-values, if they are linked
1409  * with sum statistics.
1410  * E-values are compared only up to a certain precision.
1411  * @param v1 Pointer to first HSP [in]
1412  * @param v2 Pointer to second HSP [in]
1413  */
1414 static int
s_EvalueCompareHSPs(const void * v1,const void * v2)1415 s_EvalueCompareHSPs(const void* v1, const void* v2)
1416 {
1417    BlastHSP* h1,* h2;
1418    int retval = 0;
1419 
1420    h1 = *((BlastHSP**) v1);
1421    h2 = *((BlastHSP**) v2);
1422 
1423    /* Check if one or both of these are null. Those HSPs should go to the end */
1424    if (!h1 && !h2)
1425       return 0;
1426    else if (!h1)
1427       return 1;
1428    else if (!h2)
1429       return -1;
1430 
1431    if ((retval = s_EvalueComp(h1->evalue, h2->evalue)) != 0)
1432       return retval;
1433 
1434    return ScoreCompareHSPs(v1, v2);
1435 }
1436 
Blast_HSPListSortByEvalue(BlastHSPList * hsp_list)1437 void Blast_HSPListSortByEvalue(BlastHSPList* hsp_list)
1438 {
1439     if (!hsp_list)
1440         return;
1441 
1442     if (hsp_list->hspcnt > 1) {
1443         Int4 index;
1444         BlastHSP** hsp_array = hsp_list->hsp_array;
1445         /* First check if HSP array is already sorted. */
1446         for (index = 0; index < hsp_list->hspcnt - 1; ++index) {
1447             if (s_EvalueCompareHSPs(&hsp_array[index], &hsp_array[index+1]) > 0) {
1448                 break;
1449             }
1450         }
1451         /* Sort the HSP array if it is not sorted yet. */
1452         if (index < hsp_list->hspcnt - 1) {
1453             qsort(hsp_list->hsp_array, hsp_list->hspcnt, sizeof(BlastHSP*),
1454                   s_EvalueCompareHSPs);
1455         }
1456     }
1457 }
1458 
1459 
1460 /** Retrieve the starting diagonal of an HSP
1461  * @param hsp The target HSP
1462  * @return The starting diagonal
1463  */
1464 static Int4
s_HSPStartDiag(const BlastHSP * hsp)1465 s_HSPStartDiag(const BlastHSP *hsp)
1466 {
1467     return hsp->query.offset - hsp->subject.offset;
1468 }
1469 
1470 /** Retrieve the ending diagonal of an HSP
1471  * @param hsp The target HSP
1472  * @return The ending diagonal
1473  */
1474 static Int4
s_HSPEndDiag(const BlastHSP * hsp)1475 s_HSPEndDiag(const BlastHSP *hsp)
1476 {
1477     return hsp->query.end - hsp->subject.end;
1478 }
1479 
1480 /** Given two hits, check if the hits can be merged and do
1481  * the merge if so. Hits must not contain traceback
1482  * @param hsp1 The first hit. If merging happens, this hit is
1483  *             overwritten with the merged version [in][out]
1484  * @param hsp2 The second hit [in]
1485  * @return TRUE if a merge was performed, FALSE if not
1486  */
1487 static Boolean
s_BlastMergeTwoHSPs(BlastHSP * hsp1,BlastHSP * hsp2,Boolean allow_gap)1488 s_BlastMergeTwoHSPs(BlastHSP* hsp1, BlastHSP* hsp2, Boolean allow_gap)
1489 {
1490    ASSERT(!hsp1->gap_info || !hsp2->gap_info);
1491 
1492    /* do not merge off-diagonal hsps for ungapped search */
1493    if (!allow_gap &&
1494        hsp1->subject.offset - hsp2->subject.offset -hsp1->query.offset + hsp2->query.offset)
1495    {
1496        return FALSE;
1497    }
1498 
1499    if(hsp1->subject.frame != hsp2->subject.frame)
1500 	   return FALSE;
1501 
1502    /* combine the boundaries of the two HSPs,
1503       assuming they intersect at all */
1504    if (CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end,
1505                         hsp2->query.offset,
1506                         hsp1->subject.offset, hsp1->subject.end,
1507                         hsp2->subject.offset) ||
1508        CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end,
1509                         hsp2->query.end,
1510                         hsp1->subject.offset, hsp1->subject.end,
1511                         hsp2->subject.end)) {
1512 
1513 	  double score_density =  (hsp1->score + hsp2->score) *(1.0) /
1514 			                  ((hsp1->query.end - hsp1->query.offset) +
1515 			                   (hsp2->query.end - hsp2->query.offset));
1516       hsp1->query.offset = MIN(hsp1->query.offset, hsp2->query.offset);
1517       hsp1->subject.offset = MIN(hsp1->subject.offset, hsp2->subject.offset);
1518       hsp1->query.end = MAX(hsp1->query.end, hsp2->query.end);
1519       hsp1->subject.end = MAX(hsp1->subject.end, hsp2->subject.end);
1520       if (hsp2->score > hsp1->score) {
1521           hsp1->query.gapped_start = hsp2->query.gapped_start;
1522           hsp1->subject.gapped_start = hsp2->subject.gapped_start;
1523 	  hsp1->score = hsp2->score;
1524       }
1525 
1526       hsp1->score = MAX((int) (score_density *(hsp1->query.end - hsp1->query.offset)), hsp1->score);
1527       return TRUE;
1528    }
1529 
1530    return FALSE;
1531 }
1532 
1533 /** Maximal diagonal distance between HSP starting offsets, within which HSPs
1534  * from search of different chunks of subject sequence are considered for
1535  * merging.
1536  */
1537 #define OVERLAP_DIAG_CLOSE 10
1538 /********************************************************************************
1539           Functions manipulating BlastHSPList's
1540 ********************************************************************************/
1541 
Blast_HSPListFree(BlastHSPList * hsp_list)1542 BlastHSPList* Blast_HSPListFree(BlastHSPList* hsp_list)
1543 {
1544    Int4 index;
1545 
1546    if (!hsp_list)
1547       return hsp_list;
1548 
1549    for (index = 0; index < hsp_list->hspcnt; ++index) {
1550       Blast_HSPFree(hsp_list->hsp_array[index]);
1551    }
1552    sfree(hsp_list->hsp_array);
1553 
1554    sfree(hsp_list);
1555    return NULL;
1556 }
1557 
Blast_HSPListNew(Int4 hsp_max)1558 BlastHSPList* Blast_HSPListNew(Int4 hsp_max)
1559 {
1560    BlastHSPList* hsp_list = (BlastHSPList*) calloc(1, sizeof(BlastHSPList));
1561    const Int4 kDefaultAllocated=100;
1562 
1563    /* hsp_max is max number of HSP's allowed in an HSP list;
1564       INT4_MAX taken as infinity. */
1565    hsp_list->hsp_max = INT4_MAX;
1566    if (hsp_max > 0)
1567       hsp_list->hsp_max = hsp_max;
1568 
1569    hsp_list->allocated = MIN(kDefaultAllocated, hsp_list->hsp_max);
1570 
1571    hsp_list->hsp_array = (BlastHSP**)
1572       calloc(hsp_list->allocated, sizeof(BlastHSP*));
1573 
1574    return hsp_list;
1575 }
1576 
1577 Boolean
Blast_HSPList_IsEmpty(const BlastHSPList * hsp_list)1578 Blast_HSPList_IsEmpty(const BlastHSPList* hsp_list)
1579 {
1580     return (hsp_list && hsp_list->hspcnt == 0) ? TRUE : FALSE;
1581 }
1582 
BlastHSPListDup(const BlastHSPList * hsp_list)1583 BlastHSPList* BlastHSPListDup(const BlastHSPList* hsp_list)
1584 {
1585     BlastHSPList * rv = 0;
1586 
1587     if (hsp_list) {
1588         int index = 0;
1589         int num = hsp_list->hspcnt;
1590 
1591         rv = malloc(sizeof(BlastHSPList));
1592         *rv = *hsp_list;
1593 
1594         if (num) {
1595             rv->hsp_array = malloc(sizeof(BlastHSP*) * num);
1596 
1597             for(index = 0; index < hsp_list->hspcnt; ++index) {
1598                 BlastHSP * h = hsp_list->hsp_array[index];
1599                 BlastHSP ** h2 = & rv->hsp_array[index];
1600 
1601                 if (h) {
1602                     *h2 = malloc(sizeof(BlastHSP));
1603                     **h2 = *h;
1604                 } else {
1605                     *h2 = 0;
1606                 }
1607             }
1608         }
1609     }
1610 
1611     return rv;
1612 }
1613 
Blast_HSPListSwap(BlastHSPList * list1,BlastHSPList * list2)1614 void Blast_HSPListSwap(BlastHSPList* list1, BlastHSPList* list2)
1615 {
1616     BlastHSPList tmp;
1617 
1618     tmp = *list1;
1619     *list1 = *list2;
1620     *list2 = tmp;
1621 }
1622 
1623 /** This is a copy of a static function from ncbimisc.c.
1624  * Turns array into a heap with respect to a given comparison function.
1625  */
1626 static void
s_Heapify(char * base0,char * base,char * lim,char * last,size_t width,int (* compar)(const void *,const void *))1627 s_Heapify (char* base0, char* base, char* lim, char* last, size_t width, int (*compar )(const void*, const void* ))
1628 {
1629    size_t i;
1630    char   ch;
1631    char* left_son,* large_son;
1632 
1633    left_son = base0 + 2*(base-base0) + width;
1634    while (base <= lim) {
1635       if (left_son == last)
1636          large_son = left_son;
1637       else
1638          large_son = (*compar)(left_son, left_son+width) >= 0 ?
1639             left_son : left_son+width;
1640       if ((*compar)(base, large_son) < 0) {
1641          for (i=0; i<width; ++i) {
1642             ch = base[i];
1643             base[i] = large_son[i];
1644             large_son[i] = ch;
1645          }
1646          base = large_son;
1647          left_son = base0 + 2*(base-base0) + width;
1648       } else
1649          break;
1650    }
1651 }
1652 
1653 /** Creates a heap of elements based on a comparison function.
1654  * @param b An array [in] [out]
1655  * @param nel Number of elements in b [in]
1656  * @param width The size of each element [in]
1657  * @param compar Callback to compare two heap elements [in]
1658  */
1659 static void
s_CreateHeap(void * b,size_t nel,size_t width,int (* compar)(const void *,const void *))1660 s_CreateHeap (void* b, size_t nel, size_t width,
1661    int (*compar )(const void*, const void* ))
1662 {
1663    char*    base = (char*)b;
1664    size_t i;
1665    char*    base0 = (char*)base,* lim,* basef;
1666 
1667    if (nel < 2)
1668       return;
1669 
1670    lim = &base[((nel-2)/2)*width];
1671    basef = &base[(nel-1)*width];
1672    i = nel/2;
1673    for (base = &base0[(i - 1)*width]; i > 0; base = base - width) {
1674       s_Heapify(base0, base, lim, basef, width, compar);
1675       i--;
1676    }
1677 }
1678 
1679 /** Given a BlastHSPList* with a heapified HSP array, check whether the
1680  * new HSP is better than the worst scoring.  If it is, then remove the
1681  * worst scoring and insert, otherwise free the new one.
1682  * HSP and insert the new HSP in the heap.
1683  * @param hsp_list Contains all HSPs for a given subject. [in] [out]
1684  * @param hsp A pointer to new HSP to be inserted into the HSP list [in] [out]
1685  */
1686 static void
s_BlastHSPListInsertHSPInHeap(BlastHSPList * hsp_list,BlastHSP ** hsp)1687 s_BlastHSPListInsertHSPInHeap(BlastHSPList* hsp_list,
1688                              BlastHSP** hsp)
1689 {
1690     BlastHSP** hsp_array = hsp_list->hsp_array;
1691     if (ScoreCompareHSPs(hsp, &hsp_array[0]) > 0)
1692     {
1693          Blast_HSPFree(*hsp);
1694          return;
1695     }
1696     else
1697          Blast_HSPFree(hsp_array[0]);
1698 
1699     hsp_array[0] = *hsp;
1700     if (hsp_list->hspcnt >= 2) {
1701         s_Heapify((char*)hsp_array, (char*)hsp_array,
1702                 (char*)&hsp_array[hsp_list->hspcnt/2 - 1],
1703                  (char*)&hsp_array[hsp_list->hspcnt-1],
1704                  sizeof(BlastHSP*), ScoreCompareHSPs);
1705     }
1706 }
1707 
1708 #ifndef NDEBUG
1709 /** Verifies that the best_evalue field on the BlastHSPList is correct.
1710  * @param hsp_list object to check [in]
1711  * @return TRUE if OK, FALSE otherwise.
1712  */
1713 static Boolean
s_BlastCheckBestEvalue(const BlastHSPList * hsp_list)1714 s_BlastCheckBestEvalue(const BlastHSPList* hsp_list)
1715 {
1716     int index = 0;
1717     double best_evalue = (double) INT4_MAX;
1718     const double kDelta = 1.0e-200;
1719 
1720     /* There are no HSP's here. */
1721     if (hsp_list->hspcnt == 0)
1722        return TRUE;
1723 
1724     for (index=0; index<hsp_list->hspcnt; index++)
1725        best_evalue = MIN(hsp_list->hsp_array[index]->evalue, best_evalue);
1726 
1727     /* check that it's within 1%. */
1728     if (ABS(best_evalue-hsp_list->best_evalue)/(best_evalue+kDelta) > 0.01)
1729        return FALSE;
1730 
1731     return TRUE;
1732 }
1733 #endif /* _DEBUG */
1734 
1735 /** Gets the best (lowest) evalue from the BlastHSPList.
1736  * @param hsp_list object containing the evalues [in]
1737  * @return TRUE if OK, FALSE otherwise.
1738  */
1739 static double
s_BlastGetBestEvalue(const BlastHSPList * hsp_list)1740 s_BlastGetBestEvalue(const BlastHSPList* hsp_list)
1741 {
1742     int index = 0;
1743     double best_evalue = (double) INT4_MAX;
1744 
1745     for (index=0; index<hsp_list->hspcnt; index++)
1746        best_evalue = MIN(hsp_list->hsp_array[index]->evalue, best_evalue);
1747 
1748     return best_evalue;
1749 }
1750 
1751 /* Comments in blast_hits.h
1752  */
1753 Int2
Blast_HSPListSaveHSP(BlastHSPList * hsp_list,BlastHSP * new_hsp)1754 Blast_HSPListSaveHSP(BlastHSPList* hsp_list, BlastHSP* new_hsp)
1755 {
1756    BlastHSP** hsp_array;
1757    Int4 hspcnt;
1758    Int4 hsp_allocated; /* how many hsps are in the array. */
1759    Int2 status = 0;
1760 
1761    hspcnt = hsp_list->hspcnt;
1762    hsp_allocated = hsp_list->allocated;
1763    hsp_array = hsp_list->hsp_array;
1764 
1765 
1766    /* Check if list is already full, then reallocate. */
1767    if (hspcnt >= hsp_allocated && hsp_list->do_not_reallocate == FALSE)
1768    {
1769       Int4 new_allocated = MIN(2*hsp_list->allocated, hsp_list->hsp_max);
1770       if (new_allocated > hsp_list->allocated) {
1771          hsp_array = (BlastHSP**)
1772             realloc(hsp_array, new_allocated*sizeof(BlastHSP*));
1773          if (hsp_array == NULL)
1774          {
1775             hsp_list->do_not_reallocate = TRUE;
1776             hsp_array = hsp_list->hsp_array;
1777             /** Return a non-zero status, because restriction on number
1778                 of HSPs here is a result of memory allocation failure. */
1779             status = -1;
1780          } else {
1781             hsp_list->hsp_array = hsp_array;
1782             hsp_list->allocated = new_allocated;
1783             hsp_allocated = new_allocated;
1784          }
1785       } else {
1786          hsp_list->do_not_reallocate = TRUE;
1787       }
1788       /* If it is the first time when the HSP array is filled to capacity,
1789          create a heap now. */
1790       if (hsp_list->do_not_reallocate) {
1791           s_CreateHeap(hsp_array, hspcnt, sizeof(BlastHSP*), ScoreCompareHSPs);
1792       }
1793    }
1794 
1795    /* If there is space in the allocated HSP array, simply save the new HSP.
1796       Othewise, if the new HSP has lower score than the worst HSP in the heap,
1797       then delete it, else insert it in the heap. */
1798    if (hspcnt < hsp_allocated)
1799    {
1800       hsp_array[hsp_list->hspcnt] = new_hsp;
1801       (hsp_list->hspcnt)++;
1802       return status;
1803    } else {
1804        /* Insert the new HSP in heap. */
1805        s_BlastHSPListInsertHSPInHeap(hsp_list, &new_hsp);
1806    }
1807 
1808    return status;
1809 }
1810 
Blast_HSPListGetEvalues(EBlastProgramType program_number,const BlastQueryInfo * query_info,Int4 subject_length,BlastHSPList * hsp_list,Boolean gapped_calculation,Boolean RPS_prelim,const BlastScoreBlk * sbp,double gap_decay_rate,double scaling_factor)1811 Int2 Blast_HSPListGetEvalues(EBlastProgramType program_number,
1812                              const BlastQueryInfo* query_info,
1813                              Int4 subject_length,
1814                              BlastHSPList* hsp_list,
1815                              Boolean gapped_calculation,
1816                              Boolean RPS_prelim,
1817                              const BlastScoreBlk* sbp, double gap_decay_rate,
1818                              double scaling_factor)
1819 {
1820    BlastHSP* hsp;
1821    BlastHSP** hsp_array;
1822    Blast_KarlinBlk** kbp;
1823    Int4 hsp_cnt;
1824    Int4 index;
1825    Int4 kbp_context;
1826    Int4 score;
1827    double gap_decay_divisor = 1.;
1828    Boolean isRPS = Blast_ProgramIsRpsBlast(program_number);
1829 
1830    if (hsp_list == NULL || hsp_list->hspcnt == 0)
1831       return 0;
1832 
1833    kbp = (gapped_calculation ? sbp->kbp_gap : sbp->kbp);
1834    hsp_cnt = hsp_list->hspcnt;
1835    hsp_array = hsp_list->hsp_array;
1836 
1837    if (gap_decay_rate != 0.)
1838       gap_decay_divisor = BLAST_GapDecayDivisor(gap_decay_rate, 1);
1839 
1840    for (index=0; index<hsp_cnt; index++) {
1841       hsp = hsp_array[index];
1842 
1843       ASSERT(hsp != NULL);
1844       ASSERT(scaling_factor != 0.0);
1845 #if 0
1846       ASSERT(sbp->round_down == FALSE || (hsp->score & 1) == 0);
1847 #endif
1848       /* Divide Lambda by the scaling factor, so e-value is
1849          calculated correctly from a scaled score. This is needed only
1850          for RPS BLAST, where scores are scaled, but Lambda is not. */
1851       kbp_context = hsp->context;
1852       if (RPS_prelim) {
1853           /* All kbp in preliminary stage are equivalent.  However, some
1854              may be invalid.  Search for the first populated kbp */
1855           int i;
1856           for (i=0; i < sbp->number_of_contexts; ++i) {
1857               if (kbp[i]) break;
1858           }
1859           kbp_context = i;
1860       }
1861       ASSERT(kbp[kbp_context]);
1862       kbp[kbp_context]->Lambda /= scaling_factor;
1863 
1864       /* Round score down to even number for E-value calculations only. */
1865       /* Added 2018/5/16 by rackerst, SB-2303 */
1866       score = hsp->score;
1867       if (hsp_list  &&  hsp_list->hspcnt != 0
1868               &&  gapped_calculation  &&  sbp->round_down) {
1869           score &= ~1;
1870       }
1871 
1872       if (sbp->gbp) {
1873           /* Only try Spouge's method if gumbel parameters are available */
1874           if (!isRPS) {
1875               hsp->evalue =
1876                   BLAST_SpougeStoE(score, kbp[kbp_context], sbp->gbp,
1877                                query_info->contexts[hsp->context].query_length,
1878                                subject_length);
1879           } else {
1880               /* for RPS blast, query and subject is swapped */
1881               hsp->evalue =
1882                   BLAST_SpougeStoE(score, kbp[kbp_context], sbp->gbp,
1883                                subject_length,
1884                                query_info->contexts[hsp->context].query_length);
1885           }
1886       } else {
1887           /* Get effective search space from the query information block */
1888           hsp->evalue =
1889               BLAST_KarlinStoE_simple(score, kbp[kbp_context],
1890                                query_info->contexts[hsp->context].eff_searchsp);
1891       }
1892 
1893       hsp->evalue /= gap_decay_divisor;
1894       /* Put back the unscaled value of Lambda. */
1895       kbp[kbp_context]->Lambda *= scaling_factor;
1896    }
1897 
1898    /* Assign the best e-value field. Here the best e-value will always be
1899       attained for the first HSP in the list. Check that the incoming
1900       HSP list is properly sorted by score. */
1901    ASSERT(Blast_HSPListIsSortedByScore(hsp_list));
1902    hsp_list->best_evalue = s_BlastGetBestEvalue(hsp_list);
1903 
1904    return 0;
1905 }
1906 
Blast_HSPListGetBitScores(BlastHSPList * hsp_list,Boolean gapped_calculation,const BlastScoreBlk * sbp)1907 Int2 Blast_HSPListGetBitScores(BlastHSPList* hsp_list,
1908                                Boolean gapped_calculation,
1909                                const BlastScoreBlk* sbp)
1910 {
1911    BlastHSP* hsp;
1912    Blast_KarlinBlk** kbp;
1913    Int4 index;
1914 
1915    if (hsp_list == NULL)
1916       return 1;
1917 
1918    kbp = (gapped_calculation ? sbp->kbp_gap : sbp->kbp);
1919 
1920    for (index=0; index<hsp_list->hspcnt; index++) {
1921       hsp = hsp_list->hsp_array[index];
1922       ASSERT(hsp != NULL);
1923 #if 0
1924       ASSERT(sbp->round_down == FALSE || (hsp->score & 1) == 0);
1925 #endif
1926       hsp->bit_score =
1927          (hsp->score*kbp[hsp->context]->Lambda - kbp[hsp->context]->logK) /
1928          NCBIMATH_LN2;
1929    }
1930 
1931    return 0;
1932 }
1933 
Blast_HSPListPHIGetBitScores(BlastHSPList * hsp_list,BlastScoreBlk * sbp)1934 void Blast_HSPListPHIGetBitScores(BlastHSPList* hsp_list, BlastScoreBlk* sbp)
1935 {
1936     Int4 index;
1937 
1938     double lambda, logC;
1939 
1940     ASSERT(sbp && sbp->kbp_gap && sbp->kbp_gap[0]);
1941 
1942     lambda = sbp->kbp_gap[0]->Lambda;
1943     logC = log(sbp->kbp_gap[0]->paramC);
1944 
1945     for (index=0; index<hsp_list->hspcnt; index++) {
1946         BlastHSP* hsp = hsp_list->hsp_array[index];
1947         ASSERT(hsp != NULL);
1948         hsp->bit_score =
1949             (hsp->score*lambda - logC - log(1.0 + hsp->score*lambda))
1950             / NCBIMATH_LN2;
1951     }
1952 }
1953 
1954 void
Blast_HSPListPHIGetEvalues(BlastHSPList * hsp_list,BlastScoreBlk * sbp,const BlastQueryInfo * query_info,const SPHIPatternSearchBlk * pattern_blk)1955 Blast_HSPListPHIGetEvalues(BlastHSPList* hsp_list, BlastScoreBlk* sbp,
1956                            const BlastQueryInfo* query_info,
1957                            const SPHIPatternSearchBlk* pattern_blk)
1958 {
1959    Int4 index;
1960    BlastHSP* hsp;
1961 
1962    if (!hsp_list || hsp_list->hspcnt == 0)
1963        return;
1964 
1965    for (index = 0; index < hsp_list->hspcnt; ++index) {
1966       hsp = hsp_list->hsp_array[index];
1967       s_HSPPHIGetEvalue(hsp, sbp, query_info, pattern_blk);
1968    }
1969    /* The best e-value is the one for the highest scoring HSP, which
1970       must be the first in the list. Check that HSPs are sorted by score
1971       to make sure this assumption is correct. */
1972    ASSERT(Blast_HSPListIsSortedByScore(hsp_list));
1973    hsp_list->best_evalue = s_BlastGetBestEvalue(hsp_list);
1974 }
1975 
Blast_HSPListReapByEvalue(BlastHSPList * hsp_list,const BlastHitSavingOptions * hit_options)1976 Int2 Blast_HSPListReapByEvalue(BlastHSPList* hsp_list,
1977         const BlastHitSavingOptions* hit_options)
1978 {
1979    BlastHSP* hsp;
1980    BlastHSP** hsp_array;
1981    Int4 hsp_cnt = 0;
1982    Int4 index;
1983    double cutoff;
1984 
1985    if (hsp_list == NULL)
1986       return 0;
1987 
1988    cutoff = hit_options->expect_value;
1989 
1990    hsp_array = hsp_list->hsp_array;
1991    for (index = 0; index < hsp_list->hspcnt; index++) {
1992       hsp = hsp_array[index];
1993 
1994       ASSERT(hsp != NULL);
1995 
1996       if (hsp->evalue > cutoff) {
1997          hsp_array[index] = Blast_HSPFree(hsp_array[index]);
1998       } else {
1999          if (index > hsp_cnt)
2000             hsp_array[hsp_cnt] = hsp_array[index];
2001          hsp_cnt++;
2002       }
2003    }
2004 
2005    hsp_list->hspcnt = hsp_cnt;
2006 
2007    return 0;
2008 }
2009 
Blast_HSPListReapByQueryCoverage(BlastHSPList * hsp_list,const BlastHitSavingOptions * hit_options,const BlastQueryInfo * query_info,EBlastProgramType program_number)2010 Int2 Blast_HSPListReapByQueryCoverage(BlastHSPList* hsp_list,
2011                                       const BlastHitSavingOptions* hit_options,
2012                                       const BlastQueryInfo* query_info,
2013                                       EBlastProgramType program_number)
2014 
2015 {
2016    BlastHSP* hsp;
2017    BlastHSP** hsp_array;
2018    Int4 hsp_cnt = 0;
2019    Int4 index;
2020 
2021    if ((hsp_list == NULL) || (hsp_list->hspcnt == 0) || (hit_options->query_cov_hsp_perc == 0))
2022       return 0;
2023 
2024    hsp_array = hsp_list->hsp_array;
2025    for (index = 0; index < hsp_list->hspcnt; index++) {
2026       hsp = hsp_array[index];
2027       ASSERT(hsp != NULL);
2028       if ( Blast_HSPQueryCoverageTest(hsp, hit_options->query_cov_hsp_perc,
2029 			                          query_info->contexts[hsp->context].query_length)) {
2030          hsp_array[index] = Blast_HSPFree(hsp_array[index]);
2031       } else {
2032          if (index > hsp_cnt)
2033             hsp_array[hsp_cnt] = hsp_array[index];
2034          hsp_cnt++;
2035       }
2036    }
2037 
2038    hsp_list->hspcnt = hsp_cnt;
2039 
2040    return 0;
2041 }
2042 
Blast_TrimHSPListByMaxHsps(BlastHSPList * hsp_list,const BlastHitSavingOptions * hit_options)2043 Int2 Blast_TrimHSPListByMaxHsps(BlastHSPList* hsp_list,
2044                                 const BlastHitSavingOptions* hit_options)
2045 {
2046    BlastHSP** hsp_array;
2047    Int4 index;
2048    Int4 hsp_max;
2049 
2050    if ((hsp_list == NULL) ||
2051 	   (hit_options->max_hsps_per_subject == 0) ||
2052 	   (hsp_list->hspcnt <= hit_options->max_hsps_per_subject))
2053       return 0;
2054 
2055    hsp_max = hit_options->max_hsps_per_subject;
2056    hsp_array = hsp_list->hsp_array;
2057    for (index = hsp_max; index < hsp_list->hspcnt; index++) {
2058       hsp_array[index] = Blast_HSPFree(hsp_array[index]);
2059    }
2060 
2061    hsp_list->hspcnt = hsp_max;
2062    return 0;
2063 }
2064 
2065 
2066 /** Same as Blast_HSPListReapByEvalue() except that it uses
2067  *  the raw score of the hit and the HitSavingOptions->cutoff_score
2068  *  to filter out hits. -RMH-
2069  */
Blast_HSPListReapByRawScore(BlastHSPList * hsp_list,const BlastHitSavingOptions * hit_options)2070 Int2 Blast_HSPListReapByRawScore(BlastHSPList* hsp_list,
2071         const BlastHitSavingOptions* hit_options)
2072 {
2073    BlastHSP* hsp;
2074    BlastHSP** hsp_array;
2075    Int4 hsp_cnt = 0;
2076    Int4 index;
2077 
2078    if (hsp_list == NULL)
2079       return 0;
2080 
2081    hsp_array = hsp_list->hsp_array;
2082    for (index = 0; index < hsp_list->hspcnt; index++) {
2083       hsp = hsp_array[index];
2084 
2085       ASSERT(hsp != NULL);
2086 
2087       if ( hsp->score < hit_options->cutoff_score ) {
2088          hsp_array[index] = Blast_HSPFree(hsp_array[index]);
2089       } else {
2090          if (index > hsp_cnt)
2091             hsp_array[hsp_cnt] = hsp_array[index];
2092          hsp_cnt++;
2093       }
2094    }
2095 
2096    hsp_list->hspcnt = hsp_cnt;
2097 
2098    return 0;
2099 }
2100 
2101 /** callback used to sort HSP lists in order of increasing OID
2102  * @param x First HSP list [in]
2103  * @param y Second HSP list [in]
2104  * @return compare result
2105  */
s_SortHSPListByOid(const void * x,const void * y)2106 static int s_SortHSPListByOid(const void *x, const void *y)
2107 {
2108     BlastHSPList **xx = (BlastHSPList **)x;
2109     BlastHSPList **yy = (BlastHSPList **)y;
2110     return (*xx)->oid - (*yy)->oid;
2111 }
2112 
Blast_HitListMerge(BlastHitList ** old_hit_list_ptr,BlastHitList ** combined_hit_list_ptr,Int4 contexts_per_query,Int4 * split_offsets,Int4 chunk_overlap_size,Boolean allow_gap)2113 Int2 Blast_HitListMerge(BlastHitList** old_hit_list_ptr,
2114                         BlastHitList** combined_hit_list_ptr,
2115                         Int4 contexts_per_query, Int4 *split_offsets,
2116                         Int4 chunk_overlap_size, Boolean allow_gap)
2117 {
2118     Int4 i, j;
2119     Boolean query_is_split;
2120     BlastHitList* hitlist1 = *old_hit_list_ptr;
2121     BlastHitList* hitlist2 = *combined_hit_list_ptr;
2122     BlastHitList* new_hitlist;
2123     Int4 num_hsplists1;
2124     Int4 num_hsplists2;
2125 
2126     if (hitlist1 == NULL)
2127         return 0;
2128     if (hitlist2 == NULL) {
2129         *combined_hit_list_ptr = hitlist1;
2130         *old_hit_list_ptr = NULL;
2131         return 0;
2132     }
2133     num_hsplists1 = hitlist1->hsplist_count;
2134     num_hsplists2 = hitlist2->hsplist_count;
2135     new_hitlist = Blast_HitListNew(hitlist1->hsplist_max);
2136 
2137     /* sort the lists of HSPs by oid */
2138 
2139     if (num_hsplists1 > 1) {
2140         qsort(hitlist1->hsplist_array, num_hsplists1,
2141               sizeof(BlastHSPList*), s_SortHSPListByOid);
2142     }
2143     if (num_hsplists2 > 1) {
2144         qsort(hitlist2->hsplist_array, num_hsplists2,
2145               sizeof(BlastHSPList*), s_SortHSPListByOid);
2146     }
2147 
2148     /* find out if the two hitlists contain hits for a single
2149        (split) query sequence */
2150 
2151     query_is_split = FALSE;
2152     for (i = 0; i < contexts_per_query; i++) {
2153         if (split_offsets[i] > 0) {
2154             query_is_split = TRUE;
2155             break;
2156         }
2157     }
2158     ASSERT(chunk_overlap_size != 0);
2159 
2160     /* merge the HSPlists of the two HitLists */
2161 
2162     i = j = 0;
2163     while (i < num_hsplists1 && j < num_hsplists2) {
2164         BlastHSPList* hsplist1 = hitlist1->hsplist_array[i];
2165         BlastHSPList* hsplist2 = hitlist2->hsplist_array[j];
2166 
2167         if (hsplist1->oid < hsplist2->oid) {
2168             Blast_HitListUpdate(new_hitlist, hsplist1);
2169             i++;
2170         }
2171         else if (hsplist1->oid > hsplist2->oid) {
2172             Blast_HitListUpdate(new_hitlist, hsplist2);
2173             j++;
2174         }
2175         else {
2176             /* the old and new Hitlists contain hits to the same
2177                DB sequence, and these must be merged. */
2178 
2179             if (query_is_split) {
2180                 Blast_HSPListsMerge(hitlist1->hsplist_array + i,
2181                                     hitlist2->hsplist_array + j,
2182                                     hsplist2->hsp_max, split_offsets,
2183                                     contexts_per_query,
2184                                     chunk_overlap_size,
2185                                     allow_gap, FALSE);
2186             }
2187             else {
2188                 Blast_HSPListAppend(hitlist1->hsplist_array + i,
2189                                     hitlist2->hsplist_array + j,
2190                                     hsplist2->hsp_max);
2191             }
2192             Blast_HitListUpdate(new_hitlist, hitlist2->hsplist_array[j]);
2193             i++;
2194             j++;
2195         }
2196     }
2197     for (; i < num_hsplists1; i++) {
2198         BlastHSPList* hsplist1 = hitlist1->hsplist_array[i];
2199         Blast_HitListUpdate(new_hitlist, hsplist1);
2200     }
2201     for (; j < num_hsplists2; j++) {
2202         BlastHSPList* hsplist2 = hitlist2->hsplist_array[j];
2203         Blast_HitListUpdate(new_hitlist, hsplist2);
2204     }
2205     hitlist1->hsplist_count = 0;
2206     Blast_HitListFree(hitlist1);
2207     hitlist2->hsplist_count = 0;
2208     Blast_HitListFree(hitlist2);
2209 
2210     *old_hit_list_ptr = NULL;
2211     *combined_hit_list_ptr = new_hitlist;
2212     return 0;
2213 }
2214 
2215 
2216 /* See description in blast_hits.h */
2217 
2218 Int2
Blast_HSPListPurgeNullHSPs(BlastHSPList * hsp_list)2219 Blast_HSPListPurgeNullHSPs(BlastHSPList* hsp_list)
2220 
2221 {
2222         Int4 index, index1; /* loop indices. */
2223         Int4 hspcnt; /* total number of HSP's to iterate over. */
2224         BlastHSP** hsp_array;  /* hsp_array to purge. */
2225 
2226 	if (hsp_list == NULL || hsp_list->hspcnt == 0)
2227 		return 0;
2228 
2229         hsp_array = hsp_list->hsp_array;
2230         hspcnt = hsp_list->hspcnt;
2231 
2232 	index1 = 0;
2233 	for (index=0; index<hspcnt; index++)
2234 	{
2235 		if (hsp_array[index] != NULL)
2236 		{
2237 			hsp_array[index1] = hsp_array[index];
2238 			index1++;
2239 		}
2240 	}
2241 
2242 	for (index=index1; index<hspcnt; index++)
2243 	{
2244 		hsp_array[index] = NULL;
2245 	}
2246 
2247 	hsp_list->hspcnt = index1;
2248 
2249         return 0;
2250 }
2251 
2252 /** Callback for sorting HSPs by starting offset in query. Sorting is by
2253  * increasing context, then increasing query start offset, then increasing
2254  * subject start offset, then decreasing score, then increasing query end
2255  * offset, then increasing subject end offset. Null HSPs are moved to the
2256  * end of the array.
2257  * @param v1 pointer to first HSP [in]
2258  * @param v2 pointer to second HSP [in]
2259  * @return Result of comparison.
2260  */
2261 static int
s_QueryOffsetCompareHSPs(const void * v1,const void * v2)2262 s_QueryOffsetCompareHSPs(const void* v1, const void* v2)
2263 {
2264    BlastHSP* h1,* h2;
2265    BlastHSP** hp1,** hp2;
2266 
2267    hp1 = (BlastHSP**) v1;
2268    hp2 = (BlastHSP**) v2;
2269    h1 = *hp1;
2270    h2 = *hp2;
2271 
2272    if (!h1 && !h2)
2273       return 0;
2274    else if (!h1)
2275       return 1;
2276    else if (!h2)
2277       return -1;
2278 
2279    /* If these are from different contexts, don't compare offsets */
2280    if (h1->context < h2->context)
2281       return -1;
2282    if (h1->context > h2->context)
2283       return 1;
2284 
2285    if (h1->query.offset < h2->query.offset)
2286       return -1;
2287    if (h1->query.offset > h2->query.offset)
2288       return 1;
2289 
2290    if (h1->subject.offset < h2->subject.offset)
2291       return -1;
2292    if (h1->subject.offset > h2->subject.offset)
2293       return 1;
2294 
2295    /* tie breakers: sort by decreasing score, then
2296       by increasing size of query range, then by
2297       increasing subject range. */
2298 
2299    if (h1->score < h2->score)
2300       return 1;
2301    if (h1->score > h2->score)
2302       return -1;
2303 
2304    if (h1->query.end < h2->query.end)
2305       return 1;
2306    if (h1->query.end > h2->query.end)
2307       return -1;
2308 
2309    if (h1->subject.end < h2->subject.end)
2310       return 1;
2311    if (h1->subject.end > h2->subject.end)
2312       return -1;
2313 
2314    return 0;
2315 }
2316 
2317 /** Callback for sorting HSPs by ending offset in query. Sorting is by
2318  * increasing context, then increasing query end offset, then increasing
2319  * subject end offset, then decreasing score, then decreasing query start
2320  * offset, then decreasing subject start offset. Null HSPs are moved to the
2321  * end of the array.
2322  * @param v1 pointer to first HSP [in]
2323  * @param v2 pointer to second HSP [in]
2324  * @return Result of comparison.
2325  */
2326 static int
s_QueryEndCompareHSPs(const void * v1,const void * v2)2327 s_QueryEndCompareHSPs(const void* v1, const void* v2)
2328 {
2329    BlastHSP* h1,* h2;
2330    BlastHSP** hp1,** hp2;
2331 
2332    hp1 = (BlastHSP**) v1;
2333    hp2 = (BlastHSP**) v2;
2334    h1 = *hp1;
2335    h2 = *hp2;
2336 
2337    if (!h1 && !h2)
2338       return 0;
2339    else if (!h1)
2340       return 1;
2341    else if (!h2)
2342       return -1;
2343 
2344    /* If these are from different contexts, don't compare offsets */
2345    if (h1->context < h2->context)
2346       return -1;
2347    if (h1->context > h2->context)
2348       return 1;
2349 
2350    if (h1->query.end < h2->query.end)
2351       return -1;
2352    if (h1->query.end > h2->query.end)
2353       return 1;
2354 
2355    if (h1->subject.end < h2->subject.end)
2356       return -1;
2357    if (h1->subject.end > h2->subject.end)
2358       return 1;
2359 
2360    /* tie breakers: sort by decreasing score, then
2361       by increasing size of query range, then by
2362       increasing size of subject range. The shortest range
2363       means the *largest* sequence offset must come
2364       first */
2365    if (h1->score < h2->score)
2366       return 1;
2367    if (h1->score > h2->score)
2368       return -1;
2369 
2370    if (h1->query.offset < h2->query.offset)
2371       return 1;
2372    if (h1->query.offset > h2->query.offset)
2373       return -1;
2374 
2375    if (h1->subject.offset < h2->subject.offset)
2376       return 1;
2377    if (h1->subject.offset > h2->subject.offset)
2378       return -1;
2379 
2380    return 0;
2381 }
2382 
2383 
2384 /* cut off the GapEditScript according to hsp offset and end */
2385 static void
s_CutOffGapEditScript(BlastHSP * hsp,Int4 q_cut,Int4 s_cut,Boolean cut_begin)2386 s_CutOffGapEditScript(BlastHSP* hsp, Int4 q_cut, Int4 s_cut, Boolean cut_begin)
2387 {
2388    int index, opid, qid, sid;
2389    Boolean found = FALSE;
2390    GapEditScript *esp = hsp->gap_info;
2391    qid = 0;
2392    sid = 0;
2393    opid = 0;
2394    q_cut -= hsp->query.offset;
2395    s_cut -= hsp->subject.offset;
2396    for (index=0; index < esp->size; index++) {
2397        for(opid=0; opid < esp->num[index];){
2398           if (esp->op_type[index] == eGapAlignSub) {
2399              qid++;
2400              sid++;
2401              opid++;
2402           } else if (esp->op_type[index] == eGapAlignDel) {
2403              sid+=esp->num[index];
2404              opid+=esp->num[index];
2405           } else if (esp->op_type[index] == eGapAlignIns) {
2406              qid+=esp->num[index];
2407              opid+=esp->num[index];
2408           }
2409           if (qid >= q_cut && sid >= s_cut) found = TRUE;
2410           if (found) break;
2411        }
2412        if (found) break;
2413    }
2414 
2415    // RMH: Unless both cut sites where found the following
2416    //      block would access memory outside the GapEditScript
2417    //      array.
2418    if ( found )
2419    {
2420      if (cut_begin) {
2421          int new_index = 0;
2422          if (opid < esp->num[index]) {
2423             ASSERT(esp->op_type[index] == eGapAlignSub);
2424             esp->op_type[0] = esp->op_type[index];
2425             esp->num[0] = esp->num[index] - opid;
2426             new_index++;
2427          }
2428          ++index;
2429          for (; index < esp->size; index++, new_index++) {
2430             esp->op_type[new_index] = esp->op_type[index];
2431             esp->num[new_index] = esp->num[index];
2432          }
2433          esp->size = new_index;
2434          hsp->query.offset += qid;
2435          hsp->subject.offset += sid;
2436      } else {
2437          if (opid < esp->num[index]) {
2438             ASSERT(esp->op_type[index] == eGapAlignSub);
2439             esp->num[index] = opid;
2440          }
2441          esp->size = index+1;
2442          hsp->query.end = hsp->query.offset + qid;
2443          hsp->subject.end = hsp->subject.offset + sid;
2444      }
2445    }
2446 }
2447 
2448 Int4
Blast_HSPListPurgeHSPsWithCommonEndpoints(EBlastProgramType program,BlastHSPList * hsp_list,Boolean purge)2449 Blast_HSPListPurgeHSPsWithCommonEndpoints(EBlastProgramType program,
2450                                           BlastHSPList* hsp_list,
2451                                           Boolean purge)
2452 
2453 {
2454    BlastHSP** hsp_array;  /* hsp_array to purge. */
2455    BlastHSP* hsp;
2456    Int4 i, j, k;
2457    Int4 hsp_count;
2458    purge |= (program != eBlastTypeBlastn);
2459 
2460    /* If HSP list is empty, return immediately. */
2461    if (hsp_list == NULL || hsp_list->hspcnt == 0)
2462        return 0;
2463 
2464    /* Do nothing for PHI BLAST, because HSPs corresponding to different pattern
2465       occurrences may have common end points, but should all be kept. */
2466    if (Blast_ProgramIsPhiBlast(program))
2467        return hsp_list->hspcnt;
2468 
2469    hsp_array = hsp_list->hsp_array;
2470    hsp_count = hsp_list->hspcnt;
2471 
2472    qsort(hsp_array, hsp_count, sizeof(BlastHSP*), s_QueryOffsetCompareHSPs);
2473    i = 0;
2474    while (i < hsp_count) {
2475       j = 1;
2476       while (i+j < hsp_count &&
2477              hsp_array[i] && hsp_array[i+j] &&
2478              hsp_array[i]->context == hsp_array[i+j]->context &&
2479              hsp_array[i]->query.offset == hsp_array[i+j]->query.offset &&
2480              hsp_array[i]->subject.offset == hsp_array[i+j]->subject.offset) {
2481          hsp_count--;
2482          hsp = hsp_array[i+j];
2483          if (!purge && (hsp->query.end > hsp_array[i]->query.end)) {
2484              s_CutOffGapEditScript(hsp, hsp_array[i]->query.end,
2485                                         hsp_array[i]->subject.end, TRUE);
2486          } else {
2487              hsp = Blast_HSPFree(hsp);
2488          }
2489          for (k=i+j; k<hsp_count; k++) {
2490              hsp_array[k] = hsp_array[k+1];
2491          }
2492          hsp_array[hsp_count] = hsp;
2493       }
2494       i += j;
2495    }
2496 
2497    qsort(hsp_array, hsp_count, sizeof(BlastHSP*), s_QueryEndCompareHSPs);
2498    i = 0;
2499    while (i < hsp_count) {
2500       j = 1;
2501       while (i+j < hsp_count &&
2502              hsp_array[i] && hsp_array[i+j] &&
2503              hsp_array[i]->context == hsp_array[i+j]->context &&
2504              hsp_array[i]->query.end == hsp_array[i+j]->query.end &&
2505              hsp_array[i]->subject.end == hsp_array[i+j]->subject.end) {
2506          hsp_count--;
2507          hsp = hsp_array[i+j];
2508          if (!purge && (hsp->query.offset < hsp_array[i]->query.offset)) {
2509              s_CutOffGapEditScript(hsp, hsp_array[i]->query.offset,
2510                                         hsp_array[i]->subject.offset, FALSE);
2511          } else {
2512              hsp = Blast_HSPFree(hsp);
2513          }
2514          for (k=i+j; k<hsp_count; k++) {
2515              hsp_array[k] = hsp_array[k+1];
2516          }
2517          hsp_array[hsp_count] = hsp;
2518       }
2519       i += j;
2520    }
2521 
2522    if (purge) {
2523       Blast_HSPListPurgeNullHSPs(hsp_list);
2524    }
2525 
2526    return hsp_count;
2527 }
2528 
2529 Int4
Blast_HSPListSubjectBestHit(EBlastProgramType program,const BlastHSPSubjectBestHitOptions * subject_besthit_opts,const BlastQueryInfo * query_info,BlastHSPList * hsp_list)2530 Blast_HSPListSubjectBestHit(EBlastProgramType program,
2531 		                    const BlastHSPSubjectBestHitOptions* subject_besthit_opts,
2532 		                    const BlastQueryInfo *query_info,
2533                             BlastHSPList* hsp_list)
2534 {
2535    BlastHSP** hsp_array;  /* hsp_array to purge. */
2536    const int range_diff = subject_besthit_opts->max_range_diff;
2537    Boolean isBlastn = (program == eBlastTypeBlastn);
2538    unsigned int i, j;
2539    int o, e;
2540    int curr_context, target_context;
2541    int qlen;
2542 
2543    /* If HSP list is empty, return immediately. */
2544    if (hsp_list == NULL || hsp_list->hspcnt == 0)
2545        return 0;
2546 
2547    if (Blast_ProgramIsPhiBlast(program))
2548        return hsp_list->hspcnt;
2549 
2550    hsp_array = hsp_list->hsp_array;
2551 
2552    // The hsp list is sorted by score
2553    for(i=0; i < hsp_list->hspcnt -1; i++) {
2554 	  if(hsp_array[i] == NULL){
2555 		  continue;
2556 	  }
2557       j = 1;
2558       o = hsp_array[i]->query.offset - range_diff;
2559       e = hsp_array[i]->query.end + range_diff;
2560       if (o < 0) o = 0;
2561       if (e < 0) e = hsp_array[i]->query.end;
2562       while (i+j < hsp_list->hspcnt) {
2563           if (hsp_array[i+j] && hsp_array[i]->context == hsp_array[i+j]->context &&
2564               ((hsp_array[i+j]->query.offset >= o) &&
2565                (hsp_array[i+j]->query.end <= e))){
2566        	      hsp_array[i+j] = Blast_HSPFree(hsp_array[i+j]);
2567           }
2568           j++;
2569       }
2570    }
2571 
2572    Blast_HSPListPurgeNullHSPs(hsp_list);
2573 
2574    if(isBlastn) {
2575 	   for(i=0; i < hsp_list->hspcnt -1; i++) {
2576 	   	  if(hsp_array[i] == NULL){
2577 	   		  continue;
2578 	   	  }
2579 	   	  // Flip query offsets of current hsp to target context frame
2580 	   	  j = 1;
2581 	   	  curr_context = hsp_array[i]->context;
2582           qlen = query_info->contexts[curr_context].query_length;
2583 	   	  target_context = (hsp_array[i]->query.frame > 0) ? curr_context +1 : curr_context -1;
2584 	   	  e = qlen - (hsp_array[i]->query.offset - range_diff);
2585 	   	  o = qlen - (hsp_array[i]->query.end + range_diff);
2586 	   	  while (i+j < hsp_list->hspcnt) {
2587 	   		  if(hsp_array[i+j] && (hsp_array[i+j]->context == target_context) &&
2588 	   			 ((hsp_array[i+j]->query.offset >= o) &&
2589 	   			  (hsp_array[i+j]->query.end <= e))){
2590 	   			     hsp_array[i+j] = Blast_HSPFree(hsp_array[i+j]);
2591 	   		  }
2592 	   		  j++;
2593 	   	  }
2594        }
2595        Blast_HSPListPurgeNullHSPs(hsp_list);
2596    }
2597    return hsp_list->hspcnt;
2598 }
2599 
2600 Int2
Blast_HSPListReevaluateUngapped(EBlastProgramType program,BlastHSPList * hsp_list,BLAST_SequenceBlk * query_blk,BLAST_SequenceBlk * subject_blk,const BlastInitialWordParameters * word_params,const BlastHitSavingParameters * hit_params,const BlastQueryInfo * query_info,BlastScoreBlk * sbp,const BlastScoringParameters * score_params,const BlastSeqSrc * seq_src,const Uint1 * gen_code_string)2601 Blast_HSPListReevaluateUngapped(EBlastProgramType program,
2602    BlastHSPList* hsp_list, BLAST_SequenceBlk* query_blk,
2603    BLAST_SequenceBlk* subject_blk,
2604    const BlastInitialWordParameters* word_params,
2605    const BlastHitSavingParameters* hit_params, const BlastQueryInfo* query_info,
2606    BlastScoreBlk* sbp, const BlastScoringParameters* score_params,
2607    const BlastSeqSrc* seq_src, const Uint1* gen_code_string)
2608 {
2609    BlastHSP** hsp_array,* hsp;
2610    const Uint1* subject_start = NULL;
2611    Uint1* query_start;
2612    Int4 index, context, hspcnt;
2613    Boolean purge;
2614    Int2 status = 0;
2615    const Boolean kTranslateSubject = Blast_SubjectIsTranslated(program);
2616    const Boolean kNucleotideSubject = Blast_SubjectIsNucleotide(program);
2617    SBlastTargetTranslation* target_t = NULL;
2618 
2619    ASSERT(!score_params->options->gapped_calculation);
2620 
2621    if (!hsp_list)
2622       return status;
2623 
2624    hspcnt = hsp_list->hspcnt;
2625    hsp_array = hsp_list->hsp_array;
2626 
2627    if (hsp_list->hspcnt == 0)
2628       /* All HSPs have been deleted */
2629       return status;
2630 
2631    /* Retrieve the unpacked subject sequence and save it in the
2632       sequence_start element of the subject structure.
2633       NB: for the BLAST 2 Sequences case, the uncompressed sequence must
2634       already be there */
2635    if (seq_src) {
2636       /* Wrap subject sequence block into a BlastSeqSrcGetSeqArg structure, which is
2637          needed by the BlastSeqSrc API. */
2638       /* If this was a protein subject, leave as is. */
2639       if (kNucleotideSubject)
2640       {
2641       	BlastSeqSrcGetSeqArg seq_arg;
2642       	memset((void*) &seq_arg, 0, sizeof(seq_arg));
2643       	seq_arg.oid = subject_blk->oid;
2644       	seq_arg.encoding =
2645        	  (kTranslateSubject ? eBlastEncodingNcbi4na : eBlastEncodingNucleotide);
2646         seq_arg.check_oid_exclusion = TRUE;
2647       	seq_arg.seq = subject_blk;
2648       	/* Return the packed sequence to the database */
2649       	BlastSeqSrcReleaseSequence(seq_src, &seq_arg);
2650       	/* Get the unpacked sequence */
2651       	if (( BLAST_SEQSRC_EXCLUDED == BlastSeqSrcGetSequence(seq_src, &seq_arg))) {
2652       		for (index = 0; index < hspcnt; ++index) {
2653       			 hsp_array[index] = Blast_HSPFree(hsp_array[index]);
2654       		}
2655       		Blast_HSPListPurgeNullHSPs(hsp_list);
2656       		return 0;
2657       	}
2658       	else if (status < 0){
2659           return status;
2660       	}
2661        }
2662    }
2663 
2664    if (kTranslateSubject) {
2665       if (!gen_code_string)
2666          return -1;
2667       /* Get the translation buffer */
2668       BlastTargetTranslationNew(subject_blk, gen_code_string, program,
2669                    score_params->options->is_ooframe, &target_t);
2670    } else {
2671       /* Store sequence in blastna encoding in sequence_start */
2672       if (subject_blk->sequence_start)
2673           subject_start = subject_blk->sequence_start + 1;
2674       else
2675           subject_start = subject_blk->sequence;
2676    }
2677 
2678    purge = FALSE;
2679    for (index = 0; index < hspcnt; ++index) {
2680       Boolean delete_hsp = FALSE;
2681       if (hsp_array[index] == NULL)
2682          continue;
2683       else
2684          hsp = hsp_array[index];
2685 
2686       context = hsp->context;
2687 
2688       query_start = query_blk->sequence +
2689           query_info->contexts[context].query_offset;
2690 
2691       if (kTranslateSubject)
2692          subject_start = Blast_HSPGetTargetTranslation(target_t, hsp, NULL);
2693 
2694       if (kNucleotideSubject) {
2695          delete_hsp =
2696              Blast_HSPReevaluateWithAmbiguitiesUngapped(hsp, query_start,
2697                 subject_start, word_params, sbp, kTranslateSubject);
2698       }
2699 
2700       if (!delete_hsp) {
2701           const Uint1* query_nomask = query_blk->sequence_nomask +
2702               query_info->contexts[context].query_offset;
2703           Int4 align_length = 0;
2704           Blast_HSPGetNumIdentitiesAndPositives(query_nomask,
2705                    							    subject_start,
2706                                					hsp,
2707                                					score_params->options,
2708                                					&align_length,
2709                                					sbp);
2710 
2711            delete_hsp = Blast_HSPTest(hsp, hit_params->options, align_length);
2712       }
2713       if (delete_hsp) { /* This HSP is now below the cutoff */
2714          hsp_array[index] = Blast_HSPFree(hsp_array[index]);
2715          purge = TRUE;
2716       }
2717    }
2718 
2719    if (target_t)
2720       target_t = BlastTargetTranslationFree(target_t);
2721 
2722    if (purge)
2723       Blast_HSPListPurgeNullHSPs(hsp_list);
2724 
2725    /* Sort the HSP array by score (scores may have changed!) */
2726    Blast_HSPListSortByScore(hsp_list);
2727    Blast_HSPListAdjustOddBlastnScores(hsp_list, FALSE, sbp);
2728    return 0;
2729 }
2730 
2731 /** Combine two HSP lists, without altering the individual HSPs, and without
2732  * reallocating the HSP array.
2733  * @param hsp_list New HSP list [in]
2734  * @param combined_hsp_list Old HSP list, to which new HSPs are added [in] [out]
2735  * @param new_hspcnt How many HSPs to save in the combined list? The extra ones
2736  *                   are freed. The best scoring HSPs are saved. This argument
2737  *                   cannot be greater than the allocated size of the
2738  *                   combined list's HSP array. [in]
2739  */
2740 static void
s_BlastHSPListsCombineByScore(BlastHSPList * hsp_list,BlastHSPList * combined_hsp_list,Int4 new_hspcnt)2741 s_BlastHSPListsCombineByScore(BlastHSPList* hsp_list,
2742                              BlastHSPList* combined_hsp_list,
2743                              Int4 new_hspcnt)
2744 {
2745    Int4 index, index1, index2;
2746    BlastHSP** new_hsp_array;
2747 
2748    ASSERT(new_hspcnt <= combined_hsp_list->allocated);
2749 
2750    if (new_hspcnt >= hsp_list->hspcnt + combined_hsp_list->hspcnt) {
2751       /* All HSPs from both arrays are saved */
2752       for (index=combined_hsp_list->hspcnt, index1=0;
2753            index1<hsp_list->hspcnt; index1++) {
2754          if (hsp_list->hsp_array[index1] != NULL)
2755             combined_hsp_list->hsp_array[index++] = hsp_list->hsp_array[index1];
2756       }
2757       combined_hsp_list->hspcnt = new_hspcnt;
2758       Blast_HSPListSortByScore(combined_hsp_list);
2759    } else {
2760       /* Not all HSPs are be saved; sort both arrays by score and save only
2761          the new_hspcnt best ones.
2762          For the merged set of HSPs, allocate array the same size as in the
2763          old HSP list. */
2764       new_hsp_array = (BlastHSP**)
2765          malloc(combined_hsp_list->allocated*sizeof(BlastHSP*));
2766 
2767       Blast_HSPListSortByScore(combined_hsp_list);
2768       Blast_HSPListSortByScore(hsp_list);
2769       index1 = index2 = 0;
2770       for (index = 0; index < new_hspcnt; ++index) {
2771          if (index1 < combined_hsp_list->hspcnt &&
2772              (index2 >= hsp_list->hspcnt ||
2773               ScoreCompareHSPs(&combined_hsp_list->hsp_array[index1],
2774                                &hsp_list->hsp_array[index2]) <= 0)) {
2775             new_hsp_array[index] = combined_hsp_list->hsp_array[index1];
2776             ++index1;
2777          } else {
2778             new_hsp_array[index] = hsp_list->hsp_array[index2];
2779             ++index2;
2780          }
2781       }
2782       /* Free the extra HSPs that could not be saved */
2783       for ( ; index1 < combined_hsp_list->hspcnt; ++index1) {
2784          combined_hsp_list->hsp_array[index1] =
2785             Blast_HSPFree(combined_hsp_list->hsp_array[index1]);
2786       }
2787       for ( ; index2 < hsp_list->hspcnt; ++index2) {
2788          hsp_list->hsp_array[index2] =
2789             Blast_HSPFree(hsp_list->hsp_array[index2]);
2790       }
2791       /* Point combined_hsp_list's HSP array to the new one */
2792       sfree(combined_hsp_list->hsp_array);
2793       combined_hsp_list->hsp_array = new_hsp_array;
2794       combined_hsp_list->hspcnt = new_hspcnt;
2795    }
2796 
2797    /* Second HSP list now does not own any HSPs */
2798    hsp_list->hspcnt = 0;
2799 }
2800 
Blast_HSPListAppend(BlastHSPList ** old_hsp_list_ptr,BlastHSPList ** combined_hsp_list_ptr,Int4 hsp_num_max)2801 Int2 Blast_HSPListAppend(BlastHSPList** old_hsp_list_ptr,
2802         BlastHSPList** combined_hsp_list_ptr, Int4 hsp_num_max)
2803 {
2804    BlastHSPList* combined_hsp_list = *combined_hsp_list_ptr;
2805    BlastHSPList* hsp_list = *old_hsp_list_ptr;
2806    Int4 new_hspcnt;
2807 
2808    if (!hsp_list || hsp_list->hspcnt == 0)
2809       return 0;
2810 
2811    /* If no previous HSP list, return a pointer to the old one */
2812    if (!combined_hsp_list) {
2813       *combined_hsp_list_ptr = hsp_list;
2814       *old_hsp_list_ptr = NULL;
2815       return 0;
2816    }
2817 
2818    /* Just append new list to the end of the old list, in case of
2819       multiple frames of the subject sequence */
2820    new_hspcnt = MIN(combined_hsp_list->hspcnt + hsp_list->hspcnt,
2821                     hsp_num_max);
2822    if (new_hspcnt > combined_hsp_list->allocated &&
2823        !combined_hsp_list->do_not_reallocate) {
2824       Int4 new_allocated = MIN(2*new_hspcnt, hsp_num_max);
2825       BlastHSP** new_hsp_array;
2826       new_hsp_array = (BlastHSP**)
2827          realloc(combined_hsp_list->hsp_array,
2828                  new_allocated*sizeof(BlastHSP*));
2829 
2830       if (new_hsp_array) {
2831          combined_hsp_list->allocated = new_allocated;
2832          combined_hsp_list->hsp_array = new_hsp_array;
2833       } else {
2834          combined_hsp_list->do_not_reallocate = TRUE;
2835          new_hspcnt = combined_hsp_list->allocated;
2836       }
2837    }
2838    if (combined_hsp_list->allocated == hsp_num_max)
2839       combined_hsp_list->do_not_reallocate = TRUE;
2840 
2841    s_BlastHSPListsCombineByScore(hsp_list, combined_hsp_list, new_hspcnt);
2842 
2843    hsp_list = Blast_HSPListFree(hsp_list);
2844    *old_hsp_list_ptr = NULL;
2845 
2846    return 0;
2847 }
2848 
Blast_HSPListsMerge(BlastHSPList ** hsp_list_ptr,BlastHSPList ** combined_hsp_list_ptr,Int4 hsp_num_max,Int4 * split_offsets,Int4 contexts_per_query,Int4 chunk_overlap_size,Boolean allow_gap,Boolean short_reads)2849 Int2 Blast_HSPListsMerge(BlastHSPList** hsp_list_ptr,
2850                    BlastHSPList** combined_hsp_list_ptr,
2851                    Int4 hsp_num_max, Int4 *split_offsets,
2852                    Int4 contexts_per_query, Int4 chunk_overlap_size,
2853                    Boolean allow_gap, Boolean short_reads)
2854 {
2855    BlastHSPList* combined_hsp_list = *combined_hsp_list_ptr;
2856    BlastHSPList* hsp_list = *hsp_list_ptr;
2857    BlastHSP* hsp1, *hsp2, *hsp_var;
2858    BlastHSP** hspp1,** hspp2;
2859    Int4 index1, index2;
2860    Int4 hspcnt1, hspcnt2, new_hspcnt = 0;
2861    Int4 start_diag, end_diag;
2862    Int4 offset_idx;
2863    BlastHSP** new_hsp_array;
2864 
2865    if (!hsp_list || hsp_list->hspcnt == 0)
2866       return 0;
2867 
2868    /* If no previous HSP list, just return a copy of the new one. */
2869    if (!combined_hsp_list) {
2870       *combined_hsp_list_ptr = hsp_list;
2871       *hsp_list_ptr = NULL;
2872       return 0;
2873    }
2874 
2875    /* Merge the two HSP lists for successive chunks of the subject sequence.
2876       First put all HSPs that intersect the overlap region at the front of
2877       the respective HSP arrays. Note that if the query sequence is
2878       assumed split, each context of the query sequence can have a
2879       different split point */
2880    hspcnt1 = hspcnt2 = 0;
2881 
2882    if (contexts_per_query < 0) {      /* subject seq is split */
2883       for (index1 = 0; index1 < combined_hsp_list->hspcnt; index1++) {
2884          hsp1 = combined_hsp_list->hsp_array[index1];
2885          if (hsp1->subject.end > split_offsets[0]) {
2886             /* At least part of this HSP lies in the overlap strip. */
2887             hsp_var = combined_hsp_list->hsp_array[hspcnt1];
2888             combined_hsp_list->hsp_array[hspcnt1] = hsp1;
2889             combined_hsp_list->hsp_array[index1] = hsp_var;
2890             ++hspcnt1;
2891          }
2892       }
2893       for (index2 = 0; index2 < hsp_list->hspcnt; index2++) {
2894          hsp2 = hsp_list->hsp_array[index2];
2895          if (hsp2->subject.offset < split_offsets[0] + chunk_overlap_size) {
2896             /* At least part of this HSP lies in the overlap strip. */
2897             hsp_var = hsp_list->hsp_array[hspcnt2];
2898             hsp_list->hsp_array[hspcnt2] = hsp2;
2899             hsp_list->hsp_array[index2] = hsp_var;
2900             ++hspcnt2;
2901          }
2902       }
2903    }
2904    else {            /* query seq is split */
2905 
2906       /* An HSP can be a candidate for merging if it lies in the
2907          overlap region. Whether this is true depends on whether the
2908          HSP starts to the left of the split point, or ends to the
2909          right of the overlap region. A complication is that 'left'
2910          and 'right' have opposite meaning when the HSP is on the
2911          minus strand of the query sequence */
2912 
2913       for (index1 = 0; index1 < combined_hsp_list->hspcnt; index1++) {
2914          hsp1 = combined_hsp_list->hsp_array[index1];
2915          offset_idx = hsp1->context % contexts_per_query;
2916          if (split_offsets[offset_idx] < 0) continue;
2917          if ((hsp1->query.frame >= 0 && hsp1->query.end >
2918                          split_offsets[offset_idx]) ||
2919              (hsp1->query.frame < 0 && hsp1->query.offset <
2920                          split_offsets[offset_idx] + chunk_overlap_size)) {
2921             /* At least part of this HSP lies in the overlap strip. */
2922             hsp_var = combined_hsp_list->hsp_array[hspcnt1];
2923             combined_hsp_list->hsp_array[hspcnt1] = hsp1;
2924             combined_hsp_list->hsp_array[index1] = hsp_var;
2925             ++hspcnt1;
2926          }
2927       }
2928       for (index2 = 0; index2 < hsp_list->hspcnt; index2++) {
2929          hsp2 = hsp_list->hsp_array[index2];
2930          offset_idx = hsp2->context % contexts_per_query;
2931          if (split_offsets[offset_idx] < 0) continue;
2932          if ((hsp2->query.frame < 0 && hsp2->query.end >
2933                          split_offsets[offset_idx]) ||
2934              (hsp2->query.frame >= 0 && hsp2->query.offset <
2935                          split_offsets[offset_idx] + chunk_overlap_size)) {
2936             /* At least part of this HSP lies in the overlap strip. */
2937             hsp_var = hsp_list->hsp_array[hspcnt2];
2938             hsp_list->hsp_array[hspcnt2] = hsp2;
2939             hsp_list->hsp_array[index2] = hsp_var;
2940             ++hspcnt2;
2941          }
2942       }
2943    }
2944 
2945    /* the merge process is independent of whether merging happens
2946       between query chunks or subject chunks */
2947 
2948    if (hspcnt1 > 0 && hspcnt2 > 0) {
2949       hspp1 = combined_hsp_list->hsp_array;
2950       hspp2 = hsp_list->hsp_array;
2951 
2952       for (index1 = 0; index1 < hspcnt1; index1++) {
2953 
2954          hsp1 = hspp1[index1];
2955 
2956          for (index2 = 0; index2 < hspcnt2; index2++) {
2957 
2958             hsp2 = hspp2[index2];
2959 
2960             /* Skip already deleted HSPs, or HSPs from different contexts */
2961             if (!hsp2 || hsp1->context != hsp2->context)
2962                continue;
2963 
2964             /* Short read qureies are shorter than the overlap region and may
2965                already have a traceback */
2966             if (short_reads) {
2967                 hspp2[index2] = Blast_HSPFree(hsp2);
2968                 continue;
2969             }
2970 
2971             /* we have to determine the starting diagonal of one HSP
2972                and the ending diagonal of the other */
2973 
2974             if (contexts_per_query < 0 || hsp1->query.frame >= 0) {
2975                end_diag = s_HSPEndDiag(hsp1);
2976                start_diag = s_HSPStartDiag(hsp2);
2977             }
2978             else {
2979                end_diag = s_HSPEndDiag(hsp2);
2980                start_diag = s_HSPStartDiag(hsp1);
2981             }
2982 
2983             if (ABS(end_diag - start_diag) < OVERLAP_DIAG_CLOSE) {
2984                if (s_BlastMergeTwoHSPs(hsp1, hsp2, allow_gap)) {
2985                   /* Free the second HSP. */
2986                   hspp2[index2] = Blast_HSPFree(hsp2);
2987                }
2988             }
2989          }
2990       }
2991 
2992       /* Purge the nulled out HSPs from the new HSP list */
2993       Blast_HSPListPurgeNullHSPs(hsp_list);
2994    }
2995 
2996    /* The new number of HSPs is now the sum of the remaining counts in the
2997       two lists, but if there is a restriction on the number of HSPs to keep,
2998       it might have to be reduced. */
2999    new_hspcnt =
3000       MIN(hsp_list->hspcnt + combined_hsp_list->hspcnt, hsp_num_max);
3001 
3002    if (new_hspcnt >= combined_hsp_list->allocated-1 &&
3003        combined_hsp_list->do_not_reallocate == FALSE) {
3004       Int4 new_allocated = MIN(2*new_hspcnt, hsp_num_max);
3005       if (new_allocated > combined_hsp_list->allocated) {
3006          new_hsp_array = (BlastHSP**)
3007             realloc(combined_hsp_list->hsp_array,
3008                     new_allocated*sizeof(BlastHSP*));
3009          if (new_hsp_array == NULL) {
3010             combined_hsp_list->do_not_reallocate = TRUE;
3011          } else {
3012             combined_hsp_list->hsp_array = new_hsp_array;
3013             combined_hsp_list->allocated = new_allocated;
3014          }
3015       } else {
3016          combined_hsp_list->do_not_reallocate = TRUE;
3017       }
3018       new_hspcnt = MIN(new_hspcnt, combined_hsp_list->allocated);
3019    }
3020 
3021    s_BlastHSPListsCombineByScore(hsp_list, combined_hsp_list, new_hspcnt);
3022 
3023    hsp_list = Blast_HSPListFree(hsp_list);
3024    *hsp_list_ptr = NULL;
3025 
3026    return 0;
3027 }
3028 
Blast_HSPListAdjustOffsets(BlastHSPList * hsp_list,Int4 offset)3029 void Blast_HSPListAdjustOffsets(BlastHSPList* hsp_list, Int4 offset)
3030 {
3031    Int4 index;
3032 
3033    if (offset == 0) {
3034        return;
3035    }
3036 
3037    for (index=0; index<hsp_list->hspcnt; index++) {
3038       BlastHSP* hsp = hsp_list->hsp_array[index];
3039       hsp->subject.offset += offset;
3040       hsp->subject.end += offset;
3041       hsp->subject.gapped_start += offset;
3042    }
3043 }
3044 
Blast_HSPListAdjustOddBlastnScores(BlastHSPList * hsp_list,Boolean gapped_calculation,const BlastScoreBlk * sbp)3045 void Blast_HSPListAdjustOddBlastnScores(BlastHSPList* hsp_list,
3046                                         Boolean gapped_calculation,
3047                                         const BlastScoreBlk* sbp)
3048 {
3049     int index;
3050 
3051     if (!hsp_list || hsp_list->hspcnt == 0 ||
3052         gapped_calculation == FALSE ||
3053         sbp->round_down == FALSE)
3054        return;
3055 
3056     for (index = 0; index < hsp_list->hspcnt; ++index)
3057         hsp_list->hsp_array[index]->score &= ~1;
3058 
3059     /* Sort the HSPs again, since the order may have to be different now. */
3060     Blast_HSPListSortByScore(hsp_list);
3061 }
3062 
3063 /** Callback for sorting hsp lists by their best evalue/score;
3064  * Evalues are compared with the condition that if both are close enough to
3065  * zero (currently < 1.0e-180), they are considered equal.
3066  * It is assumed that the HSP arrays in each hit list are already sorted by
3067  * e-value/score.
3068 */
3069 static int
s_EvalueCompareHSPLists(const void * v1,const void * v2)3070 s_EvalueCompareHSPLists(const void* v1, const void* v2)
3071 {
3072    BlastHSPList* h1,* h2;
3073    int retval = 0;
3074 
3075    h1 = *(BlastHSPList**) v1;
3076    h2 = *(BlastHSPList**) v2;
3077 
3078    /* If any of the HSP lists is empty, it is considered "worse" than the
3079       other, unless the other is also empty. */
3080    if (h1->hspcnt == 0 && h2->hspcnt == 0)
3081       return 0;
3082    else if (h1->hspcnt == 0)
3083       return 1;
3084    else if (h2->hspcnt == 0)
3085       return -1;
3086 
3087    if ((retval = s_EvalueComp(h1->best_evalue,
3088                                    h2->best_evalue)) != 0)
3089       return retval;
3090 
3091    if (h1->hsp_array[0]->score > h2->hsp_array[0]->score)
3092       return -1;
3093    if (h1->hsp_array[0]->score < h2->hsp_array[0]->score)
3094       return 1;
3095 
3096    /* In case of equal best E-values and scores, order will be determined
3097       by ordinal ids of the subject sequences */
3098    return BLAST_CMP(h2->oid, h1->oid);
3099 }
3100 
3101 /** Callback for sorting hsp lists by their best e-value/score, in
3102  * reverse order - from higher e-value to lower (lower score to higher).
3103 */
3104 static int
s_EvalueCompareHSPListsRev(const void * v1,const void * v2)3105 s_EvalueCompareHSPListsRev(const void* v1, const void* v2)
3106 {
3107    return s_EvalueCompareHSPLists(v2, v1);
3108 }
3109 
3110 /********************************************************************************
3111           Functions manipulating BlastHitList's
3112 ********************************************************************************/
3113 
3114 /*
3115    description in blast_hits.h
3116  */
Blast_HitListNew(Int4 hitlist_size)3117 BlastHitList* Blast_HitListNew(Int4 hitlist_size)
3118 {
3119    BlastHitList* new_hitlist = (BlastHitList*)
3120       calloc(1, sizeof(BlastHitList));
3121    new_hitlist->hsplist_max = hitlist_size;
3122    new_hitlist->low_score = INT4_MAX;
3123    new_hitlist->hsplist_count = 0;
3124    new_hitlist->hsplist_current = 0;
3125    return new_hitlist;
3126 }
3127 
3128 /*
3129    description in blast_hits.h
3130 */
Blast_HitListFree(BlastHitList * hitlist)3131 BlastHitList* Blast_HitListFree(BlastHitList* hitlist)
3132 {
3133    if (!hitlist)
3134       return NULL;
3135 
3136    Blast_HitListHSPListsFree(hitlist);
3137    sfree(hitlist);
3138    return NULL;
3139 }
3140 
3141 /* description in blast_hits.h */
Blast_HitListHSPListsFree(BlastHitList * hitlist)3142 Int2 Blast_HitListHSPListsFree(BlastHitList* hitlist)
3143 {
3144    Int4 index;
3145 
3146    if (!hitlist)
3147 	return 0;
3148 
3149    for (index = 0; index < hitlist->hsplist_count; ++index)
3150       hitlist->hsplist_array[index] =
3151          Blast_HSPListFree(hitlist->hsplist_array[index]);
3152 
3153    sfree(hitlist->hsplist_array);
3154 
3155    hitlist->hsplist_count = 0;
3156 
3157    return 0;
3158 }
3159 
3160 /** Purge a BlastHitList of empty HSP lists.
3161  * @param hit_list BLAST hit list structure. [in] [out]
3162  */
3163 static void
s_BlastHitListPurge(BlastHitList * hit_list)3164 s_BlastHitListPurge(BlastHitList* hit_list)
3165 {
3166    Int4 index, hsplist_count;
3167 
3168    if (!hit_list)
3169       return;
3170 
3171    hsplist_count = hit_list->hsplist_count;
3172    for (index = 0; index < hsplist_count &&
3173            hit_list->hsplist_array[index]->hspcnt > 0; ++index);
3174 
3175    hit_list->hsplist_count = index;
3176    /* Free all empty HSP lists */
3177    for ( ; index < hsplist_count; ++index) {
3178       Blast_HSPListFree(hit_list->hsplist_array[index]);
3179    }
3180 }
3181 
3182 /** Given a BlastHitList* with a heapified HSP list array, remove
3183  *  the worst scoring HSP list and insert the new HSP list in the heap
3184  * @param hit_list Contains all HSP lists for a given query [in] [out]
3185  * @param hsp_list A new HSP list to be inserted into the hit list [in]
3186  */
3187 static void
s_BlastHitListInsertHSPListInHeap(BlastHitList * hit_list,BlastHSPList * hsp_list)3188 s_BlastHitListInsertHSPListInHeap(BlastHitList* hit_list,
3189                                  BlastHSPList* hsp_list)
3190 {
3191       Blast_HSPListFree(hit_list->hsplist_array[0]);
3192       hit_list->hsplist_array[0] = hsp_list;
3193       if (hit_list->hsplist_count >= 2) {
3194          s_Heapify((char*)hit_list->hsplist_array, (char*)hit_list->hsplist_array,
3195                  (char*)&hit_list->hsplist_array[hit_list->hsplist_count/2 - 1],
3196                  (char*)&hit_list->hsplist_array[hit_list->hsplist_count-1],
3197                  sizeof(BlastHSPList*), s_EvalueCompareHSPLists);
3198       }
3199       hit_list->worst_evalue =
3200          hit_list->hsplist_array[0]->best_evalue;
3201       hit_list->low_score = hit_list->hsplist_array[0]->hsp_array[0]->score;
3202 }
3203 
3204 /** Given a BlastHitList pointer this function makes the
3205  * hsplist_array larger, up to a maximum size.
3206  * These incremental increases are mostly an issue for users who
3207  * put in a very large number for number of hits to save, but only save a few.
3208  * @param hit_list object containing the hsplist_array to grow [in]
3209  * @return zero on success, 1 if full already.
3210  */
s_Blast_HitListGrowHSPListArray(BlastHitList * hit_list)3211 static Int2 s_Blast_HitListGrowHSPListArray(BlastHitList* hit_list)
3212 
3213 {
3214     const int kStartValue = 100; /* default number of hsplist_array to start with. */
3215 
3216     ASSERT(hit_list);
3217 
3218     if (hit_list->hsplist_current >= hit_list->hsplist_max)
3219        return 1;
3220 
3221     if (hit_list->hsplist_current <= 0)
3222        hit_list->hsplist_current = kStartValue;
3223     else
3224        hit_list->hsplist_current = MIN(2*hit_list->hsplist_current, hit_list->hsplist_max);
3225 
3226     hit_list->hsplist_array =
3227        (BlastHSPList**) realloc(hit_list->hsplist_array, hit_list->hsplist_current*sizeof(BlastHSPList*));
3228 
3229     if (hit_list->hsplist_array == NULL)
3230        return BLASTERR_MEMORY;
3231 
3232     return 0;
3233 }
3234 
Blast_HitListUpdate(BlastHitList * hit_list,BlastHSPList * hsp_list)3235 Int2 Blast_HitListUpdate(BlastHitList* hit_list,
3236                          BlastHSPList* hsp_list)
3237 {
3238    hsp_list->best_evalue = s_BlastGetBestEvalue(hsp_list);
3239 
3240 #ifndef NDEBUG
3241    ASSERT(s_BlastCheckBestEvalue(hsp_list) == TRUE); /* NCBI_FAKE_WARNING */
3242 #endif /* _DEBUG */
3243 
3244    if (hit_list->hsplist_count < hit_list->hsplist_max) {
3245       /* If the array of HSP lists for this query is not yet allocated,
3246          do it here */
3247       if (hit_list->hsplist_current == hit_list->hsplist_count)
3248       {
3249          Int2 status = s_Blast_HitListGrowHSPListArray(hit_list);
3250          if (status)
3251            return status;
3252       }
3253       /* Just add to the end; sort later */
3254       hit_list->hsplist_array[hit_list->hsplist_count++] = hsp_list;
3255       hit_list->worst_evalue =
3256          MAX(hsp_list->best_evalue, hit_list->worst_evalue);
3257       hit_list->low_score =
3258          MIN(hsp_list->hsp_array[0]->score, hit_list->low_score);
3259    } else {
3260       int evalue_order = 0;
3261       if (!hit_list->heapified) {
3262     	  /* make sure all hsp_list is sorted */
3263           int index;
3264           for (index =0; index < hit_list->hsplist_count; index++) {
3265               Blast_HSPListSortByEvalue(hit_list->hsplist_array[index]);
3266               hit_list->hsplist_array[index]->best_evalue = s_BlastGetBestEvalue(hit_list->hsplist_array[index]);
3267           }
3268           s_CreateHeap(hit_list->hsplist_array, hit_list->hsplist_count,
3269                        sizeof(BlastHSPList*), s_EvalueCompareHSPLists);
3270           hit_list->heapified = TRUE;
3271       }
3272 
3273       /* make sure the hsp_list is sorted.  We actually do not need to sort
3274          the full list: all that we need is the best score.   However, the
3275          following code assumes hsp_list->hsp_array[0] has the best score. */
3276       Blast_HSPListSortByEvalue(hsp_list);
3277       hsp_list->best_evalue = s_BlastGetBestEvalue(hsp_list);
3278       evalue_order = s_EvalueCompareHSPLists(&(hit_list->hsplist_array[0]), &hsp_list);
3279       if (evalue_order < 0) {
3280          /* This hit list is less significant than any of those already saved;
3281             discard it. Note that newer hits with score and e-value both equal
3282             to the current worst will be saved, at the expense of some older
3283             hit.
3284          */
3285          Blast_HSPListFree(hsp_list);
3286       } else {
3287          s_BlastHitListInsertHSPListInHeap(hit_list, hsp_list);
3288       }
3289    }
3290    return 0;
3291 }
3292 
3293 Int2
Blast_HitListPurgeNullHSPLists(BlastHitList * hit_list)3294 Blast_HitListPurgeNullHSPLists(BlastHitList* hit_list)
3295 {
3296    Int4 index, index1; /* loop indices. */
3297    Int4 hsplist_count; /* total number of HSPList's to iterate over. */
3298    BlastHSPList** hsplist_array;  /* hsplist_array to purge. */
3299 
3300    if (hit_list == NULL || hit_list->hsplist_count == 0)
3301       return 0;
3302 
3303    hsplist_array = hit_list->hsplist_array;
3304    hsplist_count = hit_list->hsplist_count;
3305 
3306    index1 = 0;
3307    for (index=0; index<hsplist_count; index++) {
3308       if (hsplist_array[index]) {
3309          hsplist_array[index1] = hsplist_array[index];
3310          index1++;
3311       }
3312    }
3313 
3314    for (index=index1; index<hsplist_count; index++) {
3315       hsplist_array[index] = NULL;
3316    }
3317 
3318    hit_list->hsplist_count = index1;
3319 
3320    return 0;
3321 }
3322 
Blast_HitListSortByEvalue(BlastHitList * hit_list)3323 Int2 Blast_HitListSortByEvalue(BlastHitList* hit_list)
3324 {
3325       if (hit_list && hit_list->hsplist_count > 1) {
3326          qsort(hit_list->hsplist_array, hit_list->hsplist_count,
3327                   sizeof(BlastHSPList*), s_EvalueCompareHSPLists);
3328       }
3329       s_BlastHitListPurge(hit_list);
3330    return 0;
3331 }
3332 
3333 
3334 /********************************************************************************
3335           Functions manipulating BlastHSPResults
3336 ********************************************************************************/
3337 
Blast_HSPResultsNew(Int4 num_queries)3338 BlastHSPResults* Blast_HSPResultsNew(Int4 num_queries)
3339 {
3340    BlastHSPResults* retval = NULL;
3341 
3342    retval = (BlastHSPResults*) calloc(1, sizeof(BlastHSPResults));
3343    if ( !retval ) {
3344        return NULL;
3345    }
3346 
3347    retval->num_queries = num_queries;
3348    retval->hitlist_array = (BlastHitList**)
3349       calloc(num_queries, sizeof(BlastHitList*));
3350 
3351    if ( !retval->hitlist_array ) {
3352        return Blast_HSPResultsFree(retval);
3353    }
3354 
3355    return retval;
3356 }
3357 
Blast_HSPResultsFree(BlastHSPResults * results)3358 BlastHSPResults* Blast_HSPResultsFree(BlastHSPResults* results)
3359 {
3360    Int4 index;
3361 
3362    if (!results)
3363        return NULL;
3364 
3365    if (results->hitlist_array)
3366    {
3367    	for (index = 0; index < results->num_queries; ++index)
3368       		Blast_HitListFree(results->hitlist_array[index]);
3369    	sfree(results->hitlist_array);
3370    }
3371    sfree(results);
3372    return NULL;
3373 }
3374 
Blast_HSPResultsSortByEvalue(BlastHSPResults * results)3375 Int2 Blast_HSPResultsSortByEvalue(BlastHSPResults* results)
3376 {
3377    Int4 index;
3378    BlastHitList* hit_list;
3379 
3380    if (!results)
3381        return 0;
3382 
3383    for (index = 0; index < results->num_queries; ++index) {
3384       hit_list = results->hitlist_array[index];
3385       if (hit_list != NULL
3386               && hit_list->hsplist_count > 1
3387               && hit_list->hsplist_array != NULL) {
3388          qsort(hit_list->hsplist_array, hit_list->hsplist_count,
3389                   sizeof(BlastHSPList*), s_EvalueCompareHSPLists);
3390       }
3391       s_BlastHitListPurge(hit_list);
3392    }
3393    return 0;
3394 }
3395 
Blast_HSPResultsReverseSort(BlastHSPResults * results)3396 Int2 Blast_HSPResultsReverseSort(BlastHSPResults* results)
3397 {
3398    Int4 index;
3399    BlastHitList* hit_list;
3400 
3401    for (index = 0; index < results->num_queries; ++index) {
3402       hit_list = results->hitlist_array[index];
3403       if (hit_list && hit_list->hsplist_count > 1) {
3404          qsort(hit_list->hsplist_array, hit_list->hsplist_count,
3405                sizeof(BlastHSPList*), s_EvalueCompareHSPListsRev);
3406       }
3407       s_BlastHitListPurge(hit_list);
3408    }
3409    return 0;
3410 }
3411 
Blast_HSPResultsReverseOrder(BlastHSPResults * results)3412 Int2 Blast_HSPResultsReverseOrder(BlastHSPResults* results)
3413 {
3414    Int4 index;
3415    BlastHitList* hit_list;
3416 
3417    for (index = 0; index < results->num_queries; ++index) {
3418       hit_list = results->hitlist_array[index];
3419       if (hit_list && hit_list->hsplist_count > 1) {
3420 	 BlastHSPList* hsplist;
3421 	 Int4 index1;
3422 	 /* Swap HSP lists: first with last; second with second from the end,
3423 	    etc. */
3424 	 for (index1 = 0; index1 < hit_list->hsplist_count/2; ++index1) {
3425 	    hsplist = hit_list->hsplist_array[index1];
3426 	    hit_list->hsplist_array[index1] =
3427 	       hit_list->hsplist_array[hit_list->hsplist_count-index1-1];
3428 	    hit_list->hsplist_array[hit_list->hsplist_count-index1-1] =
3429 	       hsplist;
3430 	 }
3431       }
3432    }
3433    return 0;
3434 }
3435 
3436 /** Auxiliary structure for sorting HSPs */
3437 typedef struct SHspWrap {
3438     BlastHSPList *hsplist;  /**< The HSPList to which this HSP belongs */
3439     BlastHSP *hsp;          /**< HSP described by this structure */
3440 } SHspWrap;
3441 
3442 /** callback used to sort a list of encapsulated HSP
3443  *  structures in order of decreasing raw score
3444  *  -RMH-
3445  */
s_SortHspWrapRawScore(const void * x,const void * y)3446 static int s_SortHspWrapRawScore(const void *x, const void *y)
3447 {
3448     SHspWrap *wrap1 = (SHspWrap *)x;
3449     SHspWrap *wrap2 = (SHspWrap *)y;
3450     if (wrap1->hsp->score > wrap2->hsp->score)
3451         return -1;
3452     if (wrap1->hsp->score < wrap2->hsp->score)
3453         return 1;
3454 
3455     return 0;
3456 }
3457 
3458 // Masklevel filtering for rmblastn. -RMH-
Blast_HSPResultsApplyMasklevel(BlastHSPResults * results,const BlastQueryInfo * query_info,Int4 masklevel,Int4 query_length)3459 Int2 Blast_HSPResultsApplyMasklevel(BlastHSPResults *results,
3460                                     const BlastQueryInfo *query_info,
3461                                     Int4 masklevel, Int4 query_length)
3462 {
3463    Int4 i, j, k, m;
3464    Int4 hsp_count;
3465    SHspWrap *hsp_array;
3466    BlastIntervalTree *tree;
3467 
3468    /* set up the interval tree; subject offsets are not needed */
3469 
3470    tree = Blast_IntervalTreeInit(0, query_length + 1, 0, 0);
3471 
3472    for (i = 0; i < results->num_queries; i++) {
3473       BlastHitList *hitlist = results->hitlist_array[i];
3474       if (hitlist == NULL)
3475          continue;
3476 
3477       for (j = hsp_count = 0; j < hitlist->hsplist_count; j++) {
3478          BlastHSPList *hsplist = hitlist->hsplist_array[j];
3479          hsp_count += hsplist->hspcnt;
3480       }
3481 
3482       /* empty each HSP into a combined HSP array, then
3483          sort the array by raw score */
3484 
3485       hsp_array = (SHspWrap *)malloc(hsp_count * sizeof(SHspWrap));
3486 
3487       for (j = k = 0; j < hitlist->hsplist_count; j++) {
3488          BlastHSPList *hsplist = hitlist->hsplist_array[j];
3489          for (m = 0; m < hsplist->hspcnt; k++, m++) {
3490             BlastHSP *hsp = hsplist->hsp_array[m];
3491             hsp_array[k].hsplist = hsplist;
3492             hsp_array[k].hsp = hsp;
3493          }
3494          hsplist->hspcnt = 0;
3495       }
3496 
3497       qsort(hsp_array, hsp_count, sizeof(SHspWrap), s_SortHspWrapRawScore);
3498 
3499       /* Starting with the best HSP, use the interval tree to
3500          check that the query range of each HSP in the list has
3501          not already been enveloped by too many higher-scoring
3502          HSPs. If this is not the case, add the HSP back into results */
3503 
3504       Blast_IntervalTreeReset(tree);
3505 
3506       for (j = 0; j < hsp_count; j++) {
3507          BlastHSPList *hsplist = hsp_array[j].hsplist;
3508          BlastHSP *hsp = hsp_array[j].hsp;
3509 
3510          if (BlastIntervalTreeMasksHSP(tree,
3511                          hsp, query_info, 0, masklevel)) {
3512              Blast_HSPFree(hsp);
3513          }
3514          else {
3515              BlastIntervalTreeAddHSP(hsp, tree, query_info, eQueryOnlyStrandIndifferent);
3516              Blast_HSPListSaveHSP(hsplist, hsp);
3517 
3518              /* the first HSP added back into an HSPList
3519                 automatically has the best e-value */
3520              // RMH: hmmmmmmm
3521              if (hsplist->hspcnt == 1)
3522                  hsplist->best_evalue = hsp->evalue;
3523          }
3524       }
3525       sfree(hsp_array);
3526 
3527       /* remove any HSPLists that are still empty after the
3528          culling process. Sort any remaining lists by score */
3529       for (j = 0; j < hitlist->hsplist_count; j++) {
3530          BlastHSPList *hsplist = hitlist->hsplist_array[j];
3531          if (hsplist->hspcnt == 0) {
3532              hitlist->hsplist_array[j] =
3533                    Blast_HSPListFree(hitlist->hsplist_array[j]);
3534          }
3535          else {
3536              Blast_HSPListSortByScore(hitlist->hsplist_array[j]);
3537          }
3538       }
3539       Blast_HitListPurgeNullHSPLists(hitlist);
3540    }
3541 
3542    tree = Blast_IntervalTreeFree(tree);
3543    return 0;
3544 }
3545 
Blast_HSPResultsInsertHSPList(BlastHSPResults * results,BlastHSPList * hsp_list,Int4 hitlist_size)3546 Int2 Blast_HSPResultsInsertHSPList(BlastHSPResults* results,
3547         BlastHSPList* hsp_list, Int4 hitlist_size)
3548 {
3549    if (!hsp_list || hsp_list->hspcnt == 0)
3550       return 0;
3551 
3552    ASSERT(hsp_list->query_index < results->num_queries);
3553 
3554    if (!results->hitlist_array[hsp_list->query_index]) {
3555        results->hitlist_array[hsp_list->query_index] =
3556            Blast_HitListNew(hitlist_size);
3557    }
3558    Blast_HitListUpdate(results->hitlist_array[hsp_list->query_index],
3559                        hsp_list);
3560    return 0;
3561 }
3562 
3563 BlastHSPResults**
PHIBlast_HSPResultsSplit(const BlastHSPResults * results,const SPHIQueryInfo * pattern_info)3564 PHIBlast_HSPResultsSplit(const BlastHSPResults* results,
3565                          const SPHIQueryInfo* pattern_info)
3566 {
3567     BlastHSPResults* *phi_results = NULL;
3568     int pattern_index, hit_index;
3569     int num_patterns;
3570     BlastHitList* hit_list = NULL;
3571     BlastHSPList** hsplist_array; /* Temporary per-pattern HSP lists. */
3572 
3573     if (!pattern_info || pattern_info->num_patterns == 0)
3574         return NULL;
3575 
3576     num_patterns = pattern_info->num_patterns;
3577 
3578     phi_results =
3579         (BlastHSPResults**) calloc(num_patterns, sizeof(BlastHSPResults*));
3580 
3581     if (!results || !results->hitlist_array[0])
3582         return phi_results;   /* An empty results set is expected if no hits. */
3583 
3584     hsplist_array = (BlastHSPList**) calloc(num_patterns, sizeof(BlastHSPList*));
3585     hit_list = results->hitlist_array[0];
3586 
3587     for (hit_index = 0; hit_index < hit_list->hsplist_count; ++hit_index) {
3588         BlastHSPList* hsp_list = hit_list->hsplist_array[hit_index];
3589         int hsp_index;
3590         /* Copy HSPs corresponding to different pattern occurrences into
3591            separate HSP lists. */
3592         for (hsp_index = 0; hsp_index < hsp_list->hspcnt; ++hsp_index) {
3593             BlastHSP* hsp = s_BlastHSPCopy(hsp_list->hsp_array[hsp_index]);
3594             pattern_index = hsp->pat_info->index;
3595             if (!hsplist_array[pattern_index])
3596                 hsplist_array[pattern_index] = Blast_HSPListNew(0);
3597             hsplist_array[pattern_index]->oid = hsp_list->oid;
3598             Blast_HSPListSaveHSP(hsplist_array[pattern_index], hsp);
3599         }
3600 
3601         /* Save HSP lists corresponding to different pattern occurrences
3602            in separate results structures. */
3603         for (pattern_index = 0; pattern_index < num_patterns;
3604              ++pattern_index) {
3605             if (hsplist_array[pattern_index]) {
3606                 if (!phi_results[pattern_index])
3607                     phi_results[pattern_index] = Blast_HSPResultsNew(1);
3608                 Blast_HSPResultsInsertHSPList(phi_results[pattern_index],
3609                                               hsplist_array[pattern_index],
3610                                               hit_list->hsplist_max);
3611                 hsplist_array[pattern_index] = NULL;
3612             }
3613         }
3614     }
3615 
3616     sfree(hsplist_array);
3617 
3618     /* Sort HSPLists in each of the results structures by e-value. */
3619     for (pattern_index = 0; pattern_index < num_patterns; ++pattern_index) {
3620         Blast_HSPResultsSortByEvalue(phi_results[pattern_index]);
3621     }
3622 
3623     return phi_results;
3624 }
3625 
3626 BlastHSPResults*
Blast_HSPResultsFromHSPStream(BlastHSPStream * hsp_stream,size_t num_queries,SBlastHitsParameters * bhp)3627 Blast_HSPResultsFromHSPStream(BlastHSPStream* hsp_stream,
3628                               size_t num_queries,
3629                               SBlastHitsParameters* bhp)
3630 {
3631     BlastHSPResults* retval = NULL;
3632     BlastHSPList* hsp_list = NULL;
3633 
3634     retval = Blast_HSPResultsNew((Int4) num_queries);
3635 
3636     while (BlastHSPStreamRead(hsp_stream, &hsp_list) != kBlastHSPStream_Eof) {
3637         Blast_HSPResultsInsertHSPList(retval, hsp_list,
3638                                       bhp->prelim_hitlist_size);
3639     }
3640     SBlastHitsParametersFree(bhp);
3641     return retval;
3642 }
3643 
3644 /** Comparison function for sorting HSP lists in increasing order of the
3645  * number of HSPs in a hit. Needed for s_TrimResultsByTotalHSPLimit below.
3646  * @param v1 Pointer to the first HSP list [in]
3647  * @param v2 Pointer to the second HSP list [in]
3648  */
3649 static int
s_CompareHsplistHspcnt(const void * v1,const void * v2)3650 s_CompareHsplistHspcnt(const void* v1, const void* v2)
3651 {
3652    BlastHSPList* r1 = *((BlastHSPList**) v1);
3653    BlastHSPList* r2 = *((BlastHSPList**) v2);
3654 
3655    if (r1->hspcnt < r2->hspcnt)
3656       return -1;
3657    else if (r1->hspcnt > r2->hspcnt)
3658       return 1;
3659    else
3660       return 0;
3661 }
3662 
3663 /** Removes extra results if a limit is imposed on the total number of HSPs
3664  * returned. If the search involves multiple query sequences, the total HSP
3665  * limit is applied separately to each query.
3666  * The trimming algorithm makes sure that at least 1 HSP is returned for each
3667  * database sequence hit. Suppose results for a given query consist of HSP
3668  * lists for N database sequences, and the limit is T. HSP lists are sorted in
3669  * order of increasing number of HSPs in each list. Then the algorithm proceeds
3670  * by leaving at most i*T/N HSPs for the first i HSP lists, for every
3671  * i = 1, 2, ..., N.
3672  * @param results Results after preliminary stage of a BLAST search [in|out]
3673  * @param total_hsp_limit Limit on total number of HSPs [in]
3674  * @return TRUE if HSP limit has been exceeded, FALSE otherwise.
3675  */
3676 static Boolean
s_TrimResultsByTotalHSPLimit(BlastHSPResults * results,Uint4 total_hsp_limit)3677 s_TrimResultsByTotalHSPLimit(BlastHSPResults* results, Uint4 total_hsp_limit)
3678 {
3679     int query_index;
3680     Boolean hsp_limit_exceeded = FALSE;
3681 
3682     if (total_hsp_limit == 0) {
3683         return hsp_limit_exceeded;
3684     }
3685 
3686     for (query_index = 0; query_index < results->num_queries; ++query_index) {
3687         BlastHitList* hit_list = NULL;
3688         BlastHSPList** hsplist_array = NULL;
3689         Int4 hsplist_count = 0;
3690         int subj_index;
3691 
3692         if ( !(hit_list = results->hitlist_array[query_index]) )
3693             continue;
3694         /* The count of HSPs is separate for each query. */
3695         hsplist_count = hit_list->hsplist_count;
3696 
3697         hsplist_array = (BlastHSPList**)
3698             malloc(hsplist_count*sizeof(BlastHSPList*));
3699 
3700         for (subj_index = 0; subj_index < hsplist_count; ++subj_index) {
3701             hsplist_array[subj_index] = hit_list->hsplist_array[subj_index];
3702         }
3703 
3704         qsort((void*)hsplist_array, hsplist_count,
3705               sizeof(BlastHSPList*), s_CompareHsplistHspcnt);
3706 
3707         {
3708             Int4 tot_hsps = 0;  /* total number of HSPs */
3709             Uint4 hsp_per_seq = MAX(1, total_hsp_limit/hsplist_count);
3710             for (subj_index = 0; subj_index < hsplist_count; ++subj_index) {
3711                 Int4 allowed_hsp_num = ((subj_index+1)*hsp_per_seq) - tot_hsps;
3712                 BlastHSPList* hsp_list = hsplist_array[subj_index];
3713                 if (hsp_list->hspcnt > allowed_hsp_num) {
3714                     /* Free the extra HSPs */
3715                     int hsp_index;
3716                     for (hsp_index = allowed_hsp_num;
3717                          hsp_index < hsp_list->hspcnt; ++hsp_index) {
3718                         Blast_HSPFree(hsp_list->hsp_array[hsp_index]);
3719                     }
3720                     hsp_list->hspcnt = allowed_hsp_num;
3721                     hsp_limit_exceeded = TRUE;
3722                 }
3723                 tot_hsps += hsp_list->hspcnt;
3724             }
3725         }
3726         sfree(hsplist_array);
3727     }
3728 
3729     return hsp_limit_exceeded;
3730 }
3731 
3732 typedef struct BlastHSPwOid {
3733 	BlastHSP *  hsp;
3734 	Int4 		oid;
3735 } BlastHSPwOid;
3736 
3737 static int
s_CompareScoreHSPwOid(const void * v1,const void * v2)3738 s_CompareScoreHSPwOid(const void* v1, const void* v2)
3739 {
3740    BlastHSPwOid * r1 = (BlastHSPwOid *) v1;
3741    BlastHSPwOid * r2 = (BlastHSPwOid *) v2;
3742    return (s_EvalueCompareHSPs(&(r1->hsp), &(r2->hsp)));
3743 }
3744 
3745 static int
s_CompareOidHSPwOid(const void * v1,const void * v2)3746 s_CompareOidHSPwOid(const void* v1, const void* v2)
3747 {
3748    BlastHSPwOid * r1 = (BlastHSPwOid *) v1;
3749    BlastHSPwOid * r2 = (BlastHSPwOid *) v2;
3750    return (r1->oid > r2->oid);
3751 }
3752 
3753 
3754 /* extended version of the above function. Provides information about query number
3755  * which exceeded number of HSP.
3756  * The hsp_limit_exceeded is of results->num_queries size guarantied.
3757  */
3758 static Boolean
s_TrimResultsByTotalHSPLimitEx(BlastHSPResults * results,Uint4 total_hsp_limit,Boolean * hsp_limit_exceeded)3759 s_TrimResultsByTotalHSPLimitEx(BlastHSPResults* results,
3760 	                       Uint4 total_hsp_limit,
3761 			       Boolean *hsp_limit_exceeded)
3762 {
3763     int query_index;
3764     Boolean  any_hsp_limit_exceeded = FALSE;
3765 
3766     if (total_hsp_limit == 0) {
3767         return any_hsp_limit_exceeded;
3768     }
3769 
3770     for (query_index = 0; query_index < results->num_queries; ++query_index) {
3771         BlastHitList* hit_list = NULL;
3772         Int4 hsplist_count = 0;
3773         int subj_index;
3774         Int4 total_hsps = 0;  /* total number of HSPs */
3775 
3776         if( hsp_limit_exceeded) hsp_limit_exceeded[query_index]  = FALSE;
3777 
3778         if ( !(hit_list = results->hitlist_array[query_index]) )
3779             continue;
3780         /* The count of HSPs is separate for each query. */
3781         hsplist_count = hit_list->hsplist_count;
3782 
3783         for (subj_index = 0; subj_index < hsplist_count; ++subj_index) {
3784         	total_hsps += hit_list->hsplist_array[subj_index]->hspcnt;
3785         }
3786 
3787         if(total_hsps > total_hsp_limit)
3788         {
3789         	 BlastHSPwOid *  everything_list = (BlastHSPwOid *) malloc(total_hsps * sizeof(BlastHSPwOid));
3790         	 BlastHSPList * subj_list;
3791        		 int hsp_counter = 0;
3792        		 int max_hit_list_size = hit_list->hsplist_max;
3793         	 if( hsp_limit_exceeded) {
3794         		 hsp_limit_exceeded[query_index]  = TRUE;
3795         		 any_hsp_limit_exceeded = TRUE;
3796         	 }
3797         	 for (subj_index = 0; subj_index < hsplist_count; ++subj_index) {
3798         		 int subj_hsp;
3799         		 BlastHSP ** hsps_per_subj;
3800         		 subj_list = hit_list->hsplist_array[subj_index];
3801         		 hsps_per_subj = subj_list->hsp_array;
3802         		 for(subj_hsp=0; subj_hsp < subj_list->hspcnt; ++subj_hsp) {
3803         			 everything_list[hsp_counter].hsp = hsps_per_subj[subj_hsp];
3804         			 everything_list[hsp_counter].oid = subj_list->oid;
3805         			 hsps_per_subj[subj_hsp] = NULL;
3806         			 hsp_counter ++;
3807         		 }
3808 
3809         	 }
3810         	 results->hitlist_array[query_index] = Blast_HitListFree(hit_list);
3811         	 qsort((void*)everything_list, total_hsps, sizeof(BlastHSPwOid), s_CompareScoreHSPwOid);
3812 
3813         	 for(hsp_counter = total_hsp_limit; hsp_counter < total_hsps ; ++hsp_counter) {
3814         		 everything_list[hsp_counter].hsp = Blast_HSPFree(everything_list[hsp_counter].hsp);
3815         		 everything_list[hsp_counter].oid = 0x7fffff;
3816         	 }
3817 
3818         	 qsort((void*)everything_list, total_hsp_limit, sizeof(BlastHSPwOid), s_CompareOidHSPwOid);
3819        		 subj_list = NULL;
3820         	 for(hsp_counter = 0; hsp_counter < total_hsp_limit; ++ hsp_counter)
3821         	 {
3822         		 int hsp_counter_start = hsp_counter;
3823         		 int hspcnt;
3824         		 int num_hsp;
3825 
3826         		 while ((everything_list[hsp_counter].oid == everything_list[hsp_counter+1].oid) &&
3827         				 (hsp_counter + 1 < total_hsp_limit)) {
3828         			 hsp_counter ++;
3829         		 }
3830         		 num_hsp = hsp_counter -hsp_counter_start + 1;
3831         		 subj_list = Blast_HSPListNew(num_hsp);
3832         		 subj_list->oid = everything_list[hsp_counter].oid;
3833         		 subj_list->query_index = query_index;
3834 
3835         		 for(hspcnt = 0; hspcnt < num_hsp; ++hspcnt) {
3836         			 Blast_HSPListSaveHSP(subj_list, everything_list[hsp_counter_start + hspcnt].hsp);
3837         		 }
3838 
3839         		 Blast_HSPResultsInsertHSPList(results,subj_list, max_hit_list_size );
3840         	 }
3841         	 free(everything_list);
3842         }
3843     }
3844 
3845     return any_hsp_limit_exceeded;
3846 }
3847 
3848 BlastHSPResults*
Blast_HSPResultsFromHSPStreamWithLimit(BlastHSPStream * hsp_stream,Uint4 num_queries,SBlastHitsParameters * hit_param,Uint4 max_num_hsps,Boolean * removed_hsps)3849 Blast_HSPResultsFromHSPStreamWithLimit(BlastHSPStream* hsp_stream,
3850                                    Uint4 num_queries,
3851                                    SBlastHitsParameters* hit_param,
3852                                    Uint4 max_num_hsps,
3853                                    Boolean* removed_hsps)
3854 {
3855     Boolean rm_hsps = FALSE;
3856     BlastHSPResults* retval = Blast_HSPResultsFromHSPStream(hsp_stream,
3857                                                             num_queries,
3858                                                             hit_param);
3859 
3860     rm_hsps = s_TrimResultsByTotalHSPLimit(retval, max_num_hsps);
3861     if (removed_hsps) {
3862         *removed_hsps = rm_hsps;
3863     }
3864     return retval;
3865 }
3866 BlastHSPResults*
Blast_HSPResultsFromHSPStreamWithLimitEx(BlastHSPStream * hsp_stream,Uint4 num_queries,SBlastHitsParameters * hit_param,Uint4 max_num_hsps,Boolean * removed_hsps)3867 Blast_HSPResultsFromHSPStreamWithLimitEx(BlastHSPStream* hsp_stream,
3868                                    Uint4 num_queries,
3869                                    SBlastHitsParameters* hit_param,
3870                                    Uint4 max_num_hsps,
3871 				   Boolean *removed_hsps)
3872 {
3873     Boolean rm_hsps = FALSE;
3874     BlastHSPResults* retval = Blast_HSPResultsFromHSPStream(hsp_stream,
3875                                                             num_queries,
3876                                                             hit_param);
3877 
3878     rm_hsps = s_TrimResultsByTotalHSPLimitEx(retval, max_num_hsps,removed_hsps);
3879     if (removed_hsps) {
3880         *removed_hsps = rm_hsps;
3881     }
3882     return retval;
3883 }
3884