1 /* $Id: blast_hits.c,v 1.283 2016/09/13 14:49:13 fukanchi Exp $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  *
27  */
28 
29 /** @file blast_hits.c
30  * BLAST functions for saving hits after the (preliminary) gapped alignment
31  */
32 
33 #include <algo/blast/core/ncbi_math.h>
34 #include <algo/blast/core/blast_hits.h>
35 #include <algo/blast/core/blast_util.h>
36 #include <algo/blast/core/blast_def.h>
37 #include <algo/blast/core/blast_hspstream.h>
38 #include "blast_hits_priv.h"
39 #include "blast_itree.h"
40 #include "jumper.h"
41 
42 NCBI_XBLAST_EXPORT
SBlastHitsParametersNew(const BlastHitSavingOptions * hit_options,const BlastExtensionOptions * ext_options,const BlastScoringOptions * scoring_options,SBlastHitsParameters ** retval)43 Int2 SBlastHitsParametersNew(const BlastHitSavingOptions* hit_options,
44                              const BlastExtensionOptions* ext_options,
45                              const BlastScoringOptions* scoring_options,
46                              SBlastHitsParameters* *retval)
47 {
48        Int4 prelim_hitlist_size;
49        ASSERT(retval);
50        *retval = NULL;
51 
52        if (hit_options == NULL ||
53            ext_options == NULL ||
54            scoring_options == NULL)
55            return 1;
56 
57        *retval = (SBlastHitsParameters*) malloc(sizeof(SBlastHitsParameters));
58        if (*retval == NULL)
59            return 2;
60 
61        prelim_hitlist_size = hit_options->hitlist_size;
62        if (ext_options->compositionBasedStats)
63             prelim_hitlist_size = prelim_hitlist_size * 2 + 50;
64        else if (scoring_options->gapped_calculation)
65             prelim_hitlist_size = MIN(2 * prelim_hitlist_size,
66                                       prelim_hitlist_size + 50);
67 
68        (*retval)->prelim_hitlist_size = MAX(prelim_hitlist_size, 10);
69 
70        (*retval)->hsp_num_max = BlastHspNumMax(scoring_options->gapped_calculation, hit_options);
71 
72        return 0;
73 }
74 
75 SBlastHitsParameters*
SBlastHitsParametersDup(const SBlastHitsParameters * hit_params)76 SBlastHitsParametersDup(const SBlastHitsParameters* hit_params)
77 {
78     SBlastHitsParameters* retval = (SBlastHitsParameters*)
79         malloc(sizeof(SBlastHitsParameters));
80 
81     if ( !retval ) {
82         return NULL;
83     }
84 
85     memcpy((void*)retval, (void*) hit_params, sizeof(SBlastHitsParameters));
86     return retval;
87 }
88 
89 NCBI_XBLAST_EXPORT
SBlastHitsParametersFree(SBlastHitsParameters * param)90 SBlastHitsParameters* SBlastHitsParametersFree(SBlastHitsParameters* param)
91 {
92        if (param)
93        {
94                sfree(param);
95        }
96        return NULL;
97 }
98 
99 
100 
101 /********************************************************************************
102           Functions manipulating BlastHSP's
103 ********************************************************************************/
104 
Blast_HSPFree(BlastHSP * hsp)105 BlastHSP* Blast_HSPFree(BlastHSP* hsp)
106 {
107    if (!hsp)
108       return NULL;
109    hsp->gap_info = GapEditScriptDelete(hsp->gap_info);
110    hsp->map_info = BlastHSPMappingInfoFree(hsp->map_info);
111    sfree(hsp->pat_info);
112    sfree(hsp);
113    return NULL;
114 }
115 
Blast_HSPNew(void)116 BlastHSP* Blast_HSPNew(void)
117 {
118      BlastHSP* new_hsp = (BlastHSP*) calloc(1, sizeof(BlastHSP));
119      return new_hsp;
120 }
121 
122 /*
123    Comments in blast_hits.h
124 */
125 Int2
Blast_HSPInit(Int4 query_start,Int4 query_end,Int4 subject_start,Int4 subject_end,Int4 query_gapped_start,Int4 subject_gapped_start,Int4 query_context,Int2 query_frame,Int2 subject_frame,Int4 score,GapEditScript ** gap_edit,BlastHSP ** ret_hsp)126 Blast_HSPInit(Int4 query_start, Int4 query_end, Int4 subject_start,
127               Int4 subject_end, Int4 query_gapped_start,
128               Int4 subject_gapped_start, Int4 query_context,
129               Int2 query_frame, Int2 subject_frame, Int4 score,
130               GapEditScript* *gap_edit, BlastHSP* *ret_hsp)
131 {
132    BlastHSP* new_hsp = NULL;
133 
134    if (!ret_hsp)
135       return -1;
136 
137    new_hsp = Blast_HSPNew();
138 
139    *ret_hsp = NULL;
140 
141    if (new_hsp == NULL)
142 	return BLASTERR_MEMORY;
143 
144 
145    new_hsp->query.offset = query_start;
146    new_hsp->subject.offset = subject_start;
147    new_hsp->query.end = query_end;
148    new_hsp->subject.end = subject_end;
149    new_hsp->query.gapped_start = query_gapped_start;
150    new_hsp->subject.gapped_start = subject_gapped_start;
151    new_hsp->context = query_context;
152    new_hsp->query.frame = query_frame;
153    new_hsp->subject.frame = subject_frame;
154    new_hsp->score = score;
155    if (gap_edit && *gap_edit)
156    { /* If this is non-NULL transfer ownership. */
157         new_hsp->gap_info = *gap_edit;
158         *gap_edit = NULL;
159    }
160 
161    *ret_hsp = new_hsp;
162 
163    return 0;
164 }
165 
166 
BlastHSPMappingInfoFree(BlastHSPMappingInfo * info)167 BlastHSPMappingInfo* BlastHSPMappingInfoFree(BlastHSPMappingInfo* info)
168 {
169    if (!info) {
170        return NULL;
171    }
172 
173    info->edits = JumperEditsBlockFree(info->edits);
174    if (info->subject_overhangs) {
175        SequenceOverhangsFree(info->subject_overhangs);
176    }
177    sfree(info);
178 
179    return NULL;
180 }
181 
BlastHSPMappingInfoNew(void)182 BlastHSPMappingInfo* BlastHSPMappingInfoNew(void)
183 {
184     BlastHSPMappingInfo* retval = calloc(1, sizeof(BlastHSPMappingInfo));
185     return retval;
186 }
187 
BlastHspNumMax(Boolean gapped_calculation,const BlastHitSavingOptions * options)188 Int4 BlastHspNumMax(Boolean gapped_calculation, const BlastHitSavingOptions* options)
189 {
190    Int4 retval=0;
191 
192    /* per-subject HSP limits do not apply to gapped searches; JIRA SB-616 */
193    if (options->hsp_num_max <= 0)
194    {
195       retval = INT4_MAX;
196    }
197    else
198    {
199       retval = options->hsp_num_max;
200    }
201 
202    return retval;
203 }
204 
205 /** Copies all contents of a BlastHSP structure. Used in PHI BLAST for splitting
206  * results corresponding to different pattern occurrences in query.
207  * @param hsp Original HSP [in]
208  * @return New HSP, copied from the original.
209  */
210 static BlastHSP*
s_BlastHSPCopy(const BlastHSP * hsp)211 s_BlastHSPCopy(const BlastHSP* hsp)
212 {
213     BlastHSP* new_hsp = NULL;
214     /* Do not pass the edit script, because we don't want to tranfer
215        ownership. */
216     Blast_HSPInit(hsp->query.offset, hsp->query.end, hsp->subject.offset,
217                   hsp->subject.end, hsp->query.gapped_start,
218                   hsp->subject.gapped_start, hsp->context,
219                   hsp->query.frame, hsp->subject.frame, hsp->score,
220                   NULL, &new_hsp);
221     new_hsp->evalue = hsp->evalue;
222     new_hsp->num = hsp->num;
223     new_hsp->num_ident = hsp->num_ident;
224     new_hsp->bit_score = hsp->bit_score;
225     new_hsp->comp_adjustment_method = hsp->comp_adjustment_method;
226     if (hsp->gap_info) {
227         new_hsp->gap_info = GapEditScriptDup(hsp->gap_info);
228     }
229 
230     if (hsp->pat_info) {
231         /* Copy this HSP's pattern data. */
232         new_hsp->pat_info =
233             (SPHIHspInfo*) BlastMemDup(hsp->pat_info, sizeof(SPHIHspInfo));
234     }
235     return new_hsp;
236 }
237 
238 /* Make a deep copy of an HSP */
Blast_HSPClone(const BlastHSP * hsp)239 BlastHSP* Blast_HSPClone(const BlastHSP* hsp)
240 {
241     BlastHSP* retval = NULL;
242 
243     if (!hsp) {
244         return NULL;
245     }
246 
247     retval = Blast_HSPNew();
248     if (retval) {
249         retval->score = hsp->score;
250         retval->num_ident = hsp->num_ident;
251         memcpy(&retval->query, &hsp->query, sizeof(BlastSeg));
252         memcpy(&retval->subject, &hsp->subject, sizeof(BlastSeg));
253         retval->context = hsp->context;
254         retval->evalue = hsp->evalue;
255         retval->bit_score = hsp->bit_score;
256         retval->num = hsp->num;
257         retval->comp_adjustment_method = hsp->comp_adjustment_method;
258         retval->num_positives = hsp->num_positives;
259 
260         /* copy gapped traceback */
261         if (hsp->gap_info) {
262             retval->gap_info = GapEditScriptDup(hsp->gap_info);
263             if (!retval->gap_info) {
264                 Blast_HSPFree(retval);
265                 return NULL;
266             }
267         }
268 
269         /* copy short read mapping data */
270         if (hsp->map_info) {
271             retval->map_info = BlastHSPMappingInfoNew();
272             if (!retval->map_info) {
273                 Blast_HSPFree(retval);
274                 return NULL;
275             }
276             retval->map_info->edits =
277                 JumperEditsBlockDup(hsp->map_info->edits);
278             if (!retval->map_info->edits) {
279                 Blast_HSPFree(retval);
280                 return NULL;
281             }
282             retval->map_info->left_edge = hsp->map_info->left_edge;
283             retval->map_info->right_edge = hsp->map_info->right_edge;
284 
285             if (hsp->map_info->subject_overhangs) {
286                 SequenceOverhangs* old = hsp->map_info->subject_overhangs;
287                 SequenceOverhangs* copy = calloc(1, sizeof(SequenceOverhangs));
288                 if (!copy) {
289                     Blast_HSPFree(retval);
290                     return NULL;
291                 }
292 
293                 if (old->left && old->left_len > 0) {
294                     copy->left_len = old->left_len;
295                     copy->left = malloc(copy->left_len * sizeof(Uint1));
296                     if (!copy->left) {
297                         SequenceOverhangsFree(copy);
298                         Blast_HSPFree(retval);
299                         return NULL;
300                     }
301                     memcpy(copy->left, old->left,
302                            copy->left_len * sizeof(Uint1));
303                 }
304 
305                 if (old->right && old->right_len > 0) {
306                     copy->right_len = old->right_len;
307                     copy->right = malloc(copy->right_len * sizeof(Uint1));
308                     if (!copy->right) {
309                         SequenceOverhangsFree(copy);
310                         Blast_HSPFree(retval);
311                     }
312                     memcpy(copy->right, old->right,
313                            copy->right_len * sizeof(Uint1));
314                 }
315 
316                 retval->map_info->subject_overhangs = copy;
317             }
318         }
319 
320         /* copy phi-blast pattern data */
321         if (hsp->pat_info) {
322             retval->pat_info =
323                 (SPHIHspInfo*) BlastMemDup(hsp->pat_info, sizeof(SPHIHspInfo));
324         }
325     }
326 
327     return retval;
328 }
329 
330 /** Count the number of occurrences of pattern in sequence, which
331  * do not overlap by more than half the pattern match length.
332  * @param query_info Query information structure, containing pattern info. [in]
333  */
334 Int4
PhiBlastGetEffectiveNumberOfPatterns(const BlastQueryInfo * query_info)335 PhiBlastGetEffectiveNumberOfPatterns(const BlastQueryInfo *query_info)
336 {
337     Int4 index; /*loop index*/
338     Int4 lastEffectiveOccurrence; /*last nonoverlapping occurrence*/
339     Int4 count; /* Count of effective (nonoverlapping) occurrences */
340     Int4 min_pattern_length;
341     SPHIQueryInfo* pat_info;
342 
343     ASSERT(query_info && query_info->pattern_info && query_info->contexts);
344 
345     pat_info = query_info->pattern_info;
346 
347     if (pat_info->num_patterns <= 1)
348         return pat_info->num_patterns;
349 
350     /* Minimal length of a pattern is saved in the length adjustment field. */
351     min_pattern_length = query_info->contexts[0].length_adjustment;
352 
353     count = 1;
354     lastEffectiveOccurrence = pat_info->occurrences[0].offset;
355     for(index = 1; index < pat_info->num_patterns; ++index) {
356         if (((pat_info->occurrences[index].offset - lastEffectiveOccurrence) * 2)
357             > min_pattern_length) {
358             lastEffectiveOccurrence = pat_info->occurrences[index].offset;
359             ++count;
360         }
361     }
362 
363     return count;
364 }
365 
366 
367 /** Calculate e-value for an HSP found by PHI BLAST.
368  * @param hsp An HSP found by PHI BLAST [in]
369  * @param sbp Scoring block with statistical parameters [in]
370  * @param query_info Structure containing information about pattern counts [in]
371  * @param pattern_blk Structure containing counts of PHI pattern hits [in]
372  */
373 static void
s_HSPPHIGetEvalue(BlastHSP * hsp,BlastScoreBlk * sbp,const BlastQueryInfo * query_info,const SPHIPatternSearchBlk * pattern_blk)374 s_HSPPHIGetEvalue(BlastHSP* hsp, BlastScoreBlk* sbp,
375                   const BlastQueryInfo* query_info,
376                   const SPHIPatternSearchBlk* pattern_blk)
377 {
378    double paramC;
379    double Lambda;
380 
381    ASSERT(query_info && hsp && sbp && pattern_blk);
382 
383    paramC = sbp->kbp[0]->paramC;
384    Lambda = sbp->kbp[0]->Lambda;
385 
386    /* We have the actual number of occurrences of pattern in db. */
387    hsp->evalue = paramC*(1+Lambda*hsp->score)*
388                 PhiBlastGetEffectiveNumberOfPatterns(query_info)*
389                 pattern_blk->num_patterns_db*
390                 exp(-Lambda*hsp->score);
391 }
392 
393 /** Update HSP data after reevaluation with ambiguities. In particular this
394  * function calculates number of identities and checks if the percent identity
395  * criterion is satisfied.
396  * @param hsp HSP to update [in] [out]
397  * @param gapped Is this a gapped search? [in]
398  * @param cutoff_score Cutoff score for saving the HSP [in]
399  * @param score New score [in]
400  * @param query_start Start of query sequence [in]
401  * @param subject_start Start of subject sequence [in]
402  * @param best_q_start Pointer to start of the new alignment in query [in]
403  * @param best_q_end Pointer to end of the new alignment in query [in]
404  * @param best_s_start Pointer to start of the new alignment in subject [in]
405  * @param best_s_end Pointer to end of the new alignment in subject [in]
406  * @param best_start_esp_index index of the edit script array where the new alignment
407  *                       starts. [in]
408  * @param best_end_esp_index index in the edit script array where the new alignment
409  *                     ends. [in]
410  * @param best_end_esp_num Number of edit operations in the last edit script,
411  *                         that are included in the alignment. [in]
412  * @return TRUE if HSP is scheduled to be deleted.
413  */
414 static Boolean
s_UpdateReevaluatedHSP(BlastHSP * hsp,Boolean gapped,Int4 cutoff_score,Int4 score,const Uint1 * query_start,const Uint1 * subject_start,const Uint1 * best_q_start,const Uint1 * best_q_end,const Uint1 * best_s_start,const Uint1 * best_s_end,int best_start_esp_index,int best_end_esp_index,int best_end_esp_num)415 s_UpdateReevaluatedHSP(BlastHSP* hsp, Boolean gapped,
416                        Int4 cutoff_score,
417                        Int4 score, const Uint1* query_start, const Uint1* subject_start,
418                        const Uint1* best_q_start, const Uint1* best_q_end,
419                        const Uint1* best_s_start, const Uint1* best_s_end,
420                        int best_start_esp_index,
421                        int best_end_esp_index,
422                        int best_end_esp_num)
423 {
424     Boolean delete_hsp = TRUE;
425 
426     hsp->score = score;
427 
428     if (hsp->score >= cutoff_score) {
429         /* Update all HSP offsets. */
430         hsp->query.offset = best_q_start - query_start;
431         hsp->query.end = hsp->query.offset + best_q_end - best_q_start;
432         hsp->subject.offset = best_s_start - subject_start;
433         hsp->subject.end = hsp->subject.offset + best_s_end - best_s_start;
434 
435         if (gapped) {
436             int last_num=hsp->gap_info->size - 1;
437             if (best_end_esp_index != last_num|| best_start_esp_index > 0)
438             {
439                 GapEditScript* esp_temp = GapEditScriptNew(best_end_esp_index-best_start_esp_index+1);
440                 GapEditScriptPartialCopy(esp_temp, 0, hsp->gap_info, best_start_esp_index, best_end_esp_index);
441                 hsp->gap_info = GapEditScriptDelete(hsp->gap_info);
442                 hsp->gap_info = esp_temp;
443             }
444             last_num = hsp->gap_info->size - 1;
445             hsp->gap_info->num[last_num] = best_end_esp_num;
446             ASSERT(best_end_esp_num >= 0);
447         }
448         delete_hsp = FALSE;
449     }
450 
451     return delete_hsp;
452 }
453 
Blast_HSPReevaluateWithAmbiguitiesGapped(BlastHSP * hsp,const Uint1 * q,const Int4 qlen,const Uint1 * s,const Int4 slen,const BlastHitSavingParameters * hit_params,const BlastScoringParameters * score_params,const BlastScoreBlk * sbp)454 Boolean Blast_HSPReevaluateWithAmbiguitiesGapped(BlastHSP* hsp,
455            const Uint1* q, const Int4 qlen,
456            const Uint1* s, const Int4 slen,
457            const BlastHitSavingParameters* hit_params,
458            const BlastScoringParameters* score_params,
459            const BlastScoreBlk* sbp)
460 {
461    Int4 sum, score, gap_open, gap_extend;
462    Int4 index; /* loop index */
463    Int4 qp, sp, ext;
464 
465    int best_start_esp_index = 0;
466    int best_end_esp_index = 0;
467    int current_start_esp_index = 0;
468    int best_end_esp_num = 0;
469    GapEditScript* esp;  /* Used to hold GapEditScript of hsp->gap_info */
470 
471    const Uint1* best_q_start; /* Start of the best scoring part in query. */
472    const Uint1* best_s_start; /* Start of the best scoring part in subject. */
473    const Uint1* best_q_end;   /* End of the best scoring part in query. */
474    const Uint1* best_s_end;   /* End of the best scoring part in subject. */
475 
476 
477    const Uint1* current_q_start; /* Start of the current part of the alignment in
478                            query. */
479    const Uint1* current_s_start; /* Start of the current part of the alignment in
480                            subject. */
481 
482    const Uint1* query,* subject;
483    Int4** matrix;
484    Int2 factor = 1;
485    const Uint1 kResidueMask = 0x0f;
486    Int4 cutoff_score = hit_params->cutoffs[hsp->context].cutoff_score;
487 
488    matrix = sbp->matrix->data;
489 
490    /* For a non-affine greedy case, calculate the real value of the gap
491       extension penalty. Multiply all scores by 2 if it is not integer. */
492    if (score_params->gap_open == 0 && score_params->gap_extend == 0) {
493       if (score_params->reward % 2 == 1)
494          factor = 2;
495       gap_open = 0;
496       gap_extend =
497          (score_params->reward - 2*score_params->penalty) * factor / 2;
498    } else {
499       gap_open = score_params->gap_open;
500       gap_extend = score_params->gap_extend;
501    }
502 
503    query = q + hsp->query.offset;
504    subject = s + hsp->subject.offset;
505    score = 0;
506    sum = 0;
507 
508    /* Point all pointers to the beginning of the alignment. */
509    best_q_start = best_q_end = current_q_start = query;
510    best_s_start = best_s_end = current_s_start = subject;
511    /* There are no previous edit scripts at the beginning. */
512 
513    best_end_esp_num = -1;
514    esp = hsp->gap_info;
515    if (!esp) return TRUE;
516    for (index=0; index<esp->size; index++)
517    {
518        int op_index = 0;  /* Index of an operation within a single edit script. */
519        for (op_index=0; op_index<esp->num[index]; )
520        {
521           /* Process substitutions one operation at a time, full gaps in one step. */
522           if (esp->op_type[index] == eGapAlignSub) {
523               sum += factor*matrix[*query & kResidueMask][*subject];
524               query++;
525               subject++;
526               op_index++;
527           } else if (esp->op_type[index] == eGapAlignDel) {
528               sum -= gap_open + gap_extend * esp->num[index];
529               subject += esp->num[index];
530               op_index += esp->num[index];
531           } else if (esp->op_type[index] == eGapAlignIns) {
532               sum -= gap_open + gap_extend * esp->num[index];
533               query += esp->num[index];
534               op_index += esp->num[index];
535           }
536 
537           if (sum < 0) {
538            /* Point current edit script chain start to the new place.
539               If we are in the middle of an edit script, reduce its length and
540               point operation index to the beginning of a modified edit script;
541               if we are at the end, move to the next edit script. */
542               if (op_index < esp->num[index]) {
543                   esp->num[index] -= op_index;
544                   current_start_esp_index = index;
545                   op_index = 0;
546               } else {
547                   current_start_esp_index = index + 1;
548               }
549               /* Set sum to 0, to start a fresh count. */
550               sum = 0;
551               /* Set current starting positions in sequences to the new start. */
552               current_q_start = query;
553               current_s_start = subject;
554 
555               /* If score has passed the cutoff at some point, leave the best score
556                  and edit scripts positions information untouched, otherwise reset
557                  the best score to 0 and point the best edit script positions to
558                  the new start. */
559               if (score < cutoff_score) {
560                   /* Start from new offset; discard all previous information. */
561                   best_q_start = query;
562                   best_s_start = subject;
563                   score = 0;
564 
565                   /* Set best start and end edit script pointers to new start. */
566                   best_start_esp_index = current_start_esp_index;
567                   best_end_esp_index = current_start_esp_index;
568               }
569              /* break; */ /* start on next GapEditScript. */
570           } else if (sum > score) {
571               /* Remember this point as the best scoring end point, and the current
572               start of the alignment as the start of the best alignment. */
573               score = sum;
574 
575               best_q_start = current_q_start;
576               best_s_start = current_s_start;
577               best_q_end = query;
578               best_s_end = subject;
579 
580               best_start_esp_index = current_start_esp_index;
581               best_end_esp_index = index;
582               best_end_esp_num = op_index;
583           }
584        }
585    } /* loop on edit scripts */
586 
587    score /= factor;
588 
589    if (best_start_esp_index < esp->size && best_end_esp_index < esp->size) {
590       /* post processing: try to extend further */
591       ASSERT(esp->op_type[best_start_esp_index] == eGapAlignSub);
592       ASSERT(esp->op_type[best_end_esp_index] == eGapAlignSub);
593 
594       qp = best_q_start - q;
595       sp = best_s_start - s;
596       ext = 0;
597       while(qp > 0 && sp > 0 && (q[--qp] == s[--sp]) && q[qp]<4) ext++;
598       best_q_start -= ext;
599       best_s_start -= ext;
600       esp->num[best_start_esp_index] += ext;
601       if (best_end_esp_index == best_start_esp_index) best_end_esp_num += ext;
602       score += ext * score_params->reward;
603 
604       qp = best_q_end - q;
605       sp = best_s_end - s;
606       ext = 0;
607       while(qp < qlen && sp < slen && q[qp]<4 && (q[qp++] == s[sp++])) ext++;
608       best_q_end += ext;
609       best_s_end += ext;
610       esp->num[best_end_esp_index] += ext;
611       best_end_esp_num += ext;
612       score += ext * score_params->reward;
613    }
614 
615    /* Update HSP data. */
616    return
617        s_UpdateReevaluatedHSP(hsp, TRUE, cutoff_score,
618                               score, q, s, best_q_start,
619                               best_q_end, best_s_start, best_s_end,
620                               best_start_esp_index, best_end_esp_index,
621                               best_end_esp_num);
622 }
623 
624 /** Update HSP data after reevaluation with ambiguities for an ungapped search.
625  * In particular this function calculates number of identities and checks if the
626  * percent identity criterion is satisfied.
627  * @param hsp HSP to update [in] [out]
628  * @param cutoff_score Cutoff score for saving the HSP [in]
629  * @param score New score [in]
630  * @param query_start Start of query sequence [in]
631  * @param subject_start Start of subject sequence [in]
632  * @param best_q_start Pointer to start of the new alignment in query [in]
633  * @param best_q_end Pointer to end of the new alignment in query [in]
634  * @param best_s_start Pointer to start of the new alignment in subject [in]
635  * @param best_s_end Pointer to end of the new alignment in subject [in]
636  * @return TRUE if HSP is scheduled to be deleted.
637  */
638 static Boolean
s_UpdateReevaluatedHSPUngapped(BlastHSP * hsp,Int4 cutoff_score,Int4 score,const Uint1 * query_start,const Uint1 * subject_start,const Uint1 * best_q_start,const Uint1 * best_q_end,const Uint1 * best_s_start,const Uint1 * best_s_end)639 s_UpdateReevaluatedHSPUngapped(BlastHSP* hsp, Int4 cutoff_score, Int4 score,
640                                const Uint1* query_start, const Uint1* subject_start,
641                                const Uint1* best_q_start, const Uint1* best_q_end,
642                                const Uint1* best_s_start, const Uint1* best_s_end)
643 {
644     return
645         s_UpdateReevaluatedHSP(hsp, FALSE, cutoff_score, score, query_start,
646                                subject_start, best_q_start, best_q_end,
647                                best_s_start, best_s_end, 0, 0, 0);
648 }
649 
650 Boolean
Blast_HSPReevaluateWithAmbiguitiesUngapped(BlastHSP * hsp,const Uint1 * query_start,const Uint1 * subject_start,const BlastInitialWordParameters * word_params,BlastScoreBlk * sbp,Boolean translated)651 Blast_HSPReevaluateWithAmbiguitiesUngapped(BlastHSP* hsp, const Uint1* query_start,
652    const Uint1* subject_start, const BlastInitialWordParameters* word_params,
653    BlastScoreBlk* sbp, Boolean translated)
654 {
655    Int4 sum, score;
656    Int4** matrix;
657    const Uint1* query,* subject;
658    const Uint1* best_q_start,* best_s_start,* best_q_end,* best_s_end;
659    const Uint1* current_q_start, * current_s_start;
660    Int4 index;
661    const Uint1 kResidueMask = (translated ? 0xff : 0x0f);
662    Int4 hsp_length = hsp->query.end - hsp->query.offset;
663    Int4 cutoff_score = word_params->cutoffs[hsp->context].cutoff_score;
664 
665    matrix = sbp->matrix->data;
666 
667    query = query_start + hsp->query.offset;
668    subject = subject_start + hsp->subject.offset;
669    score = 0;
670    sum = 0;
671    best_q_start = best_q_end = current_q_start = query;
672    best_s_start = best_s_end = current_s_start = subject;
673 
674    for (index = 0; index < hsp_length; ++index) {
675       sum += matrix[*query & kResidueMask][*subject];
676       query++;
677       subject++;
678       if (sum < 0) {
679           /* Start from new offset */
680           sum = 0;
681           current_q_start = query;
682           current_s_start = subject;
683           /* If previous top score never reached the cutoff, discard the front
684              part of the alignment completely. Otherwise keep pointer to the
685              top-scoring front part. */
686          if (score < cutoff_score) {
687             best_q_start = best_q_end = query;
688             best_s_start = best_s_end = subject;
689             score = 0;
690          }
691       } else if (sum > score) {
692          /* Remember this point as the best scoring end point */
693          score = sum;
694          best_q_end = query;
695          best_s_end = subject;
696          /* Set start of alignment to the current start, dismissing the
697             previous top-scoring piece. */
698          best_q_start = current_q_start;
699          best_s_start = current_s_start;
700       }
701    }
702 
703    /* Update HSP data. */
704    return
705        s_UpdateReevaluatedHSPUngapped(hsp, cutoff_score, score,
706                                       query_start, subject_start, best_q_start,
707                                       best_q_end, best_s_start, best_s_end);
708 }
709 
710 /** Calculate number of identities in a regular HSP.
711  * @param query The query sequence [in]
712  * @param subject The uncompressed subject sequence [in]
713  * @param hsp All information about the HSP [in]
714  * @param num_ident_ptr Number of identities [out]
715  * @param align_length_ptr The alignment length, including gaps [out]
716  * @param sbp Blast score blk [in]
717  * @param num_pos_ptr Number of Positives [out]
718  * @return 0 on success, -1 on invalid parameters or error
719  */
720 static Int2
s_Blast_HSPGetNumIdentitiesAndPositives(const Uint1 * query,const Uint1 * subject,const BlastHSP * hsp,Int4 * num_ident_ptr,Int4 * align_length_ptr,const BlastScoreBlk * sbp,Int4 * num_pos_ptr)721 s_Blast_HSPGetNumIdentitiesAndPositives(const Uint1* query, const Uint1* subject,
722                             			const BlastHSP* hsp, Int4* num_ident_ptr,
723                             			Int4* align_length_ptr, const BlastScoreBlk* sbp,
724                             			Int4* num_pos_ptr)
725 {
726    Int4 i, num_ident, align_length, q_off, s_off;
727    Uint1* q,* s;
728    Int4 q_length = hsp->query.end - hsp->query.offset;
729    Int4 s_length = hsp->subject.end - hsp->subject.offset;
730    Int4** matrix = NULL;
731    Int4 num_pos = 0;
732 
733    q_off = hsp->query.offset;
734    s_off = hsp->subject.offset;
735 
736    if ( !subject || !query || !hsp )
737       return -1;
738 
739    q = (Uint1*) &query[q_off];
740    s = (Uint1*) &subject[s_off];
741 
742    num_ident = 0;
743    align_length = 0;
744 
745    if(NULL != sbp)
746    {
747 	   if(sbp->protein_alphabet)
748 		   matrix = sbp->matrix->data;
749    }
750 
751    if (!hsp->gap_info) {
752       /* Ungapped case. Check that lengths are the same in query and subject,
753          then count number of matches. */
754       if (q_length != s_length)
755          return -1;
756       align_length = q_length;
757       for (i=0; i<align_length; i++) {
758          if (*q == *s)
759             num_ident++;
760          else if (NULL != matrix) {
761         	 if (matrix[*q][*s] > 0)
762         		 num_pos ++;
763              }
764          q++;
765          s++;
766       }
767    	}
768     else {
769       Int4 index;
770       GapEditScript* esp = hsp->gap_info;
771       for (index=0; index<esp->size; index++)
772       {
773          align_length += esp->num[index];
774          switch (esp->op_type[index]) {
775          case eGapAlignSub:
776             for (i=0; i<esp->num[index]; i++) {
777                if (*q == *s) {
778                   num_ident++;
779                }
780                else if (NULL != matrix) {
781             	   if (matrix[*q][*s] > 0)
782             		   num_pos ++;
783                }
784                q++;
785                s++;
786             }
787             break;
788          case eGapAlignDel:
789             s += esp->num[index];
790             break;
791          case eGapAlignIns:
792             q += esp->num[index];
793             break;
794          default:
795             s += esp->num[index];
796             q += esp->num[index];
797             break;
798          }
799       }
800    }
801 
802    if (align_length_ptr) {
803        *align_length_ptr = align_length;
804    }
805    *num_ident_ptr = num_ident;
806 
807    if(NULL != matrix)
808 	   *num_pos_ptr = num_pos + num_ident;
809 
810    return 0;
811 }
812 
813 /** Calculate number of identities in an HSP for an out-of-frame alignment.
814  * @param query The query sequence [in]
815  * @param subject The uncompressed subject sequence [in]
816  * @param hsp All information about the HSP [in]
817  * @param program BLAST program (blastx or tblastn) [in]
818  * @param num_ident_ptr Number of identities [out]
819  * @param align_length_ptr The alignment length, including gaps [out]
820  * @param sbp Blast score blk [in]
821  * @param num_pos_ptr Number of Positives [out]
822  * @return 0 on success, -1 on invalid parameters or error
823  */
824 static Int2
s_Blast_HSPGetOOFNumIdentitiesAndPositives(const Uint1 * query,const Uint1 * subject,const BlastHSP * hsp,EBlastProgramType program,Int4 * num_ident_ptr,Int4 * align_length_ptr,const BlastScoreBlk * sbp,Int4 * num_pos_ptr)825 s_Blast_HSPGetOOFNumIdentitiesAndPositives(
826 		const Uint1* query, const Uint1* subject,
827 		const BlastHSP* hsp, EBlastProgramType program,
828 		Int4* num_ident_ptr, Int4* align_length_ptr,
829 		const BlastScoreBlk* sbp, Int4* num_pos_ptr)
830 {
831    Int4 num_ident, align_length;
832    Int4 index;
833    Uint1* q,* s;
834    GapEditScript* esp;
835    Int4 ** matrix = NULL;
836    Int4 num_pos = 0;
837 
838    if (!hsp->gap_info || !subject || !query)
839       return -1;
840 
841    if(NULL != sbp)
842    {
843 	   if(sbp->protein_alphabet)
844 		   matrix = sbp->matrix->data;
845    }
846 
847    if (program == eBlastTypeTblastn ||
848        program == eBlastTypeRpsTblastn) {
849        q = (Uint1*) &query[hsp->query.offset];
850        s = (Uint1*) &subject[hsp->subject.offset];
851    } else {
852        s = (Uint1*) &query[hsp->query.offset];
853        q = (Uint1*) &subject[hsp->subject.offset];
854    }
855    num_ident = 0;
856    align_length = 0;
857 
858    esp = hsp->gap_info;
859    for (index=0; index<esp->size; index++)
860    {
861       int i;
862       switch (esp->op_type[index]) {
863       case eGapAlignSub: /* Substitution */
864          align_length += esp->num[index];
865          for (i=0; i<esp->num[index]; i++) {
866             if (*q == *s)
867                num_ident++;
868             else if (NULL != matrix) {
869               	 if (matrix[*q][*s] > 0)
870                		 num_pos ++;
871             }
872             ++q;
873             s += CODON_LENGTH;
874          }
875          break;
876       case eGapAlignIns: /* Insertion */
877          align_length += esp->num[index];
878          s += esp->num[index] * CODON_LENGTH;
879          break;
880       case eGapAlignDel: /* Deletion */
881          align_length += esp->num[index];
882          q += esp->num[index];
883          break;
884       case eGapAlignDel2: /* Gap of two nucleotides. */
885          s -= 2;
886          break;
887       case eGapAlignDel1: /* Gap of one nucleotide. */
888          s -= 1;
889          break;
890       case eGapAlignIns1: /* Insertion of one nucleotide. */
891          s += 1;
892          break;
893       case eGapAlignIns2: /* Insertion of two nucleotides. */
894          s += 2;
895          break;
896       default:
897          s += esp->num[index] * CODON_LENGTH;
898          q += esp->num[index];
899          break;
900       }
901    }
902 
903    if (align_length_ptr) {
904        *align_length_ptr = align_length;
905    }
906    *num_ident_ptr = num_ident;
907 
908    if(NULL != matrix)
909 	   *num_pos_ptr = num_pos + num_ident;
910 
911    return 0;
912 }
913 
914 Int2
Blast_HSPGetNumIdentities(const Uint1 * query,const Uint1 * subject,BlastHSP * hsp,const BlastScoringOptions * score_options,Int4 * align_length_ptr)915 Blast_HSPGetNumIdentities(const Uint1* query,
916                           const Uint1* subject,
917                           BlastHSP* hsp,
918                           const BlastScoringOptions* score_options,
919                           Int4* align_length_ptr)
920 {
921     Int2 retval = 0;
922 
923     /* Calculate alignment length and number of identical letters.
924        Do not get the number of identities if the query is not available */
925     if (score_options->is_ooframe) {
926         retval = s_Blast_HSPGetOOFNumIdentitiesAndPositives(query, subject, hsp,
927                                                	   	   	    score_options->program_number,
928                                                	   	   	    &hsp->num_ident,
929                                                	   	   	    align_length_ptr,
930                                                	   	   	    NULL, NULL);
931     } else {
932         retval = s_Blast_HSPGetNumIdentitiesAndPositives(query, subject, hsp,
933                                             		  	 &hsp->num_ident,
934                                             		  	 align_length_ptr,
935                                             		  	 NULL, NULL);
936     }
937 
938     return retval;
939 }
940 Int2
Blast_HSPGetNumIdentitiesAndPositives(const Uint1 * query,const Uint1 * subject,BlastHSP * hsp,const BlastScoringOptions * score_options,Int4 * align_length_ptr,const BlastScoreBlk * sbp)941 Blast_HSPGetNumIdentitiesAndPositives(const Uint1* query,
942         							  const Uint1* subject,
943         							  BlastHSP* hsp,
944         							  const BlastScoringOptions* score_options,
945         							  Int4* align_length_ptr,
946         							  const BlastScoreBlk * sbp)
947 {
948     Int2 retval = 0;
949 
950     /* Calculate alignment length and number of identical letters.
951        Do not get the number of identities if the query is not available */
952     if (score_options->is_ooframe) {
953         retval = s_Blast_HSPGetOOFNumIdentitiesAndPositives(query, subject, hsp,
954                                                	   	   	    score_options->program_number,
955                                                	   	   	    &hsp->num_ident,
956                                                	   	   	    align_length_ptr,
957                                                	   	   	    sbp, &hsp->num_positives);
958     } else {
959         retval = s_Blast_HSPGetNumIdentitiesAndPositives(query, subject, hsp,
960                                             		  	 &hsp->num_ident,
961                                             		  	 align_length_ptr,
962                                             		  	 sbp, &hsp->num_positives);
963     }
964 
965     return retval;
966 }
967 
s_HSPTest(const BlastHSP * hsp,const BlastHitSavingOptions * hit_options,Int4 align_length)968 static Boolean s_HSPTest(const BlastHSP* hsp,
969                          const BlastHitSavingOptions* hit_options,
970                          Int4 align_length)
971 {
972 	   return ((hsp->num_ident * 100.0 <
973 			   align_length * hit_options->percent_identity) ||
974 			   align_length < hit_options->min_hit_length) ;
975 
976 }
977 
978 Boolean
Blast_HSPTestIdentityAndLength(EBlastProgramType program_number,BlastHSP * hsp,const Uint1 * query,const Uint1 * subject,const BlastScoringOptions * score_options,const BlastHitSavingOptions * hit_options)979 Blast_HSPTestIdentityAndLength(EBlastProgramType program_number,
980                                BlastHSP* hsp, const Uint1* query, const Uint1* subject,
981                                const BlastScoringOptions* score_options,
982                                const BlastHitSavingOptions* hit_options)
983 {
984    Int4 align_length = 0;
985    Boolean delete_hsp = FALSE;
986    Int2 status = 0;
987 
988    ASSERT(hsp && query && subject && score_options && hit_options);
989 
990    status = Blast_HSPGetNumIdentities(query, subject, hsp, score_options,
991                                       &align_length);
992    ASSERT(status == 0);
993    (void)status;    /* to pacify compiler warning */
994 
995    /* Check whether this HSP passes the percent identity and minimal hit
996       length criteria, and delete it if it does not. */
997    delete_hsp = s_HSPTest(hsp, hit_options, align_length);
998 
999    return delete_hsp;
1000 }
1001 
Blast_HSPTest(BlastHSP * hsp,const BlastHitSavingOptions * hit_options,Int4 align_length)1002 Boolean Blast_HSPTest(BlastHSP* hsp,
1003 		 	 	 	  const BlastHitSavingOptions* hit_options,
1004 		 	 	 	  Int4 align_length)
1005 {
1006 	return s_HSPTest(hsp, hit_options, align_length);
1007 }
1008 
Blast_HSPGetQueryCoverage(const BlastHSP * hsp,Int4 query_length)1009 double Blast_HSPGetQueryCoverage(const BlastHSP* hsp, Int4 query_length)
1010 {
1011 	double pct = 0;
1012     if(query_length > 0) {
1013             pct = 100.0 * (double) (hsp->query.end - hsp->query.offset)/ (double) query_length;
1014             if(pct < 99)
1015             	pct +=0.5;
1016     }
1017     return pct;
1018 }
1019 
Blast_HSPQueryCoverageTest(BlastHSP * hsp,double min_query_coverage_pct,Int4 query_length)1020 Boolean Blast_HSPQueryCoverageTest(BlastHSP* hsp,
1021         						   double min_query_coverage_pct,
1022         						   Int4 query_length)
1023 {
1024      double hsp_coverage = Blast_HSPGetQueryCoverage( hsp, query_length);
1025      return (hsp_coverage < min_query_coverage_pct);
1026 }
1027 
1028 
1029 void
Blast_HSPCalcLengthAndGaps(const BlastHSP * hsp,Int4 * length_out,Int4 * gaps_out,Int4 * gap_opens_out)1030 Blast_HSPCalcLengthAndGaps(const BlastHSP* hsp, Int4* length_out,
1031                            Int4* gaps_out, Int4* gap_opens_out)
1032 {
1033    Int4 length = hsp->query.end - hsp->query.offset;
1034    Int4 s_length = hsp->subject.end - hsp->subject.offset;
1035    Int4 gap_opens = 0, gaps = 0;
1036 
1037    if (hsp->gap_info) {
1038       GapEditScript* esp = hsp->gap_info;
1039       Int4 index;
1040       for (index=0; index<esp->size; index++) {
1041          if (esp->op_type[index] == eGapAlignDel) {
1042             length += esp->num[index];
1043             gaps += esp->num[index];
1044             ++gap_opens;
1045          } else if (esp->op_type[index] == eGapAlignIns) {
1046             ++gap_opens;
1047             gaps += esp->num[index];
1048          }
1049       }
1050    } else if (s_length > length) {
1051       length = s_length;
1052    }
1053 
1054    *length_out = length;
1055    *gap_opens_out = gap_opens;
1056    *gaps_out = gaps;
1057 }
1058 
1059 /** Adjust start and end of an HSP in a translated sequence segment.
1060  * @param segment BlastSeg structure (part of BlastHSP) [in]
1061  * @param seq_length Length of the full sequence [in]
1062  * @param start Start of the alignment in this segment in nucleotide
1063  *              coordinates, 1-offset [out]
1064  * @param end End of the alignment in this segment in nucleotide
1065  *            coordinates, 1-offset [out]
1066  */
1067 static void
s_BlastSegGetTranslatedOffsets(const BlastSeg * segment,Int4 seq_length,Int4 * start,Int4 * end)1068 s_BlastSegGetTranslatedOffsets(const BlastSeg* segment, Int4 seq_length,
1069                               Int4* start, Int4* end)
1070 {
1071    if (segment->frame < 0)	{
1072       *start = seq_length - CODON_LENGTH*segment->offset + segment->frame;
1073       *end = seq_length - CODON_LENGTH*segment->end + segment->frame + 1;
1074    } else if (segment->frame > 0)	{
1075       *start = CODON_LENGTH*(segment->offset) + segment->frame - 1;
1076       *end = CODON_LENGTH*segment->end + segment->frame - 2;
1077    } else {
1078       *start = segment->offset + 1;
1079       *end = segment->end;
1080    }
1081 }
1082 
1083 void
Blast_HSPGetAdjustedOffsets(EBlastProgramType program,BlastHSP * hsp,Int4 query_length,Int4 subject_length,Int4 * q_start,Int4 * q_end,Int4 * s_start,Int4 * s_end)1084 Blast_HSPGetAdjustedOffsets(EBlastProgramType program, BlastHSP* hsp,
1085                             Int4 query_length, Int4 subject_length,
1086                             Int4* q_start, Int4* q_end,
1087                             Int4* s_start, Int4* s_end)
1088 {
1089    if (!hsp->gap_info) {
1090       *q_start = hsp->query.offset + 1;
1091       *q_end = hsp->query.end;
1092       *s_start = hsp->subject.offset + 1;
1093       *s_end = hsp->subject.end;
1094       return;
1095    }
1096 
1097    if (!Blast_QueryIsTranslated(program) &&
1098        !Blast_SubjectIsTranslated(program)) {
1099       if (hsp->query.frame != hsp->subject.frame) {
1100          /* Blastn: if different strands, flip offsets in query; leave
1101             offsets in subject as they are, but change order for correct
1102             correspondence. */
1103          *q_end = query_length - hsp->query.offset;
1104          *q_start = *q_end - hsp->query.end + hsp->query.offset + 1;
1105          *s_end = hsp->subject.offset + 1;
1106          *s_start = hsp->subject.end;
1107       } else {
1108          *q_start = hsp->query.offset + 1;
1109          *q_end = hsp->query.end;
1110          *s_start = hsp->subject.offset + 1;
1111          *s_end = hsp->subject.end;
1112       }
1113    } else {
1114       s_BlastSegGetTranslatedOffsets(&hsp->query, query_length, q_start, q_end);
1115       s_BlastSegGetTranslatedOffsets(&hsp->subject, subject_length,
1116                                     q_start, s_end);
1117    }
1118 }
1119 
1120 
1121 const Uint1*
Blast_HSPGetTargetTranslation(SBlastTargetTranslation * target_t,const BlastHSP * hsp,Int4 * translated_length)1122 Blast_HSPGetTargetTranslation(SBlastTargetTranslation* target_t,
1123                               const BlastHSP* hsp, Int4* translated_length)
1124 {
1125     Int4 context = -1;
1126     Int4 start, stop;
1127 
1128     ASSERT(target_t != NULL);
1129 
1130     if (hsp == NULL)
1131        return NULL;
1132 
1133     context = BLAST_FrameToContext(hsp->subject.frame, target_t->program_number);
1134     start = target_t->range[2*context];
1135     stop = target_t->range[2*context+1];
1136 
1137     /* skip translation if full translation has already been done */
1138     if (target_t->partial && (start ||
1139         (stop < target_t->subject_blk->length / CODON_LENGTH -3)))
1140     {
1141     	 const int kMaxTranslation = 99; /* Needs to be divisible by three (?) */
1142          Int4 nucl_length = 0;
1143          Int4 translation_length = 0;
1144          Int4 nucl_start = 0;
1145          Int4 nucl_end = 0;
1146     	 Int4 nucl_shift = 0;
1147     	 Int4 start_shift = 0;
1148 
1149          /* HSP coordinates are in terms of protein sequences. */
1150          if (hsp->subject.offset < 0) {
1151              nucl_start = 0;
1152              nucl_end = target_t->subject_blk->length;
1153          } else {
1154              nucl_start = MAX(0, 3*hsp->subject.offset - kMaxTranslation);
1155              nucl_end = MIN(target_t->subject_blk->length, 3*hsp->subject.end + kMaxTranslation);
1156              /* extend to the end of the sequence if close */
1157              if (target_t->subject_blk->length - nucl_end <= 21) {
1158                  nucl_end = target_t->subject_blk->length;
1159              }
1160          }
1161 
1162          nucl_length = nucl_end - nucl_start;
1163 
1164          translation_length = 1+nucl_length/CODON_LENGTH;
1165          start_shift = nucl_start/CODON_LENGTH;
1166 
1167          if (hsp->subject.frame < 0)
1168              nucl_shift = target_t->subject_blk->length - nucl_start - nucl_length;
1169          else
1170              nucl_shift = nucl_start;
1171 
1172          if (start_shift < start || start_shift+translation_length > stop) {
1173                /* needs re-translation */
1174                Int4 length = 0; /* actual translation length */
1175                Uint1* nucl_seq = target_t->subject_blk->sequence + nucl_shift;
1176                Uint1* nucl_seq_rev = NULL;
1177 
1178                target_t->range[2*context] = start_shift;
1179 
1180                if (translation_length > stop-start) {
1181                    /* needs re-allocation */
1182                    sfree(target_t->translations[context]);
1183                    target_t->translations[context] = (Uint1*) malloc(translation_length+2);
1184                }
1185 
1186                if (hsp->subject.frame < 0) {
1187                    /* needs reverse sequence */
1188                    GetReverseNuclSequence(nucl_seq, nucl_length, &nucl_seq_rev);
1189                }
1190 
1191                length = BLAST_GetTranslation(nucl_seq, nucl_seq_rev,
1192                        nucl_length, hsp->subject.frame, target_t->translations[context],
1193                        target_t->gen_code_string);
1194 
1195                target_t->range[2*context+1] = start_shift + length;
1196 
1197                sfree(nucl_seq_rev);
1198 
1199                /* partial translation needs to be fenced */
1200                if(hsp->subject.offset >= 0) {
1201                    target_t->translations[context][0] = FENCE_SENTRY;
1202                    target_t->translations[context][length+1] = FENCE_SENTRY;
1203                }
1204          }
1205     }
1206     if (translated_length)
1207         *translated_length = target_t->range[2*context+1];
1208 
1209     /* +1 as the first byte is a sentinel. */
1210     return target_t->translations[context] - target_t->range[2*context] + 1;
1211 }
1212 
1213 Int2
Blast_HSPGetPartialSubjectTranslation(BLAST_SequenceBlk * subject_blk,BlastHSP * hsp,Boolean is_ooframe,const Uint1 * gen_code_string,Uint1 ** translation_buffer_ptr,Uint1 ** subject_ptr,Int4 * subject_length_ptr,Int4 * start_shift_ptr)1214 Blast_HSPGetPartialSubjectTranslation(BLAST_SequenceBlk* subject_blk,
1215                                       BlastHSP* hsp,
1216                                       Boolean is_ooframe,
1217                                       const Uint1* gen_code_string,
1218                                       Uint1** translation_buffer_ptr,
1219                                       Uint1** subject_ptr,
1220                                       Int4* subject_length_ptr,
1221                                       Int4* start_shift_ptr)
1222 {
1223    Int4 translation_length;
1224    Uint1* translation_buffer;
1225    Uint1* subject;
1226    Int4 start_shift;
1227    Int4 nucl_shift;
1228    Int2 status = 0;
1229 
1230    ASSERT(subject_blk && hsp && gen_code_string && translation_buffer_ptr &&
1231           subject_ptr && subject_length_ptr && start_shift_ptr);
1232 
1233    translation_buffer = *translation_buffer_ptr;
1234    sfree(translation_buffer);
1235 
1236    if (!is_ooframe) {
1237       start_shift =
1238          MAX(0, 3*hsp->subject.offset - MAX_FULL_TRANSLATION);
1239       translation_length =
1240          MIN(3*hsp->subject.end + MAX_FULL_TRANSLATION,
1241              subject_blk->length) - start_shift;
1242       if (hsp->subject.frame > 0) {
1243          nucl_shift = start_shift;
1244       } else {
1245          nucl_shift = subject_blk->length - start_shift - translation_length;
1246       }
1247       status = (Int2)
1248          Blast_GetPartialTranslation(subject_blk->sequence_start+nucl_shift,
1249                                      translation_length, hsp->subject.frame,
1250                                      gen_code_string, &translation_buffer,
1251                                      subject_length_ptr, NULL);
1252       /* Below, the start_shift will be used for the protein
1253          coordinates, so need to divide it by 3 */
1254       start_shift /= CODON_LENGTH;
1255    } else {
1256       Int4 oof_end;
1257       oof_end = subject_blk->length;
1258 
1259       start_shift =
1260          MAX(0, hsp->subject.offset - MAX_FULL_TRANSLATION);
1261       translation_length =
1262          MIN(hsp->subject.end + MAX_FULL_TRANSLATION, oof_end) - start_shift;
1263       if (hsp->subject.frame > 0) {
1264          nucl_shift = start_shift;
1265       } else {
1266          nucl_shift = oof_end - start_shift - translation_length;
1267       }
1268       status = (Int2)
1269          Blast_GetPartialTranslation(subject_blk->sequence_start+nucl_shift,
1270                                      translation_length, hsp->subject.frame,
1271                                      gen_code_string, NULL,
1272                                      subject_length_ptr, &translation_buffer);
1273    }
1274    hsp->subject.offset -= start_shift;
1275    hsp->subject.end -= start_shift;
1276    hsp->subject.gapped_start -= start_shift;
1277    *translation_buffer_ptr = translation_buffer;
1278    *start_shift_ptr = start_shift;
1279 
1280    if (!is_ooframe) {
1281       subject = translation_buffer + 1;
1282    } else {
1283       subject = translation_buffer + CODON_LENGTH;
1284    }
1285    *subject_ptr = subject;
1286 
1287    return status;
1288 }
1289 
1290 void
Blast_HSPAdjustSubjectOffset(BlastHSP * hsp,Int4 start_shift)1291 Blast_HSPAdjustSubjectOffset(BlastHSP* hsp, Int4 start_shift)
1292 {
1293     /* Adjust subject offsets if shifted (partial) sequence was used
1294        for extension */
1295     if (start_shift > 0) {
1296         hsp->subject.offset += start_shift;
1297         hsp->subject.end += start_shift;
1298         hsp->subject.gapped_start += start_shift;
1299     }
1300 
1301     return;
1302 }
1303 
1304 int
ScoreCompareHSPs(const void * h1,const void * h2)1305 ScoreCompareHSPs(const void* h1, const void* h2)
1306 {
1307    BlastHSP* hsp1,* hsp2;   /* the HSPs to be compared */
1308    int result = 0;      /* the result of the comparison */
1309 
1310    hsp1 = *((BlastHSP**) h1);
1311    hsp2 = *((BlastHSP**) h2);
1312 
1313    /* Null HSPs are "greater" than any non-null ones, so they go to the end
1314       of a sorted list. */
1315    if (!hsp1 && !hsp2)
1316        return 0;
1317    else if (!hsp1)
1318        return 1;
1319    else if (!hsp2)
1320        return -1;
1321 
1322    if (0 == (result = BLAST_CMP(hsp2->score,          hsp1->score)) &&
1323        0 == (result = BLAST_CMP(hsp1->subject.offset, hsp2->subject.offset)) &&
1324        0 == (result = BLAST_CMP(hsp2->subject.end,    hsp1->subject.end)) &&
1325        0 == (result = BLAST_CMP(hsp1->query  .offset, hsp2->query  .offset))) {
1326        /* if all other test can't distinguish the HSPs, then the final
1327           test is the result */
1328        result = BLAST_CMP(hsp2->query.end, hsp1->query.end);
1329    }
1330    return result;
1331 }
1332 
Blast_HSPListIsSortedByScore(const BlastHSPList * hsp_list)1333 Boolean Blast_HSPListIsSortedByScore(const BlastHSPList* hsp_list)
1334 {
1335     Int4 index;
1336 
1337     if (!hsp_list || hsp_list->hspcnt <= 1)
1338         return TRUE;
1339 
1340     for (index = 0; index < hsp_list->hspcnt - 1; ++index) {
1341         if (ScoreCompareHSPs(&hsp_list->hsp_array[index],
1342                              &hsp_list->hsp_array[index+1]) > 0) {
1343             return FALSE;
1344         }
1345     }
1346     return TRUE;
1347 }
1348 
Blast_HSPListSortByScore(BlastHSPList * hsp_list)1349 void Blast_HSPListSortByScore(BlastHSPList* hsp_list)
1350 {
1351     if (!hsp_list || hsp_list->hspcnt <= 1)
1352         return;
1353 
1354     if (!Blast_HSPListIsSortedByScore(hsp_list)) {
1355         qsort(hsp_list->hsp_array, hsp_list->hspcnt, sizeof(BlastHSP*),
1356               ScoreCompareHSPs);
1357     }
1358 }
1359 
1360 /** Compares 2 evalues, consider them equal if both are close enough to zero.
1361  * @param evalue1 First evalue [in]
1362  * @param evalue2 Second evalue [in]
1363  */
1364 static int
s_EvalueComp(double evalue1,double evalue2)1365 s_EvalueComp(double evalue1, double evalue2)
1366 {
1367     const double epsilon = 1.0e-180;
1368     if (evalue1 < epsilon && evalue2 < epsilon) {
1369         return 0;
1370     }
1371 
1372     if (evalue1 < evalue2) {
1373         return -1;
1374     } else if (evalue1 > evalue2) {
1375         return 1;
1376     } else {
1377         return 0;
1378     }
1379 }
1380 
1381 /** Comparison callback function for sorting HSPs by e-value and score, before
1382  * saving BlastHSPList in a BlastHitList. E-value has priority over score,
1383  * because lower scoring HSPs might have lower e-values, if they are linked
1384  * with sum statistics.
1385  * E-values are compared only up to a certain precision.
1386  * @param v1 Pointer to first HSP [in]
1387  * @param v2 Pointer to second HSP [in]
1388  */
1389 static int
s_EvalueCompareHSPs(const void * v1,const void * v2)1390 s_EvalueCompareHSPs(const void* v1, const void* v2)
1391 {
1392    BlastHSP* h1,* h2;
1393    int retval = 0;
1394 
1395    h1 = *((BlastHSP**) v1);
1396    h2 = *((BlastHSP**) v2);
1397 
1398    /* Check if one or both of these are null. Those HSPs should go to the end */
1399    if (!h1 && !h2)
1400       return 0;
1401    else if (!h1)
1402       return 1;
1403    else if (!h2)
1404       return -1;
1405 
1406    if ((retval = s_EvalueComp(h1->evalue, h2->evalue)) != 0)
1407       return retval;
1408 
1409    return ScoreCompareHSPs(v1, v2);
1410 }
1411 
Blast_HSPListSortByEvalue(BlastHSPList * hsp_list)1412 void Blast_HSPListSortByEvalue(BlastHSPList* hsp_list)
1413 {
1414     if (!hsp_list)
1415         return;
1416 
1417     if (hsp_list->hspcnt > 1) {
1418         Int4 index;
1419         BlastHSP** hsp_array = hsp_list->hsp_array;
1420         /* First check if HSP array is already sorted. */
1421         for (index = 0; index < hsp_list->hspcnt - 1; ++index) {
1422             if (s_EvalueCompareHSPs(&hsp_array[index], &hsp_array[index+1]) > 0) {
1423                 break;
1424             }
1425         }
1426         /* Sort the HSP array if it is not sorted yet. */
1427         if (index < hsp_list->hspcnt - 1) {
1428             qsort(hsp_list->hsp_array, hsp_list->hspcnt, sizeof(BlastHSP*),
1429                   s_EvalueCompareHSPs);
1430         }
1431     }
1432 }
1433 
1434 
1435 /** Retrieve the starting diagonal of an HSP
1436  * @param hsp The target HSP
1437  * @return The starting diagonal
1438  */
1439 static Int4
s_HSPStartDiag(const BlastHSP * hsp)1440 s_HSPStartDiag(const BlastHSP *hsp)
1441 {
1442     return hsp->query.offset - hsp->subject.offset;
1443 }
1444 
1445 /** Retrieve the ending diagonal of an HSP
1446  * @param hsp The target HSP
1447  * @return The ending diagonal
1448  */
1449 static Int4
s_HSPEndDiag(const BlastHSP * hsp)1450 s_HSPEndDiag(const BlastHSP *hsp)
1451 {
1452     return hsp->query.end - hsp->subject.end;
1453 }
1454 
1455 /** Given two hits, check if the hits can be merged and do
1456  * the merge if so. Hits must not contain traceback
1457  * @param hsp1 The first hit. If merging happens, this hit is
1458  *             overwritten with the merged version [in][out]
1459  * @param hsp2 The second hit [in]
1460  * @return TRUE if a merge was performed, FALSE if not
1461  */
1462 static Boolean
s_BlastMergeTwoHSPs(BlastHSP * hsp1,BlastHSP * hsp2,Boolean allow_gap)1463 s_BlastMergeTwoHSPs(BlastHSP* hsp1, BlastHSP* hsp2, Boolean allow_gap)
1464 {
1465    ASSERT(!hsp1->gap_info || !hsp2->gap_info);
1466 
1467    /* do not merge off-diagonal hsps for ungapped search */
1468    if (!allow_gap &&
1469        hsp1->subject.offset - hsp2->subject.offset -hsp1->query.offset + hsp2->query.offset)
1470    {
1471        return FALSE;
1472    }
1473 
1474    if(hsp1->subject.frame != hsp2->subject.frame)
1475 	   return FALSE;
1476 
1477    /* combine the boundaries of the two HSPs,
1478       assuming they intersect at all */
1479    if (CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end,
1480                         hsp2->query.offset,
1481                         hsp1->subject.offset, hsp1->subject.end,
1482                         hsp2->subject.offset) ||
1483        CONTAINED_IN_HSP(hsp1->query.offset, hsp1->query.end,
1484                         hsp2->query.end,
1485                         hsp1->subject.offset, hsp1->subject.end,
1486                         hsp2->subject.end)) {
1487 
1488 	  double score_density =  (hsp1->score + hsp2->score) *(1.0) /
1489 			                  ((hsp1->query.end - hsp1->query.offset) +
1490 			                   (hsp2->query.end - hsp2->query.offset));
1491       hsp1->query.offset = MIN(hsp1->query.offset, hsp2->query.offset);
1492       hsp1->subject.offset = MIN(hsp1->subject.offset, hsp2->subject.offset);
1493       hsp1->query.end = MAX(hsp1->query.end, hsp2->query.end);
1494       hsp1->subject.end = MAX(hsp1->subject.end, hsp2->subject.end);
1495       if (hsp2->score > hsp1->score) {
1496           hsp1->query.gapped_start = hsp2->query.gapped_start;
1497           hsp1->subject.gapped_start = hsp2->subject.gapped_start;
1498 	  hsp1->score = hsp2->score;
1499       }
1500 
1501       hsp1->score = MAX((int) (score_density *(hsp1->query.end - hsp1->query.offset)), hsp1->score);
1502       return TRUE;
1503    }
1504 
1505    return FALSE;
1506 }
1507 
1508 /** Maximal diagonal distance between HSP starting offsets, within which HSPs
1509  * from search of different chunks of subject sequence are considered for
1510  * merging.
1511  */
1512 #define OVERLAP_DIAG_CLOSE 10
1513 /********************************************************************************
1514           Functions manipulating BlastHSPList's
1515 ********************************************************************************/
1516 
Blast_HSPListFree(BlastHSPList * hsp_list)1517 BlastHSPList* Blast_HSPListFree(BlastHSPList* hsp_list)
1518 {
1519    Int4 index;
1520 
1521    if (!hsp_list)
1522       return hsp_list;
1523 
1524    for (index = 0; index < hsp_list->hspcnt; ++index) {
1525       Blast_HSPFree(hsp_list->hsp_array[index]);
1526    }
1527    sfree(hsp_list->hsp_array);
1528 
1529    sfree(hsp_list);
1530    return NULL;
1531 }
1532 
Blast_HSPListNew(Int4 hsp_max)1533 BlastHSPList* Blast_HSPListNew(Int4 hsp_max)
1534 {
1535    BlastHSPList* hsp_list = (BlastHSPList*) calloc(1, sizeof(BlastHSPList));
1536    const Int4 kDefaultAllocated=100;
1537 
1538    /* hsp_max is max number of HSP's allowed in an HSP list;
1539       INT4_MAX taken as infinity. */
1540    hsp_list->hsp_max = INT4_MAX;
1541    if (hsp_max > 0)
1542       hsp_list->hsp_max = hsp_max;
1543 
1544    hsp_list->allocated = MIN(kDefaultAllocated, hsp_list->hsp_max);
1545 
1546    hsp_list->hsp_array = (BlastHSP**)
1547       calloc(hsp_list->allocated, sizeof(BlastHSP*));
1548 
1549    return hsp_list;
1550 }
1551 
1552 Boolean
Blast_HSPList_IsEmpty(const BlastHSPList * hsp_list)1553 Blast_HSPList_IsEmpty(const BlastHSPList* hsp_list)
1554 {
1555     return (hsp_list && hsp_list->hspcnt == 0) ? TRUE : FALSE;
1556 }
1557 
BlastHSPListDup(const BlastHSPList * hsp_list)1558 BlastHSPList* BlastHSPListDup(const BlastHSPList* hsp_list)
1559 {
1560     BlastHSPList * rv = 0;
1561 
1562     if (hsp_list) {
1563         int index = 0;
1564         int num = hsp_list->hspcnt;
1565 
1566         rv = malloc(sizeof(BlastHSPList));
1567         *rv = *hsp_list;
1568 
1569         if (num) {
1570             rv->hsp_array = malloc(sizeof(BlastHSP*) * num);
1571 
1572             for(index = 0; index < hsp_list->hspcnt; ++index) {
1573                 BlastHSP * h = hsp_list->hsp_array[index];
1574                 BlastHSP ** h2 = & rv->hsp_array[index];
1575 
1576                 if (h) {
1577                     *h2 = malloc(sizeof(BlastHSP));
1578                     **h2 = *h;
1579                 } else {
1580                     *h2 = 0;
1581                 }
1582             }
1583         }
1584     }
1585 
1586     return rv;
1587 }
1588 
Blast_HSPListSwap(BlastHSPList * list1,BlastHSPList * list2)1589 void Blast_HSPListSwap(BlastHSPList* list1, BlastHSPList* list2)
1590 {
1591     BlastHSPList tmp;
1592 
1593     tmp = *list1;
1594     *list1 = *list2;
1595     *list2 = tmp;
1596 }
1597 
1598 /** This is a copy of a static function from ncbimisc.c.
1599  * Turns array into a heap with respect to a given comparison function.
1600  */
1601 static void
s_Heapify(char * base0,char * base,char * lim,char * last,size_t width,int (* compar)(const void *,const void *))1602 s_Heapify (char* base0, char* base, char* lim, char* last, size_t width, int (*compar )(const void*, const void* ))
1603 {
1604    size_t i;
1605    char   ch;
1606    char* left_son,* large_son;
1607 
1608    left_son = base0 + 2*(base-base0) + width;
1609    while (base <= lim) {
1610       if (left_son == last)
1611          large_son = left_son;
1612       else
1613          large_son = (*compar)(left_son, left_son+width) >= 0 ?
1614             left_son : left_son+width;
1615       if ((*compar)(base, large_son) < 0) {
1616          for (i=0; i<width; ++i) {
1617             ch = base[i];
1618             base[i] = large_son[i];
1619             large_son[i] = ch;
1620          }
1621          base = large_son;
1622          left_son = base0 + 2*(base-base0) + width;
1623       } else
1624          break;
1625    }
1626 }
1627 
1628 /** Creates a heap of elements based on a comparison function.
1629  * @param b An array [in] [out]
1630  * @param nel Number of elements in b [in]
1631  * @param width The size of each element [in]
1632  * @param compar Callback to compare two heap elements [in]
1633  */
1634 static void
s_CreateHeap(void * b,size_t nel,size_t width,int (* compar)(const void *,const void *))1635 s_CreateHeap (void* b, size_t nel, size_t width,
1636    int (*compar )(const void*, const void* ))
1637 {
1638    char*    base = (char*)b;
1639    size_t i;
1640    char*    base0 = (char*)base,* lim,* basef;
1641 
1642    if (nel < 2)
1643       return;
1644 
1645    lim = &base[((nel-2)/2)*width];
1646    basef = &base[(nel-1)*width];
1647    i = nel/2;
1648    for (base = &base0[(i - 1)*width]; i > 0; base = base - width) {
1649       s_Heapify(base0, base, lim, basef, width, compar);
1650       i--;
1651    }
1652 }
1653 
1654 /** Given a BlastHSPList* with a heapified HSP array, check whether the
1655  * new HSP is better than the worst scoring.  If it is, then remove the
1656  * worst scoring and insert, otherwise free the new one.
1657  * HSP and insert the new HSP in the heap.
1658  * @param hsp_list Contains all HSPs for a given subject. [in] [out]
1659  * @param hsp A pointer to new HSP to be inserted into the HSP list [in] [out]
1660  */
1661 static void
s_BlastHSPListInsertHSPInHeap(BlastHSPList * hsp_list,BlastHSP ** hsp)1662 s_BlastHSPListInsertHSPInHeap(BlastHSPList* hsp_list,
1663                              BlastHSP** hsp)
1664 {
1665     BlastHSP** hsp_array = hsp_list->hsp_array;
1666     if (ScoreCompareHSPs(hsp, &hsp_array[0]) > 0)
1667     {
1668          Blast_HSPFree(*hsp);
1669          return;
1670     }
1671     else
1672          Blast_HSPFree(hsp_array[0]);
1673 
1674     hsp_array[0] = *hsp;
1675     if (hsp_list->hspcnt >= 2) {
1676         s_Heapify((char*)hsp_array, (char*)hsp_array,
1677                 (char*)&hsp_array[hsp_list->hspcnt/2 - 1],
1678                  (char*)&hsp_array[hsp_list->hspcnt-1],
1679                  sizeof(BlastHSP*), ScoreCompareHSPs);
1680     }
1681 }
1682 
1683 #ifndef NDEBUG
1684 /** Verifies that the best_evalue field on the BlastHSPList is correct.
1685  * @param hsp_list object to check [in]
1686  * @return TRUE if OK, FALSE otherwise.
1687  */
1688 static Boolean
s_BlastCheckBestEvalue(const BlastHSPList * hsp_list)1689 s_BlastCheckBestEvalue(const BlastHSPList* hsp_list)
1690 {
1691     int index = 0;
1692     double best_evalue = (double) INT4_MAX;
1693     const double kDelta = 1.0e-200;
1694 
1695     /* There are no HSP's here. */
1696     if (hsp_list->hspcnt == 0)
1697        return TRUE;
1698 
1699     for (index=0; index<hsp_list->hspcnt; index++)
1700        best_evalue = MIN(hsp_list->hsp_array[index]->evalue, best_evalue);
1701 
1702     /* check that it's within 1%. */
1703     if (ABS(best_evalue-hsp_list->best_evalue)/(best_evalue+kDelta) > 0.01)
1704        return FALSE;
1705 
1706     return TRUE;
1707 }
1708 #endif /* _DEBUG */
1709 
1710 /** Gets the best (lowest) evalue from the BlastHSPList.
1711  * @param hsp_list object containing the evalues [in]
1712  * @return TRUE if OK, FALSE otherwise.
1713  */
1714 static double
s_BlastGetBestEvalue(const BlastHSPList * hsp_list)1715 s_BlastGetBestEvalue(const BlastHSPList* hsp_list)
1716 {
1717     int index = 0;
1718     double best_evalue = (double) INT4_MAX;
1719 
1720     for (index=0; index<hsp_list->hspcnt; index++)
1721        best_evalue = MIN(hsp_list->hsp_array[index]->evalue, best_evalue);
1722 
1723     return best_evalue;
1724 }
1725 
1726 /* Comments in blast_hits.h
1727  */
1728 Int2
Blast_HSPListSaveHSP(BlastHSPList * hsp_list,BlastHSP * new_hsp)1729 Blast_HSPListSaveHSP(BlastHSPList* hsp_list, BlastHSP* new_hsp)
1730 {
1731    BlastHSP** hsp_array;
1732    Int4 hspcnt;
1733    Int4 hsp_allocated; /* how many hsps are in the array. */
1734    Int2 status = 0;
1735 
1736    hspcnt = hsp_list->hspcnt;
1737    hsp_allocated = hsp_list->allocated;
1738    hsp_array = hsp_list->hsp_array;
1739 
1740 
1741    /* Check if list is already full, then reallocate. */
1742    if (hspcnt >= hsp_allocated && hsp_list->do_not_reallocate == FALSE)
1743    {
1744       Int4 new_allocated = MIN(2*hsp_list->allocated, hsp_list->hsp_max);
1745       if (new_allocated > hsp_list->allocated) {
1746          hsp_array = (BlastHSP**)
1747             realloc(hsp_array, new_allocated*sizeof(BlastHSP*));
1748          if (hsp_array == NULL)
1749          {
1750             hsp_list->do_not_reallocate = TRUE;
1751             hsp_array = hsp_list->hsp_array;
1752             /** Return a non-zero status, because restriction on number
1753                 of HSPs here is a result of memory allocation failure. */
1754             status = -1;
1755          } else {
1756             hsp_list->hsp_array = hsp_array;
1757             hsp_list->allocated = new_allocated;
1758             hsp_allocated = new_allocated;
1759          }
1760       } else {
1761          hsp_list->do_not_reallocate = TRUE;
1762       }
1763       /* If it is the first time when the HSP array is filled to capacity,
1764          create a heap now. */
1765       if (hsp_list->do_not_reallocate) {
1766           s_CreateHeap(hsp_array, hspcnt, sizeof(BlastHSP*), ScoreCompareHSPs);
1767       }
1768    }
1769 
1770    /* If there is space in the allocated HSP array, simply save the new HSP.
1771       Othewise, if the new HSP has lower score than the worst HSP in the heap,
1772       then delete it, else insert it in the heap. */
1773    if (hspcnt < hsp_allocated)
1774    {
1775       hsp_array[hsp_list->hspcnt] = new_hsp;
1776       (hsp_list->hspcnt)++;
1777       return status;
1778    } else {
1779        /* Insert the new HSP in heap. */
1780        s_BlastHSPListInsertHSPInHeap(hsp_list, &new_hsp);
1781    }
1782 
1783    return status;
1784 }
1785 
Blast_HSPListGetEvalues(EBlastProgramType program_number,const BlastQueryInfo * query_info,Int4 subject_length,BlastHSPList * hsp_list,Boolean gapped_calculation,Boolean RPS_prelim,const BlastScoreBlk * sbp,double gap_decay_rate,double scaling_factor)1786 Int2 Blast_HSPListGetEvalues(EBlastProgramType program_number,
1787                              const BlastQueryInfo* query_info,
1788                              Int4 subject_length,
1789                              BlastHSPList* hsp_list,
1790                              Boolean gapped_calculation,
1791                              Boolean RPS_prelim,
1792                              const BlastScoreBlk* sbp, double gap_decay_rate,
1793                              double scaling_factor)
1794 {
1795    BlastHSP* hsp;
1796    BlastHSP** hsp_array;
1797    Blast_KarlinBlk** kbp;
1798    Int4 hsp_cnt;
1799    Int4 index;
1800    Int4 kbp_context;
1801    double gap_decay_divisor = 1.;
1802    Boolean isRPS = Blast_ProgramIsRpsBlast(program_number);
1803 
1804    if (hsp_list == NULL || hsp_list->hspcnt == 0)
1805       return 0;
1806 
1807    kbp = (gapped_calculation ? sbp->kbp_gap : sbp->kbp);
1808    hsp_cnt = hsp_list->hspcnt;
1809    hsp_array = hsp_list->hsp_array;
1810 
1811    if (gap_decay_rate != 0.)
1812       gap_decay_divisor = BLAST_GapDecayDivisor(gap_decay_rate, 1);
1813 
1814    for (index=0; index<hsp_cnt; index++) {
1815       hsp = hsp_array[index];
1816 
1817       ASSERT(hsp != NULL);
1818       ASSERT(scaling_factor != 0.0);
1819       ASSERT(sbp->round_down == FALSE || (hsp->score & 1) == 0);
1820 
1821       /* Divide Lambda by the scaling factor, so e-value is
1822          calculated correctly from a scaled score. This is needed only
1823          for RPS BLAST, where scores are scaled, but Lambda is not. */
1824       kbp_context = hsp->context;
1825       if (RPS_prelim) {
1826           /* All kbp in preliminary stage are equivalent.  However, some
1827              may be invalid.  Search for the first populated kbp */
1828           int i;
1829           for (i=0; i < sbp->number_of_contexts; ++i) {
1830               if (kbp[i]) break;
1831           }
1832           kbp_context = i;
1833       }
1834       ASSERT(kbp[kbp_context]);
1835       kbp[kbp_context]->Lambda /= scaling_factor;
1836 
1837       if (sbp->gbp) {
1838           /* Only try Spouge's method if gumbel parameters are available */
1839           if (!isRPS) {
1840               hsp->evalue =
1841                   BLAST_SpougeStoE(hsp->score, kbp[kbp_context], sbp->gbp,
1842                                query_info->contexts[hsp->context].query_length,
1843                                subject_length);
1844           } else {
1845               /* for RPS blast, query and subject is swapped */
1846               hsp->evalue =
1847                   BLAST_SpougeStoE(hsp->score, kbp[kbp_context], sbp->gbp,
1848                                subject_length,
1849                                query_info->contexts[hsp->context].query_length);
1850           }
1851       } else {
1852           /* Get effective search space from the query information block */
1853           hsp->evalue =
1854               BLAST_KarlinStoE_simple(hsp->score, kbp[kbp_context],
1855                                query_info->contexts[hsp->context].eff_searchsp);
1856       }
1857 
1858       hsp->evalue /= gap_decay_divisor;
1859       /* Put back the unscaled value of Lambda. */
1860       kbp[kbp_context]->Lambda *= scaling_factor;
1861    }
1862 
1863    /* Assign the best e-value field. Here the best e-value will always be
1864       attained for the first HSP in the list. Check that the incoming
1865       HSP list is properly sorted by score. */
1866    ASSERT(Blast_HSPListIsSortedByScore(hsp_list));
1867    hsp_list->best_evalue = s_BlastGetBestEvalue(hsp_list);
1868 
1869    return 0;
1870 }
1871 
Blast_HSPListGetBitScores(BlastHSPList * hsp_list,Boolean gapped_calculation,const BlastScoreBlk * sbp)1872 Int2 Blast_HSPListGetBitScores(BlastHSPList* hsp_list,
1873                                Boolean gapped_calculation,
1874                                const BlastScoreBlk* sbp)
1875 {
1876    BlastHSP* hsp;
1877    Blast_KarlinBlk** kbp;
1878    Int4 index;
1879 
1880    if (hsp_list == NULL)
1881       return 1;
1882 
1883    kbp = (gapped_calculation ? sbp->kbp_gap : sbp->kbp);
1884 
1885    for (index=0; index<hsp_list->hspcnt; index++) {
1886       hsp = hsp_list->hsp_array[index];
1887       ASSERT(hsp != NULL);
1888       ASSERT(sbp->round_down == FALSE || (hsp->score & 1) == 0);
1889       hsp->bit_score =
1890          (hsp->score*kbp[hsp->context]->Lambda - kbp[hsp->context]->logK) /
1891          NCBIMATH_LN2;
1892    }
1893 
1894    return 0;
1895 }
1896 
Blast_HSPListPHIGetBitScores(BlastHSPList * hsp_list,BlastScoreBlk * sbp)1897 void Blast_HSPListPHIGetBitScores(BlastHSPList* hsp_list, BlastScoreBlk* sbp)
1898 {
1899     Int4 index;
1900 
1901     double lambda, logC;
1902 
1903     ASSERT(sbp && sbp->kbp_gap && sbp->kbp_gap[0]);
1904 
1905     lambda = sbp->kbp_gap[0]->Lambda;
1906     logC = log(sbp->kbp_gap[0]->paramC);
1907 
1908     for (index=0; index<hsp_list->hspcnt; index++) {
1909         BlastHSP* hsp = hsp_list->hsp_array[index];
1910         ASSERT(hsp != NULL);
1911         hsp->bit_score =
1912             (hsp->score*lambda - logC - log(1.0 + hsp->score*lambda))
1913             / NCBIMATH_LN2;
1914     }
1915 }
1916 
1917 void
Blast_HSPListPHIGetEvalues(BlastHSPList * hsp_list,BlastScoreBlk * sbp,const BlastQueryInfo * query_info,const SPHIPatternSearchBlk * pattern_blk)1918 Blast_HSPListPHIGetEvalues(BlastHSPList* hsp_list, BlastScoreBlk* sbp,
1919                            const BlastQueryInfo* query_info,
1920                            const SPHIPatternSearchBlk* pattern_blk)
1921 {
1922    Int4 index;
1923    BlastHSP* hsp;
1924 
1925    if (!hsp_list || hsp_list->hspcnt == 0)
1926        return;
1927 
1928    for (index = 0; index < hsp_list->hspcnt; ++index) {
1929       hsp = hsp_list->hsp_array[index];
1930       s_HSPPHIGetEvalue(hsp, sbp, query_info, pattern_blk);
1931    }
1932    /* The best e-value is the one for the highest scoring HSP, which
1933       must be the first in the list. Check that HSPs are sorted by score
1934       to make sure this assumption is correct. */
1935    ASSERT(Blast_HSPListIsSortedByScore(hsp_list));
1936    hsp_list->best_evalue = s_BlastGetBestEvalue(hsp_list);
1937 }
1938 
Blast_HSPListReapByEvalue(BlastHSPList * hsp_list,const BlastHitSavingOptions * hit_options)1939 Int2 Blast_HSPListReapByEvalue(BlastHSPList* hsp_list,
1940         const BlastHitSavingOptions* hit_options)
1941 {
1942    BlastHSP* hsp;
1943    BlastHSP** hsp_array;
1944    Int4 hsp_cnt = 0;
1945    Int4 index;
1946    double cutoff;
1947 
1948    if (hsp_list == NULL)
1949       return 0;
1950 
1951    cutoff = hit_options->expect_value;
1952 
1953    hsp_array = hsp_list->hsp_array;
1954    for (index = 0; index < hsp_list->hspcnt; index++) {
1955       hsp = hsp_array[index];
1956 
1957       ASSERT(hsp != NULL);
1958 
1959       if (hsp->evalue > cutoff) {
1960          hsp_array[index] = Blast_HSPFree(hsp_array[index]);
1961       } else {
1962          if (index > hsp_cnt)
1963             hsp_array[hsp_cnt] = hsp_array[index];
1964          hsp_cnt++;
1965       }
1966    }
1967 
1968    hsp_list->hspcnt = hsp_cnt;
1969 
1970    return 0;
1971 }
1972 
Blast_HSPListReapByQueryCoverage(BlastHSPList * hsp_list,const BlastHitSavingOptions * hit_options,const BlastQueryInfo * query_info,EBlastProgramType program_number)1973 Int2 Blast_HSPListReapByQueryCoverage(BlastHSPList* hsp_list,
1974                                       const BlastHitSavingOptions* hit_options,
1975                                       const BlastQueryInfo* query_info,
1976                                       EBlastProgramType program_number)
1977 
1978 {
1979    BlastHSP* hsp;
1980    BlastHSP** hsp_array;
1981    Int4 hsp_cnt = 0;
1982    Int4 index;
1983 
1984    if ((hsp_list == NULL) || (hsp_list->hspcnt == 0) || (hit_options->query_cov_hsp_perc == 0))
1985       return 0;
1986 
1987    hsp_array = hsp_list->hsp_array;
1988    for (index = 0; index < hsp_list->hspcnt; index++) {
1989       hsp = hsp_array[index];
1990       ASSERT(hsp != NULL);
1991       if ( Blast_HSPQueryCoverageTest(hsp, hit_options->query_cov_hsp_perc,
1992 			                          query_info->contexts[hsp->context].query_length)) {
1993          hsp_array[index] = Blast_HSPFree(hsp_array[index]);
1994       } else {
1995          if (index > hsp_cnt)
1996             hsp_array[hsp_cnt] = hsp_array[index];
1997          hsp_cnt++;
1998       }
1999    }
2000 
2001    hsp_list->hspcnt = hsp_cnt;
2002 
2003    return 0;
2004 }
2005 
Blast_TrimHSPListByMaxHsps(BlastHSPList * hsp_list,const BlastHitSavingOptions * hit_options)2006 Int2 Blast_TrimHSPListByMaxHsps(BlastHSPList* hsp_list,
2007                                 const BlastHitSavingOptions* hit_options)
2008 {
2009    BlastHSP** hsp_array;
2010    Int4 index;
2011    Int4 hsp_max;
2012 
2013    if ((hsp_list == NULL) ||
2014 	   (hit_options->max_hsps_per_subject == 0) ||
2015 	   (hsp_list->hspcnt <= hit_options->max_hsps_per_subject))
2016       return 0;
2017 
2018    hsp_max = hit_options->max_hsps_per_subject;
2019    hsp_array = hsp_list->hsp_array;
2020    for (index = hsp_max; index < hsp_list->hspcnt; index++) {
2021       hsp_array[index] = Blast_HSPFree(hsp_array[index]);
2022    }
2023 
2024    hsp_list->hspcnt = hsp_max;
2025    return 0;
2026 }
2027 
2028 
2029 /** Same as Blast_HSPListReapByEvalue() except that it uses
2030  *  the raw score of the hit and the HitSavingOptions->cutoff_score
2031  *  to filter out hits. -RMH-
2032  */
Blast_HSPListReapByRawScore(BlastHSPList * hsp_list,const BlastHitSavingOptions * hit_options)2033 Int2 Blast_HSPListReapByRawScore(BlastHSPList* hsp_list,
2034         const BlastHitSavingOptions* hit_options)
2035 {
2036    BlastHSP* hsp;
2037    BlastHSP** hsp_array;
2038    Int4 hsp_cnt = 0;
2039    Int4 index;
2040 
2041    if (hsp_list == NULL)
2042       return 0;
2043 
2044    hsp_array = hsp_list->hsp_array;
2045    for (index = 0; index < hsp_list->hspcnt; index++) {
2046       hsp = hsp_array[index];
2047 
2048       ASSERT(hsp != NULL);
2049 
2050       if ( hsp->score < hit_options->cutoff_score ) {
2051          hsp_array[index] = Blast_HSPFree(hsp_array[index]);
2052       } else {
2053          if (index > hsp_cnt)
2054             hsp_array[hsp_cnt] = hsp_array[index];
2055          hsp_cnt++;
2056       }
2057    }
2058 
2059    hsp_list->hspcnt = hsp_cnt;
2060 
2061    return 0;
2062 }
2063 
2064 /** callback used to sort HSP lists in order of increasing OID
2065  * @param x First HSP list [in]
2066  * @param y Second HSP list [in]
2067  * @return compare result
2068  */
s_SortHSPListByOid(const void * x,const void * y)2069 static int s_SortHSPListByOid(const void *x, const void *y)
2070 {
2071     BlastHSPList **xx = (BlastHSPList **)x;
2072     BlastHSPList **yy = (BlastHSPList **)y;
2073     return (*xx)->oid - (*yy)->oid;
2074 }
2075 
Blast_HitListMerge(BlastHitList ** old_hit_list_ptr,BlastHitList ** combined_hit_list_ptr,Int4 contexts_per_query,Int4 * split_offsets,Int4 chunk_overlap_size,Boolean allow_gap)2076 Int2 Blast_HitListMerge(BlastHitList** old_hit_list_ptr,
2077                         BlastHitList** combined_hit_list_ptr,
2078                         Int4 contexts_per_query, Int4 *split_offsets,
2079                         Int4 chunk_overlap_size, Boolean allow_gap)
2080 {
2081     Int4 i, j;
2082     Boolean query_is_split;
2083     BlastHitList* hitlist1 = *old_hit_list_ptr;
2084     BlastHitList* hitlist2 = *combined_hit_list_ptr;
2085     BlastHitList* new_hitlist;
2086     Int4 num_hsplists1;
2087     Int4 num_hsplists2;
2088 
2089     if (hitlist1 == NULL)
2090         return 0;
2091     if (hitlist2 == NULL) {
2092         *combined_hit_list_ptr = hitlist1;
2093         *old_hit_list_ptr = NULL;
2094         return 0;
2095     }
2096     num_hsplists1 = hitlist1->hsplist_count;
2097     num_hsplists2 = hitlist2->hsplist_count;
2098     new_hitlist = Blast_HitListNew(hitlist1->hsplist_max);
2099 
2100     /* sort the lists of HSPs by oid */
2101 
2102     if (num_hsplists1 > 1) {
2103         qsort(hitlist1->hsplist_array, num_hsplists1,
2104               sizeof(BlastHSPList*), s_SortHSPListByOid);
2105     }
2106     if (num_hsplists2 > 1) {
2107         qsort(hitlist2->hsplist_array, num_hsplists2,
2108               sizeof(BlastHSPList*), s_SortHSPListByOid);
2109     }
2110 
2111     /* find out if the two hitlists contain hits for a single
2112        (split) query sequence */
2113 
2114     query_is_split = FALSE;
2115     for (i = 0; i < contexts_per_query; i++) {
2116         if (split_offsets[i] > 0) {
2117             query_is_split = TRUE;
2118             break;
2119         }
2120     }
2121     ASSERT(chunk_overlap_size != 0);
2122 
2123     /* merge the HSPlists of the two HitLists */
2124 
2125     i = j = 0;
2126     while (i < num_hsplists1 && j < num_hsplists2) {
2127         BlastHSPList* hsplist1 = hitlist1->hsplist_array[i];
2128         BlastHSPList* hsplist2 = hitlist2->hsplist_array[j];
2129 
2130         if (hsplist1->oid < hsplist2->oid) {
2131             Blast_HitListUpdate(new_hitlist, hsplist1);
2132             i++;
2133         }
2134         else if (hsplist1->oid > hsplist2->oid) {
2135             Blast_HitListUpdate(new_hitlist, hsplist2);
2136             j++;
2137         }
2138         else {
2139             /* the old and new Hitlists contain hits to the same
2140                DB sequence, and these must be merged. */
2141 
2142             if (query_is_split) {
2143                 Blast_HSPListsMerge(hitlist1->hsplist_array + i,
2144                                     hitlist2->hsplist_array + j,
2145                                     hsplist2->hsp_max, split_offsets,
2146                                     contexts_per_query,
2147                                     chunk_overlap_size,
2148                                     allow_gap, FALSE);
2149             }
2150             else {
2151                 Blast_HSPListAppend(hitlist1->hsplist_array + i,
2152                                     hitlist2->hsplist_array + j,
2153                                     hsplist2->hsp_max);
2154             }
2155             Blast_HitListUpdate(new_hitlist, hitlist2->hsplist_array[j]);
2156             i++;
2157             j++;
2158         }
2159     }
2160     for (; i < num_hsplists1; i++) {
2161         BlastHSPList* hsplist1 = hitlist1->hsplist_array[i];
2162         Blast_HitListUpdate(new_hitlist, hsplist1);
2163     }
2164     for (; j < num_hsplists2; j++) {
2165         BlastHSPList* hsplist2 = hitlist2->hsplist_array[j];
2166         Blast_HitListUpdate(new_hitlist, hsplist2);
2167     }
2168     hitlist1->hsplist_count = 0;
2169     Blast_HitListFree(hitlist1);
2170     hitlist2->hsplist_count = 0;
2171     Blast_HitListFree(hitlist2);
2172 
2173     *old_hit_list_ptr = NULL;
2174     *combined_hit_list_ptr = new_hitlist;
2175     return 0;
2176 }
2177 
2178 
2179 /* See description in blast_hits.h */
2180 
2181 Int2
Blast_HSPListPurgeNullHSPs(BlastHSPList * hsp_list)2182 Blast_HSPListPurgeNullHSPs(BlastHSPList* hsp_list)
2183 
2184 {
2185         Int4 index, index1; /* loop indices. */
2186         Int4 hspcnt; /* total number of HSP's to iterate over. */
2187         BlastHSP** hsp_array;  /* hsp_array to purge. */
2188 
2189 	if (hsp_list == NULL || hsp_list->hspcnt == 0)
2190 		return 0;
2191 
2192         hsp_array = hsp_list->hsp_array;
2193         hspcnt = hsp_list->hspcnt;
2194 
2195 	index1 = 0;
2196 	for (index=0; index<hspcnt; index++)
2197 	{
2198 		if (hsp_array[index] != NULL)
2199 		{
2200 			hsp_array[index1] = hsp_array[index];
2201 			index1++;
2202 		}
2203 	}
2204 
2205 	for (index=index1; index<hspcnt; index++)
2206 	{
2207 		hsp_array[index] = NULL;
2208 	}
2209 
2210 	hsp_list->hspcnt = index1;
2211 
2212         return 0;
2213 }
2214 
2215 /** Callback for sorting HSPs by starting offset in query. Sorting is by
2216  * increasing context, then increasing query start offset, then increasing
2217  * subject start offset, then decreasing score, then increasing query end
2218  * offset, then increasing subject end offset. Null HSPs are moved to the
2219  * end of the array.
2220  * @param v1 pointer to first HSP [in]
2221  * @param v2 pointer to second HSP [in]
2222  * @return Result of comparison.
2223  */
2224 static int
s_QueryOffsetCompareHSPs(const void * v1,const void * v2)2225 s_QueryOffsetCompareHSPs(const void* v1, const void* v2)
2226 {
2227    BlastHSP* h1,* h2;
2228    BlastHSP** hp1,** hp2;
2229 
2230    hp1 = (BlastHSP**) v1;
2231    hp2 = (BlastHSP**) v2;
2232    h1 = *hp1;
2233    h2 = *hp2;
2234 
2235    if (!h1 && !h2)
2236       return 0;
2237    else if (!h1)
2238       return 1;
2239    else if (!h2)
2240       return -1;
2241 
2242    /* If these are from different contexts, don't compare offsets */
2243    if (h1->context < h2->context)
2244       return -1;
2245    if (h1->context > h2->context)
2246       return 1;
2247 
2248    if (h1->query.offset < h2->query.offset)
2249       return -1;
2250    if (h1->query.offset > h2->query.offset)
2251       return 1;
2252 
2253    if (h1->subject.offset < h2->subject.offset)
2254       return -1;
2255    if (h1->subject.offset > h2->subject.offset)
2256       return 1;
2257 
2258    /* tie breakers: sort by decreasing score, then
2259       by increasing size of query range, then by
2260       increasing subject range. */
2261 
2262    if (h1->score < h2->score)
2263       return 1;
2264    if (h1->score > h2->score)
2265       return -1;
2266 
2267    if (h1->query.end < h2->query.end)
2268       return 1;
2269    if (h1->query.end > h2->query.end)
2270       return -1;
2271 
2272    if (h1->subject.end < h2->subject.end)
2273       return 1;
2274    if (h1->subject.end > h2->subject.end)
2275       return -1;
2276 
2277    return 0;
2278 }
2279 
2280 /** Callback for sorting HSPs by ending offset in query. Sorting is by
2281  * increasing context, then increasing query end offset, then increasing
2282  * subject end offset, then decreasing score, then decreasing query start
2283  * offset, then decreasing subject start offset. Null HSPs are moved to the
2284  * end of the array.
2285  * @param v1 pointer to first HSP [in]
2286  * @param v2 pointer to second HSP [in]
2287  * @return Result of comparison.
2288  */
2289 static int
s_QueryEndCompareHSPs(const void * v1,const void * v2)2290 s_QueryEndCompareHSPs(const void* v1, const void* v2)
2291 {
2292    BlastHSP* h1,* h2;
2293    BlastHSP** hp1,** hp2;
2294 
2295    hp1 = (BlastHSP**) v1;
2296    hp2 = (BlastHSP**) v2;
2297    h1 = *hp1;
2298    h2 = *hp2;
2299 
2300    if (!h1 && !h2)
2301       return 0;
2302    else if (!h1)
2303       return 1;
2304    else if (!h2)
2305       return -1;
2306 
2307    /* If these are from different contexts, don't compare offsets */
2308    if (h1->context < h2->context)
2309       return -1;
2310    if (h1->context > h2->context)
2311       return 1;
2312 
2313    if (h1->query.end < h2->query.end)
2314       return -1;
2315    if (h1->query.end > h2->query.end)
2316       return 1;
2317 
2318    if (h1->subject.end < h2->subject.end)
2319       return -1;
2320    if (h1->subject.end > h2->subject.end)
2321       return 1;
2322 
2323    /* tie breakers: sort by decreasing score, then
2324       by increasing size of query range, then by
2325       increasing size of subject range. The shortest range
2326       means the *largest* sequence offset must come
2327       first */
2328    if (h1->score < h2->score)
2329       return 1;
2330    if (h1->score > h2->score)
2331       return -1;
2332 
2333    if (h1->query.offset < h2->query.offset)
2334       return 1;
2335    if (h1->query.offset > h2->query.offset)
2336       return -1;
2337 
2338    if (h1->subject.offset < h2->subject.offset)
2339       return 1;
2340    if (h1->subject.offset > h2->subject.offset)
2341       return -1;
2342 
2343    return 0;
2344 }
2345 
2346 
2347 /* cut off the GapEditScript according to hsp offset and end */
2348 static void
s_CutOffGapEditScript(BlastHSP * hsp,Int4 q_cut,Int4 s_cut,Boolean cut_begin)2349 s_CutOffGapEditScript(BlastHSP* hsp, Int4 q_cut, Int4 s_cut, Boolean cut_begin)
2350 {
2351    int index, opid, qid, sid;
2352    Boolean found = FALSE;
2353    GapEditScript *esp = hsp->gap_info;
2354    qid = 0;
2355    sid = 0;
2356    opid = 0;
2357    q_cut -= hsp->query.offset;
2358    s_cut -= hsp->subject.offset;
2359    for (index=0; index < esp->size; index++) {
2360        for(opid=0; opid < esp->num[index];){
2361           if (esp->op_type[index] == eGapAlignSub) {
2362              qid++;
2363              sid++;
2364              opid++;
2365           } else if (esp->op_type[index] == eGapAlignDel) {
2366              sid+=esp->num[index];
2367              opid+=esp->num[index];
2368           } else if (esp->op_type[index] == eGapAlignIns) {
2369              qid+=esp->num[index];
2370              opid+=esp->num[index];
2371           }
2372           if (qid >= q_cut && sid >= s_cut) found = TRUE;
2373           if (found) break;
2374        }
2375        if (found) break;
2376    }
2377 
2378    // RMH: Unless both cut sites where found the following
2379    //      block would access memory outside the GapEditScript
2380    //      array.
2381    if ( found )
2382    {
2383      if (cut_begin) {
2384          int new_index = 0;
2385          if (opid < esp->num[index]) {
2386             ASSERT(esp->op_type[index] == eGapAlignSub);
2387             esp->op_type[0] = esp->op_type[index];
2388             esp->num[0] = esp->num[index] - opid;
2389             new_index++;
2390          }
2391          ++index;
2392          for (; index < esp->size; index++, new_index++) {
2393             esp->op_type[new_index] = esp->op_type[index];
2394             esp->num[new_index] = esp->num[index];
2395          }
2396          esp->size = new_index;
2397          hsp->query.offset += qid;
2398          hsp->subject.offset += sid;
2399      } else {
2400          if (opid < esp->num[index]) {
2401             ASSERT(esp->op_type[index] == eGapAlignSub);
2402             esp->num[index] = opid;
2403          }
2404          esp->size = index+1;
2405          hsp->query.end = hsp->query.offset + qid;
2406          hsp->subject.end = hsp->subject.offset + sid;
2407      }
2408    }
2409 }
2410 
2411 Int4
Blast_HSPListPurgeHSPsWithCommonEndpoints(EBlastProgramType program,BlastHSPList * hsp_list,Boolean purge)2412 Blast_HSPListPurgeHSPsWithCommonEndpoints(EBlastProgramType program,
2413                                           BlastHSPList* hsp_list,
2414                                           Boolean purge)
2415 
2416 {
2417    BlastHSP** hsp_array;  /* hsp_array to purge. */
2418    BlastHSP* hsp;
2419    Int4 i, j, k;
2420    Int4 hsp_count;
2421    purge |= (program != eBlastTypeBlastn);
2422 
2423    /* If HSP list is empty, return immediately. */
2424    if (hsp_list == NULL || hsp_list->hspcnt == 0)
2425        return 0;
2426 
2427    /* Do nothing for PHI BLAST, because HSPs corresponding to different pattern
2428       occurrences may have common end points, but should all be kept. */
2429    if (Blast_ProgramIsPhiBlast(program))
2430        return hsp_list->hspcnt;
2431 
2432    hsp_array = hsp_list->hsp_array;
2433    hsp_count = hsp_list->hspcnt;
2434 
2435    qsort(hsp_array, hsp_count, sizeof(BlastHSP*), s_QueryOffsetCompareHSPs);
2436    i = 0;
2437    while (i < hsp_count) {
2438       j = 1;
2439       while (i+j < hsp_count &&
2440              hsp_array[i] && hsp_array[i+j] &&
2441              hsp_array[i]->context == hsp_array[i+j]->context &&
2442              hsp_array[i]->query.offset == hsp_array[i+j]->query.offset &&
2443              hsp_array[i]->subject.offset == hsp_array[i+j]->subject.offset) {
2444          hsp_count--;
2445          hsp = hsp_array[i+j];
2446          if (!purge && (hsp->query.end > hsp_array[i]->query.end)) {
2447              s_CutOffGapEditScript(hsp, hsp_array[i]->query.end,
2448                                         hsp_array[i]->subject.end, TRUE);
2449          } else {
2450              hsp = Blast_HSPFree(hsp);
2451          }
2452          for (k=i+j; k<hsp_count; k++) {
2453              hsp_array[k] = hsp_array[k+1];
2454          }
2455          hsp_array[hsp_count] = hsp;
2456       }
2457       i += j;
2458    }
2459 
2460    qsort(hsp_array, hsp_count, sizeof(BlastHSP*), s_QueryEndCompareHSPs);
2461    i = 0;
2462    while (i < hsp_count) {
2463       j = 1;
2464       while (i+j < hsp_count &&
2465              hsp_array[i] && hsp_array[i+j] &&
2466              hsp_array[i]->context == hsp_array[i+j]->context &&
2467              hsp_array[i]->query.end == hsp_array[i+j]->query.end &&
2468              hsp_array[i]->subject.end == hsp_array[i+j]->subject.end) {
2469          hsp_count--;
2470          hsp = hsp_array[i+j];
2471          if (!purge && (hsp->query.offset < hsp_array[i]->query.offset)) {
2472              s_CutOffGapEditScript(hsp, hsp_array[i]->query.offset,
2473                                         hsp_array[i]->subject.offset, FALSE);
2474          } else {
2475              hsp = Blast_HSPFree(hsp);
2476          }
2477          for (k=i+j; k<hsp_count; k++) {
2478              hsp_array[k] = hsp_array[k+1];
2479          }
2480          hsp_array[hsp_count] = hsp;
2481       }
2482       i += j;
2483    }
2484 
2485    if (purge) {
2486       Blast_HSPListPurgeNullHSPs(hsp_list);
2487    }
2488 
2489    return hsp_count;
2490 }
2491 
2492 Int2
Blast_HSPListReevaluateUngapped(EBlastProgramType program,BlastHSPList * hsp_list,BLAST_SequenceBlk * query_blk,BLAST_SequenceBlk * subject_blk,const BlastInitialWordParameters * word_params,const BlastHitSavingParameters * hit_params,const BlastQueryInfo * query_info,BlastScoreBlk * sbp,const BlastScoringParameters * score_params,const BlastSeqSrc * seq_src,const Uint1 * gen_code_string)2493 Blast_HSPListReevaluateUngapped(EBlastProgramType program,
2494    BlastHSPList* hsp_list, BLAST_SequenceBlk* query_blk,
2495    BLAST_SequenceBlk* subject_blk,
2496    const BlastInitialWordParameters* word_params,
2497    const BlastHitSavingParameters* hit_params, const BlastQueryInfo* query_info,
2498    BlastScoreBlk* sbp, const BlastScoringParameters* score_params,
2499    const BlastSeqSrc* seq_src, const Uint1* gen_code_string)
2500 {
2501    BlastHSP** hsp_array,* hsp;
2502    const Uint1* subject_start = NULL;
2503    Uint1* query_start;
2504    Int4 index, context, hspcnt;
2505    Boolean purge;
2506    Int2 status = 0;
2507    const Boolean kTranslateSubject = Blast_SubjectIsTranslated(program);
2508    const Boolean kNucleotideSubject = Blast_SubjectIsNucleotide(program);
2509    SBlastTargetTranslation* target_t = NULL;
2510 
2511    ASSERT(!score_params->options->gapped_calculation);
2512 
2513    if (!hsp_list)
2514       return status;
2515 
2516    hspcnt = hsp_list->hspcnt;
2517    hsp_array = hsp_list->hsp_array;
2518 
2519    if (hsp_list->hspcnt == 0)
2520       /* All HSPs have been deleted */
2521       return status;
2522 
2523    /* Retrieve the unpacked subject sequence and save it in the
2524       sequence_start element of the subject structure.
2525       NB: for the BLAST 2 Sequences case, the uncompressed sequence must
2526       already be there */
2527    if (seq_src) {
2528       /* Wrap subject sequence block into a BlastSeqSrcGetSeqArg structure, which is
2529          needed by the BlastSeqSrc API. */
2530       /* If this was a protein subject, leave as is. */
2531       if (kNucleotideSubject)
2532       {
2533       	BlastSeqSrcGetSeqArg seq_arg;
2534       	memset((void*) &seq_arg, 0, sizeof(seq_arg));
2535       	seq_arg.oid = subject_blk->oid;
2536       	seq_arg.encoding =
2537        	  (kTranslateSubject ? eBlastEncodingNcbi4na : eBlastEncodingNucleotide);
2538         seq_arg.check_oid_exclusion = TRUE;
2539       	seq_arg.seq = subject_blk;
2540       	/* Return the packed sequence to the database */
2541       	BlastSeqSrcReleaseSequence(seq_src, &seq_arg);
2542       	/* Get the unpacked sequence */
2543       	if (( BLAST_SEQSRC_EXCLUDED == BlastSeqSrcGetSequence(seq_src, &seq_arg))) {
2544       		for (index = 0; index < hspcnt; ++index) {
2545       			 hsp_array[index] = Blast_HSPFree(hsp_array[index]);
2546       		}
2547       		Blast_HSPListPurgeNullHSPs(hsp_list);
2548       		return 0;
2549       	}
2550       	else if (status < 0){
2551           return status;
2552       	}
2553        }
2554    }
2555 
2556    if (kTranslateSubject) {
2557       if (!gen_code_string)
2558          return -1;
2559       /* Get the translation buffer */
2560       BlastTargetTranslationNew(subject_blk, gen_code_string, program,
2561                    score_params->options->is_ooframe, &target_t);
2562    } else {
2563       /* Store sequence in blastna encoding in sequence_start */
2564       if (subject_blk->sequence_start)
2565           subject_start = subject_blk->sequence_start + 1;
2566       else
2567           subject_start = subject_blk->sequence;
2568    }
2569 
2570    purge = FALSE;
2571    for (index = 0; index < hspcnt; ++index) {
2572       Boolean delete_hsp = FALSE;
2573       if (hsp_array[index] == NULL)
2574          continue;
2575       else
2576          hsp = hsp_array[index];
2577 
2578       context = hsp->context;
2579 
2580       query_start = query_blk->sequence +
2581           query_info->contexts[context].query_offset;
2582 
2583       if (kTranslateSubject)
2584          subject_start = Blast_HSPGetTargetTranslation(target_t, hsp, NULL);
2585 
2586       if (kNucleotideSubject) {
2587          delete_hsp =
2588              Blast_HSPReevaluateWithAmbiguitiesUngapped(hsp, query_start,
2589                 subject_start, word_params, sbp, kTranslateSubject);
2590       }
2591 
2592       if (!delete_hsp) {
2593           const Uint1* query_nomask = query_blk->sequence_nomask +
2594               query_info->contexts[context].query_offset;
2595           Int4 align_length = 0;
2596           Blast_HSPGetNumIdentitiesAndPositives(query_nomask,
2597                    							    subject_start,
2598                                					hsp,
2599                                					score_params->options,
2600                                					&align_length,
2601                                					sbp);
2602 
2603            delete_hsp = Blast_HSPTest(hsp, hit_params->options, align_length);
2604       }
2605       if (delete_hsp) { /* This HSP is now below the cutoff */
2606          hsp_array[index] = Blast_HSPFree(hsp_array[index]);
2607          purge = TRUE;
2608       }
2609    }
2610 
2611    if (target_t)
2612       target_t = BlastTargetTranslationFree(target_t);
2613 
2614    if (purge)
2615       Blast_HSPListPurgeNullHSPs(hsp_list);
2616 
2617    /* Sort the HSP array by score (scores may have changed!) */
2618    Blast_HSPListSortByScore(hsp_list);
2619    Blast_HSPListAdjustOddBlastnScores(hsp_list, FALSE, sbp);
2620    return 0;
2621 }
2622 
2623 /** Combine two HSP lists, without altering the individual HSPs, and without
2624  * reallocating the HSP array.
2625  * @param hsp_list New HSP list [in]
2626  * @param combined_hsp_list Old HSP list, to which new HSPs are added [in] [out]
2627  * @param new_hspcnt How many HSPs to save in the combined list? The extra ones
2628  *                   are freed. The best scoring HSPs are saved. This argument
2629  *                   cannot be greater than the allocated size of the
2630  *                   combined list's HSP array. [in]
2631  */
2632 static void
s_BlastHSPListsCombineByScore(BlastHSPList * hsp_list,BlastHSPList * combined_hsp_list,Int4 new_hspcnt)2633 s_BlastHSPListsCombineByScore(BlastHSPList* hsp_list,
2634                              BlastHSPList* combined_hsp_list,
2635                              Int4 new_hspcnt)
2636 {
2637    Int4 index, index1, index2;
2638    BlastHSP** new_hsp_array;
2639 
2640    ASSERT(new_hspcnt <= combined_hsp_list->allocated);
2641 
2642    if (new_hspcnt >= hsp_list->hspcnt + combined_hsp_list->hspcnt) {
2643       /* All HSPs from both arrays are saved */
2644       for (index=combined_hsp_list->hspcnt, index1=0;
2645            index1<hsp_list->hspcnt; index1++) {
2646          if (hsp_list->hsp_array[index1] != NULL)
2647             combined_hsp_list->hsp_array[index++] = hsp_list->hsp_array[index1];
2648       }
2649       combined_hsp_list->hspcnt = new_hspcnt;
2650       Blast_HSPListSortByScore(combined_hsp_list);
2651    } else {
2652       /* Not all HSPs are be saved; sort both arrays by score and save only
2653          the new_hspcnt best ones.
2654          For the merged set of HSPs, allocate array the same size as in the
2655          old HSP list. */
2656       new_hsp_array = (BlastHSP**)
2657          malloc(combined_hsp_list->allocated*sizeof(BlastHSP*));
2658 
2659       Blast_HSPListSortByScore(combined_hsp_list);
2660       Blast_HSPListSortByScore(hsp_list);
2661       index1 = index2 = 0;
2662       for (index = 0; index < new_hspcnt; ++index) {
2663          if (index1 < combined_hsp_list->hspcnt &&
2664              (index2 >= hsp_list->hspcnt ||
2665               ScoreCompareHSPs(&combined_hsp_list->hsp_array[index1],
2666                                &hsp_list->hsp_array[index2]) <= 0)) {
2667             new_hsp_array[index] = combined_hsp_list->hsp_array[index1];
2668             ++index1;
2669          } else {
2670             new_hsp_array[index] = hsp_list->hsp_array[index2];
2671             ++index2;
2672          }
2673       }
2674       /* Free the extra HSPs that could not be saved */
2675       for ( ; index1 < combined_hsp_list->hspcnt; ++index1) {
2676          combined_hsp_list->hsp_array[index1] =
2677             Blast_HSPFree(combined_hsp_list->hsp_array[index1]);
2678       }
2679       for ( ; index2 < hsp_list->hspcnt; ++index2) {
2680          hsp_list->hsp_array[index2] =
2681             Blast_HSPFree(hsp_list->hsp_array[index2]);
2682       }
2683       /* Point combined_hsp_list's HSP array to the new one */
2684       sfree(combined_hsp_list->hsp_array);
2685       combined_hsp_list->hsp_array = new_hsp_array;
2686       combined_hsp_list->hspcnt = new_hspcnt;
2687    }
2688 
2689    /* Second HSP list now does not own any HSPs */
2690    hsp_list->hspcnt = 0;
2691 }
2692 
Blast_HSPListAppend(BlastHSPList ** old_hsp_list_ptr,BlastHSPList ** combined_hsp_list_ptr,Int4 hsp_num_max)2693 Int2 Blast_HSPListAppend(BlastHSPList** old_hsp_list_ptr,
2694         BlastHSPList** combined_hsp_list_ptr, Int4 hsp_num_max)
2695 {
2696    BlastHSPList* combined_hsp_list = *combined_hsp_list_ptr;
2697    BlastHSPList* hsp_list = *old_hsp_list_ptr;
2698    Int4 new_hspcnt;
2699 
2700    if (!hsp_list || hsp_list->hspcnt == 0)
2701       return 0;
2702 
2703    /* If no previous HSP list, return a pointer to the old one */
2704    if (!combined_hsp_list) {
2705       *combined_hsp_list_ptr = hsp_list;
2706       *old_hsp_list_ptr = NULL;
2707       return 0;
2708    }
2709 
2710    /* Just append new list to the end of the old list, in case of
2711       multiple frames of the subject sequence */
2712    new_hspcnt = MIN(combined_hsp_list->hspcnt + hsp_list->hspcnt,
2713                     hsp_num_max);
2714    if (new_hspcnt > combined_hsp_list->allocated &&
2715        !combined_hsp_list->do_not_reallocate) {
2716       Int4 new_allocated = MIN(2*new_hspcnt, hsp_num_max);
2717       BlastHSP** new_hsp_array;
2718       new_hsp_array = (BlastHSP**)
2719          realloc(combined_hsp_list->hsp_array,
2720                  new_allocated*sizeof(BlastHSP*));
2721 
2722       if (new_hsp_array) {
2723          combined_hsp_list->allocated = new_allocated;
2724          combined_hsp_list->hsp_array = new_hsp_array;
2725       } else {
2726          combined_hsp_list->do_not_reallocate = TRUE;
2727          new_hspcnt = combined_hsp_list->allocated;
2728       }
2729    }
2730    if (combined_hsp_list->allocated == hsp_num_max)
2731       combined_hsp_list->do_not_reallocate = TRUE;
2732 
2733    s_BlastHSPListsCombineByScore(hsp_list, combined_hsp_list, new_hspcnt);
2734 
2735    hsp_list = Blast_HSPListFree(hsp_list);
2736    *old_hsp_list_ptr = NULL;
2737 
2738    return 0;
2739 }
2740 
Blast_HSPListsMerge(BlastHSPList ** hsp_list_ptr,BlastHSPList ** combined_hsp_list_ptr,Int4 hsp_num_max,Int4 * split_offsets,Int4 contexts_per_query,Int4 chunk_overlap_size,Boolean allow_gap,Boolean short_reads)2741 Int2 Blast_HSPListsMerge(BlastHSPList** hsp_list_ptr,
2742                    BlastHSPList** combined_hsp_list_ptr,
2743                    Int4 hsp_num_max, Int4 *split_offsets,
2744                    Int4 contexts_per_query, Int4 chunk_overlap_size,
2745                    Boolean allow_gap, Boolean short_reads)
2746 {
2747    BlastHSPList* combined_hsp_list = *combined_hsp_list_ptr;
2748    BlastHSPList* hsp_list = *hsp_list_ptr;
2749    BlastHSP* hsp1, *hsp2, *hsp_var;
2750    BlastHSP** hspp1,** hspp2;
2751    Int4 index1, index2;
2752    Int4 hspcnt1, hspcnt2, new_hspcnt = 0;
2753    Int4 start_diag, end_diag;
2754    Int4 offset_idx;
2755    BlastHSP** new_hsp_array;
2756 
2757    if (!hsp_list || hsp_list->hspcnt == 0)
2758       return 0;
2759 
2760    /* If no previous HSP list, just return a copy of the new one. */
2761    if (!combined_hsp_list) {
2762       *combined_hsp_list_ptr = hsp_list;
2763       *hsp_list_ptr = NULL;
2764       return 0;
2765    }
2766 
2767    /* Merge the two HSP lists for successive chunks of the subject sequence.
2768       First put all HSPs that intersect the overlap region at the front of
2769       the respective HSP arrays. Note that if the query sequence is
2770       assumed split, each context of the query sequence can have a
2771       different split point */
2772    hspcnt1 = hspcnt2 = 0;
2773 
2774    if (contexts_per_query < 0) {      /* subject seq is split */
2775       for (index1 = 0; index1 < combined_hsp_list->hspcnt; index1++) {
2776          hsp1 = combined_hsp_list->hsp_array[index1];
2777          if (hsp1->subject.end > split_offsets[0]) {
2778             /* At least part of this HSP lies in the overlap strip. */
2779             hsp_var = combined_hsp_list->hsp_array[hspcnt1];
2780             combined_hsp_list->hsp_array[hspcnt1] = hsp1;
2781             combined_hsp_list->hsp_array[index1] = hsp_var;
2782             ++hspcnt1;
2783          }
2784       }
2785       for (index2 = 0; index2 < hsp_list->hspcnt; index2++) {
2786          hsp2 = hsp_list->hsp_array[index2];
2787          if (hsp2->subject.offset < split_offsets[0] + chunk_overlap_size) {
2788             /* At least part of this HSP lies in the overlap strip. */
2789             hsp_var = hsp_list->hsp_array[hspcnt2];
2790             hsp_list->hsp_array[hspcnt2] = hsp2;
2791             hsp_list->hsp_array[index2] = hsp_var;
2792             ++hspcnt2;
2793          }
2794       }
2795    }
2796    else {            /* query seq is split */
2797 
2798       /* An HSP can be a candidate for merging if it lies in the
2799          overlap region. Whether this is true depends on whether the
2800          HSP starts to the left of the split point, or ends to the
2801          right of the overlap region. A complication is that 'left'
2802          and 'right' have opposite meaning when the HSP is on the
2803          minus strand of the query sequence */
2804 
2805       for (index1 = 0; index1 < combined_hsp_list->hspcnt; index1++) {
2806          hsp1 = combined_hsp_list->hsp_array[index1];
2807          offset_idx = hsp1->context % contexts_per_query;
2808          if (split_offsets[offset_idx] < 0) continue;
2809          if ((hsp1->query.frame >= 0 && hsp1->query.end >
2810                          split_offsets[offset_idx]) ||
2811              (hsp1->query.frame < 0 && hsp1->query.offset <
2812                          split_offsets[offset_idx] + chunk_overlap_size)) {
2813             /* At least part of this HSP lies in the overlap strip. */
2814             hsp_var = combined_hsp_list->hsp_array[hspcnt1];
2815             combined_hsp_list->hsp_array[hspcnt1] = hsp1;
2816             combined_hsp_list->hsp_array[index1] = hsp_var;
2817             ++hspcnt1;
2818          }
2819       }
2820       for (index2 = 0; index2 < hsp_list->hspcnt; index2++) {
2821          hsp2 = hsp_list->hsp_array[index2];
2822          offset_idx = hsp2->context % contexts_per_query;
2823          if (split_offsets[offset_idx] < 0) continue;
2824          if ((hsp2->query.frame < 0 && hsp2->query.end >
2825                          split_offsets[offset_idx]) ||
2826              (hsp2->query.frame >= 0 && hsp2->query.offset <
2827                          split_offsets[offset_idx] + chunk_overlap_size)) {
2828             /* At least part of this HSP lies in the overlap strip. */
2829             hsp_var = hsp_list->hsp_array[hspcnt2];
2830             hsp_list->hsp_array[hspcnt2] = hsp2;
2831             hsp_list->hsp_array[index2] = hsp_var;
2832             ++hspcnt2;
2833          }
2834       }
2835    }
2836 
2837    /* the merge process is independent of whether merging happens
2838       between query chunks or subject chunks */
2839 
2840    if (hspcnt1 > 0 && hspcnt2 > 0) {
2841       hspp1 = combined_hsp_list->hsp_array;
2842       hspp2 = hsp_list->hsp_array;
2843 
2844       for (index1 = 0; index1 < hspcnt1; index1++) {
2845 
2846          hsp1 = hspp1[index1];
2847 
2848          for (index2 = 0; index2 < hspcnt2; index2++) {
2849 
2850             hsp2 = hspp2[index2];
2851 
2852             /* Skip already deleted HSPs, or HSPs from different contexts */
2853             if (!hsp2 || hsp1->context != hsp2->context)
2854                continue;
2855 
2856             /* Short read qureies are shorter than the overlap region and may
2857                already have a traceback */
2858             if (short_reads) {
2859                 hspp2[index2] = Blast_HSPFree(hsp2);
2860                 continue;
2861             }
2862 
2863             /* we have to determine the starting diagonal of one HSP
2864                and the ending diagonal of the other */
2865 
2866             if (contexts_per_query < 0 || hsp1->query.frame >= 0) {
2867                end_diag = s_HSPEndDiag(hsp1);
2868                start_diag = s_HSPStartDiag(hsp2);
2869             }
2870             else {
2871                end_diag = s_HSPEndDiag(hsp2);
2872                start_diag = s_HSPStartDiag(hsp1);
2873             }
2874 
2875             if (ABS(end_diag - start_diag) < OVERLAP_DIAG_CLOSE) {
2876                if (s_BlastMergeTwoHSPs(hsp1, hsp2, allow_gap)) {
2877                   /* Free the second HSP. */
2878                   hspp2[index2] = Blast_HSPFree(hsp2);
2879                }
2880             }
2881          }
2882       }
2883 
2884       /* Purge the nulled out HSPs from the new HSP list */
2885       Blast_HSPListPurgeNullHSPs(hsp_list);
2886    }
2887 
2888    /* The new number of HSPs is now the sum of the remaining counts in the
2889       two lists, but if there is a restriction on the number of HSPs to keep,
2890       it might have to be reduced. */
2891    new_hspcnt =
2892       MIN(hsp_list->hspcnt + combined_hsp_list->hspcnt, hsp_num_max);
2893 
2894    if (new_hspcnt >= combined_hsp_list->allocated-1 &&
2895        combined_hsp_list->do_not_reallocate == FALSE) {
2896       Int4 new_allocated = MIN(2*new_hspcnt, hsp_num_max);
2897       if (new_allocated > combined_hsp_list->allocated) {
2898          new_hsp_array = (BlastHSP**)
2899             realloc(combined_hsp_list->hsp_array,
2900                     new_allocated*sizeof(BlastHSP*));
2901          if (new_hsp_array == NULL) {
2902             combined_hsp_list->do_not_reallocate = TRUE;
2903          } else {
2904             combined_hsp_list->hsp_array = new_hsp_array;
2905             combined_hsp_list->allocated = new_allocated;
2906          }
2907       } else {
2908          combined_hsp_list->do_not_reallocate = TRUE;
2909       }
2910       new_hspcnt = MIN(new_hspcnt, combined_hsp_list->allocated);
2911    }
2912 
2913    s_BlastHSPListsCombineByScore(hsp_list, combined_hsp_list, new_hspcnt);
2914 
2915    hsp_list = Blast_HSPListFree(hsp_list);
2916    *hsp_list_ptr = NULL;
2917 
2918    return 0;
2919 }
2920 
Blast_HSPListAdjustOffsets(BlastHSPList * hsp_list,Int4 offset)2921 void Blast_HSPListAdjustOffsets(BlastHSPList* hsp_list, Int4 offset)
2922 {
2923    Int4 index;
2924 
2925    if (offset == 0) {
2926        return;
2927    }
2928 
2929    for (index=0; index<hsp_list->hspcnt; index++) {
2930       BlastHSP* hsp = hsp_list->hsp_array[index];
2931       hsp->subject.offset += offset;
2932       hsp->subject.end += offset;
2933       hsp->subject.gapped_start += offset;
2934    }
2935 }
2936 
Blast_HSPListAdjustOddBlastnScores(BlastHSPList * hsp_list,Boolean gapped_calculation,const BlastScoreBlk * sbp)2937 void Blast_HSPListAdjustOddBlastnScores(BlastHSPList* hsp_list,
2938                                         Boolean gapped_calculation,
2939                                         const BlastScoreBlk* sbp)
2940 {
2941     int index;
2942 
2943     if (!hsp_list || hsp_list->hspcnt == 0 ||
2944         gapped_calculation == FALSE ||
2945         sbp->round_down == FALSE)
2946        return;
2947 
2948     for (index = 0; index < hsp_list->hspcnt; ++index)
2949         hsp_list->hsp_array[index]->score &= ~1;
2950 
2951     /* Sort the HSPs again, since the order may have to be different now. */
2952     Blast_HSPListSortByScore(hsp_list);
2953 }
2954 
2955 /** Callback for sorting hsp lists by their best evalue/score;
2956  * Evalues are compared with the condition that if both are close enough to
2957  * zero (currently < 1.0e-180), they are considered equal.
2958  * It is assumed that the HSP arrays in each hit list are already sorted by
2959  * e-value/score.
2960 */
2961 static int
s_EvalueCompareHSPLists(const void * v1,const void * v2)2962 s_EvalueCompareHSPLists(const void* v1, const void* v2)
2963 {
2964    BlastHSPList* h1,* h2;
2965    int retval = 0;
2966 
2967    h1 = *(BlastHSPList**) v1;
2968    h2 = *(BlastHSPList**) v2;
2969 
2970    /* If any of the HSP lists is empty, it is considered "worse" than the
2971       other, unless the other is also empty. */
2972    if (h1->hspcnt == 0 && h2->hspcnt == 0)
2973       return 0;
2974    else if (h1->hspcnt == 0)
2975       return 1;
2976    else if (h2->hspcnt == 0)
2977       return -1;
2978 
2979    if ((retval = s_EvalueComp(h1->best_evalue,
2980                                    h2->best_evalue)) != 0)
2981       return retval;
2982 
2983    if (h1->hsp_array[0]->score > h2->hsp_array[0]->score)
2984       return -1;
2985    if (h1->hsp_array[0]->score < h2->hsp_array[0]->score)
2986       return 1;
2987 
2988    /* In case of equal best E-values and scores, order will be determined
2989       by ordinal ids of the subject sequences */
2990    return BLAST_CMP(h2->oid, h1->oid);
2991 }
2992 
2993 /** Callback for sorting hsp lists by their best e-value/score, in
2994  * reverse order - from higher e-value to lower (lower score to higher).
2995 */
2996 static int
s_EvalueCompareHSPListsRev(const void * v1,const void * v2)2997 s_EvalueCompareHSPListsRev(const void* v1, const void* v2)
2998 {
2999    return s_EvalueCompareHSPLists(v2, v1);
3000 }
3001 
3002 /********************************************************************************
3003           Functions manipulating BlastHitList's
3004 ********************************************************************************/
3005 
3006 /*
3007    description in blast_hits.h
3008  */
Blast_HitListNew(Int4 hitlist_size)3009 BlastHitList* Blast_HitListNew(Int4 hitlist_size)
3010 {
3011    BlastHitList* new_hitlist = (BlastHitList*)
3012       calloc(1, sizeof(BlastHitList));
3013    new_hitlist->hsplist_max = hitlist_size;
3014    new_hitlist->low_score = INT4_MAX;
3015    new_hitlist->hsplist_count = 0;
3016    new_hitlist->hsplist_current = 0;
3017    return new_hitlist;
3018 }
3019 
3020 /*
3021    description in blast_hits.h
3022 */
Blast_HitListFree(BlastHitList * hitlist)3023 BlastHitList* Blast_HitListFree(BlastHitList* hitlist)
3024 {
3025    if (!hitlist)
3026       return NULL;
3027 
3028    Blast_HitListHSPListsFree(hitlist);
3029    sfree(hitlist);
3030    return NULL;
3031 }
3032 
3033 /* description in blast_hits.h */
Blast_HitListHSPListsFree(BlastHitList * hitlist)3034 Int2 Blast_HitListHSPListsFree(BlastHitList* hitlist)
3035 {
3036    Int4 index;
3037 
3038    if (!hitlist)
3039 	return 0;
3040 
3041    for (index = 0; index < hitlist->hsplist_count; ++index)
3042       hitlist->hsplist_array[index] =
3043          Blast_HSPListFree(hitlist->hsplist_array[index]);
3044 
3045    sfree(hitlist->hsplist_array);
3046 
3047    hitlist->hsplist_count = 0;
3048 
3049    return 0;
3050 }
3051 
3052 /** Purge a BlastHitList of empty HSP lists.
3053  * @param hit_list BLAST hit list structure. [in] [out]
3054  */
3055 static void
s_BlastHitListPurge(BlastHitList * hit_list)3056 s_BlastHitListPurge(BlastHitList* hit_list)
3057 {
3058    Int4 index, hsplist_count;
3059 
3060    if (!hit_list)
3061       return;
3062 
3063    hsplist_count = hit_list->hsplist_count;
3064    for (index = 0; index < hsplist_count &&
3065            hit_list->hsplist_array[index]->hspcnt > 0; ++index);
3066 
3067    hit_list->hsplist_count = index;
3068    /* Free all empty HSP lists */
3069    for ( ; index < hsplist_count; ++index) {
3070       Blast_HSPListFree(hit_list->hsplist_array[index]);
3071    }
3072 }
3073 
3074 /** Given a BlastHitList* with a heapified HSP list array, remove
3075  *  the worst scoring HSP list and insert the new HSP list in the heap
3076  * @param hit_list Contains all HSP lists for a given query [in] [out]
3077  * @param hsp_list A new HSP list to be inserted into the hit list [in]
3078  */
3079 static void
s_BlastHitListInsertHSPListInHeap(BlastHitList * hit_list,BlastHSPList * hsp_list)3080 s_BlastHitListInsertHSPListInHeap(BlastHitList* hit_list,
3081                                  BlastHSPList* hsp_list)
3082 {
3083       Blast_HSPListFree(hit_list->hsplist_array[0]);
3084       hit_list->hsplist_array[0] = hsp_list;
3085       if (hit_list->hsplist_count >= 2) {
3086          s_Heapify((char*)hit_list->hsplist_array, (char*)hit_list->hsplist_array,
3087                  (char*)&hit_list->hsplist_array[hit_list->hsplist_count/2 - 1],
3088                  (char*)&hit_list->hsplist_array[hit_list->hsplist_count-1],
3089                  sizeof(BlastHSPList*), s_EvalueCompareHSPLists);
3090       }
3091       hit_list->worst_evalue =
3092          hit_list->hsplist_array[0]->best_evalue;
3093       hit_list->low_score = hit_list->hsplist_array[0]->hsp_array[0]->score;
3094 }
3095 
3096 /** Given a BlastHitList pointer this function makes the
3097  * hsplist_array larger, up to a maximum size.
3098  * These incremental increases are mostly an issue for users who
3099  * put in a very large number for number of hits to save, but only save a few.
3100  * @param hit_list object containing the hsplist_array to grow [in]
3101  * @return zero on success, 1 if full already.
3102  */
s_Blast_HitListGrowHSPListArray(BlastHitList * hit_list)3103 static Int2 s_Blast_HitListGrowHSPListArray(BlastHitList* hit_list)
3104 
3105 {
3106     const int kStartValue = 100; /* default number of hsplist_array to start with. */
3107 
3108     ASSERT(hit_list);
3109 
3110     if (hit_list->hsplist_current >= hit_list->hsplist_max)
3111        return 1;
3112 
3113     if (hit_list->hsplist_current <= 0)
3114        hit_list->hsplist_current = kStartValue;
3115     else
3116        hit_list->hsplist_current = MIN(2*hit_list->hsplist_current, hit_list->hsplist_max);
3117 
3118     hit_list->hsplist_array =
3119        (BlastHSPList**) realloc(hit_list->hsplist_array, hit_list->hsplist_current*sizeof(BlastHSPList*));
3120 
3121     if (hit_list->hsplist_array == NULL)
3122        return BLASTERR_MEMORY;
3123 
3124     return 0;
3125 }
3126 
Blast_HitListUpdate(BlastHitList * hit_list,BlastHSPList * hsp_list)3127 Int2 Blast_HitListUpdate(BlastHitList* hit_list,
3128                          BlastHSPList* hsp_list)
3129 {
3130    hsp_list->best_evalue = s_BlastGetBestEvalue(hsp_list);
3131 
3132 #ifndef NDEBUG
3133    ASSERT(s_BlastCheckBestEvalue(hsp_list) == TRUE); /* NCBI_FAKE_WARNING */
3134 #endif /* _DEBUG */
3135 
3136    if (hit_list->hsplist_count < hit_list->hsplist_max) {
3137       /* If the array of HSP lists for this query is not yet allocated,
3138          do it here */
3139       if (hit_list->hsplist_current == hit_list->hsplist_count)
3140       {
3141          Int2 status = s_Blast_HitListGrowHSPListArray(hit_list);
3142          if (status)
3143            return status;
3144       }
3145       /* Just add to the end; sort later */
3146       hit_list->hsplist_array[hit_list->hsplist_count++] = hsp_list;
3147       hit_list->worst_evalue =
3148          MAX(hsp_list->best_evalue, hit_list->worst_evalue);
3149       hit_list->low_score =
3150          MIN(hsp_list->hsp_array[0]->score, hit_list->low_score);
3151    } else {
3152       int evalue_order = 0;
3153       /* make sure the hsp_list is sorted.  We actually do not need to sort
3154          the full list: all that we need is the best score.   However, the
3155          following code assumes hsp_list->hsp_array[0] has the best score. */
3156       Blast_HSPListSortByEvalue(hsp_list);
3157       /* Compare e-values only with a certain precision */
3158       evalue_order = s_EvalueComp(hsp_list->best_evalue,
3159                                        hit_list->worst_evalue);
3160       if (evalue_order > 0 ||
3161           (evalue_order == 0 &&
3162            (hsp_list->hsp_array[0]->score < hit_list->low_score))) {
3163          /* This hit list is less significant than any of those already saved;
3164             discard it. Note that newer hits with score and e-value both equal
3165             to the current worst will be saved, at the expense of some older
3166             hit.
3167          */
3168          Blast_HSPListFree(hsp_list);
3169       } else {
3170          if (!hit_list->heapified) {
3171             /* make sure all hsp_list is sorted */
3172             int index;
3173             for (index =0; index < hit_list->hsplist_count; index++)
3174                 Blast_HSPListSortByEvalue(hit_list->hsplist_array[index]);
3175 
3176             s_CreateHeap(hit_list->hsplist_array, hit_list->hsplist_count,
3177                        sizeof(BlastHSPList*), s_EvalueCompareHSPLists);
3178             hit_list->heapified = TRUE;
3179          }
3180          s_BlastHitListInsertHSPListInHeap(hit_list, hsp_list);
3181       }
3182    }
3183    return 0;
3184 }
3185 
3186 Int2
Blast_HitListPurgeNullHSPLists(BlastHitList * hit_list)3187 Blast_HitListPurgeNullHSPLists(BlastHitList* hit_list)
3188 {
3189    Int4 index, index1; /* loop indices. */
3190    Int4 hsplist_count; /* total number of HSPList's to iterate over. */
3191    BlastHSPList** hsplist_array;  /* hsplist_array to purge. */
3192 
3193    if (hit_list == NULL || hit_list->hsplist_count == 0)
3194       return 0;
3195 
3196    hsplist_array = hit_list->hsplist_array;
3197    hsplist_count = hit_list->hsplist_count;
3198 
3199    index1 = 0;
3200    for (index=0; index<hsplist_count; index++) {
3201       if (hsplist_array[index]) {
3202          hsplist_array[index1] = hsplist_array[index];
3203          index1++;
3204       }
3205    }
3206 
3207    for (index=index1; index<hsplist_count; index++) {
3208       hsplist_array[index] = NULL;
3209    }
3210 
3211    hit_list->hsplist_count = index1;
3212 
3213    return 0;
3214 }
3215 
Blast_HitListSortByEvalue(BlastHitList * hit_list)3216 Int2 Blast_HitListSortByEvalue(BlastHitList* hit_list)
3217 {
3218       if (hit_list && hit_list->hsplist_count > 1) {
3219          qsort(hit_list->hsplist_array, hit_list->hsplist_count,
3220                   sizeof(BlastHSPList*), s_EvalueCompareHSPLists);
3221       }
3222       s_BlastHitListPurge(hit_list);
3223    return 0;
3224 }
3225 
3226 
3227 /********************************************************************************
3228           Functions manipulating BlastHSPResults
3229 ********************************************************************************/
3230 
Blast_HSPResultsNew(Int4 num_queries)3231 BlastHSPResults* Blast_HSPResultsNew(Int4 num_queries)
3232 {
3233    BlastHSPResults* retval = NULL;
3234 
3235    retval = (BlastHSPResults*) calloc(1, sizeof(BlastHSPResults));
3236    if ( !retval ) {
3237        return NULL;
3238    }
3239 
3240    retval->num_queries = num_queries;
3241    retval->hitlist_array = (BlastHitList**)
3242       calloc(num_queries, sizeof(BlastHitList*));
3243 
3244    if ( !retval->hitlist_array ) {
3245        return Blast_HSPResultsFree(retval);
3246    }
3247 
3248    return retval;
3249 }
3250 
Blast_HSPResultsFree(BlastHSPResults * results)3251 BlastHSPResults* Blast_HSPResultsFree(BlastHSPResults* results)
3252 {
3253    Int4 index;
3254 
3255    if (!results)
3256        return NULL;
3257 
3258    if (results->hitlist_array)
3259    {
3260    	for (index = 0; index < results->num_queries; ++index)
3261       		Blast_HitListFree(results->hitlist_array[index]);
3262    	sfree(results->hitlist_array);
3263    }
3264    sfree(results);
3265    return NULL;
3266 }
3267 
Blast_HSPResultsSortByEvalue(BlastHSPResults * results)3268 Int2 Blast_HSPResultsSortByEvalue(BlastHSPResults* results)
3269 {
3270    Int4 index;
3271    BlastHitList* hit_list;
3272 
3273    if (!results)
3274        return 0;
3275 
3276    for (index = 0; index < results->num_queries; ++index) {
3277       hit_list = results->hitlist_array[index];
3278       if (hit_list != NULL
3279               && hit_list->hsplist_count > 1
3280               && hit_list->hsplist_array != NULL) {
3281          qsort(hit_list->hsplist_array, hit_list->hsplist_count,
3282                   sizeof(BlastHSPList*), s_EvalueCompareHSPLists);
3283       }
3284       s_BlastHitListPurge(hit_list);
3285    }
3286    return 0;
3287 }
3288 
Blast_HSPResultsReverseSort(BlastHSPResults * results)3289 Int2 Blast_HSPResultsReverseSort(BlastHSPResults* results)
3290 {
3291    Int4 index;
3292    BlastHitList* hit_list;
3293 
3294    for (index = 0; index < results->num_queries; ++index) {
3295       hit_list = results->hitlist_array[index];
3296       if (hit_list && hit_list->hsplist_count > 1) {
3297          qsort(hit_list->hsplist_array, hit_list->hsplist_count,
3298                sizeof(BlastHSPList*), s_EvalueCompareHSPListsRev);
3299       }
3300       s_BlastHitListPurge(hit_list);
3301    }
3302    return 0;
3303 }
3304 
Blast_HSPResultsReverseOrder(BlastHSPResults * results)3305 Int2 Blast_HSPResultsReverseOrder(BlastHSPResults* results)
3306 {
3307    Int4 index;
3308    BlastHitList* hit_list;
3309 
3310    for (index = 0; index < results->num_queries; ++index) {
3311       hit_list = results->hitlist_array[index];
3312       if (hit_list && hit_list->hsplist_count > 1) {
3313 	 BlastHSPList* hsplist;
3314 	 Int4 index1;
3315 	 /* Swap HSP lists: first with last; second with second from the end,
3316 	    etc. */
3317 	 for (index1 = 0; index1 < hit_list->hsplist_count/2; ++index1) {
3318 	    hsplist = hit_list->hsplist_array[index1];
3319 	    hit_list->hsplist_array[index1] =
3320 	       hit_list->hsplist_array[hit_list->hsplist_count-index1-1];
3321 	    hit_list->hsplist_array[hit_list->hsplist_count-index1-1] =
3322 	       hsplist;
3323 	 }
3324       }
3325    }
3326    return 0;
3327 }
3328 
3329 /** Auxiliary structure for sorting HSPs */
3330 typedef struct SHspWrap {
3331     BlastHSPList *hsplist;  /**< The HSPList to which this HSP belongs */
3332     BlastHSP *hsp;          /**< HSP described by this structure */
3333 } SHspWrap;
3334 
3335 /** callback used to sort a list of encapsulated HSP
3336  *  structures in order of decreasing raw score
3337  *  -RMH-
3338  */
s_SortHspWrapRawScore(const void * x,const void * y)3339 static int s_SortHspWrapRawScore(const void *x, const void *y)
3340 {
3341     SHspWrap *wrap1 = (SHspWrap *)x;
3342     SHspWrap *wrap2 = (SHspWrap *)y;
3343     if (wrap1->hsp->score > wrap2->hsp->score)
3344         return -1;
3345     if (wrap1->hsp->score < wrap2->hsp->score)
3346         return 1;
3347 
3348     return 0;
3349 }
3350 
3351 // Masklevel filtering for rmblastn. -RMH-
Blast_HSPResultsApplyMasklevel(BlastHSPResults * results,const BlastQueryInfo * query_info,Int4 masklevel,Int4 query_length)3352 Int2 Blast_HSPResultsApplyMasklevel(BlastHSPResults *results,
3353                                     const BlastQueryInfo *query_info,
3354                                     Int4 masklevel, Int4 query_length)
3355 {
3356    Int4 i, j, k, m;
3357    Int4 hsp_count;
3358    SHspWrap *hsp_array;
3359    BlastIntervalTree *tree;
3360 
3361    /* set up the interval tree; subject offsets are not needed */
3362 
3363    tree = Blast_IntervalTreeInit(0, query_length + 1, 0, 0);
3364 
3365    for (i = 0; i < results->num_queries; i++) {
3366       BlastHitList *hitlist = results->hitlist_array[i];
3367       if (hitlist == NULL)
3368          continue;
3369 
3370       for (j = hsp_count = 0; j < hitlist->hsplist_count; j++) {
3371          BlastHSPList *hsplist = hitlist->hsplist_array[j];
3372          hsp_count += hsplist->hspcnt;
3373       }
3374 
3375       /* empty each HSP into a combined HSP array, then
3376          sort the array by raw score */
3377 
3378       hsp_array = (SHspWrap *)malloc(hsp_count * sizeof(SHspWrap));
3379 
3380       for (j = k = 0; j < hitlist->hsplist_count; j++) {
3381          BlastHSPList *hsplist = hitlist->hsplist_array[j];
3382          for (m = 0; m < hsplist->hspcnt; k++, m++) {
3383             BlastHSP *hsp = hsplist->hsp_array[m];
3384             hsp_array[k].hsplist = hsplist;
3385             hsp_array[k].hsp = hsp;
3386          }
3387          hsplist->hspcnt = 0;
3388       }
3389 
3390       qsort(hsp_array, hsp_count, sizeof(SHspWrap), s_SortHspWrapRawScore);
3391 
3392       /* Starting with the best HSP, use the interval tree to
3393          check that the query range of each HSP in the list has
3394          not already been enveloped by too many higher-scoring
3395          HSPs. If this is not the case, add the HSP back into results */
3396 
3397       Blast_IntervalTreeReset(tree);
3398 
3399       for (j = 0; j < hsp_count; j++) {
3400          BlastHSPList *hsplist = hsp_array[j].hsplist;
3401          BlastHSP *hsp = hsp_array[j].hsp;
3402 
3403          if (BlastIntervalTreeMasksHSP(tree,
3404                          hsp, query_info, 0, masklevel)) {
3405              Blast_HSPFree(hsp);
3406          }
3407          else {
3408              BlastIntervalTreeAddHSP(hsp, tree, query_info, eQueryOnlyStrandIndifferent);
3409              Blast_HSPListSaveHSP(hsplist, hsp);
3410 
3411              /* the first HSP added back into an HSPList
3412                 automatically has the best e-value */
3413              // RMH: hmmmmmmm
3414              if (hsplist->hspcnt == 1)
3415                  hsplist->best_evalue = hsp->evalue;
3416          }
3417       }
3418       sfree(hsp_array);
3419 
3420       /* remove any HSPLists that are still empty after the
3421          culling process. Sort any remaining lists by score */
3422       for (j = 0; j < hitlist->hsplist_count; j++) {
3423          BlastHSPList *hsplist = hitlist->hsplist_array[j];
3424          if (hsplist->hspcnt == 0) {
3425              hitlist->hsplist_array[j] =
3426                    Blast_HSPListFree(hitlist->hsplist_array[j]);
3427          }
3428          else {
3429              Blast_HSPListSortByScore(hitlist->hsplist_array[j]);
3430          }
3431       }
3432       Blast_HitListPurgeNullHSPLists(hitlist);
3433    }
3434 
3435    tree = Blast_IntervalTreeFree(tree);
3436    return 0;
3437 }
3438 
Blast_HSPResultsInsertHSPList(BlastHSPResults * results,BlastHSPList * hsp_list,Int4 hitlist_size)3439 Int2 Blast_HSPResultsInsertHSPList(BlastHSPResults* results,
3440         BlastHSPList* hsp_list, Int4 hitlist_size)
3441 {
3442    if (!hsp_list || hsp_list->hspcnt == 0)
3443       return 0;
3444 
3445    ASSERT(hsp_list->query_index < results->num_queries);
3446 
3447    if (!results->hitlist_array[hsp_list->query_index]) {
3448        results->hitlist_array[hsp_list->query_index] =
3449            Blast_HitListNew(hitlist_size);
3450    }
3451    Blast_HitListUpdate(results->hitlist_array[hsp_list->query_index],
3452                        hsp_list);
3453    return 0;
3454 }
3455 
3456 BlastHSPResults**
PHIBlast_HSPResultsSplit(const BlastHSPResults * results,const SPHIQueryInfo * pattern_info)3457 PHIBlast_HSPResultsSplit(const BlastHSPResults* results,
3458                          const SPHIQueryInfo* pattern_info)
3459 {
3460     BlastHSPResults* *phi_results = NULL;
3461     int pattern_index, hit_index;
3462     int num_patterns;
3463     BlastHitList* hit_list = NULL;
3464     BlastHSPList** hsplist_array; /* Temporary per-pattern HSP lists. */
3465 
3466     if (!pattern_info || pattern_info->num_patterns == 0)
3467         return NULL;
3468 
3469     num_patterns = pattern_info->num_patterns;
3470 
3471     phi_results =
3472         (BlastHSPResults**) calloc(num_patterns, sizeof(BlastHSPResults*));
3473 
3474     if (!results || !results->hitlist_array[0])
3475         return phi_results;   /* An empty results set is expected if no hits. */
3476 
3477     hsplist_array = (BlastHSPList**) calloc(num_patterns, sizeof(BlastHSPList*));
3478     hit_list = results->hitlist_array[0];
3479 
3480     for (hit_index = 0; hit_index < hit_list->hsplist_count; ++hit_index) {
3481         BlastHSPList* hsp_list = hit_list->hsplist_array[hit_index];
3482         int hsp_index;
3483         /* Copy HSPs corresponding to different pattern occurrences into
3484            separate HSP lists. */
3485         for (hsp_index = 0; hsp_index < hsp_list->hspcnt; ++hsp_index) {
3486             BlastHSP* hsp = s_BlastHSPCopy(hsp_list->hsp_array[hsp_index]);
3487             pattern_index = hsp->pat_info->index;
3488             if (!hsplist_array[pattern_index])
3489                 hsplist_array[pattern_index] = Blast_HSPListNew(0);
3490             hsplist_array[pattern_index]->oid = hsp_list->oid;
3491             Blast_HSPListSaveHSP(hsplist_array[pattern_index], hsp);
3492         }
3493 
3494         /* Save HSP lists corresponding to different pattern occurrences
3495            in separate results structures. */
3496         for (pattern_index = 0; pattern_index < num_patterns;
3497              ++pattern_index) {
3498             if (hsplist_array[pattern_index]) {
3499                 if (!phi_results[pattern_index])
3500                     phi_results[pattern_index] = Blast_HSPResultsNew(1);
3501                 Blast_HSPResultsInsertHSPList(phi_results[pattern_index],
3502                                               hsplist_array[pattern_index],
3503                                               hit_list->hsplist_max);
3504                 hsplist_array[pattern_index] = NULL;
3505             }
3506         }
3507     }
3508 
3509     sfree(hsplist_array);
3510 
3511     /* Sort HSPLists in each of the results structures by e-value. */
3512     for (pattern_index = 0; pattern_index < num_patterns; ++pattern_index) {
3513         Blast_HSPResultsSortByEvalue(phi_results[pattern_index]);
3514     }
3515 
3516     return phi_results;
3517 }
3518 
3519 BlastHSPResults*
Blast_HSPResultsFromHSPStream(BlastHSPStream * hsp_stream,size_t num_queries,SBlastHitsParameters * bhp)3520 Blast_HSPResultsFromHSPStream(BlastHSPStream* hsp_stream,
3521                               size_t num_queries,
3522                               SBlastHitsParameters* bhp)
3523 {
3524     BlastHSPResults* retval = NULL;
3525     BlastHSPList* hsp_list = NULL;
3526 
3527     retval = Blast_HSPResultsNew((Int4) num_queries);
3528 
3529     while (BlastHSPStreamRead(hsp_stream, &hsp_list) != kBlastHSPStream_Eof) {
3530         Blast_HSPResultsInsertHSPList(retval, hsp_list,
3531                                       bhp->prelim_hitlist_size);
3532     }
3533     SBlastHitsParametersFree(bhp);
3534     return retval;
3535 }
3536 
3537 /** Comparison function for sorting HSP lists in increasing order of the
3538  * number of HSPs in a hit. Needed for s_TrimResultsByTotalHSPLimit below.
3539  * @param v1 Pointer to the first HSP list [in]
3540  * @param v2 Pointer to the second HSP list [in]
3541  */
3542 static int
s_CompareHsplistHspcnt(const void * v1,const void * v2)3543 s_CompareHsplistHspcnt(const void* v1, const void* v2)
3544 {
3545    BlastHSPList* r1 = *((BlastHSPList**) v1);
3546    BlastHSPList* r2 = *((BlastHSPList**) v2);
3547 
3548    if (r1->hspcnt < r2->hspcnt)
3549       return -1;
3550    else if (r1->hspcnt > r2->hspcnt)
3551       return 1;
3552    else
3553       return 0;
3554 }
3555 
3556 /** Removes extra results if a limit is imposed on the total number of HSPs
3557  * returned. If the search involves multiple query sequences, the total HSP
3558  * limit is applied separately to each query.
3559  * The trimming algorithm makes sure that at least 1 HSP is returned for each
3560  * database sequence hit. Suppose results for a given query consist of HSP
3561  * lists for N database sequences, and the limit is T. HSP lists are sorted in
3562  * order of increasing number of HSPs in each list. Then the algorithm proceeds
3563  * by leaving at most i*T/N HSPs for the first i HSP lists, for every
3564  * i = 1, 2, ..., N.
3565  * @param results Results after preliminary stage of a BLAST search [in|out]
3566  * @param total_hsp_limit Limit on total number of HSPs [in]
3567  * @return TRUE if HSP limit has been exceeded, FALSE otherwise.
3568  */
3569 static Boolean
s_TrimResultsByTotalHSPLimit(BlastHSPResults * results,Uint4 total_hsp_limit)3570 s_TrimResultsByTotalHSPLimit(BlastHSPResults* results, Uint4 total_hsp_limit)
3571 {
3572     int query_index;
3573     Boolean hsp_limit_exceeded = FALSE;
3574 
3575     if (total_hsp_limit == 0) {
3576         return hsp_limit_exceeded;
3577     }
3578 
3579     for (query_index = 0; query_index < results->num_queries; ++query_index) {
3580         BlastHitList* hit_list = NULL;
3581         BlastHSPList** hsplist_array = NULL;
3582         Int4 hsplist_count = 0;
3583         int subj_index;
3584 
3585         if ( !(hit_list = results->hitlist_array[query_index]) )
3586             continue;
3587         /* The count of HSPs is separate for each query. */
3588         hsplist_count = hit_list->hsplist_count;
3589 
3590         hsplist_array = (BlastHSPList**)
3591             malloc(hsplist_count*sizeof(BlastHSPList*));
3592 
3593         for (subj_index = 0; subj_index < hsplist_count; ++subj_index) {
3594             hsplist_array[subj_index] = hit_list->hsplist_array[subj_index];
3595         }
3596 
3597         qsort((void*)hsplist_array, hsplist_count,
3598               sizeof(BlastHSPList*), s_CompareHsplistHspcnt);
3599 
3600         {
3601             Int4 tot_hsps = 0;  /* total number of HSPs */
3602             Uint4 hsp_per_seq = MAX(1, total_hsp_limit/hsplist_count);
3603             for (subj_index = 0; subj_index < hsplist_count; ++subj_index) {
3604                 Int4 allowed_hsp_num = ((subj_index+1)*hsp_per_seq) - tot_hsps;
3605                 BlastHSPList* hsp_list = hsplist_array[subj_index];
3606                 if (hsp_list->hspcnt > allowed_hsp_num) {
3607                     /* Free the extra HSPs */
3608                     int hsp_index;
3609                     for (hsp_index = allowed_hsp_num;
3610                          hsp_index < hsp_list->hspcnt; ++hsp_index) {
3611                         Blast_HSPFree(hsp_list->hsp_array[hsp_index]);
3612                     }
3613                     hsp_list->hspcnt = allowed_hsp_num;
3614                     hsp_limit_exceeded = TRUE;
3615                 }
3616                 tot_hsps += hsp_list->hspcnt;
3617             }
3618         }
3619         sfree(hsplist_array);
3620     }
3621 
3622     return hsp_limit_exceeded;
3623 }
3624 
3625 typedef struct BlastHSPwOid {
3626 	BlastHSP *  hsp;
3627 	Int4 		oid;
3628 } BlastHSPwOid;
3629 
3630 static int
s_CompareScoreHSPwOid(const void * v1,const void * v2)3631 s_CompareScoreHSPwOid(const void* v1, const void* v2)
3632 {
3633    BlastHSPwOid * r1 = (BlastHSPwOid *) v1;
3634    BlastHSPwOid * r2 = (BlastHSPwOid *) v2;
3635    return (s_EvalueCompareHSPs(&(r1->hsp), &(r2->hsp)));
3636 }
3637 
3638 static int
s_CompareOidHSPwOid(const void * v1,const void * v2)3639 s_CompareOidHSPwOid(const void* v1, const void* v2)
3640 {
3641    BlastHSPwOid * r1 = (BlastHSPwOid *) v1;
3642    BlastHSPwOid * r2 = (BlastHSPwOid *) v2;
3643    return (r1->oid > r2->oid);
3644 }
3645 
3646 
3647 /* extended version of the above function. Provides information about query number
3648  * which exceeded number of HSP.
3649  * The hsp_limit_exceeded is of results->num_queries size guarantied.
3650  */
3651 static Boolean
s_TrimResultsByTotalHSPLimitEx(BlastHSPResults * results,Uint4 total_hsp_limit,Boolean * hsp_limit_exceeded)3652 s_TrimResultsByTotalHSPLimitEx(BlastHSPResults* results,
3653 	                       Uint4 total_hsp_limit,
3654 			       Boolean *hsp_limit_exceeded)
3655 {
3656     int query_index;
3657     Boolean  any_hsp_limit_exceeded = FALSE;
3658 
3659     if (total_hsp_limit == 0) {
3660         return any_hsp_limit_exceeded;
3661     }
3662 
3663     for (query_index = 0; query_index < results->num_queries; ++query_index) {
3664         BlastHitList* hit_list = NULL;
3665         Int4 hsplist_count = 0;
3666         int subj_index;
3667         Int4 total_hsps = 0;  /* total number of HSPs */
3668 
3669         if( hsp_limit_exceeded) hsp_limit_exceeded[query_index]  = FALSE;
3670 
3671         if ( !(hit_list = results->hitlist_array[query_index]) )
3672             continue;
3673         /* The count of HSPs is separate for each query. */
3674         hsplist_count = hit_list->hsplist_count;
3675 
3676         for (subj_index = 0; subj_index < hsplist_count; ++subj_index) {
3677         	total_hsps += hit_list->hsplist_array[subj_index]->hspcnt;
3678         }
3679 
3680         if(total_hsps > total_hsp_limit)
3681         {
3682         	 BlastHSPwOid *  everything_list = (BlastHSPwOid *) malloc(total_hsps * sizeof(BlastHSPwOid));
3683         	 BlastHSPList * subj_list;
3684        		 int hsp_counter = 0;
3685        		 int max_hit_list_size = hit_list->hsplist_max;
3686         	 if( hsp_limit_exceeded) {
3687         		 hsp_limit_exceeded[query_index]  = TRUE;
3688         		 any_hsp_limit_exceeded = TRUE;
3689         	 }
3690         	 for (subj_index = 0; subj_index < hsplist_count; ++subj_index) {
3691         		 int subj_hsp;
3692         		 BlastHSP ** hsps_per_subj;
3693         		 subj_list = hit_list->hsplist_array[subj_index];
3694         		 hsps_per_subj = subj_list->hsp_array;
3695         		 for(subj_hsp=0; subj_hsp < subj_list->hspcnt; ++subj_hsp) {
3696         			 everything_list[hsp_counter].hsp = hsps_per_subj[subj_hsp];
3697         			 everything_list[hsp_counter].oid = subj_list->oid;
3698         			 hsps_per_subj[subj_hsp] = NULL;
3699         			 hsp_counter ++;
3700         		 }
3701 
3702         	 }
3703         	 results->hitlist_array[query_index] = Blast_HitListFree(hit_list);
3704         	 qsort((void*)everything_list, total_hsps, sizeof(BlastHSPwOid), s_CompareScoreHSPwOid);
3705 
3706         	 for(hsp_counter = total_hsp_limit; hsp_counter < total_hsps ; ++hsp_counter) {
3707         		 everything_list[hsp_counter].hsp = Blast_HSPFree(everything_list[hsp_counter].hsp);
3708         		 everything_list[hsp_counter].oid = 0x7fffff;
3709         	 }
3710 
3711         	 qsort((void*)everything_list, total_hsp_limit, sizeof(BlastHSPwOid), s_CompareOidHSPwOid);
3712        		 subj_list = NULL;
3713         	 for(hsp_counter = 0; hsp_counter < total_hsp_limit; ++ hsp_counter)
3714         	 {
3715         		 int hsp_counter_start = hsp_counter;
3716         		 int hspcnt;
3717         		 int num_hsp;
3718 
3719         		 while ((everything_list[hsp_counter].oid == everything_list[hsp_counter+1].oid) &&
3720         				 (hsp_counter + 1 < total_hsp_limit)) {
3721         			 hsp_counter ++;
3722         		 }
3723         		 num_hsp = hsp_counter -hsp_counter_start + 1;
3724         		 subj_list = Blast_HSPListNew(num_hsp);
3725         		 subj_list->oid = everything_list[hsp_counter].oid;
3726         		 subj_list->query_index = query_index;
3727 
3728         		 for(hspcnt = 0; hspcnt < num_hsp; ++hspcnt) {
3729         			 Blast_HSPListSaveHSP(subj_list, everything_list[hsp_counter_start + hspcnt].hsp);
3730         		 }
3731 
3732         		 Blast_HSPResultsInsertHSPList(results,subj_list, max_hit_list_size );
3733         	 }
3734         	 free(everything_list);
3735         }
3736     }
3737 
3738     return any_hsp_limit_exceeded;
3739 }
3740 
3741 BlastHSPResults*
Blast_HSPResultsFromHSPStreamWithLimit(BlastHSPStream * hsp_stream,Uint4 num_queries,SBlastHitsParameters * hit_param,Uint4 max_num_hsps,Boolean * removed_hsps)3742 Blast_HSPResultsFromHSPStreamWithLimit(BlastHSPStream* hsp_stream,
3743                                    Uint4 num_queries,
3744                                    SBlastHitsParameters* hit_param,
3745                                    Uint4 max_num_hsps,
3746                                    Boolean* removed_hsps)
3747 {
3748     Boolean rm_hsps = FALSE;
3749     BlastHSPResults* retval = Blast_HSPResultsFromHSPStream(hsp_stream,
3750                                                             num_queries,
3751                                                             hit_param);
3752 
3753     rm_hsps = s_TrimResultsByTotalHSPLimit(retval, max_num_hsps);
3754     if (removed_hsps) {
3755         *removed_hsps = rm_hsps;
3756     }
3757     return retval;
3758 }
3759 BlastHSPResults*
Blast_HSPResultsFromHSPStreamWithLimitEx(BlastHSPStream * hsp_stream,Uint4 num_queries,SBlastHitsParameters * hit_param,Uint4 max_num_hsps,Boolean * removed_hsps)3760 Blast_HSPResultsFromHSPStreamWithLimitEx(BlastHSPStream* hsp_stream,
3761                                    Uint4 num_queries,
3762                                    SBlastHitsParameters* hit_param,
3763                                    Uint4 max_num_hsps,
3764 				   Boolean *removed_hsps)
3765 {
3766     Boolean rm_hsps = FALSE;
3767     BlastHSPResults* retval = Blast_HSPResultsFromHSPStream(hsp_stream,
3768                                                             num_queries,
3769                                                             hit_param);
3770 
3771     rm_hsps = s_TrimResultsByTotalHSPLimitEx(retval, max_num_hsps,removed_hsps);
3772     if (removed_hsps) {
3773         *removed_hsps = rm_hsps;
3774     }
3775     return retval;
3776 }
3777 
3778 
Blast_HSPChainNew(void)3779 BlastHSPChain* Blast_HSPChainNew(void)
3780 {
3781     BlastHSPChain* chain = calloc(1, sizeof(BlastHSPChain));
3782     return chain;
3783 }
3784 
Blast_HSPChainFree(BlastHSPChain * ch)3785 BlastHSPChain* Blast_HSPChainFree(BlastHSPChain* ch)
3786 {
3787     Int4 i;
3788 
3789     if (!ch) {
3790         return NULL;
3791     }
3792 
3793     for (i = 0;i < ch->num_hsps;i++) {
3794         Blast_HSPFree(ch->hsp_array[i]);
3795     }
3796     sfree(ch->hsp_array);
3797     sfree(ch);
3798 
3799     return NULL;
3800 }
3801 
Blast_MappingResultsNew(void)3802 BlastMappingResults* Blast_MappingResultsNew(void)
3803 {
3804     BlastMappingResults* retval = calloc(1, sizeof(BlastMappingResults));
3805     return retval;
3806 }
3807 
Blast_MappingResultsFree(BlastMappingResults * results)3808 BlastMappingResults* Blast_MappingResultsFree(BlastMappingResults* results)
3809 {
3810     if (!results) {
3811         return NULL;
3812     }
3813 
3814     if (results->chain_array) {
3815         Int4 i;
3816         for (i = 0;i < results->num_results;i++) {
3817             Blast_HSPChainFree(results->chain_array[i]);
3818         }
3819         sfree(results->chain_array);
3820     }
3821     sfree(results);
3822 
3823     return NULL;
3824 }
3825