1 /* $Id: blast_gapalign.c 581681 2019-03-04 21:01:49Z boratyng $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  *
27  */
28 
29 /** @file blast_gapalign.c
30  * Functions to perform gapped alignment
31  */
32 
33 #include <algo/blast/core/ncbi_math.h>
34 #include <algo/blast/core/blast_gapalign.h>
35 #include <algo/blast/core/blast_util.h> /* for NCBI2NA_UNPACK_BASE macros */
36 #include <algo/blast/core/greedy_align.h>
37 #include "blast_gapalign_priv.h"
38 #include "blast_hits_priv.h"
39 #include "blast_itree.h"
40 #include "jumper.h"
41 
42 static Int2 s_BlastDynProgNtGappedAlignment(BLAST_SequenceBlk* query_blk,
43    BLAST_SequenceBlk* subject_blk, BlastGapAlignStruct* gap_align,
44    const BlastScoringParameters* score_params, BlastInitHSP* init_hsp);
45 static Int4 s_BlastAlignPackedNucl(Uint1* B, Uint1* A, Int4 N, Int4 M,
46    Int4* pej, Int4* pei, BlastGapAlignStruct* gap_align,
47    const BlastScoringParameters* score_params, Boolean reverse_sequence,
48    Int4 x_dropoff);
49 
50 static Int2 s_BlastProtGappedAlignment(EBlastProgramType program,
51    BLAST_SequenceBlk* query_in, BLAST_SequenceBlk* subject_in,
52    BlastGapAlignStruct* gap_align,
53    const BlastScoringParameters* score_params,
54    BlastInitHSP* init_hsp, Boolean restricted_alignment,
55    Boolean * fence_hit);
56 
57 /** Lower bound for scores. Divide by two to prevent underflows. */
58 #define MININT INT4_MIN/2
59 
60 /** Minimal size of a chunk for state array allocation. */
61 #define	CHUNKSIZE	2097152
62 
63 /** Retrieve the state structure corresponding to a given length
64  * @param head Pointer to the first element of the state structures
65  *        array [in]
66  * @param length The length for which the state structure has to be
67  *        found [in]
68  * @return The found or created state structure
69  */
70 static GapStateArrayStruct*
s_GapGetState(GapStateArrayStruct ** head,Int4 length)71 s_GapGetState(GapStateArrayStruct** head, Int4 length)
72 
73 {
74    GapStateArrayStruct*	retval,* var,* last;
75    Int4	chunksize = MAX(CHUNKSIZE, length + length/3);
76 
77    length += length/3;	/* Add on about 30% so the end will get reused. */
78    retval = NULL;
79    if (*head == NULL) {
80       retval = (GapStateArrayStruct*)
81          malloc(sizeof(GapStateArrayStruct));
82       retval->state_array = (Uint1*) malloc(chunksize*sizeof(Uint1));
83       retval->length = chunksize;
84       retval->used = 0;
85       retval->next = NULL;
86       *head = retval;
87    } else {
88       var = *head;
89       last = *head;
90       while (var) {
91          if (length < (var->length - var->used)) {
92             retval = var;
93             break;
94          } else if (var->used == 0) {
95             /* If it's empty and too small, replace. */
96             sfree(var->state_array);
97             var->state_array = (Uint1*) malloc(chunksize*sizeof(Uint1));
98             var->length = chunksize;
99             retval = var;
100             break;
101          }
102          last = var;
103          var = var->next;
104       }
105 
106       if (var == NULL)
107       {
108          retval = (GapStateArrayStruct*) malloc(sizeof(GapStateArrayStruct));
109          retval->state_array = (Uint1*) malloc(chunksize*sizeof(Uint1));
110          retval->length = chunksize;
111          retval->used = 0;
112          retval->next = NULL;
113          last->next = retval;
114       }
115    }
116 
117 #ifdef ERR_POST_EX_DEFINED
118    if (retval->state_array == NULL)
119       ErrPostEx(SEV_ERROR, 0, 0, "state array is NULL");
120 #endif
121 
122    return retval;
123 
124 }
125 
126 /** Remove a state from a state structure */
127 static Boolean
s_GapPurgeState(GapStateArrayStruct * state_struct)128 s_GapPurgeState(GapStateArrayStruct* state_struct)
129 {
130    while (state_struct)
131    {
132       /*
133 	memset(state_struct->state_array, 0, state_struct->used);
134       */
135       state_struct->used = 0;
136       state_struct = state_struct->next;
137    }
138 
139    return TRUE;
140 }
141 
142 /** Deallocate the memory for greedy gapped alignment */
143 static SGreedyAlignMem*
s_BlastGreedyAlignsFree(SGreedyAlignMem * gamp)144 s_BlastGreedyAlignsFree(SGreedyAlignMem* gamp)
145 {
146    if (gamp->last_seq2_off) {
147       sfree(gamp->last_seq2_off[0]);
148       sfree(gamp->last_seq2_off);
149    } else {
150       if (gamp->last_seq2_off_affine) {
151          sfree(gamp->last_seq2_off_affine[0]);
152          sfree(gamp->last_seq2_off_affine);
153       }
154       sfree(gamp->diag_bounds);
155    }
156    sfree(gamp->max_score);
157    if (gamp->space)
158       MBSpaceFree(gamp->space);
159    sfree(gamp);
160    return NULL;
161 }
162 
163 /** Allocate memory for the greedy gapped alignment algorithm
164  * @param score_params Parameters related to scoring [in]
165  * @param ext_params Options and parameters related to the extension [in]
166  * @param max_d The maximum distance to search [in]
167  * @param Xdrop The Xdrop value [in]
168  * @return The allocated SGreedyAlignMem structure
169  */
170 static SGreedyAlignMem*
s_BlastGreedyAlignMemAlloc(const BlastScoringParameters * score_params,const BlastExtensionParameters * ext_params,Int4 max_d,Int4 Xdrop)171 s_BlastGreedyAlignMemAlloc(const BlastScoringParameters* score_params,
172 		       const BlastExtensionParameters* ext_params,
173 		       Int4 max_d, Int4 Xdrop)
174 {
175    SGreedyAlignMem* gamp;
176    Int4 max_d_1, d_diff, max_cost, gd, i;
177    Int4 reward, penalty, gap_open, gap_extend;
178    Int4 Mis_cost, GE_cost;
179 
180    if (score_params == NULL || (!ext_params && !Xdrop))
181       return NULL;
182 
183    if (score_params->reward % 2 == 1) {
184       reward = 2*score_params->reward;
185       penalty = -2*score_params->penalty;
186       if (!Xdrop)
187           Xdrop = 2*MAX(ext_params->gap_x_dropoff,
188                     ext_params->gap_x_dropoff_final);
189       gap_open = 2*score_params->gap_open;
190       gap_extend = 2*score_params->gap_extend;
191    } else {
192       reward = score_params->reward;
193       penalty = -score_params->penalty;
194       if (!Xdrop)
195           Xdrop = MAX(ext_params->gap_x_dropoff,
196                   ext_params->gap_x_dropoff_final);
197       gap_open = score_params->gap_open;
198       gap_extend = score_params->gap_extend;
199    }
200 
201    if (gap_open == 0 && gap_extend == 0)
202       gap_extend = reward / 2 + penalty;
203 
204    gamp = (SGreedyAlignMem*) calloc(1, sizeof(SGreedyAlignMem));
205    gamp->max_dist = max_d;
206    gamp->xdrop = Xdrop;
207 
208    if (score_params->gap_open==0 && score_params->gap_extend==0) {
209       d_diff = (Xdrop+reward/2)/(penalty+reward)+1;
210 
211       gamp->last_seq2_off = (Int4**) malloc((max_d + 2) * sizeof(Int4*));
212       if (gamp->last_seq2_off == NULL) {
213          sfree(gamp);
214          return NULL;
215       }
216       gamp->last_seq2_off[0] =
217          (Int4*) malloc((max_d + max_d + 6) * sizeof(Int4) * 2);
218       if (gamp->last_seq2_off[0] == NULL) {
219 #ifdef ERR_POST_EX_DEFINED
220 	 ErrPostEx(SEV_WARNING, 0, 0,
221               "Failed to allocate %ld bytes for greedy alignment",
222               (max_d + max_d + 6) * sizeof(Int4) * 2);
223 #endif
224          s_BlastGreedyAlignsFree(gamp);
225          return NULL;
226       }
227 
228       gamp->last_seq2_off[1] = gamp->last_seq2_off[0] + max_d + max_d + 6;
229       gamp->last_seq2_off_affine = NULL;
230       gamp->diag_bounds = NULL;
231    } else {
232       gamp->last_seq2_off = NULL;
233       Mis_cost = reward + penalty;
234       GE_cost = gap_extend+reward/2;
235       max_d_1 = max_d;
236       max_d *= GE_cost;
237       max_cost = MAX(Mis_cost, gap_open+GE_cost);
238       gd = BLAST_Gdb3(&Mis_cost, &gap_open, &GE_cost);
239       d_diff = (Xdrop+reward/2)/gd+1;
240       gamp->diag_bounds = (Int4*) calloc(2*(max_d+1+max_cost), sizeof(Int4));
241       gamp->last_seq2_off_affine = (SGreedyOffset**)
242 	 malloc((MAX(max_d, max_cost) + 2) * sizeof(SGreedyOffset*));
243       if (!gamp->diag_bounds || !gamp->last_seq2_off_affine) {
244          s_BlastGreedyAlignsFree(gamp);
245          return NULL;
246       }
247       gamp->last_seq2_off_affine[0] = (SGreedyOffset*)
248 	 calloc((2*max_d_1 + 6) , sizeof(SGreedyOffset) * (max_cost+1));
249       for (i = 1; i <= max_cost; i++)
250 	 gamp->last_seq2_off_affine[i] =
251 	    gamp->last_seq2_off_affine[i-1] + 2*max_d_1 + 6;
252       if (!gamp->last_seq2_off_affine || !gamp->last_seq2_off_affine[0]) {
253          s_BlastGreedyAlignsFree(gamp);
254          return NULL;
255       }
256    }
257    gamp->max_score = (Int4*) malloc(sizeof(Int4) * (max_d + 1 + d_diff));
258 
259    gamp->space = MBSpaceNew(0);
260    if (!gamp->max_score || !gamp->space) {
261        /* Failure in one of the memory allocations */
262        s_BlastGreedyAlignsFree(gamp);
263        return NULL;
264    }
265 
266    return gamp;
267 }
268 
269 /* Documented in blast_gapalign.h */
270 BlastGapAlignStruct*
BLAST_GapAlignStructFree(BlastGapAlignStruct * gap_align)271 BLAST_GapAlignStructFree(BlastGapAlignStruct* gap_align)
272 {
273    if (!gap_align)
274       return NULL;
275 
276    GapEditScriptDelete(gap_align->edit_script);
277    GapPrelimEditBlockFree(gap_align->fwd_prelim_tback);
278    GapPrelimEditBlockFree(gap_align->rev_prelim_tback);
279    if (gap_align->greedy_align_mem)
280       s_BlastGreedyAlignsFree(gap_align->greedy_align_mem);
281    GapStateFree(gap_align->state_struct);
282    sfree(gap_align->dp_mem);
283    JumperGapAlignFree(gap_align->jumper);
284 
285    sfree(gap_align);
286    return NULL;
287 }
288 
289 /* Documented in blast_gapalign.h */
290 Int2
BLAST_GapAlignStructNew(const BlastScoringParameters * score_params,const BlastExtensionParameters * ext_params,Uint4 max_subject_length,BlastScoreBlk * sbp,BlastGapAlignStruct ** gap_align_ptr)291 BLAST_GapAlignStructNew(const BlastScoringParameters* score_params,
292    const BlastExtensionParameters* ext_params,
293    Uint4 max_subject_length,
294    BlastScoreBlk* sbp, BlastGapAlignStruct** gap_align_ptr)
295 {
296    Int2 status = 0;
297    BlastGapAlignStruct* gap_align;
298 
299    if (!gap_align_ptr || !sbp || !score_params || !ext_params)
300       return -1;
301 
302    gap_align = (BlastGapAlignStruct*) calloc(1, sizeof(BlastGapAlignStruct));
303 
304    *gap_align_ptr = gap_align;
305 
306    gap_align->sbp = sbp;
307 
308    gap_align->gap_x_dropoff = ext_params->gap_x_dropoff;
309    gap_align->max_mismatches = ext_params->options->max_mismatches;
310    gap_align->mismatch_window = ext_params->options->mismatch_window;
311 
312    /* allocate memory either for dynamic programming or jumper */
313    if (ext_params->options->ePrelimGapExt != eJumperWithTraceback) {
314        if (ext_params->options->ePrelimGapExt == eDynProgScoreOnly) {
315 	   /* allocate structures for ordinary dynamic programming */
316 	   gap_align->dp_mem_alloc = 1000;
317 	   gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
318 						    sizeof(BlastGapDP));
319 	   if (!gap_align->dp_mem)
320 	       gap_align = BLAST_GapAlignStructFree(gap_align);
321        }
322        else {
323 	   /* allocate structures for greedy dynamic programming */
324 	   max_subject_length = MIN(max_subject_length, MAX_DBSEQ_LEN);
325 	   max_subject_length = MIN(GREEDY_MAX_COST,
326 			    max_subject_length / GREEDY_MAX_COST_FRACTION + 1);
327 	   gap_align->greedy_align_mem =
328 	       s_BlastGreedyAlignMemAlloc(score_params, ext_params,
329 					  max_subject_length, 0);
330 	   if (!gap_align->greedy_align_mem)
331 	       gap_align = BLAST_GapAlignStructFree(gap_align);
332        }
333    }
334    else {
335        gap_align->jumper = JumperGapAlignNew(200);
336        if (ext_params->gap_x_dropoff == 0) {
337            gap_align->gap_x_dropoff = 3 * MAX(-score_params->penalty,
338                                               score_params->gap_open +
339                                               score_params->gap_extend);
340        }
341    }
342 
343    if (!gap_align)
344       return -1;
345 
346    gap_align->positionBased = (sbp->psi_matrix != NULL);
347 
348    gap_align->fwd_prelim_tback = GapPrelimEditBlockNew();
349    gap_align->rev_prelim_tback = GapPrelimEditBlockNew();
350 
351    return status;
352 }
353 
354 /** Values for the editing script operations in traceback */
355 enum {
356     SCRIPT_SUB           = eGapAlignSub,     /**< Substitution */
357     SCRIPT_GAP_IN_A      = eGapAlignDel,     /**< Deletion */
358     SCRIPT_GAP_IN_B      = eGapAlignIns,     /**< Insertion */
359     SCRIPT_OP_MASK       = 0x07, /**< Mask for edit script operations */
360 
361     SCRIPT_EXTEND_GAP_A  = 0x10, /**< continue a gap in A */
362     SCRIPT_EXTEND_GAP_B  = 0x40  /**< continue a gap in B */
363 };
364 
365 Int4
ALIGN_EX(const Uint1 * A,const Uint1 * B,Int4 M,Int4 N,Int4 * a_offset,Int4 * b_offset,GapPrelimEditBlock * edit_block,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 query_offset,Boolean reversed,Boolean reverse_sequence,Boolean * fence_hit)366 ALIGN_EX(const Uint1* A, const Uint1* B, Int4 M, Int4 N, Int4* a_offset,
367 	Int4* b_offset, GapPrelimEditBlock *edit_block,
368         BlastGapAlignStruct* gap_align,
369         const BlastScoringParameters* score_params, Int4 query_offset,
370         Boolean reversed, Boolean reverse_sequence,
371         Boolean * fence_hit)
372 {
373     /* See Blast_SemiGappedAlign for more general comments on
374        what this code is doing; comments in this function
375        only apply to the traceback computations */
376 
377     Int4 i;
378     Int4 a_index;
379     Int4 b_index, b_size, first_b_index, last_b_index, b_increment;
380     const Uint1* b_ptr;
381 
382     BlastGapDP* score_array;
383 
384     Int4 gap_open;
385     Int4 gap_extend;
386     Int4 gap_open_extend;
387     Int4 x_dropoff;
388     Int4 best_score;
389 
390     Int4** matrix = NULL;
391     Int4** pssm = NULL;
392     Int4* matrix_row = NULL;
393 
394     Int4 score;
395     Int4 score_gap_row;
396     Int4 score_gap_col;
397     Int4 next_score;
398 
399     GapStateArrayStruct* state_struct;
400     Uint1* edit_script_row;
401     Uint1** edit_script;
402     Int4 *edit_start_offset;
403     Int4 edit_script_num_rows;
404     Int4 orig_b_index;
405     Uint1 script, next_script, script_row, script_col;
406     Int4 num_extra_cells;
407 
408     matrix = gap_align->sbp->matrix->data;
409     if (gap_align->positionBased) {
410         pssm = gap_align->sbp->psi_matrix->pssm->data;
411     }
412 
413     *a_offset = 0;
414     *b_offset = 0;
415     gap_open = score_params->gap_open;
416     gap_extend = score_params->gap_extend;
417     gap_open_extend = gap_open + gap_extend;
418     x_dropoff = gap_align->gap_x_dropoff;
419 
420     if (x_dropoff < gap_open_extend)
421         x_dropoff = gap_open_extend;
422 
423     if(N <= 0 || M <= 0)
424         return 0;
425 
426     /* Initialize traceback information. edit_script[] is
427        a 2-D array which is filled in row by row as the
428        dynamic programming takes place */
429 
430     s_GapPurgeState(gap_align->state_struct);
431 
432     /* Conceptually, traceback requires a byte array of size
433        MxN. To save on memory, each row of the array is allocated
434        separately. edit_script[i] points to the storage reserved
435        for row i, and edit_start_offset[i] gives the offset into
436        the B sequence corresponding to element 0 of edit_script[i].
437 
438        Also make the number of edit script rows grow dynamically */
439 
440     edit_script_num_rows = 100;
441     edit_script = (Uint1**) malloc(sizeof(Uint1*) * edit_script_num_rows);
442     edit_start_offset = (Int4*) malloc(sizeof(Int4) * edit_script_num_rows);
443 
444     /* allocate storage for the first row of the traceback
445        array. Because row elements correspond to gaps in A,
446        the alignment can only go x_dropoff/gap_extend positions
447        at most before failing the X dropoff criterion */
448 
449     if (gap_extend > 0)
450         num_extra_cells = x_dropoff / gap_extend + 3;
451     else
452         num_extra_cells = N + 3;
453 
454     if (num_extra_cells > gap_align->dp_mem_alloc) {
455         gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
456                                       2 * gap_align->dp_mem_alloc);
457         sfree(gap_align->dp_mem);
458         gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
459                                                   sizeof(BlastGapDP));
460     }
461 
462     state_struct = s_GapGetState(&gap_align->state_struct, num_extra_cells);
463 
464     edit_script[0] = state_struct->state_array;
465     edit_start_offset[0] = 0;
466     edit_script_row = state_struct->state_array;
467 
468     score = -gap_open_extend;
469     score_array = gap_align->dp_mem;
470     score_array[0].best = 0;
471     score_array[0].best_gap = -gap_open_extend;
472 
473     for (i = 1; i <= N; i++) {
474         if (score < -x_dropoff)
475             break;
476 
477         score_array[i].best = score;
478         score_array[i].best_gap = score - gap_open_extend;
479         score -= gap_extend;
480         edit_script_row[i] = SCRIPT_GAP_IN_A;
481     }
482     state_struct->used = i + 1;
483 
484     b_size = i;
485     best_score = 0;
486     first_b_index = 0;
487     if (reverse_sequence)
488         b_increment = -1;
489     else
490         b_increment = 1;
491 
492     for (a_index = 1; a_index <= M; a_index++) {
493 
494         /* Set up the next row of the edit script; this involves
495            allocating memory for the row, then pointing to it.
496            It is not necessary to allocate space for offsets less
497            than first_b_index (since the inner loop will never
498            look at them);
499 
500            It is unknown at this point how far to the right the
501            current traceback row will extend; all that's known for
502            sure is that the previous row fails the X-dropoff test
503            after b_size cells, and that the current row can go at
504            most num_extra_cells beyond that before failing the test */
505 
506         if (gap_extend > 0)
507             state_struct = s_GapGetState(&gap_align->state_struct,
508                            b_size - first_b_index + num_extra_cells);
509         else
510             state_struct = s_GapGetState(&gap_align->state_struct,
511                                         N + 3 - first_b_index);
512 
513         if (a_index == edit_script_num_rows) {
514             edit_script_num_rows = edit_script_num_rows * 2;
515             edit_script = (Uint1 **)realloc(edit_script,
516                                             edit_script_num_rows *
517                                             sizeof(Uint1 *));
518             edit_start_offset = (Int4 *)realloc(edit_start_offset,
519                                                 edit_script_num_rows *
520                                                 sizeof(Int4));
521         }
522 
523         edit_script[a_index] = state_struct->state_array +
524                                 state_struct->used + 1;
525         edit_start_offset[a_index] = first_b_index;
526 
527         /* the inner loop assumes the current traceback
528            row begins at offset zero of B */
529 
530         edit_script_row = edit_script[a_index] - first_b_index;
531         orig_b_index = first_b_index;
532 
533         if (!(gap_align->positionBased)) {
534             if(reverse_sequence)
535                 matrix_row = matrix[ A[ M - a_index ] ];
536             else
537                 matrix_row = matrix[ A[ a_index ] ];
538         }
539         else {
540             if(reversed || reverse_sequence)
541                 matrix_row = pssm[M - a_index];
542             else
543                 matrix_row = pssm[a_index + query_offset];
544         }
545 
546         if(reverse_sequence)
547             b_ptr = &B[N - first_b_index];
548         else
549             b_ptr = &B[first_b_index];
550 
551         score = MININT;
552         score_gap_row = MININT;
553         last_b_index = first_b_index;
554 
555         for (b_index = first_b_index; b_index < b_size; b_index++) {
556             int matrix_index = 0;
557 
558             b_ptr += b_increment;
559             score_gap_col = score_array[b_index].best_gap;
560 
561             matrix_index = *b_ptr;
562 
563             if (matrix_index == FENCE_SENTRY) {
564                 if (fence_hit) {
565                     *fence_hit = 1;
566                 }
567                 break;
568             }
569 
570             next_score = score_array[b_index].best + matrix_row[ *b_ptr ];
571 
572             /* script, script_row and script_col contain the
573                actions specified by the dynamic programming.
574                when the inner loop has finished, 'script' con-
575                tains all of the actions to perform, and is
576                written to edit_script[a_index][b_index]. Otherwise,
577                this inner loop is exactly the same as the one
578                in Blast_SemiGappedAlign() */
579 
580             script = SCRIPT_SUB;
581             script_col = SCRIPT_EXTEND_GAP_B;
582             script_row = SCRIPT_EXTEND_GAP_A;
583 
584             if (score < score_gap_col) {
585                 script = SCRIPT_GAP_IN_B;
586                 score = score_gap_col;
587             }
588             if (score < score_gap_row) {
589                 script = SCRIPT_GAP_IN_A;
590                 score = score_gap_row;
591             }
592 
593             if (best_score - score > x_dropoff) {
594 
595                 if (first_b_index == b_index)
596                     first_b_index++;
597                 else
598                     score_array[b_index].best = MININT;
599             }
600             else {
601                 last_b_index = b_index;
602                 if (score > best_score) {
603                     best_score = score;
604                     *a_offset = a_index;
605                     *b_offset = b_index;
606                 }
607 
608                 score_gap_row -= gap_extend;
609                 score_gap_col -= gap_extend;
610                 if (score_gap_col < (score - gap_open_extend)) {
611                     score_array[b_index].best_gap = score - gap_open_extend;
612                 }
613                 else {
614                     score_array[b_index].best_gap = score_gap_col;
615                     script += script_col;
616                 }
617 
618                 if (score_gap_row < (score - gap_open_extend))
619                     score_gap_row = score - gap_open_extend;
620                 else
621                     script += script_row;
622 
623                 score_array[b_index].best = score;
624             }
625 
626             score = next_score;
627             edit_script_row[b_index] = script;
628         }
629 
630         if (first_b_index == b_size || (fence_hit && *fence_hit))
631             break;
632 
633         if (last_b_index + num_extra_cells + 3 >= gap_align->dp_mem_alloc) {
634 
635             gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100,
636                                           2 * gap_align->dp_mem_alloc);
637             score_array = (BlastGapDP *)realloc(score_array,
638                                                gap_align->dp_mem_alloc *
639                                                sizeof(BlastGapDP));
640             gap_align->dp_mem = score_array;
641         }
642 
643 
644         if (last_b_index < b_size - 1) {
645             b_size = last_b_index + 1;
646         }
647         else {
648             while (score_gap_row >= (best_score - x_dropoff) && b_size <= N) {
649 
650                 score_array[b_size].best = score_gap_row;
651                 score_array[b_size].best_gap = score_gap_row - gap_open_extend;
652                 score_gap_row -= gap_extend;
653                 edit_script_row[b_size] = SCRIPT_GAP_IN_A;
654                 b_size++;
655             }
656         }
657 
658         /* update the memory allocator to reflect the exact number
659            of traceback cells this row needed */
660 
661         state_struct->used += MAX(b_index, b_size) - orig_b_index + 1;
662 
663         if (b_size <= N) {
664             score_array[b_size].best = MININT;
665             score_array[b_size].best_gap = MININT;
666             b_size++;
667         }
668     }
669 
670     /* Pick the optimal path through the now complete
671        edit_script[][]. This is equivalent to flattening
672        the 2-D array into a 1-D list of actions. */
673 
674     a_index = *a_offset;
675     b_index = *b_offset;
676     script = SCRIPT_SUB;
677 
678     if (fence_hit && *fence_hit)
679         goto done;
680 
681     while (a_index > 0 || b_index > 0) {
682         /* Retrieve the next action to perform. Rows of
683            the traceback array do not necessarily start
684            at offset zero of B, so a correction is needed
685            to point to the correct position */
686 
687         next_script =
688             edit_script[a_index][b_index - edit_start_offset[a_index]];
689 
690         switch(script) {
691         case SCRIPT_GAP_IN_A:
692             script = next_script & SCRIPT_OP_MASK;
693             if (next_script & SCRIPT_EXTEND_GAP_A)
694                 script = SCRIPT_GAP_IN_A;
695             break;
696 
697         case SCRIPT_GAP_IN_B:
698             script = next_script & SCRIPT_OP_MASK;
699             if (next_script & SCRIPT_EXTEND_GAP_B)
700                 script = SCRIPT_GAP_IN_B;
701             break;
702 
703         default:
704             script = next_script & SCRIPT_OP_MASK;
705             break;
706         }
707 
708         if (script == SCRIPT_GAP_IN_A) {
709             b_index--;
710         }
711         else if (script == SCRIPT_GAP_IN_B) {
712             a_index--;
713         }
714         else {
715             a_index--;
716             b_index--;
717         }
718         GapPrelimEditBlockAdd(edit_block, (EGapAlignOpType)script, 1);
719     }
720 
721  done:
722     sfree(edit_start_offset);
723     sfree(edit_script);
724     return best_score;
725 }
726 
727 Int4
Blast_SemiGappedAlign(const Uint1 * A,const Uint1 * B,Int4 M,Int4 N,Int4 * a_offset,Int4 * b_offset,Boolean score_only,GapPrelimEditBlock * edit_block,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 query_offset,Boolean reversed,Boolean reverse_sequence,Boolean * fence_hit)728 Blast_SemiGappedAlign(const Uint1* A, const Uint1* B, Int4 M, Int4 N,
729    Int4* a_offset, Int4* b_offset, Boolean score_only,
730    GapPrelimEditBlock *edit_block, BlastGapAlignStruct* gap_align,
731    const BlastScoringParameters* score_params,
732    Int4 query_offset, Boolean reversed, Boolean reverse_sequence,
733    Boolean * fence_hit)
734 {
735     Int4 i;                     /* sequence pointers and indices */
736     Int4 a_index;
737     Int4 b_index, b_size, first_b_index, last_b_index, b_increment;
738     const Uint1* b_ptr;
739 
740     BlastGapDP* score_array;
741 
742     Int4 gap_open;              /* alignment penalty variables */
743     Int4 gap_extend;
744     Int4 gap_open_extend;
745     Int4 x_dropoff;
746 
747     Int4** matrix = NULL;       /* pointers to the score matrix */
748     Int4** pssm = NULL;
749     Int4* matrix_row = NULL;
750 
751     Int4 score;                 /* score tracking variables */
752     Int4 score_gap_row;
753     Int4 score_gap_col;
754     Int4 next_score;
755     Int4 best_score;
756     Int4 num_extra_cells;
757 
758     if (!score_only) {
759         return ALIGN_EX(A, B, M, N, a_offset, b_offset, edit_block, gap_align,
760                         score_params, query_offset, reversed, reverse_sequence,
761                         fence_hit);
762     }
763 
764     /* do initialization and sanity-checking */
765 
766     matrix = gap_align->sbp->matrix->data;
767     if (gap_align->positionBased) {
768         pssm = gap_align->sbp->psi_matrix->pssm->data;
769     }
770     *a_offset = 0;
771     *b_offset = 0;
772     gap_open = score_params->gap_open;
773     gap_extend = score_params->gap_extend;
774     gap_open_extend = gap_open + gap_extend;
775     x_dropoff = gap_align->gap_x_dropoff;
776 
777     if (x_dropoff < gap_open_extend)
778         x_dropoff = gap_open_extend;
779 
780     if(N <= 0 || M <= 0)
781         return 0;
782 
783     /* Allocate and fill in the auxiliary bookeeping structures.
784        Since A and B could be very large, maintain a window
785        of auxiliary structures only large enough to contain to current
786        set of DP computations. The initial window size is determined
787        by the number of cells needed to fail the x-dropoff test */
788 
789     if (gap_extend > 0)
790         num_extra_cells = x_dropoff / gap_extend + 3;
791     else
792         num_extra_cells = N + 3;
793 
794     if (num_extra_cells > gap_align->dp_mem_alloc) {
795         gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
796                                       2 * gap_align->dp_mem_alloc);
797         sfree(gap_align->dp_mem);
798         gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
799                                                   sizeof(BlastGapDP));
800     }
801 
802     score_array = gap_align->dp_mem;
803     score = -gap_open_extend;
804     score_array[0].best = 0;
805     score_array[0].best_gap = -gap_open_extend;
806 
807     for (i = 1; i <= N; i++) {
808         if (score < -x_dropoff)
809             break;
810 
811         score_array[i].best = score;
812         score_array[i].best_gap = score - gap_open_extend;
813         score -= gap_extend;
814     }
815 
816     /* The inner loop below examines letters of B from
817        index 'first_b_index' to 'b_size' */
818 
819     b_size = i;
820     best_score = 0;
821     first_b_index = 0;
822     if (reverse_sequence)
823         b_increment = -1;
824     else
825         b_increment = 1;
826 
827     for (a_index = 1; a_index <= M; a_index++) {
828         /* pick out the row of the score matrix
829            appropriate for A[a_index] */
830 
831         if (!(gap_align->positionBased)) {
832             if(reverse_sequence)
833                 matrix_row = matrix[ A[ M - a_index ] ];
834             else
835                 matrix_row = matrix[ A[ a_index ] ];
836         }
837         else {
838             if(reversed || reverse_sequence)
839                 matrix_row = pssm[M - a_index];
840             else
841                 matrix_row = pssm[a_index + query_offset];
842         }
843 
844         if(reverse_sequence)
845             b_ptr = &B[N - first_b_index];
846         else
847             b_ptr = &B[first_b_index];
848 
849         /* initialize running-score variables */
850         score = MININT;
851         score_gap_row = MININT;
852         last_b_index = first_b_index;
853 
854         for (b_index = first_b_index; b_index < b_size; b_index++) {
855 
856             b_ptr += b_increment;
857             score_gap_col = score_array[b_index].best_gap;
858             next_score = score_array[b_index].best + matrix_row[ *b_ptr ];
859 
860             if (score < score_gap_col)
861                 score = score_gap_col;
862 
863             if (score < score_gap_row)
864                 score = score_gap_row;
865 
866             if (best_score - score > x_dropoff) {
867 
868                 /* the current best score failed the X-dropoff
869                    criterion. Note that this does not stop the
870                    inner loop, only forces future iterations to
871                    skip this column of B.
872 
873                    Also, if the very first letter of B that was
874                    tested failed the X dropoff criterion, make
875                    sure future inner loops start one letter to
876                    the right */
877 
878                 if (b_index == first_b_index)
879                     first_b_index++;
880                 else
881                     score_array[b_index].best = MININT;
882             }
883             else {
884                 last_b_index = b_index;
885                 if (score > best_score) {
886                     best_score = score;
887                     *a_offset = a_index;
888                     *b_offset = b_index;
889                 }
890 
891                 /* If starting a gap at this position will improve
892                    the best row, or column, score, update them to
893                    reflect that. */
894 
895                 score_gap_row -= gap_extend;
896                 score_gap_col -= gap_extend;
897                 score_array[b_index].best_gap = MAX(score - gap_open_extend,
898                                                     score_gap_col);
899                 score_gap_row = MAX(score - gap_open_extend, score_gap_row);
900                 score_array[b_index].best = score;
901             }
902 
903             score = next_score;
904         }
905 
906         /* Finish aligning if the best scores for all positions
907            of B will fail the X-dropoff test, i.e. the inner loop
908            bounds have converged to each other */
909 
910         if (first_b_index == b_size)
911             break;
912 
913         /* enlarge the window for score data if necessary */
914 
915         if (last_b_index + num_extra_cells + 3 >= gap_align->dp_mem_alloc) {
916 
917             gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100,
918                                           2 * gap_align->dp_mem_alloc);
919             score_array = (BlastGapDP *)realloc(score_array,
920                                                gap_align->dp_mem_alloc *
921                                                sizeof(BlastGapDP));
922             gap_align->dp_mem = score_array;
923         }
924 
925         if (last_b_index < b_size - 1) {
926             /* This row failed the X-dropoff test earlier than
927                the last row did; just shorten the loop bounds
928                before doing the next row */
929 
930             b_size = last_b_index + 1;
931         }
932         else {
933             /* The inner loop finished without failing the X-dropoff
934                test; initialize extra bookkeeping structures until
935                the X dropoff test fails or we run out of letters in B.
936                The next inner loop will have larger bounds */
937 
938             while (score_gap_row >= (best_score - x_dropoff) && b_size <= N) {
939                 score_array[b_size].best = score_gap_row;
940                 score_array[b_size].best_gap = score_gap_row - gap_open_extend;
941                 score_gap_row -= gap_extend;
942                 b_size++;
943             }
944         }
945 
946         if (b_size <= N) {
947             score_array[b_size].best = MININT;
948             score_array[b_size].best_gap = MININT;
949             b_size++;
950         }
951     }
952 
953     return best_score;
954 }
955 
956 /** For restricted gapped alignment, gaps may only start once in
957     this many sequence offsets */
958 #define RESTRICT_SIZE 10
959 
960 /** Low level function to perform score-only gapped extension
961  *  in one direction. Gaps in one sequence or the other are only
962  *  allowed to start at sequence offsets that are a multiple of
963  *  RESTRICT_SIZE from the alignment start point. For more details see
964  * <PRE>
965  * Michael Cameron, Hugh E. Williams, Adam Cannane, "Improved
966  * Gapped Alignment in BLAST". IEEE/ACM Transactions on Computational
967  * Biology and Bioinformatics, vol 1 #3 (2004) pp 116-129
968  * </PRE>
969  * @param A The query sequence [in]
970  * @param B The subject sequence [in]
971  * @param M Maximal extension length in query [in]
972  * @param N Maximal extension length in subject [in]
973  * @param a_offset Resulting starting offset in query [out]
974  * @param b_offset Resulting starting offset in subject [out]
975  * @param gap_align Structure holding various information and allocated
976  *        memory for the gapped alignment [in]
977  * @param score_params Parameters related to scoring [in]
978  * @param query_offset The starting offset in query [in]
979  * @param reverse_sequence Do reverse the sequence [in]
980  * @return The best alignment score found.
981  */
982 static Int4
s_RestrictedGappedAlign(const Uint1 * A,const Uint1 * B,Int4 M,Int4 N,Int4 * a_offset,Int4 * b_offset,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 query_offset,Boolean reverse_sequence)983 s_RestrictedGappedAlign(const Uint1* A, const Uint1* B, Int4 M, Int4 N,
984    Int4* a_offset, Int4* b_offset,
985    BlastGapAlignStruct* gap_align,
986    const BlastScoringParameters* score_params,
987    Int4 query_offset, Boolean reverse_sequence)
988 {
989     /* see Blast_SemiGappedAlign for general details */
990 
991     Int4 i;
992     Int4 a_index;
993     Int4 b_index, b_size, first_b_index, last_b_index, b_increment;
994     const Uint1* b_ptr;
995     Int4 b_gap;
996 
997     BlastGapDP* score_array;
998 
999     Int4 gap_open;
1000     Int4 gap_extend;
1001     Int4 gap_open_extend;
1002     Int4 x_dropoff;
1003 
1004     Int4** matrix = NULL;
1005     Int4** pssm = NULL;
1006     Int4* matrix_row = NULL;
1007 
1008     Int4 score;
1009     Int4 score_gap_row;
1010     Int4 score_gap_col;
1011     Int4 next_score;
1012     Int4 best_score;
1013     Int4 num_extra_cells;
1014 
1015     matrix = gap_align->sbp->matrix->data;
1016     if (gap_align->positionBased) {
1017         pssm = gap_align->sbp->psi_matrix->pssm->data;
1018     }
1019     *a_offset = 0;
1020     *b_offset = 0;
1021     gap_open = score_params->gap_open;
1022     gap_extend = score_params->gap_extend;
1023     gap_open_extend = gap_open + gap_extend;
1024     x_dropoff = gap_align->gap_x_dropoff;
1025 
1026     if (x_dropoff < gap_open_extend)
1027         x_dropoff = gap_open_extend;
1028 
1029     if(N <= 0 || M <= 0)
1030         return 0;
1031 
1032     if (gap_extend > 0)
1033         num_extra_cells = x_dropoff / gap_extend + 3;
1034     else
1035         num_extra_cells = N + 3;
1036 
1037     if (num_extra_cells > gap_align->dp_mem_alloc) {
1038         gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
1039                                       2 * gap_align->dp_mem_alloc);
1040         sfree(gap_align->dp_mem);
1041         gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
1042                                                   sizeof(BlastGapDP));
1043     }
1044 
1045     score_array = gap_align->dp_mem;
1046     score = -gap_open_extend;
1047     score_array[0].best = 0;
1048     score_array[0].best_gap = -gap_open_extend;
1049 
1050     for (i = 1; i <= N; i++) {
1051         if (score < -x_dropoff)
1052             break;
1053 
1054         score_array[i].best = score;
1055         score_array[i].best_gap = score - gap_open_extend;
1056         score -= gap_extend;
1057     }
1058 
1059     b_size = i;
1060     best_score = 0;
1061     first_b_index = 0;
1062     b_gap = 0;
1063     if (reverse_sequence)
1064         b_increment = -1;
1065     else
1066         b_increment = 1;
1067 
1068     for (a_index = 1; a_index <= M; a_index++) {
1069 
1070         if (!(gap_align->positionBased)) {
1071             if(reverse_sequence)
1072                 matrix_row = matrix[ A[ M - a_index ] ];
1073             else
1074                 matrix_row = matrix[ A[ a_index ] ];
1075         }
1076         else {
1077             if(reverse_sequence)
1078                 matrix_row = pssm[M - a_index];
1079             else
1080                 matrix_row = pssm[a_index + query_offset];
1081         }
1082 
1083         if(reverse_sequence)
1084             b_ptr = &B[N - first_b_index];
1085         else
1086             b_ptr = &B[first_b_index];
1087 
1088         score = MININT;
1089         score_gap_row = MININT;
1090         last_b_index = first_b_index;
1091 
1092         /* The double loop that computes the alignment is essentially
1093            unchanged from that of Blast_SemiGappedAlign. The only
1094            real difference is that a gap in A or B is not allowed to
1095            start unless the offset of the gap is divisible by
1096            RESTRICT_SIZE. This allows the ordinary dynamic programming
1097            recurrence relations to be simplified */
1098 
1099         if (a_index % RESTRICT_SIZE != 0) {
1100 
1101             /* a gap will never start in A; do not bother checking
1102                or updating score_gap_row */
1103 
1104             for (b_index = first_b_index; b_index < b_size; b_index++) {
1105 
1106                 b_ptr += b_increment;
1107                 next_score = score_array[b_index].best + matrix_row[ *b_ptr ];
1108 
1109                 if (b_index != b_gap) {
1110 
1111                     /* the majority of cases fall here; a gap
1112                        may not start in either A or B */
1113 
1114                     if (best_score - score > x_dropoff) {
1115                         score_array[b_index].best = MININT;
1116                         if (b_index == first_b_index)
1117                             first_b_index++;
1118                     }
1119                     else {
1120                         last_b_index = b_index;
1121                         if (score > best_score) {
1122                             best_score = score;
1123                             *a_offset = a_index;
1124                             *b_offset = b_index;
1125                         }
1126                         score_array[b_index].best = score;
1127                     }
1128                 }
1129                 else {
1130 
1131                     /* a gap may start in B. Update b_gap, the
1132                        offset when this will next happen, and
1133                        compute the two-term recurrence */
1134 
1135                     b_gap += RESTRICT_SIZE;
1136                     score_gap_col = score_array[b_index].best_gap;
1137 
1138                     if (score < score_gap_col)
1139                         score = score_gap_col;
1140 
1141                     if (best_score - score > x_dropoff) {
1142                         score_array[b_index].best = MININT;
1143                         if (b_index == first_b_index)
1144                             first_b_index++;
1145                     }
1146                     else {
1147                         last_b_index = b_index;
1148                         if (score > best_score) {
1149                             best_score = score;
1150                             *a_offset = a_index;
1151                             *b_offset = b_index;
1152                         }
1153 
1154                         score_gap_col -= gap_extend;
1155                         score_array[b_index].best_gap =
1156                                    MAX(score - gap_open_extend, score_gap_col);
1157                         score_array[b_index].best = score;
1158                     }
1159                 }
1160                 score = next_score;
1161             }
1162 
1163             score_gap_row = score;
1164         }
1165         else {
1166 
1167             /* gap may start in A */
1168 
1169             for (b_index = first_b_index; b_index < b_size; b_index++) {
1170 
1171                 b_ptr += b_increment;
1172                 next_score = score_array[b_index].best + matrix_row[ *b_ptr ];
1173 
1174                 if (b_index != b_gap) {
1175 
1176                     /* gap may not start in B. Compute
1177                        the resulting two-term recurrence */
1178 
1179                     if (score < score_gap_row)
1180                         score = score_gap_row;
1181 
1182                     if (best_score - score > x_dropoff) {
1183                         score_array[b_index].best = MININT;
1184                         if (b_index == first_b_index)
1185                             first_b_index++;
1186                     }
1187                     else {
1188                         last_b_index = b_index;
1189                         if (score > best_score) {
1190                             best_score = score;
1191                             *a_offset = a_index;
1192                             *b_offset = b_index;
1193                         }
1194 
1195                         score_gap_row -= gap_extend;
1196                         score_gap_row = MAX(score - gap_open_extend,
1197                                             score_gap_row);
1198                         score_array[b_index].best = score;
1199                     }
1200                 }
1201                 else {
1202 
1203                     /* the ordinary case: a gap may start in A
1204                        or B and the full three-term recurrence
1205                        must be computed. This happens once every
1206                        RESTRICT_SIZE*RESTRICT_SIZE cells */
1207 
1208                     b_gap += RESTRICT_SIZE;
1209                     score_gap_col = score_array[b_index].best_gap;
1210 
1211                     if (score < score_gap_col)
1212                         score = score_gap_col;
1213                     if (score < score_gap_row)
1214                         score = score_gap_row;
1215 
1216                     if (best_score - score > x_dropoff) {
1217                         score_array[b_index].best = MININT;
1218                         if (b_index == first_b_index)
1219                             first_b_index++;
1220                     }
1221                     else {
1222                         last_b_index = b_index;
1223                         if (score > best_score) {
1224                             best_score = score;
1225                             *a_offset = a_index;
1226                             *b_offset = b_index;
1227                         }
1228 
1229                         score_gap_row -= gap_extend;
1230                         score_gap_col -= gap_extend;
1231                         score_array[b_index].best_gap =
1232                                  MAX(score - gap_open_extend, score_gap_col);
1233                         score_gap_row = MAX(score - gap_open_extend,
1234                                             score_gap_row);
1235                         score_array[b_index].best = score;
1236                     }
1237                 }
1238                 score = next_score;
1239             }
1240         }
1241 
1242         if (first_b_index == b_size)
1243             break;
1244 
1245         /* compute the first offset of the next row where a
1246            gap in B may start */
1247 
1248         b_index = first_b_index % RESTRICT_SIZE;
1249         b_gap = first_b_index;
1250         if (b_index > 0)
1251             b_gap += RESTRICT_SIZE - b_index;
1252 
1253         if (last_b_index + num_extra_cells + 3 >= gap_align->dp_mem_alloc) {
1254 
1255             gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100,
1256                                           2 * gap_align->dp_mem_alloc);
1257             score_array = (BlastGapDP *)realloc(score_array,
1258                                                gap_align->dp_mem_alloc *
1259                                                sizeof(BlastGapDP));
1260             gap_align->dp_mem = score_array;
1261         }
1262 
1263         if (last_b_index < b_size - 1) {
1264             b_size = last_b_index + 1;
1265         }
1266         else {
1267             /* The inner loop finished without failing the X-dropoff
1268                test; initialize extra bookkeeping structures until
1269                the X dropoff test fails or we run out of letters in B.
1270                The next inner loop will have larger bounds.
1271 
1272                Note that if gaps in A are not allowed for this row, then
1273                score_gap_row is -infinity and the loop below will not
1274                extend the region to explore */
1275 
1276             while (score_gap_row >= (best_score - x_dropoff) && b_size <= N) {
1277                 score_array[b_size].best = score_gap_row;
1278                 score_array[b_size].best_gap = score_gap_row - gap_open_extend;
1279                 score_gap_row -= gap_extend;
1280                 b_size++;
1281             }
1282         }
1283 
1284         if (b_size <= N) {
1285             score_array[b_size].best = MININT;
1286             score_array[b_size].best_gap = MININT;
1287             b_size++;
1288         }
1289     }
1290 
1291     return best_score;
1292 }
1293 
1294 /** Editing script operations for out-of-frame traceback */
1295 enum {
1296     SCRIPT_AHEAD_ONE_FRAME       = 1, /**< Shift 1 frame in sequence A
1297                                          (gap 2 nucleotides) */
1298     SCRIPT_AHEAD_TWO_FRAMES      = 2, /**< Shift 2 frames in sequence A
1299                                          (gap 1 nucleotide) */
1300     SCRIPT_NEXT_IN_FRAME         = 3, /**< Shift to next base (substitution) */
1301     SCRIPT_NEXT_PLUS_ONE_FRAME   = 4, /**< Shift to next base plus 1 frame
1302                                          (gap 1 nucleotide in sequence B) */
1303     SCRIPT_NEXT_PLUS_TWO_FRAMES  = 5, /**< Shift to next base plus 2 frames
1304                                          (gap 2 nucleotides in sequence B) */
1305     SCRIPT_OOF_OPEN_GAP          = 0x08 /**< Opening a gap */
1306 };
1307 
1308 /** Low level function to perform gapped extension with out-of-frame
1309  * gapping with traceback.
1310  * @param A The query sequence [in]
1311  * @param B The subject sequence [in]
1312  * @param M Maximal extension length in query [in]
1313  * @param N Maximal extension length in subject [in]
1314  * @param a_offset Resulting starting offset in query [out]
1315  * @param b_offset Resulting starting offset in subject [out]
1316  * @param edit_block Structure to hold generated traceback [out]
1317  * @param gap_align Structure holding various information and allocated
1318  *        memory for the gapped alignment [in]
1319  * @param score_params Parameters related to scoring [in]
1320  * @param query_offset The starting offset in query [in]
1321  * @param reversed Has the sequence been reversed? Used for psi-blast [in]
1322  * @return The best alignment score found.
1323  */
1324 static Int4
s_OutOfFrameAlignWithTraceback(const Uint1 * A,const Uint1 * B,Int4 M,Int4 N,Int4 * a_offset,Int4 * b_offset,GapPrelimEditBlock * edit_block,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 query_offset,Boolean reversed)1325 s_OutOfFrameAlignWithTraceback(const Uint1* A, const Uint1* B, Int4 M, Int4 N,
1326    Int4* a_offset, Int4* b_offset, GapPrelimEditBlock *edit_block,
1327    BlastGapAlignStruct* gap_align, const BlastScoringParameters* score_params,
1328    Int4 query_offset, Boolean reversed)
1329 {
1330     Int4 i, increment;          /* sequence pointers and indices */
1331     Int4 a_index;
1332     Int4 b_index, b_size, first_b_index, last_b_index;
1333 
1334     BlastGapDP* score_array;
1335 
1336     Int4 gap_open;              /* alignment penalty variables */
1337     Int4 gap_extend;
1338     Int4 gap_open_extend;
1339     Int4 shift_penalty;
1340     Int4 x_dropoff;
1341 
1342     Int4** matrix = NULL;       /* pointers to the score matrix */
1343     Int4** pssm = NULL;
1344     Int4* matrix_row = NULL;
1345 
1346     Int4 score;                 /* score tracking variables */
1347     Int4 score_row1;
1348     Int4 score_row2;
1349     Int4 score_row3;
1350     Int4 score_gap_col;
1351     Int4 score_col1;
1352     Int4 score_col2;
1353     Int4 score_col3;
1354     Int4 score_other_frame1;
1355     Int4 score_other_frame2;
1356     Int4 best_score;
1357 
1358     GapStateArrayStruct* state_struct;
1359     Uint1** edit_script;
1360     Uint1* edit_script_row;
1361     Int4 *edit_start_offset;
1362     Int4 edit_script_num_rows;
1363     Int4 orig_b_index;
1364     Int1 script, next_script;
1365     Int4 num_extra_cells;
1366 
1367     /* do initialization and sanity-checking */
1368 
1369     matrix = gap_align->sbp->matrix->data;
1370     if (gap_align->positionBased) {
1371         pssm = gap_align->sbp->psi_matrix->pssm->data;
1372     }
1373     *a_offset = 0;
1374     *b_offset = -2;
1375     gap_open = score_params->gap_open;
1376     gap_extend = score_params->gap_extend;
1377     gap_open_extend = gap_open + gap_extend;
1378     shift_penalty = score_params->shift_pen;
1379     x_dropoff = gap_align->gap_x_dropoff;
1380 
1381     if (x_dropoff < gap_open_extend)
1382         x_dropoff = gap_open_extend;
1383 
1384     if(N <= 0 || M <= 0)
1385         return 0;
1386 
1387     /* Initialize traceback information. edit_script[] is
1388        a 2-D array which is filled in row by row as the
1389        dynamic programming takes place */
1390 
1391     s_GapPurgeState(gap_align->state_struct);
1392 
1393     /* Conceptually, traceback requires a byte array of size
1394        MxN. To save on memory, each row of the array is allocated
1395        separately. edit_script[i] points to the storage reserved
1396        for row i, and edit_start_offset[i] gives the offset into
1397        the B sequence corresponding to element 0 of edit_script[i]
1398 
1399        Also make the number of edit script rows grow dynamically */
1400 
1401     edit_script_num_rows = 100;
1402     edit_script = (Uint1**) malloc(sizeof(Uint1*) * edit_script_num_rows);
1403     edit_start_offset = (Int4*) malloc(sizeof(Int4) * edit_script_num_rows);
1404 
1405     /* allocate storage for the first row of the traceback
1406        array. Because row elements correspond to gaps in A,
1407        the alignment can only go at most x_dropoff/gap_extend
1408        positions, in all three frames, before failing the
1409        X dropoff criterion */
1410 
1411     if (gap_extend > 0)
1412         num_extra_cells = CODON_LENGTH * (x_dropoff / gap_extend + 5);
1413     else
1414         num_extra_cells = N + 5;
1415 
1416     if (num_extra_cells > gap_align->dp_mem_alloc) {
1417         gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
1418                                       2 * gap_align->dp_mem_alloc);
1419         sfree(gap_align->dp_mem);
1420         gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
1421                                                   sizeof(BlastGapDP));
1422     }
1423 
1424     state_struct = s_GapGetState(&gap_align->state_struct, num_extra_cells);
1425 
1426     edit_script[0] = state_struct->state_array;
1427     edit_start_offset[0] = 0;
1428     edit_script_row = state_struct->state_array;
1429 
1430     score_array = gap_align->dp_mem;
1431     score = -gap_open_extend;
1432     score_array[0].best = 0;
1433     score_array[0].best_gap = -gap_open_extend;
1434 
1435     for (i = 3; i <= N + 2; i += 3) {
1436         score_array[i].best = score;
1437         score_array[i].best_gap = score - gap_open_extend;
1438         edit_script_row[i] = SCRIPT_GAP_IN_B;
1439 
1440         score_array[i-1].best = MININT;
1441         score_array[i-1].best_gap = MININT;
1442         edit_script_row[i-1] = SCRIPT_GAP_IN_B;
1443 
1444         score_array[i-2].best = MININT;
1445         score_array[i-2].best_gap = MININT;
1446         edit_script_row[i-2] = SCRIPT_GAP_IN_B;
1447 
1448         if (score < -x_dropoff)
1449             break;
1450         score -= gap_extend;
1451     }
1452 
1453     /* The inner loop below examines letters of B from
1454        index 'first_b_index' to 'b_size' */
1455 
1456     score_array[i].best = MININT;
1457     score_array[i].best_gap = MININT;
1458     b_size = i - 2;
1459     state_struct->used = b_size + 1;
1460 
1461     best_score = 0;
1462     first_b_index = 0;
1463     if (reversed) {
1464         increment = -1;
1465     }
1466     else {
1467         /* Allow for a backwards frame shift */
1468         B -= 2;
1469         increment = 1;
1470     }
1471 
1472     for (a_index = 1; a_index <= M; a_index++) {
1473 
1474         /* Set up the next row of the edit script; this involves
1475            allocating memory for the row, then pointing to it.
1476            It is not necessary to allocate space for offsets less
1477            than first_b_index (since the inner loop will never
1478            look at them).
1479 
1480            It is unknown at this point how far to the right the
1481            current traceback row will extend; all that's known for
1482            sure is that the previous row fails the X-dropoff test
1483            after b_size cells, and that the current row can go at
1484            most num_extra_cells beyond that before failing the test */
1485 
1486         if (gap_extend > 0)
1487             state_struct = s_GapGetState(&gap_align->state_struct,
1488                            b_size - first_b_index + num_extra_cells);
1489         else
1490             state_struct = s_GapGetState(&gap_align->state_struct,
1491                                         N + 5 - first_b_index);
1492 
1493         if (a_index == edit_script_num_rows) {
1494             edit_script_num_rows = edit_script_num_rows * 2;
1495             edit_script = (Uint1 **)realloc(edit_script,
1496                                             edit_script_num_rows *
1497                                             sizeof(Uint1 *));
1498             edit_start_offset = (Int4 *)realloc(edit_start_offset,
1499                                                 edit_script_num_rows *
1500                                                 sizeof(Int4));
1501         }
1502 
1503         edit_script[a_index] = state_struct->state_array +
1504                                 state_struct->used + 1;
1505         edit_start_offset[a_index] = first_b_index;
1506 
1507         /* the inner loop assumes the current traceback
1508            row begins at offset zero of B */
1509 
1510         edit_script_row = edit_script[a_index] - first_b_index;
1511         orig_b_index = first_b_index;
1512 
1513         if (!(gap_align->positionBased)) {
1514             matrix_row = matrix[ A[ a_index * increment ] ];
1515         }
1516         else {
1517             if(reversed)
1518                 matrix_row = pssm[M - a_index];
1519             else
1520                 matrix_row = pssm[a_index + query_offset];
1521         }
1522 
1523         score = MININT;
1524         score_row1 = MININT;
1525         score_row2 = MININT;
1526         score_row3 = MININT;
1527         score_gap_col = MININT;
1528         score_col1 = MININT;
1529         score_col2 = MININT;
1530         score_col3 = MININT;
1531         score_other_frame1 = MININT;
1532         score_other_frame2 = MININT;
1533         last_b_index = first_b_index;
1534         b_index = first_b_index;
1535 
1536         /* The inner loop is identical to that of OOF_SEMI_G_ALIGN,
1537            except that traceback operations are sprinkled throughout. */
1538 
1539         while (b_index < b_size) {
1540 
1541             /* FRAME 0 */
1542 
1543             score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
1544             score = MAX(score, score_col1);
1545             if (score == score_col1) {
1546                 script = SCRIPT_NEXT_IN_FRAME;
1547             }
1548             else if (score + shift_penalty == score_other_frame1) {
1549                 if (score_other_frame1 == score_col2)
1550                     script = SCRIPT_AHEAD_TWO_FRAMES;
1551                 else
1552                     script = SCRIPT_NEXT_PLUS_TWO_FRAMES;
1553             }
1554             else {
1555                 if (score_other_frame2 == score_col3)
1556                     script = SCRIPT_AHEAD_ONE_FRAME;
1557                 else
1558                     script = SCRIPT_NEXT_PLUS_ONE_FRAME;
1559             }
1560             score += matrix_row[ B[ b_index * increment ] ];
1561 
1562             score_other_frame1 = MAX(score_col1, score_array[b_index].best);
1563             score_col1 = score_array[b_index].best;
1564             score_gap_col = score_array[b_index].best_gap;
1565 
1566             if (score < MAX(score_gap_col, score_row1)) {
1567                 if (score_gap_col > score_row1) {
1568                     score = score_gap_col;
1569                     script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_A;
1570                 }
1571                 else {
1572                     score = score_row1;
1573                     script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_B;
1574                 }
1575 
1576                 if (best_score - score > x_dropoff) {
1577                     if (first_b_index == b_index)
1578                         first_b_index = b_index + 1;
1579                     else
1580                         score_array[b_index].best = MININT;
1581                 }
1582                 else {
1583                     last_b_index = b_index;
1584                     score_array[b_index].best = score;
1585                     score_array[b_index].best_gap = score_gap_col - gap_extend;
1586                     score_row1 -= gap_extend;
1587                 }
1588             }
1589             else {
1590                 if (best_score - score > x_dropoff) {
1591                     if (first_b_index == b_index)
1592                         first_b_index = b_index + 1;
1593                     else
1594                         score_array[b_index].best = MININT;
1595                 }
1596                 else {
1597                     last_b_index = b_index;
1598                     score_array[b_index].best = score;
1599                     if (score > best_score) {
1600                         best_score = score;
1601                         *a_offset = a_index;
1602                         *b_offset = b_index;
1603                     }
1604 
1605                     score -= gap_open_extend;
1606                     score_row1 -= gap_extend;
1607                     if (score > score_row1)
1608                         score_row1 = score;
1609                     else
1610                         script |= SCRIPT_EXTEND_GAP_B;
1611 
1612                     score_gap_col -= gap_extend;
1613                     if (score < score_gap_col) {
1614                         score_array[b_index].best_gap = score_gap_col;
1615                         script |= SCRIPT_EXTEND_GAP_A;
1616                     }
1617                     else {
1618                         score_array[b_index].best_gap = score;
1619                     }
1620                 }
1621             }
1622 
1623             edit_script_row[b_index++] = script;
1624             if (b_index >= b_size) {
1625                 score = score_row1;
1626                 score_row1 = score_row2;
1627                 score_row2 = score_row3;
1628                 score_row3 = score;
1629                 break;
1630             }
1631 
1632             /* FRAME 1 */
1633 
1634             score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
1635             score = MAX(score, score_col2);
1636             if (score == score_col2) {
1637                 script = SCRIPT_NEXT_IN_FRAME;
1638             }
1639             else if (score + shift_penalty == score_other_frame1) {
1640                 if (score_other_frame1 == score_col1)
1641                     script = SCRIPT_AHEAD_ONE_FRAME;
1642                 else
1643                     script = SCRIPT_NEXT_PLUS_ONE_FRAME;
1644             }
1645             else {
1646                 if (score_other_frame2 == score_col3)
1647                     script = SCRIPT_AHEAD_TWO_FRAMES;
1648                 else
1649                     script = SCRIPT_NEXT_PLUS_TWO_FRAMES;
1650             }
1651             score += matrix_row[ B[ b_index * increment ] ];
1652             score_other_frame2 = MAX(score_col2, score_array[b_index].best);
1653             score_col2 = score_array[b_index].best;
1654             score_gap_col = score_array[b_index].best_gap;
1655 
1656             if (score < MAX(score_gap_col, score_row2)) {
1657                 score = MAX(score_gap_col, score_row2);
1658                 if (best_score - score > x_dropoff) {
1659                     if (first_b_index == b_index)
1660                         first_b_index = b_index + 1;
1661                     else
1662                         score_array[b_index].best = MININT;
1663                 }
1664                 else {
1665                     if (score == score_gap_col)
1666                         script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_A;
1667                     else
1668                         script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_B;
1669 
1670                     last_b_index = b_index;
1671                     score_array[b_index].best = score;
1672                     score_array[b_index].best_gap = score_gap_col - gap_extend;
1673                     score_row2 -= gap_extend;
1674                 }
1675             }
1676             else {
1677                 if (best_score - score > x_dropoff) {
1678                     if (first_b_index == b_index)
1679                         first_b_index = b_index + 1;
1680                     else
1681                         score_array[b_index].best = MININT;
1682                 }
1683                 else {
1684                     last_b_index = b_index;
1685                     score_array[b_index].best = score;
1686                     if (score > best_score) {
1687                         best_score = score;
1688                         *a_offset = a_index;
1689                         *b_offset = b_index;
1690                     }
1691                     score -= gap_open_extend;
1692                     score_row2 -= gap_extend;
1693                     if (score > score_row2)
1694                         score_row2 = score;
1695                     else
1696                         script |= SCRIPT_EXTEND_GAP_B;
1697 
1698                     score_gap_col -= gap_extend;
1699                     if (score < score_gap_col) {
1700                         score_array[b_index].best_gap = score_gap_col;
1701                         script |= SCRIPT_EXTEND_GAP_A;
1702                     }
1703                     else {
1704                         score_array[b_index].best_gap = score;
1705                     }
1706                 }
1707             }
1708 
1709             edit_script_row[b_index++] = script;
1710             if (b_index >= b_size) {
1711                 score = score_row2;
1712                 score_row2 = score_row1;
1713                 score_row1 = score_row3;
1714                 score_row3 = score;
1715                 break;
1716             }
1717 
1718             /* FRAME 2 */
1719 
1720             score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
1721             score = MAX(score, score_col3);
1722             if (score == score_col3) {
1723                 script = SCRIPT_NEXT_IN_FRAME;
1724             }
1725             else if (score + shift_penalty == score_other_frame1) {
1726                 if (score_other_frame1 == score_col1)
1727                     script = SCRIPT_AHEAD_TWO_FRAMES;
1728                 else
1729                     script = SCRIPT_NEXT_PLUS_TWO_FRAMES;
1730             }
1731             else {
1732                 if (score_other_frame2 == score_col2)
1733                     script = SCRIPT_AHEAD_ONE_FRAME;
1734                 else
1735                     script = SCRIPT_NEXT_PLUS_ONE_FRAME;
1736             }
1737             score += matrix_row[ B[ b_index * increment ] ];
1738             score_other_frame1 = score_other_frame2;
1739             score_other_frame2 = MAX(score_col3, score_array[b_index].best);
1740             score_col3 = score_array[b_index].best;
1741             score_gap_col = score_array[b_index].best_gap;
1742 
1743             if (score < MAX(score_gap_col, score_row3)) {
1744                 score = MAX(score_gap_col, score_row3);
1745                 if (best_score - score > x_dropoff) {
1746                     if (first_b_index == b_index)
1747                         first_b_index = b_index + 1;
1748                     else
1749                         score_array[b_index].best = MININT;
1750                 }
1751                 else {
1752                     if (score == score_gap_col)
1753                         script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_A;
1754                     else
1755                         script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_B;
1756 
1757                     last_b_index = b_index;
1758                     score_array[b_index].best = score;
1759                     score_array[b_index].best_gap = score_gap_col - gap_extend;
1760                     score_row3 -= gap_extend;
1761                 }
1762             }
1763             else {
1764                 if (best_score - score > x_dropoff) {
1765                     if (first_b_index == b_index)
1766                         first_b_index = b_index + 1;
1767                     else
1768                         score_array[b_index].best = MININT;
1769                 }
1770                 else {
1771                     last_b_index = b_index;
1772                     score_array[b_index].best = score;
1773                     if (score > best_score) {
1774                         best_score = score;
1775                         *a_offset = a_index;
1776                         *b_offset = b_index;
1777                     }
1778                     score -= gap_open_extend;
1779                     score_row3 -= gap_extend;
1780                     if (score > score_row3)
1781                         score_row3 = score;
1782                     else
1783                         script |= SCRIPT_EXTEND_GAP_B;
1784 
1785                     score_gap_col -= gap_extend;
1786                     if (score < score_gap_col) {
1787                         score_array[b_index].best_gap = score_gap_col;
1788                         script |= SCRIPT_EXTEND_GAP_A;
1789                     }
1790                     else {
1791                         score_array[b_index].best_gap = score;
1792                     }
1793                 }
1794             }
1795             edit_script_row[b_index++] = script;
1796         }
1797 
1798         /* Finish aligning if the best scores for all positions
1799            of B will fail the X-dropoff test, i.e. the inner loop
1800            bounds have converged to each other */
1801 
1802         if (first_b_index == b_size)
1803             break;
1804 
1805         /* Enlarge the window for score data if necessary */
1806 
1807         if (last_b_index + num_extra_cells + 5 >= gap_align->dp_mem_alloc) {
1808 
1809             gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100,
1810                                           2 * gap_align->dp_mem_alloc);
1811             score_array = (BlastGapDP *)realloc(score_array,
1812                                                gap_align->dp_mem_alloc *
1813                                                sizeof(BlastGapDP));
1814             gap_align->dp_mem = score_array;
1815         }
1816 
1817         if (last_b_index < b_size - 1) {
1818             /* This row failed the X-dropoff test earlier than
1819                the last row did; just shorten the loop bounds
1820                before doing the next row */
1821 
1822             b_size = last_b_index + 1;
1823         }
1824         else {
1825             /* The inner loop finished without failing the X-dropoff
1826                test; initialize extra bookkeeping structures until
1827                the X dropoff test fails or we run out of letters in B.
1828                The next inner loop will have larger bounds.
1829 
1830                Keep initializing extra structures until the X dropoff
1831                test fails in all frames for this row */
1832 
1833             score = MAX(score_row1, score_row2);
1834             score = MAX(score, score_row3);
1835             while (score >= (best_score - x_dropoff) && b_size < N + 1) {
1836 
1837                 score_array[b_size].best = score_row1;
1838                 score_array[b_size].best_gap = score_row1 - gap_open_extend;
1839                 score_row1 -= gap_extend;
1840                 edit_script_row[b_size] = SCRIPT_OOF_OPEN_GAP |
1841                                           SCRIPT_GAP_IN_B;
1842 
1843                 score_array[b_size+1].best = score_row2;
1844                 score_array[b_size+1].best_gap = score_row2 - gap_open_extend;
1845                 score_row2 -= gap_extend;
1846                 edit_script_row[b_size+1] = SCRIPT_OOF_OPEN_GAP |
1847                                             SCRIPT_GAP_IN_B;
1848 
1849                 score_array[b_size+2].best = score_row3;
1850                 score_array[b_size+2].best_gap = score_row3 - gap_open_extend;
1851                 score_row3 -= gap_extend;
1852                 edit_script_row[b_size+2] = SCRIPT_OOF_OPEN_GAP |
1853                                             SCRIPT_GAP_IN_B;
1854                 b_size += 3;
1855                 score -= gap_extend;
1856             }
1857         }
1858 
1859         /* update the memory allocator to reflect the exact number
1860            of traceback cells this row needed */
1861 
1862         b_size = MIN(b_size, N + 1);
1863         state_struct->used += MAX(b_index, b_size) - orig_b_index + 1;
1864 
1865         /* chop off the best score in each frame */
1866 
1867         last_b_index = MIN(b_size + 4, N + 3);
1868         while (b_size < last_b_index) {
1869             score_array[b_size].best = MININT;
1870             score_array[b_size].best_gap = MININT;
1871             b_size++;
1872         }
1873     }
1874 
1875     a_index = *a_offset;
1876     b_index = *b_offset;
1877     script = SCRIPT_AHEAD_ONE_FRAME;
1878 
1879     while (a_index > 0 || b_index > 0) {
1880         /* Retrieve the next action to perform. Rows of
1881            the traceback array do not necessarily start
1882            at offset zero of B, so a correction is needed
1883            to point to the correct position */
1884 
1885         next_script =
1886             edit_script[a_index][b_index - edit_start_offset[a_index]];
1887 
1888         switch (script) {
1889         case SCRIPT_GAP_IN_A:
1890             script = next_script & SCRIPT_OP_MASK;
1891             if (next_script & (SCRIPT_OOF_OPEN_GAP | SCRIPT_EXTEND_GAP_A))
1892                 script = SCRIPT_GAP_IN_A;
1893             break;
1894         case SCRIPT_GAP_IN_B:
1895             script = next_script & SCRIPT_OP_MASK;
1896             if (next_script & (SCRIPT_OOF_OPEN_GAP | SCRIPT_EXTEND_GAP_B))
1897                 script = SCRIPT_GAP_IN_B;
1898             break;
1899         default:
1900             script = next_script & SCRIPT_OP_MASK;
1901             break;
1902         }
1903 
1904         if (script == SCRIPT_GAP_IN_B) {
1905             b_index -= 3;
1906         }
1907         else {
1908             b_index -= script;
1909             a_index--;
1910         }
1911 
1912         GapPrelimEditBlockAdd(edit_block, (EGapAlignOpType)script, 1);
1913     }
1914 
1915     sfree(edit_start_offset);
1916     sfree(edit_script);
1917 
1918     if (!reversed)
1919         *b_offset -= 2;
1920     return best_score;
1921 }
1922 
1923 /** Low level function to perform gapped extension with out-of-frame
1924  * gapping with or without traceback.
1925  * @param A The query sequence [in]
1926  * @param B The subject sequence [in]
1927  * @param M Maximal extension length in query [in]
1928  * @param N Maximal extension length in subject [in]
1929  * @param a_offset Resulting starting offset in query [out]
1930  * @param b_offset Resulting starting offset in subject [out]
1931  * @param score_only Only find the score, without saving traceback [in]
1932  * @param edit_block Structure to hold generated traceback [out]
1933  * @param gap_align Structure holding various information and allocated
1934  *        memory for the gapped alignment [in]
1935  * @param score_params Parameters related to scoring [in]
1936  * @param query_offset The starting offset in query [in]
1937  * @param reversed Has the sequence been reversed? Used for psi-blast [in]
1938  * @return The best alignment score found.
1939  */
1940 static Int4
s_OutOfFrameGappedAlign(const Uint1 * A,const Uint1 * B,Int4 M,Int4 N,Int4 * a_offset,Int4 * b_offset,Boolean score_only,GapPrelimEditBlock * edit_block,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 query_offset,Boolean reversed)1941 s_OutOfFrameGappedAlign(const Uint1* A, const Uint1* B, Int4 M, Int4 N,
1942    Int4* a_offset, Int4* b_offset, Boolean score_only,
1943    GapPrelimEditBlock *edit_block, BlastGapAlignStruct* gap_align,
1944    const BlastScoringParameters* score_params,
1945    Int4 query_offset, Boolean reversed)
1946 {
1947     Int4 i, increment;          /* sequence pointers and indices */
1948     Int4 a_index;
1949     Int4 b_index, b_size, first_b_index, last_b_index;
1950 
1951     Int4 gap_open;              /* alignment penalty variables */
1952     Int4 gap_extend;
1953     Int4 gap_open_extend;
1954     Int4 shift_penalty;
1955     Int4 x_dropoff;
1956 
1957     BlastGapDP* score_array;
1958     Int4 num_extra_cells;
1959 
1960     Int4** matrix = NULL;       /* pointers to the score matrix */
1961     Int4** pssm = NULL;
1962     Int4* matrix_row = NULL;
1963 
1964     Int4 score;                 /* score tracking variables */
1965     Int4 score_row1;
1966     Int4 score_row2;
1967     Int4 score_row3;
1968     Int4 score_gap_col;
1969     Int4 score_col1;
1970     Int4 score_col2;
1971     Int4 score_col3;
1972     Int4 score_other_frame1;
1973     Int4 score_other_frame2;
1974     Int4 best_score;
1975 
1976     if (!score_only) {
1977         return s_OutOfFrameAlignWithTraceback(A, B, M, N, a_offset, b_offset,
1978                                               edit_block, gap_align,
1979                                               score_params, query_offset,
1980                                               reversed);
1981     }
1982 
1983     /* do initialization and sanity-checking */
1984 
1985     matrix = gap_align->sbp->matrix->data;
1986     if (gap_align->positionBased) {
1987         pssm = gap_align->sbp->psi_matrix->pssm->data;
1988     }
1989     *a_offset = 0;
1990     *b_offset = -2;
1991     gap_open = score_params->gap_open;
1992     gap_extend = score_params->gap_extend;
1993     gap_open_extend = gap_open + gap_extend;
1994     shift_penalty = score_params->shift_pen;
1995     x_dropoff = gap_align->gap_x_dropoff;
1996 
1997     if (x_dropoff < gap_open_extend)
1998         x_dropoff = gap_open_extend;
1999 
2000     if(N <= 0 || M <= 0)
2001         return 0;
2002 
2003     /* Allocate and fill in the auxiliary bookeeping structures.
2004        Since A and B could be very large, maintain a window
2005        of auxiliary structures only large enough to contain to current
2006        set of DP computations. The initial window size is determined
2007        by the number of cells needed to fail the x-dropoff test */
2008 
2009     if (gap_extend > 0)
2010         num_extra_cells = CODON_LENGTH * (x_dropoff / gap_extend + 5);
2011     else
2012         num_extra_cells = N + 5;
2013 
2014     if (num_extra_cells > gap_align->dp_mem_alloc) {
2015         gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
2016                                       2 * gap_align->dp_mem_alloc);
2017         sfree(gap_align->dp_mem);
2018         gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
2019                                                   sizeof(BlastGapDP));
2020     }
2021 
2022     score_array = gap_align->dp_mem;
2023     score = -gap_open_extend;
2024     score_array[0].best = 0;
2025     score_array[0].best_gap = -gap_open_extend;
2026 
2027     for (i = 3; i <= N + 2; i += 3) {
2028         score_array[i].best = score;
2029         score_array[i].best_gap = score - gap_open_extend;
2030         score_array[i-1].best = MININT;
2031         score_array[i-1].best_gap = MININT;
2032         score_array[i-2].best = MININT;
2033         score_array[i-2].best_gap = MININT;
2034         if (score < -x_dropoff)
2035             break;
2036         score -= gap_extend;
2037     }
2038 
2039     /* The inner loop below examines letters of B from
2040        index 'first_b_index' to 'b_size' */
2041 
2042     b_size = i - 2;
2043     score_array[i].best = MININT;
2044     score_array[i].best_gap = MININT;
2045 
2046     best_score = 0;
2047     first_b_index = 0;
2048     if (reversed) {
2049         increment = -1;
2050     }
2051     else {
2052         /* Allow for a backwards frame shift */
2053         B -= 2;
2054         increment = 1;
2055     }
2056 
2057     for (a_index = 1; a_index <= M; a_index++) {
2058 
2059         /* pick out the row of the score matrix
2060            appropriate for A[a_index] */
2061 
2062         if (!(gap_align->positionBased)) {
2063             matrix_row = matrix[ A[ a_index * increment ] ];
2064         }
2065         else {
2066             if(reversed)
2067                 matrix_row = pssm[M - a_index];
2068             else
2069                 matrix_row = pssm[a_index + query_offset];
2070         }
2071 
2072         /* initialize running-score variables */
2073         score = MININT;
2074         score_row1 = MININT;
2075         score_row2 = MININT;
2076         score_row3 = MININT;
2077         score_gap_col = MININT;
2078         score_col1 = MININT;
2079         score_col2 = MININT;
2080         score_col3 = MININT;
2081         score_other_frame1 = MININT;
2082         score_other_frame2 = MININT;
2083         last_b_index = first_b_index;
2084         b_index = first_b_index;
2085 
2086         while (b_index < b_size) {
2087 
2088             /* FRAME 0 */
2089 
2090             /* Pick the best score among all frames */
2091             score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
2092             score = MAX(score, score_col1) +
2093                                 matrix_row[ B[ b_index * increment ] ];
2094             score_other_frame1 = MAX(score_col1, score_array[b_index].best);
2095             score_col1 = score_array[b_index].best;
2096             score_gap_col = score_array[b_index].best_gap;
2097 
2098             /* Use the row and column scores if they improve
2099                the score overall */
2100 
2101             if (score < MAX(score_gap_col, score_row1)) {
2102                 score = MAX(score_gap_col, score_row1);
2103                 if (best_score - score > x_dropoff) {
2104 
2105                    /* the current best score failed the X-dropoff
2106                       criterion. Note that this does not stop the
2107                       inner loop, only forces future iterations to
2108                       skip this column of B.
2109 
2110                       Also, if the very first letter of B that was
2111                       tested failed the X dropoff criterion, make
2112                       sure future inner loops start one letter to
2113                       the right */
2114 
2115                     if (first_b_index == b_index)
2116                         first_b_index = b_index + 1;
2117                     else
2118                         score_array[b_index].best = MININT;
2119                 }
2120                 else {
2121                     /* update the row and column running scores */
2122                     last_b_index = b_index;
2123                     score_array[b_index].best = score;
2124                     score_array[b_index].best_gap = score_gap_col - gap_extend;
2125                     score_row1 -= gap_extend;
2126                 }
2127             }
2128             else {
2129                 if (best_score - score > x_dropoff) {
2130 
2131                    /* the current best score failed the X-dropoff
2132                       criterion. */
2133 
2134                     if (first_b_index == b_index)
2135                         first_b_index = b_index + 1;
2136                     else
2137                         score_array[b_index].best = MININT;
2138                 }
2139                 else {
2140                     /* The current best score exceeds the
2141                        row and column scores, and thus may
2142                        improve on the current optimal score */
2143 
2144                     last_b_index = b_index;
2145                     score_array[b_index].best = score;
2146                     if (score > best_score) {
2147                         best_score = score;
2148                         *a_offset = a_index;
2149                         *b_offset = b_index;
2150                     }
2151 
2152                     /* compute the best scores that include gaps
2153                        or gap extensions */
2154 
2155                     score -= gap_open_extend;
2156                     score_row1 -= gap_extend;
2157                     score_row1 = MAX(score, score_row1);
2158                     score_array[b_index].best_gap = MAX(score,
2159                                                   score_gap_col - gap_extend);
2160                 }
2161             }
2162 
2163             /* If this was the last letter of B checked, rotate
2164                the row scores so that code beyond the inner loop
2165                works correctly */
2166 
2167             if (++b_index >= b_size) {
2168                 score = score_row1;
2169                 score_row1 = score_row2;
2170                 score_row2 = score_row3;
2171                 score_row3 = score;
2172                 break;
2173             }
2174 
2175             /* FRAME 1 */
2176 
2177             /* This code, and that for Frame 2, are essentially the
2178                same as the preceeding code. The only real difference
2179                is the updating of the other_frame best scores */
2180 
2181             score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
2182             score = MAX(score, score_col2) +
2183                                 matrix_row[ B[ b_index * increment ] ];
2184             score_other_frame2 = MAX(score_col2, score_array[b_index].best);
2185             score_col2 = score_array[b_index].best;
2186             score_gap_col = score_array[b_index].best_gap;
2187 
2188             if (score < MAX(score_gap_col, score_row2)) {
2189                 score = MAX(score_gap_col, score_row2);
2190                 if (best_score - score > x_dropoff) {
2191                     if (first_b_index == b_index)
2192                         first_b_index = b_index + 1;
2193                     else
2194                         score_array[b_index].best = MININT;
2195                 }
2196                 else {
2197                     last_b_index = b_index;
2198                     score_array[b_index].best = score;
2199                     score_array[b_index].best_gap = score_gap_col - gap_extend;
2200                     score_row2 -= gap_extend;
2201                 }
2202             }
2203             else {
2204                 if (best_score - score > x_dropoff) {
2205                     if (first_b_index == b_index)
2206                         first_b_index = b_index + 1;
2207                     else
2208                         score_array[b_index].best = MININT;
2209                 }
2210                 else {
2211                     last_b_index = b_index;
2212                     score_array[b_index].best = score;
2213                     if (score > best_score) {
2214                         best_score = score;
2215                         *a_offset = a_index;
2216                         *b_offset = b_index;
2217                     }
2218                     score -= gap_open_extend;
2219                     score_row2 -= gap_extend;
2220                     score_row2 = MAX(score, score_row2);
2221                     score_array[b_index].best_gap = MAX(score,
2222                                                   score_gap_col - gap_extend);
2223                 }
2224             }
2225 
2226             if (++b_index >= b_size) {
2227                 score = score_row2;
2228                 score_row2 = score_row1;
2229                 score_row1 = score_row3;
2230                 score_row3 = score;
2231                 break;
2232             }
2233 
2234             /* FRAME 2 */
2235 
2236             score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
2237             score = MAX(score, score_col3) +
2238                                 matrix_row[ B[ b_index * increment ] ];
2239             score_other_frame1 = score_other_frame2;
2240             score_other_frame2 = MAX(score_col3, score_array[b_index].best);
2241             score_col3 = score_array[b_index].best;
2242             score_gap_col = score_array[b_index].best_gap;
2243 
2244             if (score < MAX(score_gap_col, score_row3)) {
2245                 score = MAX(score_gap_col, score_row3);
2246                 if (best_score - score > x_dropoff) {
2247                     if (first_b_index == b_index)
2248                         first_b_index = b_index + 1;
2249                     else
2250                         score_array[b_index].best = MININT;
2251                 }
2252                 else {
2253                     last_b_index = b_index;
2254                     score_array[b_index].best = score;
2255                     score_array[b_index].best_gap = score_gap_col - gap_extend;
2256                     score_row3 -= gap_extend;
2257                 }
2258             }
2259             else {
2260                 if (best_score - score > x_dropoff) {
2261                     if (first_b_index == b_index)
2262                         first_b_index = b_index + 1;
2263                     else
2264                         score_array[b_index].best = MININT;
2265                 }
2266                 else {
2267                     last_b_index = b_index;
2268                     score_array[b_index].best = score;
2269                     if (score > best_score) {
2270                         best_score = score;
2271                         *a_offset = a_index;
2272                         *b_offset = b_index;
2273                     }
2274                     score -= gap_open_extend;
2275                     score_row3 -= gap_extend;
2276                     score_row3 = MAX(score, score_row3);
2277                     score_array[b_index].best_gap = MAX(score,
2278                                                   score_gap_col - gap_extend);
2279                 }
2280             }
2281             b_index++;
2282         }
2283 
2284         /* Finish aligning if the best scores for all positions
2285            of B will fail the X-dropoff test, i.e. the inner loop
2286            bounds have converged to each other */
2287 
2288         if (first_b_index == b_size)
2289             break;
2290 
2291         /* Enlarge the window for score data, if necessary */
2292 
2293         if (b_size + num_extra_cells + 5 >= gap_align->dp_mem_alloc) {
2294 
2295             gap_align->dp_mem_alloc = MAX(b_size + num_extra_cells + 100,
2296                                           2 * gap_align->dp_mem_alloc);
2297             score_array = (BlastGapDP *)realloc(score_array,
2298                                                gap_align->dp_mem_alloc *
2299                                                sizeof(BlastGapDP));
2300             gap_align->dp_mem = score_array;
2301         }
2302 
2303         if (last_b_index < b_size - 1) {
2304             /* This row failed the X-dropoff test earlier than
2305                the last row did; just shorten the loop bounds
2306                before doing the next row */
2307 
2308             b_size = last_b_index + 1;
2309         }
2310         else {
2311             /* The inner loop finished without failing the X-dropoff
2312                test; initialize extra bookkeeping structures until
2313                the X dropoff test fails or we run out of letters in B.
2314                The next inner loop will have larger bounds.
2315 
2316                Keep initializing extra structures until the X dropoff
2317                test fails in all frames for this row */
2318 
2319             score = MAX(score_row1, score_row2);
2320             score = MAX(score, score_row3);
2321             while (score >= (best_score - x_dropoff) && b_size < N + 1) {
2322 
2323                 score_array[b_size].best = score_row1;
2324                 score_array[b_size].best_gap = score_row1 - gap_open_extend;
2325                 score_row1 -= gap_extend;
2326 
2327                 score_array[b_size+1].best = score_row2;
2328                 score_array[b_size+1].best_gap = score_row2 - gap_open_extend;
2329                 score_row2 -= gap_extend;
2330 
2331                 score_array[b_size+2].best = score_row3;
2332                 score_array[b_size+2].best_gap = score_row3 - gap_open_extend;
2333                 score_row3 -= gap_extend;
2334 
2335                 b_size += 3;
2336                 score -= gap_extend;
2337             }
2338         }
2339 
2340         /* chop off the best score in each frame */
2341         b_size = MIN(b_size, N + 1);
2342         last_b_index = MIN(b_size + 4, N + 3);
2343         while (b_size < last_b_index) {
2344             score_array[b_size].best = MININT;
2345             score_array[b_size].best_gap = MININT;
2346             b_size++;
2347         }
2348     }
2349 
2350     if (!reversed) {
2351         /* The sequence was shifted, so length should be adjusted as well */
2352         *b_offset -= 2;
2353     }
2354     return best_score;
2355 }
2356 
2357 /** Simple wrapper around binary search function for BlastQueryInfo structure
2358  * to obtain the context in which this initial ungapped HSP is found
2359  * @param query_info Query information structure [in]
2360  * @param init_hsp Initial HSP [in]
2361  */
2362 static NCBI_INLINE Int4
s_GetUngappedHSPContext(const BlastQueryInfo * query_info,const BlastInitHSP * init_hsp)2363 s_GetUngappedHSPContext(const BlastQueryInfo* query_info,
2364                         const BlastInitHSP* init_hsp)
2365 {
2366     return BSearchContextInfo(init_hsp->offsets.qs_offsets.q_off, query_info);
2367 }
2368 
2369 /** Adjust the HSP offsets in the initial ungapped HSP structure given the
2370  * query start
2371  * @param init_hsp Initial HSP [in|out]
2372  * @param query_start Starting offset of the query sequence in the query info
2373  * structure [in]
2374  */
2375 static NCBI_INLINE void
s_AdjustInitialHSPOffsets(BlastInitHSP * init_hsp,Int4 query_start)2376 s_AdjustInitialHSPOffsets(BlastInitHSP* init_hsp, Int4 query_start)
2377 {
2378     init_hsp->offsets.qs_offsets.q_off -= query_start;
2379     if (init_hsp->ungapped_data) {
2380         init_hsp->ungapped_data->q_start -= query_start;
2381     }
2382     ASSERT(init_hsp->ungapped_data == NULL ||
2383            init_hsp->ungapped_data->q_start >= 0);
2384 }
2385 
2386 /** Set up a BLAST_SequenceBlk structure for a single query sequence
2387  * @param concatenated_query Query sequence block for concatenated query [in]
2388  * @param query_info Query information structure for concatenated query [in]
2389  * @param context Context in which the HSP ocurred [in]
2390  * @param single_query query Output sequence block which will contain adjusted
2391  * sequence pointer and sequence length [out]
2392  * @param query_start Starting offset of the single query sequence in the
2393  * concatenated query [out]
2394   */
2395 static void
s_SetUpLocalBlastSequenceBlk(const BLAST_SequenceBlk * concatenated_query,const BlastQueryInfo * query_info,Int4 context,BLAST_SequenceBlk * single_query,Int4 * query_start)2396 s_SetUpLocalBlastSequenceBlk(const BLAST_SequenceBlk* concatenated_query,
2397                              const BlastQueryInfo* query_info, Int4 context,
2398                              BLAST_SequenceBlk* single_query,
2399                              Int4* query_start)
2400 {
2401     Int4 query_length = 0;
2402     if (concatenated_query->oof_sequence) {
2403         /* Out-of-frame blastx case: all frames of the same parity are mixed
2404            together in a special sequence. */
2405         Int4 query_end_pt = 0;
2406         Int4 mixed_frame_context = context - context % CODON_LENGTH;
2407 
2408         *query_start = query_info->contexts[mixed_frame_context].query_offset;
2409         query_end_pt =
2410             query_info->contexts[mixed_frame_context+CODON_LENGTH-1].query_offset +
2411             query_info->contexts[mixed_frame_context+CODON_LENGTH-1].query_length;
2412         query_length = query_end_pt - *query_start;
2413         single_query->sequence = NULL;
2414         single_query->oof_sequence = concatenated_query->oof_sequence + *query_start;
2415     } else {
2416         *query_start  = query_info->contexts[context].query_offset;
2417         query_length = query_info->contexts[context].query_length;
2418         single_query->sequence = concatenated_query->sequence + *query_start;
2419         single_query->oof_sequence = NULL;
2420     }
2421     single_query->length = query_length;
2422 }
2423 
2424 
2425 /** Find the HSP offsets relative to the individual query sequence instead of
2426  * the concatenated sequence and retrieve relevant portions of the query
2427  * sequence data.
2428  * @param query Query sequence block [in]
2429  * @param query_info Query information structure, including context offsets
2430  *                   array [in]
2431  * @param init_hsp Initial HSP, this will be modified to hold the adjusted
2432  *                 offsets [in] [out]
2433  * @param query_out query sequence block with modified sequence
2434  *                  pointer and sequence length [out]
2435  * @param context Which context this HSP belongs to? [out]
2436  */
2437 static void
s_AdjustHspOffsetsAndGetQueryData(const BLAST_SequenceBlk * query,const BlastQueryInfo * query_info,BlastInitHSP * init_hsp,BLAST_SequenceBlk * query_out,Int4 * context)2438 s_AdjustHspOffsetsAndGetQueryData(const BLAST_SequenceBlk* query,
2439                                   const BlastQueryInfo* query_info,
2440                                   BlastInitHSP* init_hsp,
2441                                   BLAST_SequenceBlk* query_out,
2442                                   Int4* context)
2443 {
2444     Int4 query_start = 0;
2445 
2446     ASSERT(query);
2447     ASSERT(query_info);
2448     ASSERT(query_out);
2449     ASSERT(init_hsp);
2450     ASSERT(context);
2451 
2452     ASSERT(init_hsp->ungapped_data == NULL ||
2453            init_hsp->ungapped_data->q_start >= 0);
2454     *context = s_GetUngappedHSPContext(query_info, init_hsp);
2455     s_SetUpLocalBlastSequenceBlk(query, query_info, *context,
2456                                  query_out, &query_start);
2457     ASSERT(init_hsp->ungapped_data == NULL ||
2458            (init_hsp->ungapped_data->q_start-query_start) >= 0);
2459     s_AdjustInitialHSPOffsets(init_hsp, query_start);
2460 }
2461 
2462 
2463 /** Convert the initial list of traceback actions from a non-OOF
2464  *  gapped alignment into a blast edit script. Note that this routine
2465  *  assumes the input edit blocks have not been reversed or rearranged
2466  *  by calling code
2467  *  @param rev_prelim_tback Traceback from extension to the left [in]
2468  *  @param fwd_prelim_tback Traceback from extension to the right [in]
2469  *  @return Pointer to the resulting edit script, or NULL if there
2470  *          are no traceback actions specified
2471  */
2472 GapEditScript*
Blast_PrelimEditBlockToGapEditScript(GapPrelimEditBlock * rev_prelim_tback,GapPrelimEditBlock * fwd_prelim_tback)2473 Blast_PrelimEditBlockToGapEditScript (GapPrelimEditBlock* rev_prelim_tback,
2474                                       GapPrelimEditBlock* fwd_prelim_tback)
2475 {
2476    Boolean merge_ops = FALSE;
2477    GapEditScript* esp;
2478    GapPrelimEditScript *op;
2479    Int4 i;
2480    Int4 index=0;
2481    Int4 size = 0;
2482 
2483    if (rev_prelim_tback == NULL || fwd_prelim_tback == NULL)
2484       return NULL;
2485 
2486    /* The fwd_prelim_tback script will get reversed here as the traceback started from the highest scoring point
2487      and worked backwards. The rev_prelim_tback script does NOT get reversed.  Since it was reversed when the
2488      traceback was produced it's already "forward" */
2489 
2490    if (fwd_prelim_tback->num_ops > 0 && rev_prelim_tback->num_ops > 0 &&
2491        fwd_prelim_tback->edit_ops[(fwd_prelim_tback->num_ops)-1].op_type ==
2492          rev_prelim_tback->edit_ops[(rev_prelim_tback->num_ops)-1].op_type)
2493      merge_ops = TRUE;
2494 
2495    size = fwd_prelim_tback->num_ops+rev_prelim_tback->num_ops;
2496    if (merge_ops)
2497      size--;
2498 
2499    esp = GapEditScriptNew(size);
2500 
2501    index = 0;
2502    for (i=0; i < rev_prelim_tback->num_ops; i++) {
2503       op = rev_prelim_tback->edit_ops + i;
2504       esp->op_type[index] = op->op_type;
2505       esp->num[index] = op->num;
2506       index++;
2507    }
2508 
2509    if (fwd_prelim_tback->num_ops == 0)
2510       return esp;
2511 
2512    if (merge_ops)
2513        esp->num[index-1] += fwd_prelim_tback->edit_ops[(fwd_prelim_tback->num_ops)-1].num;
2514 
2515    /* If we merge, then we skip the first one. */
2516    if (merge_ops)
2517       i = fwd_prelim_tback->num_ops - 2;
2518    else
2519       i = fwd_prelim_tback->num_ops - 1;
2520 
2521    for (; i >= 0; i--) {
2522       op = fwd_prelim_tback->edit_ops + i;
2523       esp->op_type[index] = op->op_type;
2524       esp->num[index] = op->num;
2525       index++;
2526    }
2527 
2528    return esp;
2529 }
2530 
2531 /** Fills the BlastGapAlignStruct structure with the results
2532  *  of a greedy gapped extension.
2533  * @param gap_align the initialized gapped alignment structure [in] [out]
2534  * @param q_start The starting offset in query [in]
2535  * @param s_start The starting offset in subject [in]
2536  * @param q_end The ending offset in query [in]
2537  * @param s_end The ending offset in subject [in]
2538  * @param q_seed_start The query offset of the alignment start point [in]
2539  * @param s_seed_start The subject offset of the alignment start point [in]
2540  * @param score The alignment score [in]
2541  * @param esp The edit script containing the traceback information [in]
2542  */
2543 static Int2
s_BlastGreedyGapAlignStructFill(BlastGapAlignStruct * gap_align,Int4 q_start,Int4 s_start,Int4 q_end,Int4 s_end,Int4 q_seed_start,Int4 s_seed_start,Int4 score,GapEditScript * esp)2544 s_BlastGreedyGapAlignStructFill(BlastGapAlignStruct* gap_align,
2545                                 Int4 q_start, Int4 s_start,
2546                                 Int4 q_end, Int4 s_end,
2547                                 Int4 q_seed_start, Int4 s_seed_start,
2548                                 Int4 score, GapEditScript* esp)
2549 {
2550    gap_align->query_start = q_start;
2551    gap_align->query_stop = q_end;
2552    gap_align->subject_start = s_start;
2553    gap_align->subject_stop = s_end;
2554    gap_align->greedy_query_seed_start = q_seed_start;
2555    gap_align->greedy_subject_seed_start = s_seed_start;
2556    gap_align->score = score;
2557 
2558    if (esp)
2559       gap_align->edit_script = esp;
2560 
2561    return 0;
2562 }
2563 
s_UpdateEditScript(GapEditScript * esp,int pos,int bf,int af)2564 static void s_UpdateEditScript(GapEditScript* esp, int pos, int bf, int af) {
2565    int op, qd, sd;
2566 
2567    if (bf > 0) {
2568       op = pos;
2569       qd = sd = bf;
2570       do {
2571           if (--op < 0) return;
2572           switch(esp->op_type[op]) {
2573           case eGapAlignSub:
2574               qd -= esp->num[op];
2575               sd -= esp->num[op];
2576               break;
2577           case eGapAlignIns:
2578               qd -= esp->num[op];
2579               break;
2580           case eGapAlignDel:
2581               sd -= esp->num[op];
2582           default:
2583               break;
2584           }
2585       } while (qd > 0 || sd > 0);
2586 
2587       esp->num[op] = -MAX(qd, sd);
2588       esp->op_type[op++] = eGapAlignSub;
2589       for (; op < pos-1; op++) esp->num[op] = 0;
2590       esp->num[pos] += bf;
2591       qd -= sd;
2592       esp->op_type[pos-1] = (qd>0) ? eGapAlignDel: eGapAlignIns;
2593       esp->num[pos-1] = (qd>0) ? qd : -qd;
2594    }
2595 
2596    if (af > 0) {
2597       op = pos;
2598       qd = sd = af;
2599       do {
2600           if (++op >= esp->size) return;
2601           switch(esp->op_type[op]) {
2602           case eGapAlignSub:
2603               qd -= esp->num[op];
2604               sd -= esp->num[op];
2605               break;
2606           case eGapAlignIns:
2607               qd -= esp->num[op];
2608               break;
2609           case eGapAlignDel:
2610               sd -= esp->num[op];
2611           default:
2612               break;
2613           }
2614       } while (qd > 0 || sd > 0);
2615 
2616       esp->num[op] = -MAX(qd, sd);
2617       esp->op_type[op--] = eGapAlignSub;
2618       for (; op > pos+1; op--) esp->num[op] = 0;
2619       esp->num[pos] += af;
2620       qd -= sd;
2621       esp->op_type[pos+1] = (qd>0) ? eGapAlignDel: eGapAlignIns;
2622       esp->num[pos+1] = (qd>0) ? qd : -qd;
2623    }
2624 }
2625 
s_RebuildEditScript(GapEditScript * esp)2626 static void s_RebuildEditScript(GapEditScript* esp) {
2627    int i, j;
2628    for (i=0, j=-1; i<esp->size; i++) {
2629        if (esp->num[i] == 0) continue;
2630        if (j>=0 && esp->op_type[i] == esp->op_type[j]) {
2631            esp->num[j] += esp->num[i];
2632        } else if (j==-1 || esp->op_type[i] == eGapAlignSub
2633            || esp->op_type[j] == eGapAlignSub) {
2634            esp->op_type[++j] = esp->op_type[i];
2635            esp->num[j] = esp->num[i];
2636        } else {
2637            int d = esp->num[j] - esp->num[i];
2638            if (d > 0) {
2639               esp->num[j-1] += esp->num[i];
2640               esp->num[j] = d;
2641            } else if (d < 0) {
2642               esp->num[j-1] += esp->num[j];
2643               esp->num[j] = -d;
2644               esp->op_type[j] = esp->op_type[i];
2645            } else {
2646               esp->num[j-1] += esp->num[j];
2647               --j;
2648            }
2649        }
2650    }
2651    esp->size = ++j;
2652 }
2653 
s_ReduceGaps(GapEditScript * esp,const Uint1 * q,const Uint1 * s,const Uint1 * qf,const Uint1 * sf)2654 static void s_ReduceGaps(GapEditScript* esp, const Uint1 *q, const Uint1 *s,
2655                                              const Uint1 *qf,const Uint1 *sf){
2656    int i, j, nm1, nm2, d;
2657    const Uint1 *q1, *s1;
2658 
2659    for (q1=q, s1=s, i=0; i<esp->size; i++) {
2660        if (esp->num[i] == 0) continue;
2661        if (esp->op_type[i] == eGapAlignSub) {
2662            if(esp->num[i] >= 12) {
2663                nm1 = 1;
2664                if (i > 0) {
2665                    while (q1-nm1>=q && (*(q1-nm1) == *(s1-nm1))) ++nm1;
2666                }
2667                q1 += esp->num[i];
2668                s1 += esp->num[i];
2669                nm2 = 0;
2670                if (i < esp->size -1) {
2671                    while ((q1+1<qf) && (s1+1<sf) && (*(q1++) == *(s1++))) ++nm2;
2672                }
2673                if (nm1>1 || nm2>0) s_UpdateEditScript(esp, i, nm1-1, nm2);
2674                q1--; s1--;
2675            } else {
2676                q1 += esp->num[i];
2677                s1 += esp->num[i];
2678            }
2679        } else if (esp->op_type[i] == eGapAlignIns) {
2680            q1 += esp->num[i];
2681        } else {
2682            s1 += esp->num[i];
2683        }
2684    }
2685    s_RebuildEditScript(esp);
2686 
2687    for (i=0; i<esp->size; i++) {
2688        if (esp->op_type[i] == eGapAlignSub) {
2689            q += esp->num[i];
2690            s += esp->num[i];
2691            continue;
2692        }
2693        if (i>1 && esp->op_type[i] != esp->op_type[i-2]
2694                && esp->num[i-2] > 0) {
2695            d = esp->num[i] + esp->num[i-1] + esp->num[i-2];
2696            if (d == 3) {
2697                /* special case, no need to do further testing */
2698                (esp->num[i-2]) = 0;
2699                (esp->num[i-1]) = 2;
2700                (esp->num[i]) = 0;
2701                if (esp->op_type[i] == eGapAlignIns) {
2702                    ++q;
2703                } else {
2704                    ++s;
2705                }
2706            } else if (d < 12) {
2707                /* Try reducing this sub... */
2708                nm1 = 0;
2709                nm2 = 0;
2710                d = MIN(esp->num[i], esp->num[i-2]);
2711                q -= esp->num[i-1];
2712                s -= esp->num[i-1];
2713                q1 = q;
2714                s1 = s;
2715                if (esp->op_type[i] == eGapAlignIns) {
2716                    s -= d;
2717                } else {
2718                    q -= d;
2719                }
2720                for (j=0; j<esp->num[i-1]; ++j, ++q1, ++s1, ++q, ++s) {
2721                    if (*q1 == *s1) nm1++;
2722                    if (*q == *s) nm2++;
2723                }
2724                for (j=0; j<d; ++j, ++q, ++s) {
2725                    if (*q == *s) nm2++;
2726                }
2727                if (nm2 >= nm1 - d) {
2728                    (esp->num[i-2]) -= d;
2729                    (esp->num[i-1]) += d;
2730                    (esp->num[i]) -= d;
2731                } else {
2732                    q = q1;
2733                    s = s1;
2734                }
2735            }
2736        }
2737        if (esp->op_type[i] == eGapAlignIns) {
2738            q += esp->num[i];
2739        } else {
2740            s += esp->num[i];
2741        }
2742    }
2743    s_RebuildEditScript(esp);
2744 }
2745 
2746 Int2
BLAST_GreedyGappedAlignment(const Uint1 * query,const Uint1 * subject,Int4 query_length,Int4 subject_length,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 q_off,Int4 s_off,Boolean compressed_subject,Boolean do_traceback,Boolean * fence_hit)2747 BLAST_GreedyGappedAlignment(const Uint1* query, const Uint1* subject,
2748    Int4 query_length, Int4 subject_length, BlastGapAlignStruct* gap_align,
2749    const BlastScoringParameters* score_params,
2750    Int4 q_off, Int4 s_off, Boolean compressed_subject, Boolean do_traceback,
2751    Boolean * fence_hit)
2752 {
2753    const Uint1* q;
2754    const Uint1* s;
2755    Int4 score;
2756    Int4 X;
2757    Int4 q_avail, s_avail;
2758    Int4 q_ext_l, q_ext_r, s_ext_l, s_ext_r;
2759    GapPrelimEditBlock *fwd_prelim_tback = NULL;
2760    GapPrelimEditBlock *rev_prelim_tback = NULL;
2761    SGreedySeed fwd_start_point;
2762    SGreedySeed rev_start_point;
2763    Uint1 rem;
2764    GapEditScript* esp = NULL;
2765    Int4 q_seed_start = q_off;
2766    Int4 s_seed_start = s_off;
2767 
2768    q_avail = query_length - q_off;
2769    s_avail = subject_length - s_off;
2770 
2771    q = query + q_off;
2772    if (!compressed_subject) {
2773       s = subject + s_off;
2774       rem = 4; /* Special value to indicate that sequence is already
2775                   uncompressed */
2776    } else {
2777       s = subject + s_off/4;
2778       rem = s_off % 4;
2779    }
2780 
2781    X = gap_align->gap_x_dropoff;
2782 
2783    if (do_traceback) {
2784       fwd_prelim_tback = gap_align->fwd_prelim_tback;
2785       rev_prelim_tback = gap_align->rev_prelim_tback;
2786       GapPrelimEditBlockReset(fwd_prelim_tback);
2787       GapPrelimEditBlockReset(rev_prelim_tback);
2788    }
2789 
2790    /* extend to the right */
2791    while (TRUE) {
2792        Int4 new_dist, xdrop;
2793        score = BLAST_AffineGreedyAlign(q, q_avail, s, s_avail, FALSE, X,
2794               score_params->reward, -score_params->penalty,
2795               score_params->gap_open, score_params->gap_extend,
2796               &q_ext_r, &s_ext_r, gap_align->greedy_align_mem,
2797               fwd_prelim_tback, rem, fence_hit, &fwd_start_point);
2798 
2799        if(fence_hit && *fence_hit) {
2800     	   return -1;
2801        }
2802 
2803        if (score >=0) break;
2804 
2805        /* double the max distance */
2806        new_dist = gap_align->greedy_align_mem->max_dist * 2;
2807        xdrop    = gap_align->greedy_align_mem->xdrop;
2808        s_BlastGreedyAlignsFree(gap_align->greedy_align_mem);
2809        gap_align->greedy_align_mem =
2810           s_BlastGreedyAlignMemAlloc(score_params, NULL, new_dist, xdrop);
2811 
2812        if (!gap_align->greedy_align_mem) {
2813           gap_align = BLAST_GapAlignStructFree(gap_align);
2814           return -1;
2815        }
2816    }
2817 
2818    if (compressed_subject)
2819       rem = 0;
2820 
2821    /* extend to the left */
2822    while (TRUE) {
2823        Int4 new_dist, xdrop, score1;
2824        score1 = BLAST_AffineGreedyAlign(query, q_off,
2825                subject, s_off, TRUE, X,
2826                score_params->reward, -score_params->penalty,
2827                score_params->gap_open, score_params->gap_extend,
2828                &q_ext_l, &s_ext_l, gap_align->greedy_align_mem,
2829                rev_prelim_tback, rem, fence_hit, &rev_start_point);
2830        if(fence_hit && *fence_hit) {
2831     	   return -1;
2832        }
2833        if (score1 >=0) {
2834            score += score1;
2835            break;
2836        }
2837 
2838        /* double the max distance */
2839        new_dist = gap_align->greedy_align_mem->max_dist * 2;
2840        xdrop    = gap_align->greedy_align_mem->xdrop;
2841        s_BlastGreedyAlignsFree(gap_align->greedy_align_mem);
2842        gap_align->greedy_align_mem =
2843           s_BlastGreedyAlignMemAlloc(score_params, NULL, new_dist, xdrop);
2844 
2845        if (!gap_align->greedy_align_mem) {
2846           gap_align = BLAST_GapAlignStructFree(gap_align);
2847           return -1;
2848        }
2849    }
2850 
2851    /* In basic case the greedy algorithm returns number of
2852       differences, hence we need to convert it to score */
2853    if (score_params->gap_open==0 && score_params->gap_extend==0) {
2854       score =
2855          (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*score_params->reward/2 -
2856          score*(score_params->reward - score_params->penalty);
2857    } else if (score_params->reward % 2 == 1) {
2858       score /= 2;
2859    }
2860 
2861    if (do_traceback) {
2862       esp = Blast_PrelimEditBlockToGapEditScript(rev_prelim_tback,
2863                                              fwd_prelim_tback);
2864       /* check for possible gap elimination */
2865       ASSERT(!compressed_subject);
2866       if (esp) s_ReduceGaps(esp, query+q_off-q_ext_l, subject+s_off-s_ext_l,
2867                                  query+q_off+q_ext_r, subject+s_off+s_ext_r);
2868    }
2869    else {
2870        /* estimate the best alignment start point. This is the middle
2871           of the longest run of exact matches found by the aligner,
2872           subject to the constraint that the start point lies in the
2873           box formed by the optimal alignment */
2874 
2875        Int4 q_box_l = q_off - q_ext_l;
2876        Int4 s_box_l = s_off - s_ext_l;
2877        Int4 q_box_r = q_off + q_ext_r;
2878        Int4 s_box_r = s_off + s_ext_r;
2879        Int4 q_seed_start_l = q_off - rev_start_point.start_q;
2880        Int4 s_seed_start_l = s_off - rev_start_point.start_s;
2881        Int4 q_seed_start_r = q_off + fwd_start_point.start_q;
2882        Int4 s_seed_start_r = s_off + fwd_start_point.start_s;
2883        Int4 valid_seed_len_l = 0;
2884        Int4 valid_seed_len_r = 0;
2885 
2886        if (q_seed_start_r < q_box_r && s_seed_start_r < s_box_r) {
2887            valid_seed_len_r = MIN(q_box_r - q_seed_start_r,
2888                                   s_box_r - s_seed_start_r);
2889            valid_seed_len_r = MIN(valid_seed_len_r,
2890                                   fwd_start_point.match_length) / 2;
2891        } else {
2892            q_seed_start_r = q_off;
2893            s_seed_start_r = s_off;
2894        }
2895 
2896        if (q_seed_start_l > q_box_l && s_seed_start_l > s_box_l) {
2897            valid_seed_len_l = MIN(q_seed_start_l - q_box_l,
2898                                   s_seed_start_l - s_box_l);
2899            valid_seed_len_l = MIN(valid_seed_len_l,
2900                                   rev_start_point.match_length) / 2;
2901        } else {
2902            q_seed_start_l = q_off;
2903            s_seed_start_l = s_off;
2904        }
2905 
2906        if (valid_seed_len_r > valid_seed_len_l) {
2907            q_seed_start = q_seed_start_r + valid_seed_len_r;
2908            s_seed_start = s_seed_start_r + valid_seed_len_r;
2909        }
2910        else {
2911            q_seed_start = q_seed_start_l - valid_seed_len_l;
2912            s_seed_start = s_seed_start_l - valid_seed_len_l;
2913        }
2914    }
2915 
2916    s_BlastGreedyGapAlignStructFill(gap_align,
2917                                    q_off-q_ext_l, s_off-s_ext_l,
2918                                    q_off+q_ext_r, s_off+s_ext_r,
2919                                    q_seed_start, s_seed_start,
2920                                    score, esp);
2921    return 0;
2922 }
2923 
2924 /** Performs dynamic programming style gapped extension, given an initial
2925  * HSP (offset pair), two sequence blocks and scoring and extension options.
2926  * Note: traceback is not done in this function.
2927  * @param query_blk The query sequence [in]
2928  * @param subject_blk The subject sequence [in]
2929  * @param gap_align The auxiliary structure for gapped alignment [in]
2930  * @param score_params Parameters related to scoring [in]
2931  * @param init_hsp The initial HSP that needs to be extended [in]
2932 */
2933 static Int2
s_BlastDynProgNtGappedAlignment(BLAST_SequenceBlk * query_blk,BLAST_SequenceBlk * subject_blk,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,BlastInitHSP * init_hsp)2934 s_BlastDynProgNtGappedAlignment(BLAST_SequenceBlk* query_blk,
2935    BLAST_SequenceBlk* subject_blk, BlastGapAlignStruct* gap_align,
2936    const BlastScoringParameters* score_params, BlastInitHSP* init_hsp)
2937 {
2938    Int4 q_length, s_length;
2939    Int4 private_q_start, private_s_start;
2940    Int4 score_right = 0, score_left = 0;
2941    Uint1 offset_adjustment;
2942    Uint1* query = query_blk->sequence;
2943    Uint1* subject = subject_blk->sequence;
2944    Int4 x_dropoff =  gap_align->gap_x_dropoff;
2945 
2946    if(init_hsp->ungapped_data->score < x_dropoff) {
2947           x_dropoff = init_hsp->ungapped_data->score;
2948    }
2949 
2950    /* If subject offset is not at the start of a full byte,
2951       s_BlastAlignPackedNucl won't work, so shift the alignment start
2952       to the next multiple of 4 subject letters. Note that the
2953       shift amount is always nonzero, so a left extension always happens
2954       (and has a few presumed exact matches to start with). In the case
2955       of the smallest ungapped alignment (4 nucleotides), this can
2956       conceivably put the start point at the end of one sequence */
2957 
2958    offset_adjustment = COMPRESSION_RATIO -
2959             (init_hsp->offsets.qs_offsets.s_off % COMPRESSION_RATIO);
2960    q_length = init_hsp->offsets.qs_offsets.q_off + offset_adjustment;
2961    s_length = init_hsp->offsets.qs_offsets.s_off + offset_adjustment;
2962 
2963    /* This prevents the starting point from being at the end of a sequence
2964       as mentioned in the comment above. */
2965    if (q_length > query_blk->length || s_length > subject_blk->length)
2966    {
2967        q_length -= COMPRESSION_RATIO;
2968        s_length -= COMPRESSION_RATIO;
2969    }
2970 
2971    /* perform extension to left */
2972    score_left = s_BlastAlignPackedNucl(query, subject, q_length, s_length,
2973                       &private_q_start, &private_s_start, gap_align,
2974                       score_params, TRUE, x_dropoff);
2975    if (score_left < 0)
2976       return -1;
2977    gap_align->query_start = q_length - private_q_start;
2978    gap_align->subject_start = s_length - private_s_start;
2979 
2980    /* perform extension to right */
2981    if (q_length < query_blk->length &&
2982        s_length < subject_blk->length)
2983    {
2984       score_right = s_BlastAlignPackedNucl(query+q_length-1,
2985          subject+(s_length+3)/COMPRESSION_RATIO - 1,
2986          query_blk->length-q_length,
2987          subject_blk->length-s_length, &(gap_align->query_stop),
2988          &(gap_align->subject_stop), gap_align, score_params, FALSE, x_dropoff);
2989       if (score_right < 0)
2990          return -1;
2991       gap_align->query_stop += q_length;
2992       gap_align->subject_stop += s_length;
2993    }
2994    else {
2995       gap_align->query_stop = q_length;
2996       gap_align->subject_stop = s_length;
2997    }
2998 
2999    gap_align->score = score_right+score_left;
3000 
3001    return 0;
3002 }
3003 
3004 /** Aligns two nucleotide sequences, one (A) should be packed in the
3005  * same way as the BLAST databases, the other (B) should contain one
3006  * basepair/byte. Traceback is not done in this function.
3007  * @param B The query sequence [in]
3008  * @param A The subject sequence [in]
3009  * @param N Maximal extension length in query [in]
3010  * @param M Maximal extension length in subject [in]
3011  * @param b_offset Resulting starting offset in query [out]
3012  * @param a_offset Resulting starting offset in subject [out]
3013  * @param gap_align The auxiliary structure for gapped alignment [in]
3014  * @param score_params Parameters related to scoring [in]
3015  * @param reverse_sequence Reverse the sequence.
3016  * @return The best alignment score found.
3017 */
3018 static Int4
s_BlastAlignPackedNucl(Uint1 * B,Uint1 * A,Int4 N,Int4 M,Int4 * b_offset,Int4 * a_offset,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Boolean reverse_sequence,Int4 x_dropoff)3019 s_BlastAlignPackedNucl(Uint1* B, Uint1* A, Int4 N, Int4 M,
3020 	Int4* b_offset, Int4* a_offset,
3021         BlastGapAlignStruct* gap_align,
3022         const BlastScoringParameters* score_params,
3023         Boolean reverse_sequence, Int4 x_dropoff)
3024 {
3025     Int4 i;                     /* sequence pointers and indices */
3026     Int4 a_index, a_base_pair;
3027     Int4 b_index, b_size, first_b_index, last_b_index, b_increment;
3028     Uint1* b_ptr;
3029 
3030     BlastGapDP* score_array;
3031     Int4 num_extra_cells;
3032 
3033     Int4 gap_open;              /* alignment penalty variables */
3034     Int4 gap_extend;
3035     Int4 gap_open_extend;
3036 
3037     Int4* *matrix;              /* pointers to the score matrix */
3038     Int4* matrix_row;
3039 
3040     Int4 score;                 /* score tracking variables */
3041     Int4 score_gap_row;
3042     Int4 score_gap_col;
3043     Int4 next_score;
3044     Int4 best_score;
3045 
3046     /* do initialization and sanity-checking */
3047 
3048     matrix = gap_align->sbp->matrix->data;
3049     *a_offset = 0;
3050     *b_offset = 0;
3051     gap_open = score_params->gap_open;
3052     gap_extend = score_params->gap_extend;
3053     gap_open_extend = gap_open + gap_extend;
3054 
3055     if (x_dropoff < gap_open_extend)
3056         x_dropoff = gap_open_extend;
3057 
3058     if(N <= 0 || M <= 0)
3059         return 0;
3060 
3061     /* Allocate and fill in the auxiliary bookeeping structures.
3062        Since A and B could be very large, maintain a window
3063        of auxiliary structures only large enough to contain the current
3064        set of DP computations. The initial window size is determined
3065        by the number of cells needed to fail the x-dropoff test */
3066 
3067     if (gap_extend > 0)
3068         num_extra_cells = x_dropoff / gap_extend + 3;
3069     else
3070         num_extra_cells = N + 3;
3071 
3072     if (num_extra_cells > gap_align->dp_mem_alloc) {
3073         gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
3074                                       2 * gap_align->dp_mem_alloc);
3075         sfree(gap_align->dp_mem);
3076         gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
3077                                                   sizeof(BlastGapDP));
3078     }
3079 
3080     score_array = gap_align->dp_mem;
3081     score = -gap_open_extend;
3082     score_array[0].best = 0;
3083     score_array[0].best_gap = -gap_open_extend;
3084 
3085     for (i = 1; i <= N; i++) {
3086         if (score < -x_dropoff)
3087             break;
3088 
3089         score_array[i].best = score;
3090         score_array[i].best_gap = score - gap_open_extend;
3091         score -= gap_extend;
3092     }
3093 
3094     /* The inner loop below examines letters of B from
3095        index 'first_b_index' to 'b_size' */
3096 
3097     b_size = i;
3098     best_score = 0;
3099     first_b_index = 0;
3100     if (reverse_sequence)
3101         b_increment = -1;
3102     else
3103         b_increment = 1;
3104 
3105     for (a_index = 1; a_index <= M; a_index++) {
3106 
3107         /* pick out the row of the score matrix
3108            appropriate for A[a_index] */
3109 
3110         if(reverse_sequence) {
3111             a_base_pair = NCBI2NA_UNPACK_BASE(A[(M-a_index)/4],
3112                                                ((a_index-1)%4));
3113             matrix_row = matrix[a_base_pair];
3114         }
3115         else {
3116             a_base_pair = NCBI2NA_UNPACK_BASE(A[1+((a_index-1)/4)],
3117                                                (3-((a_index-1)%4)));
3118             matrix_row = matrix[a_base_pair];
3119         }
3120 
3121         if(reverse_sequence)
3122             b_ptr = &B[N - first_b_index];
3123         else
3124             b_ptr = &B[first_b_index];
3125 
3126         /* initialize running-score variables */
3127         score = MININT;
3128         score_gap_row = MININT;
3129         last_b_index = first_b_index;
3130 
3131         for (b_index = first_b_index; b_index < b_size; b_index++) {
3132 
3133             b_ptr += b_increment;
3134             score_gap_col = score_array[b_index].best_gap;
3135             next_score = score_array[b_index].best + matrix_row[ *b_ptr ];
3136 
3137             if (score < score_gap_col)
3138                 score = score_gap_col;
3139 
3140             if (score < score_gap_row)
3141                 score = score_gap_row;
3142 
3143             if (best_score - score > x_dropoff) {
3144 
3145                 /* the current best score failed the X-dropoff
3146                    criterion. Note that this does not stop the
3147                    inner loop, only forces future iterations to
3148                    skip this column of B.
3149 
3150                    Also, if the very first letter of B that was
3151                    tested failed the X dropoff criterion, make
3152                    sure future inner loops start one letter to
3153                    the right */
3154 
3155                 if (b_index == first_b_index)
3156                     first_b_index++;
3157                 else
3158                     score_array[b_index].best = MININT;
3159             }
3160             else {
3161                 last_b_index = b_index;
3162                 if (score > best_score) {
3163                     best_score = score;
3164                     *a_offset = a_index;
3165                     *b_offset = b_index;
3166                 }
3167 
3168                 /* If starting a gap at this position will improve
3169                    the best row or column score, update them to
3170                    reflect that. */
3171 
3172                 score_gap_row -= gap_extend;
3173                 score_gap_col -= gap_extend;
3174                 score_array[b_index].best_gap = MAX(score - gap_open_extend,
3175                                                     score_gap_col);
3176                 score_gap_row = MAX(score - gap_open_extend, score_gap_row);
3177 
3178                 score_array[b_index].best = score;
3179             }
3180 
3181             score = next_score;
3182         }
3183 
3184         /* Finish aligning if the best scores for all positions
3185            of B will fail the X-dropoff test, i.e. the inner loop
3186            bounds have converged to each other */
3187 
3188         if (first_b_index == b_size)
3189             break;
3190 
3191         if (last_b_index + num_extra_cells + 3 >= gap_align->dp_mem_alloc) {
3192 
3193             gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100,
3194                                           2 * gap_align->dp_mem_alloc);
3195             score_array = (BlastGapDP *)realloc(score_array,
3196                                                gap_align->dp_mem_alloc *
3197                                                sizeof(BlastGapDP));
3198             gap_align->dp_mem = score_array;
3199         }
3200 
3201         if (last_b_index < b_size - 1) {
3202             /* This row failed the X-dropoff test earlier than
3203                the last row did; just shorten the loop bounds
3204                before doing the next row */
3205 
3206             b_size = last_b_index + 1;
3207         }
3208         else {
3209             /* The inner loop finished without failing the X-dropoff
3210                test; initialize extra bookkeeping structures until
3211                the X dropoff test fails or we run out of letters in B.
3212                The next inner loop will have larger bounds */
3213 
3214             while (score_gap_row >= (best_score - x_dropoff) && b_size <= N) {
3215                 score_array[b_size].best = score_gap_row;
3216                 score_array[b_size].best_gap = score_gap_row - gap_open_extend;
3217                 score_gap_row -= gap_extend;
3218                 b_size++;
3219             }
3220         }
3221 
3222         if (b_size <= N) {
3223             score_array[b_size].best = MININT;
3224             score_array[b_size].best_gap = MININT;
3225             b_size++;
3226         }
3227     }
3228 
3229     return best_score;
3230 }
3231 
3232 Boolean
BlastGetOffsetsForGappedAlignment(const Uint1 * query,const Uint1 * subject,const BlastScoreBlk * sbp,BlastHSP * hsp,Int4 * q_retval,Int4 * s_retval)3233 BlastGetOffsetsForGappedAlignment (const Uint1* query, const Uint1* subject,
3234    const BlastScoreBlk* sbp, BlastHSP* hsp, Int4* q_retval, Int4* s_retval)
3235 {
3236     Int4 index1, max_offset, score, max_score, hsp_end;
3237     const Uint1* query_var,* subject_var;
3238     Boolean positionBased = (sbp->psi_matrix != NULL);
3239     Int4 q_length = hsp->query.end - hsp->query.offset;
3240     Int4 s_length = hsp->subject.end - hsp->subject.offset;
3241     int q_start = hsp->query.offset;
3242     int s_start = hsp->subject.offset;
3243 
3244     if (q_length <= HSP_MAX_WINDOW) {
3245         *q_retval = q_start + q_length/2;
3246         *s_retval = s_start + q_length/2;
3247         return TRUE;
3248     }
3249 
3250     hsp_end = q_start + HSP_MAX_WINDOW;
3251     query_var = query + q_start;
3252     subject_var = subject + s_start;
3253     score=0;
3254     for (index1=q_start; index1<hsp_end; index1++) {
3255         if (!(positionBased))
3256             score += sbp->matrix->data[*query_var][*subject_var];
3257         else
3258             score += sbp->psi_matrix->pssm->data[index1][*subject_var];
3259         query_var++; subject_var++;
3260     }
3261     max_score = score;
3262     max_offset = hsp_end - 1;
3263     hsp_end = q_start + MIN(q_length, s_length);
3264     for (index1=q_start + HSP_MAX_WINDOW; index1<hsp_end; index1++) {
3265         if (!(positionBased)) {
3266             score -= sbp->matrix->data[*(query_var-HSP_MAX_WINDOW)][*(subject_var-HSP_MAX_WINDOW)];
3267             score += sbp->matrix->data[*query_var][*subject_var];
3268         } else {
3269             score -= sbp->psi_matrix->pssm->data[index1-HSP_MAX_WINDOW][*(subject_var-HSP_MAX_WINDOW)];
3270             score += sbp->psi_matrix->pssm->data[index1][*subject_var];
3271         }
3272         if (score > max_score) {
3273             max_score = score;
3274             max_offset = index1;
3275         }
3276         query_var++; subject_var++;
3277     }
3278 
3279     if (max_score > 0)
3280     {
3281         *q_retval = max_offset;
3282         *s_retval = (max_offset - q_start) + s_start;
3283         return TRUE;
3284     }
3285     else  /* Test the window around the ends of the HSP. */
3286     {
3287         score=0;
3288         query_var = query + q_start + q_length - HSP_MAX_WINDOW;
3289         subject_var = subject + s_start + s_length - HSP_MAX_WINDOW;
3290         for (index1=hsp->query.end-HSP_MAX_WINDOW; index1<hsp->query.end; index1++) {
3291             if (!(positionBased))
3292                 score += sbp->matrix->data[*query_var][*subject_var];
3293             else
3294                 score += sbp->psi_matrix->pssm->data[index1][*subject_var];
3295             query_var++; subject_var++;
3296         }
3297         if (score > 0)
3298         {
3299             *q_retval = hsp->query.end - HSP_MAX_WINDOW/2;
3300             *s_retval = hsp->subject.end - HSP_MAX_WINDOW/2;
3301             return TRUE;
3302         }
3303     }
3304     return FALSE;
3305 }
3306 
3307 void
BlastGetStartForGappedAlignmentNucl(const Uint1 * query,const Uint1 * subject,BlastHSP * hsp)3308 BlastGetStartForGappedAlignmentNucl (const Uint1* query, const Uint1* subject,
3309    BlastHSP* hsp)
3310 {
3311     /* We will stop when the identity count reaches to this number */
3312     int hspMaxIdentRun = 10;
3313     const Uint1 *q, *s;
3314     Int4 index, max_offset, score, max_score, q_start, s_start, q_len;
3315     Boolean match, prev_match;
3316     Int4 offset = MIN(hsp->subject.gapped_start - hsp->subject.offset,
3317                       hsp->query.gapped_start - hsp->query.offset);
3318     /* first check if the old value is ok */
3319     q_start = hsp->query.gapped_start;
3320     s_start = hsp->subject.gapped_start;
3321     score = -1;
3322     q = query + q_start;
3323     s = subject + s_start;
3324     q_len = hsp->query.end;
3325     while ((q-query < q_len) && (*q++ == *s++)) {
3326         score++;
3327         if (score > hspMaxIdentRun) return;
3328     }
3329     q = query + q_start;
3330     s = subject + s_start;
3331     while ((q-query >= 0) && (*q-- == *s--)) {
3332         score++;
3333         if (score > hspMaxIdentRun) return;
3334     }
3335     hspMaxIdentRun *= 1.5;  /* Demand larger value if we move */
3336     /* if the old value is not ok, try to find a better point */
3337     q_start = hsp->query.gapped_start - offset;
3338     s_start = hsp->subject.gapped_start - offset;
3339     q_len = MIN(hsp->subject.end - s_start, hsp->query.end - q_start);
3340     q = query + q_start;
3341     s = subject + s_start;
3342     max_score = 0;
3343     max_offset = q_start;
3344     score = 0;
3345     match = FALSE;
3346     prev_match = FALSE;
3347     for (index = q_start; index < q_start + q_len; index++) {
3348         match = (*q++ == *s++);
3349         if (match != prev_match) {
3350             prev_match = match;
3351             if (match) {
3352                 score = 1;
3353             } else if (score > max_score) {
3354                 max_score = score;
3355                 max_offset = index - score/2;
3356             }
3357         } else if (match) {
3358             ++score;
3359             if (score > hspMaxIdentRun) {
3360                 max_offset = index - hspMaxIdentRun/2;
3361                 hsp->query.gapped_start = max_offset;
3362                 hsp->subject.gapped_start = max_offset + s_start - q_start;
3363                 return;
3364             }
3365         }
3366     }
3367     if (match && score > max_score) {
3368         max_score = score;
3369         max_offset = index - score/2;
3370     }
3371     if (max_score > 0) {
3372         hsp->query.gapped_start = max_offset;
3373         hsp->subject.gapped_start = max_offset + s_start - q_start;
3374     }
3375 }
3376 
3377 Int4
BlastGetStartForGappedAlignment(const Uint1 * query,const Uint1 * subject,const BlastScoreBlk * sbp,Uint4 q_start,Uint4 q_length,Uint4 s_start,Uint4 s_length)3378 BlastGetStartForGappedAlignment (const Uint1* query, const Uint1* subject,
3379    const BlastScoreBlk* sbp, Uint4 q_start, Uint4 q_length,
3380    Uint4 s_start, Uint4 s_length)
3381 {
3382     Int4 index1, max_offset, score, max_score, hsp_end;
3383     const Uint1* query_var,* subject_var;
3384     Boolean positionBased = (sbp->psi_matrix != NULL);
3385 
3386     if (q_length <= HSP_MAX_WINDOW) {
3387         max_offset = q_start + q_length/2;
3388         return max_offset;
3389     }
3390 
3391     hsp_end = q_start + HSP_MAX_WINDOW;
3392     query_var = query + q_start;
3393     subject_var = subject + s_start;
3394     score=0;
3395     for (index1=q_start; index1<hsp_end; index1++) {
3396         if (!(positionBased))
3397             score += sbp->matrix->data[*query_var][*subject_var];
3398         else
3399             score += sbp->psi_matrix->pssm->data[index1][*subject_var];
3400         query_var++; subject_var++;
3401     }
3402     max_score = score;
3403     max_offset = hsp_end - 1;
3404     hsp_end = q_start + MIN(q_length, s_length);
3405     for (index1=q_start + HSP_MAX_WINDOW; index1<hsp_end; index1++) {
3406         if (!(positionBased)) {
3407             score -= sbp->matrix->data[*(query_var-HSP_MAX_WINDOW)][*(subject_var-HSP_MAX_WINDOW)];
3408             score += sbp->matrix->data[*query_var][*subject_var];
3409         } else {
3410             score -= sbp->psi_matrix->pssm->data[index1-HSP_MAX_WINDOW][*(subject_var-HSP_MAX_WINDOW)];
3411             score += sbp->psi_matrix->pssm->data[index1][*subject_var];
3412         }
3413         if (score > max_score) {
3414             max_score = score;
3415             max_offset = index1;
3416         }
3417         query_var++; subject_var++;
3418     }
3419     if (max_score > 0)
3420        max_offset -= HSP_MAX_WINDOW/2;
3421     else
3422        max_offset = q_start;
3423 
3424     return max_offset;
3425 }
3426 
BLAST_GetGappedScore(EBlastProgramType program_number,BLAST_SequenceBlk * query,BlastQueryInfo * query_info,BLAST_SequenceBlk * subject,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,const BlastExtensionParameters * ext_params,const BlastHitSavingParameters * hit_params,BlastInitHitList * init_hitlist,BlastHSPList ** hsp_list_ptr,BlastGappedStats * gapped_stats,Boolean * fence_hit)3427 Int2 BLAST_GetGappedScore (EBlastProgramType program_number,
3428         BLAST_SequenceBlk* query, BlastQueryInfo* query_info,
3429         BLAST_SequenceBlk* subject,
3430         BlastGapAlignStruct* gap_align,
3431         const BlastScoringParameters* score_params,
3432         const BlastExtensionParameters* ext_params,
3433         const BlastHitSavingParameters* hit_params,
3434         BlastInitHitList* init_hitlist,
3435         BlastHSPList** hsp_list_ptr, BlastGappedStats* gapped_stats,
3436         Boolean * fence_hit)
3437 {
3438    Int4 index;
3439    BlastInitHSP* init_hsp = NULL;
3440    BlastInitHSP* init_hsp_array;
3441    Int4 q_start, s_start, q_end, s_end;
3442    Boolean is_prot;
3443    Boolean* restricted_align_array = NULL; /* if allocated, restricted
3444                                               alignment option per query */
3445    Boolean is_greedy;
3446    Boolean is_rpsblast = Blast_ProgramIsRpsBlast(program_number);
3447    Int4 max_offset;
3448    Int4 rps_cutoff_index = 0;
3449    Int2 status = 0;
3450    BlastHSPList* hsp_list = NULL;
3451    const BlastHitSavingOptions* hit_options = hit_params->options;
3452    BLAST_SequenceBlk query_tmp;
3453    Int4 context;
3454    BlastIntervalTree *tree;
3455    Int4 score;
3456    Int4* found_high_score = NULL;
3457    Int4 **rpsblast_pssms = NULL;   /* Pointer to concatenated PSSMs in
3458                                        RPS-BLAST database */
3459    const int kHspNumMax = BlastHspNumMax(TRUE, hit_options);
3460    const double kRestrictedMult = 0.68;/* fraction of the ordinary cutoff score
3461                                         used for approximate gapped alignment */
3462    Int4 redo_index = -1; /* these are used for recomputing alignmnets if the */
3463    Int4 redo_query = -1; /* approximate alignment score is inconclusive */
3464 
3465    if (!query || !subject || !gap_align || !score_params || !ext_params ||
3466        !hit_params || !init_hitlist || !hsp_list_ptr)
3467       return 1;
3468 
3469    if (init_hitlist->total == 0)
3470       return 0;
3471 
3472    is_prot = (program_number != eBlastTypeBlastn &&
3473               program_number != eBlastTypePhiBlastn &&
3474               program_number != eBlastTypeMapping);
3475    is_greedy = (ext_params->options->ePrelimGapExt == eGreedyScoreOnly);
3476 
3477    /* turn on approximate gapped alignment if 1) the search
3478       specifies it, and 2) the first, highest-scoring ungapped
3479       alignment in init_hitlist scores below a reduced cutoff.
3480       The second condition is used to avoid computing approximate
3481       alignments that experiment shows are likely to be thrown
3482       away and recomputed optimally */
3483 
3484    if (hit_params->restricted_align && !score_params->options->is_ooframe) {
3485        int i;
3486        Boolean* found = calloc(query_info->last_context + 1, sizeof(Boolean));
3487 
3488        restricted_align_array =
3489            (Boolean*)calloc(Blast_GetQueryIndexFromContext(
3490                                                query_info->last_context,
3491                                                program_number) + 1,
3492                             sizeof(Boolean));
3493 
3494        /* find the first ungapped alignment for each query and determine
3495           whetherto use approximate gapped alignment for the query */
3496        for (i = 0;i < init_hitlist->total;i++) {
3497            int contxt; /* context */
3498            int query_index = -1;
3499            Int4 q_off;
3500 
3501            init_hsp = &init_hitlist->init_hsp_array[i];
3502            /* find query index */
3503            q_off = init_hsp->offsets.qs_offsets.q_off;
3504            for (contxt = query_info->first_context;
3505                 contxt <= query_info->last_context; contxt++) {
3506 
3507                if (q_off >= query_info->contexts[contxt].query_offset &&
3508                    q_off < query_info->contexts[contxt].query_offset +
3509                    query_info->contexts[contxt].query_length) {
3510 
3511                    query_index = Blast_GetQueryIndexFromContext(contxt,
3512                                                                 program_number);
3513                    break;
3514                }
3515            }
3516            ASSERT(query_index >= 0);
3517 
3518            /* if this is not the first initial hit for the query, disregard it */
3519            if (found[query_index]) {
3520                continue;
3521            }
3522 
3523            found[query_index] = TRUE;
3524 
3525            /* use approximate gapped alignment if the highest scoring ungapped
3526               alignment scores below the reduced cutoff */
3527            if (init_hsp->ungapped_data && init_hsp->ungapped_data->score <
3528                (Int4)(kRestrictedMult *
3529                       hit_params->cutoffs[contxt].cutoff_score)) {
3530 
3531                restricted_align_array[query_index] = TRUE;
3532            }
3533            else {
3534                restricted_align_array[query_index] = FALSE;
3535            }
3536        }
3537        if (found) {
3538            free(found);
3539        }
3540    }
3541 
3542    if (is_rpsblast) {
3543       rpsblast_pssms = gap_align->sbp->psi_matrix->pssm->data;
3544       if (program_number == eBlastTypeRpsBlast)
3545          rps_cutoff_index = subject->oid;
3546       else
3547          rps_cutoff_index = subject->oid * NUM_FRAMES +
3548                             BLAST_FrameToContext(subject->frame,
3549                                                  program_number);
3550    }
3551 
3552    ASSERT(Blast_InitHitListIsSortedByScore(init_hitlist));
3553 
3554    if (*hsp_list_ptr == NULL)
3555       *hsp_list_ptr = hsp_list = Blast_HSPListNew(kHspNumMax);
3556    else
3557       hsp_list = *hsp_list_ptr;
3558 
3559    /* Initialize the interval tree with the maximum possible
3560       query and subject offsets. For query sequences this is always
3561       query->length, and for subject sequences it is subject->length
3562       except for out-of-frame tblastn. In that case, subject->length
3563       is the length of one strand of the subject sequence, but the
3564       subject offsets of each ungapped alignment are wrt all six subject
3565       frames concatenated together. Finally, add 1 to the maximum,
3566       to account for HSPs that include the end of a sequence */
3567 
3568    if (Blast_SubjectIsTranslated(program_number) &&
3569        score_params->options->is_ooframe) {
3570       /* OOF search only supported for tblastn and blastx. (blastx
3571 	 does not have a translated subject, so can't get here.) */
3572       ASSERT(program_number == eBlastTypeTblastn);
3573 
3574       tree = Blast_IntervalTreeInit(0, query->length+1,
3575                                     0, 2*(subject->length + CODON_LENGTH)+1);
3576    }
3577    else {
3578       tree = Blast_IntervalTreeInit(0, query->length+1,
3579                                     0, subject->length+1);
3580    }
3581    if (!tree)
3582      return BLASTERR_MEMORY;
3583 
3584 
3585    init_hsp_array = init_hitlist->init_hsp_array;
3586    found_high_score = (Int4*) calloc(query_info->num_queries, sizeof(Int4));
3587    if (hit_params->low_score)
3588    {
3589    	for (index=0; index<init_hitlist->total; index++)
3590    	{
3591       	    int query_index =
3592        		  Blast_GetQueryIndexFromQueryOffset(init_hsp_array[index].offsets.qs_offsets.q_off, program_number, query_info);
3593       	    if (init_hsp_array[index].ungapped_data->score > hit_params->low_score[query_index])
3594         	found_high_score[query_index] = 1;
3595         }
3596    }
3597 
3598    for (index=0; index<init_hitlist->total; index++)
3599    {
3600       BlastHSP tmp_hsp;
3601       BlastInitHSP tmp_init_hsp;
3602       BlastUngappedData tmp_ungapped_data;
3603       int query_index=0;
3604       Boolean restricted_alignment = FALSE;
3605 
3606       /* make a local copy of the initial HSP */
3607 
3608       tmp_init_hsp = init_hsp_array[index];
3609       if (tmp_init_hsp.ungapped_data) {
3610           tmp_ungapped_data = *(init_hsp_array[index].ungapped_data);
3611           tmp_init_hsp.ungapped_data = &tmp_ungapped_data;
3612       }
3613       init_hsp = &tmp_init_hsp;
3614 
3615       s_AdjustHspOffsetsAndGetQueryData(query, query_info, init_hsp,
3616                                         &query_tmp, &context);
3617 
3618       query_index = Blast_GetQueryIndexFromContext(context, program_number);
3619 
3620       /* If redo_index > -1 and redo_query > -1, the main loop is recomputing
3621          gappaed alignments for a single query, becuase the approximate
3622          alignment score was inconclusive. This recomputing was triggered when
3623          index was equal to redo_index. Until index reaches redo_index again,
3624          skip all concatenated queries with index different from redo_query */
3625 
3626       if (index < redo_index && query_index != redo_query) {
3627           continue;
3628       }
3629 
3630       if (restricted_align_array && restricted_align_array[query_index]) {
3631           restricted_alignment = TRUE;
3632       }
3633 
3634       if (hit_params->low_score)
3635       {
3636       	if (!found_high_score[query_index])
3637          continue;
3638       }
3639 
3640       if (rpsblast_pssms)
3641          gap_align->sbp->psi_matrix->pssm->data = rpsblast_pssms +
3642              query_info->contexts[context].query_offset;
3643 
3644       if (!init_hsp->ungapped_data) {
3645          q_start = q_end = init_hsp->offsets.qs_offsets.q_off;
3646          s_start = s_end = init_hsp->offsets.qs_offsets.s_off;
3647          score = INT4_MIN;
3648       } else {
3649          q_start = init_hsp->ungapped_data->q_start;
3650          q_end = q_start + init_hsp->ungapped_data->length;
3651          s_start = init_hsp->ungapped_data->s_start;
3652          s_end = s_start + init_hsp->ungapped_data->length;
3653          score = init_hsp->ungapped_data->score;
3654       }
3655 
3656       tmp_hsp.score = score;
3657       tmp_hsp.context = context;
3658       tmp_hsp.query.offset = q_start;
3659       tmp_hsp.query.end = q_end;
3660       tmp_hsp.query.frame = query_info->contexts[context].frame;
3661       tmp_hsp.subject.offset = s_start;
3662       tmp_hsp.subject.end = s_end;
3663       tmp_hsp.subject.frame = subject->frame;
3664 
3665       /* use priate interval tree when recomputing alignments */
3666       if (!BlastIntervalTreeContainsHSP(tree, &tmp_hsp, query_info,
3667                                         hit_options->min_diag_separation))
3668       {
3669          BlastHSP* new_hsp;
3670          Int4 cutoff, restricted_cutoff = 0;
3671 
3672          if (is_rpsblast)
3673             cutoff = hit_params->cutoffs[rps_cutoff_index].cutoff_score;
3674          else
3675             cutoff = hit_params->cutoffs[context].cutoff_score;
3676 
3677          if (restricted_alignment)
3678              restricted_cutoff = (Int4)(kRestrictedMult * cutoff);
3679 
3680          if (gapped_stats)
3681             ++gapped_stats->extensions;
3682 
3683          if(is_prot && !score_params->options->is_ooframe) {
3684             max_offset =
3685                BlastGetStartForGappedAlignment(query_tmp.sequence,
3686                   subject->sequence, gap_align->sbp,
3687                   init_hsp->ungapped_data->q_start,
3688                   init_hsp->ungapped_data->length,
3689                   init_hsp->ungapped_data->s_start,
3690                   init_hsp->ungapped_data->length);
3691             init_hsp->offsets.qs_offsets.s_off +=
3692                 max_offset - init_hsp->offsets.qs_offsets.q_off;
3693             init_hsp->offsets.qs_offsets.q_off = max_offset;
3694          }
3695 
3696          if (is_prot) {
3697             status = s_BlastProtGappedAlignment(program_number, &query_tmp,
3698                                                 subject, gap_align,
3699                                                 score_params, init_hsp,
3700                                                 restricted_alignment,
3701                                                 fence_hit);
3702 
3703             if (restricted_alignment &&
3704                 gap_align->score < cutoff &&
3705                 gap_align->score >= restricted_cutoff) {
3706 
3707                 /* the alignment score is inconclusive; it may really be
3708                    low-scoring, or the approximate gapped alignment
3709                    could be highly suboptimal. Recomputing the alignment
3710                    would resolve the ambiguity, but when the final list of
3711                    HSPs contains a mix of approximate and exact gapped
3712                    alignments it's possible for a highly suboptimal
3713                    alignment to make it to the traceback phase, where
3714                    its start point is reused. We sidestep this problem
3715                    by throwing away all the previously computed gapped
3716                    alignments and recomputing them exactly. */
3717 
3718                 Int4 index2;
3719                 BlastHSPList* new_hsp_list = Blast_HSPListNew(kHspNumMax);
3720                 ASSERT(restricted_align_array);
3721                 restricted_align_array[query_index] = FALSE;
3722 
3723                 Blast_IntervalTreeReset(tree);
3724 
3725                 /* remove all HSPs computed for the current query */
3726                 for (index2 = 0; index2 < hsp_list->hspcnt; index2++) {
3727                     Int4 context2 = hsp_list->hsp_array[index2]->context;
3728                     Int4 q_index2 = Blast_GetQueryIndexFromContext(context2,
3729                                                                program_number);
3730 
3731                     if (q_index2 != query_index) {
3732 
3733                         Blast_HSPListSaveHSP(new_hsp_list,
3734                                              hsp_list->hsp_array[index2]);
3735 
3736                         status = BlastIntervalTreeAddHSP(
3737                                                 hsp_list->hsp_array[index2],
3738                                                 tree,
3739                                                 query_info,
3740                                                 eQueryAndSubject);
3741 
3742                         hsp_list->hsp_array[index2] = NULL;
3743 
3744                     }
3745                     else {
3746                         hsp_list->hsp_array[index2] =
3747                             Blast_HSPFree(hsp_list->hsp_array[index2]);
3748                     }
3749                 }
3750                 Blast_HSPListFree(hsp_list);
3751                 hsp_list = new_hsp_list;
3752 
3753                 /* restart the loop over initial hits to recompute
3754                    alignments for the current query */
3755                 redo_index = index;
3756                 redo_query = query_index;
3757                 index = -1;
3758                 continue;
3759             }
3760          } else if (is_greedy) {
3761             if (init_hsp->ungapped_data) {
3762                 init_hsp->offsets.qs_offsets.q_off =
3763                     init_hsp->ungapped_data->q_start + init_hsp->ungapped_data->length/2;
3764                 init_hsp->offsets.qs_offsets.s_off =
3765                     init_hsp->ungapped_data->s_start + init_hsp->ungapped_data->length/2;
3766             }
3767             status = BLAST_GreedyGappedAlignment(
3768                          query_tmp.sequence, subject->sequence,
3769                          query_tmp.length, subject->length,
3770                          gap_align, score_params,
3771                          init_hsp->offsets.qs_offsets.q_off,
3772                          init_hsp->offsets.qs_offsets.s_off,
3773                          (Boolean) TRUE, FALSE, fence_hit);
3774             /* override the alignment start point with the estimate
3775                by the aligner of the best start point */
3776             init_hsp->offsets.qs_offsets.q_off =
3777                                 gap_align->greedy_query_seed_start;
3778             init_hsp->offsets.qs_offsets.s_off =
3779                                 gap_align->greedy_subject_seed_start;
3780          } else {
3781             /*  Assuming the ungapped alignment is long enough to
3782                 contain an 8-letter seed of exact matches, start
3783                 the gapped alignment inside the first byte of the
3784                 seed that contains all matches. Note that even if
3785                 the ungapped alignment is not long enough,
3786                 s_BlastDynProgNtGappedAlignment will still round
3787                 the start point up to the first 4-base boundary
3788                 on the subject sequence */
3789             if (s_end >= (Int4)init_hsp->offsets.qs_offsets.s_off + 8) {
3790                init_hsp->offsets.qs_offsets.s_off += 3;
3791                init_hsp->offsets.qs_offsets.q_off += 3;
3792             }
3793             status = s_BlastDynProgNtGappedAlignment(&query_tmp, subject,
3794                          gap_align, score_params, init_hsp);
3795          }
3796 
3797          if (status) {
3798              if (rpsblast_pssms) {
3799                  gap_align->sbp->psi_matrix->pssm->data = rpsblast_pssms;
3800              }
3801    	    sfree(found_high_score);
3802    	    tree = Blast_IntervalTreeFree(tree);
3803             return status;
3804          }
3805 
3806          if (gap_align->score >= cutoff) {
3807              Int2 query_frame = 0;
3808              /* For mixed-frame search, the query frame is determined
3809                 from the offset, not only from context. */
3810              if (score_params->options->is_ooframe &&
3811                  program_number == eBlastTypeBlastx) {
3812                  query_frame = gap_align->query_start % CODON_LENGTH + 1;
3813                  if ((context % NUM_FRAMES) >= CODON_LENGTH)
3814                      query_frame = -query_frame;
3815              } else {
3816                  query_frame = query_info->contexts[context].frame;
3817              }
3818 
3819              status = Blast_HSPInit(gap_align->query_start,
3820                            gap_align->query_stop, gap_align->subject_start,
3821                            gap_align->subject_stop,
3822                            init_hsp->offsets.qs_offsets.q_off,
3823                            init_hsp->offsets.qs_offsets.s_off, context,
3824                            query_frame, subject->frame, gap_align->score,
3825                            &(gap_align->edit_script), &new_hsp);
3826              if (status)
3827 	     {
3828    		sfree(found_high_score);
3829    		tree = Blast_IntervalTreeFree(tree);
3830                 return status;
3831 	     }
3832              status = Blast_HSPListSaveHSP(hsp_list, new_hsp);
3833              if (status)
3834                  break;
3835              status = BlastIntervalTreeAddHSP(new_hsp, tree, query_info,
3836                                      eQueryAndSubject);
3837              if (status)
3838                  break;
3839          }
3840          else {
3841             /* a greedy alignment may have traceback associated with it;
3842                free that traceback if the alignment will not be used */
3843             if (is_greedy) {
3844                gap_align->edit_script = GapEditScriptDelete(
3845                                              gap_align->edit_script);
3846             }
3847          }
3848       }
3849    }
3850 
3851    sfree(found_high_score);
3852    if (restricted_align_array) {
3853        sfree(restricted_align_array);
3854    }
3855    tree = Blast_IntervalTreeFree(tree);
3856    if (rpsblast_pssms) {
3857        gap_align->sbp->psi_matrix->pssm->data = rpsblast_pssms;
3858    }
3859 
3860    *hsp_list_ptr = hsp_list;
3861    return status;
3862 }
3863 
3864 /** Out-of-frame gapped alignment wrapper function.
3865  * @param query Query sequence [in]
3866  * @param subject Subject sequence [in]
3867  * @param q_off Offset in query [in]
3868  * @param s_off Offset in subject [in]
3869  * @param private_q_start Extent of alignment in query [out]
3870  * @param private_s_start Extent of alignment in subject [out]
3871  * @param score_only Return score only, without traceback [in]
3872  * @param edit_block Structure to hold generated traceback [out]
3873  * @param gap_align Gapped alignment information and preallocated
3874  *                  memory [in] [out]
3875  * @param score_params Scoring parameters [in]
3876  * @param psi_offset Starting position in PSI-BLAST matrix [in]
3877  * @param reversed Direction of the extension [in]
3878  * @param switch_seq Sequences need to be switched for blastx,
3879  *                   but not for tblastn [in]
3880  */
3881 static Int4
s_OutOfFrameSemiGappedAlignWrap(const Uint1 * query,const Uint1 * subject,Int4 q_off,Int4 s_off,Int4 * private_q_start,Int4 * private_s_start,Boolean score_only,GapPrelimEditBlock * edit_block,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 psi_offset,Boolean reversed,Boolean switch_seq)3882 s_OutOfFrameSemiGappedAlignWrap(const Uint1* query, const Uint1* subject, Int4 q_off,
3883    Int4 s_off, Int4* private_q_start, Int4* private_s_start,
3884    Boolean score_only, GapPrelimEditBlock *edit_block,
3885    BlastGapAlignStruct* gap_align,
3886    const BlastScoringParameters* score_params, Int4 psi_offset,
3887    Boolean reversed, Boolean switch_seq)
3888 {
3889    if (switch_seq) {
3890       return s_OutOfFrameGappedAlign(subject, query, s_off, q_off,
3891                 private_s_start, private_q_start, score_only, edit_block,
3892                 gap_align, score_params, psi_offset, reversed);
3893    } else {
3894       return s_OutOfFrameGappedAlign(query, subject, q_off, s_off,
3895                 private_q_start, private_s_start, score_only, edit_block,
3896                 gap_align, score_params, psi_offset, reversed);
3897    }
3898 }
3899 
3900 /** Maximal subject length after which the offsets are adjusted to a
3901  * subsequence.
3902  */
3903 #define MAX_SUBJECT_OFFSET 90000
3904 /** Approximate upper bound on a number of gaps in an HSP, needed to determine
3905  * the length of the subject subsequence to be retrieved for alignment with
3906  * traceback.
3907  */
3908 #define MAX_TOTAL_GAPS 3000
3909 
3910 void
AdjustSubjectRange(Int4 * subject_offset_ptr,Int4 * subject_length_ptr,Int4 query_offset,Int4 query_length,Int4 * start_shift)3911 AdjustSubjectRange(Int4* subject_offset_ptr, Int4* subject_length_ptr,
3912 		   Int4 query_offset, Int4 query_length, Int4* start_shift)
3913 {
3914    Int4 s_offset;
3915    Int4 subject_length = *subject_length_ptr;
3916    Int4 max_extension_left, max_extension_right;
3917 
3918    /* If subject sequence is not too long, leave everything as is */
3919    if (subject_length < MAX_SUBJECT_OFFSET) {
3920       *start_shift = 0;
3921       return;
3922    }
3923 
3924    s_offset = *subject_offset_ptr;
3925    /* Maximal extension length is the remaining length in the query, plus
3926       an estimate of a maximal total number of gaps. */
3927    max_extension_left = query_offset + MAX_TOTAL_GAPS;
3928    max_extension_right = query_length - query_offset + MAX_TOTAL_GAPS;
3929 
3930    if (s_offset <= max_extension_left) {
3931       *start_shift = 0;
3932    } else {
3933       *start_shift = s_offset - max_extension_left;
3934       *subject_offset_ptr = max_extension_left;
3935    }
3936 
3937    *subject_length_ptr =
3938       MIN(subject_length, s_offset + max_extension_right) - *start_shift;
3939 }
3940 
3941 /** Performs gapped extension for protein sequences, given two
3942  * sequence blocks, scoring and extension options, and an initial HSP
3943  * with information from the previously performed ungapped extension
3944  * @param program BLAST program [in]
3945  * @param query_blk The query sequence block [in]
3946  * @param subject_blk The subject sequence block [in]
3947  * @param gap_align The auxiliary structure for gapped alignment [in]
3948  * @param score_params Parameters related to scoring [in]
3949  * @param init_hsp The initial HSP information [in]
3950  * @param restricted_alignment If true and search is not out-of-frame,
3951  *              use a faster approximate gapped alignment algorithm [in]
3952  * @param fence_hit If not NULL, pointer to a boolean that is set to
3953  *                  TRUE if gapped extension reaches a neighborhood
3954  *                  of the subject sequence that is not initialized [in][out]
3955  */
3956 static Int2
s_BlastProtGappedAlignment(EBlastProgramType program,BLAST_SequenceBlk * query_blk,BLAST_SequenceBlk * subject_blk,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,BlastInitHSP * init_hsp,Boolean restricted_alignment,Boolean * fence_hit)3957 s_BlastProtGappedAlignment(EBlastProgramType program,
3958    BLAST_SequenceBlk* query_blk, BLAST_SequenceBlk* subject_blk,
3959    BlastGapAlignStruct* gap_align,
3960    const BlastScoringParameters* score_params, BlastInitHSP* init_hsp,
3961    Boolean restricted_alignment,
3962    Boolean * fence_hit)
3963 {
3964    Boolean found_start, found_end;
3965    Int4 q_length=0, s_length=0, score_right, score_left;
3966    Int4 private_q_start, private_s_start;
3967    Uint1* query=NULL,* subject=NULL;
3968    Boolean switch_seq = FALSE;
3969    Int4 query_length = query_blk->length;
3970    Int4 subject_length = subject_blk->length;
3971    Int4 subject_shift = 0;
3972    BlastScoringOptions *score_options = score_params->options;
3973 
3974    if (gap_align == NULL)
3975       return -1;
3976 
3977    if (score_options->is_ooframe) {
3978       ASSERT(program == eBlastTypeTblastn || program == eBlastTypeBlastx);
3979 
3980       q_length = init_hsp->offsets.qs_offsets.q_off;
3981       /* For negative subject frames, make subject offset relative to the part
3982          of the mixed-frame sequence corresponding to the reverse strand. */
3983       if (program == eBlastTypeTblastn && subject_blk->frame < 0)
3984           init_hsp->offsets.qs_offsets.s_off -= subject_length + 1;
3985 
3986       s_length = init_hsp->offsets.qs_offsets.s_off;
3987 
3988       if (program == eBlastTypeBlastx) {
3989          subject = subject_blk->sequence + s_length;
3990          query = query_blk->oof_sequence + CODON_LENGTH + q_length;
3991          query_length -= CODON_LENGTH - 1;
3992          switch_seq = TRUE;
3993       } else if (program == eBlastTypeTblastn) {
3994          subject = subject_blk->oof_sequence + CODON_LENGTH + s_length;
3995          query = query_blk->sequence + q_length;
3996          subject_length -= CODON_LENGTH - 1;
3997       }
3998    } else {
3999       q_length = init_hsp->offsets.qs_offsets.q_off + 1;
4000       s_length = init_hsp->offsets.qs_offsets.s_off + 1;
4001       query = query_blk->sequence;
4002       subject = subject_blk->sequence;
4003    }
4004 
4005    AdjustSubjectRange(&s_length, &subject_length, q_length, query_length,
4006                       &subject_shift);
4007 
4008    found_start = FALSE;
4009    found_end = FALSE;
4010 
4011    /* Looking for "left" score */
4012    score_left = 0;
4013    if (q_length != 0 && s_length != 0) {
4014       found_start = TRUE;
4015       if(score_options->is_ooframe) {
4016          score_left =
4017              s_OutOfFrameSemiGappedAlignWrap(query, subject, q_length, s_length,
4018                  &private_q_start, &private_s_start, TRUE, NULL,
4019                  gap_align, score_params, q_length, TRUE, switch_seq);
4020       } else {
4021          if (restricted_alignment) {
4022             score_left = s_RestrictedGappedAlign(query, subject+subject_shift,
4023                                    q_length, s_length,
4024                                    &private_q_start, &private_s_start,
4025                                    gap_align, score_params,
4026                                    init_hsp->offsets.qs_offsets.q_off, TRUE);
4027          }
4028          else {
4029             score_left = Blast_SemiGappedAlign(query, subject+subject_shift,
4030                                    q_length, s_length,
4031                                    &private_q_start, &private_s_start, TRUE,
4032                                    NULL, gap_align, score_params,
4033                                    init_hsp->offsets.qs_offsets.q_off,
4034                                    FALSE, TRUE,
4035                                    fence_hit);
4036          }
4037       }
4038       gap_align->query_start = q_length - private_q_start;
4039       gap_align->subject_start = s_length - private_s_start + subject_shift;
4040    }
4041 
4042    score_right = 0;
4043    if (q_length < query_length && s_length < subject_length) {
4044       found_end = TRUE;
4045       if(score_options->is_ooframe) {
4046          score_right =
4047              s_OutOfFrameSemiGappedAlignWrap(query-1, subject-1,
4048                  query_length-q_length+1, subject_length-s_length+1,
4049                  &(gap_align->query_stop), &(gap_align->subject_stop),
4050                  TRUE, NULL, gap_align, score_params, q_length, FALSE,
4051                  switch_seq);
4052          gap_align->query_stop += q_length;
4053          gap_align->subject_stop += s_length + subject_shift;
4054       } else {
4055          if (restricted_alignment) {
4056             score_right = s_RestrictedGappedAlign(
4057                                    query+init_hsp->offsets.qs_offsets.q_off,
4058                                    subject+init_hsp->offsets.qs_offsets.s_off,
4059                                    query_length-q_length,
4060                                    subject_length-s_length,
4061                                    &(gap_align->query_stop),
4062                                    &(gap_align->subject_stop),
4063                                    gap_align, score_params,
4064                                    init_hsp->offsets.qs_offsets.q_off, FALSE);
4065          }
4066          else {
4067             score_right = Blast_SemiGappedAlign(
4068                                    query+init_hsp->offsets.qs_offsets.q_off,
4069                                    subject+init_hsp->offsets.qs_offsets.s_off,
4070                                    query_length-q_length,
4071                                    subject_length-s_length,
4072                                    &(gap_align->query_stop),
4073                                    &(gap_align->subject_stop),
4074                                    TRUE, NULL, gap_align, score_params,
4075                                    init_hsp->offsets.qs_offsets.q_off, FALSE,
4076                                    FALSE,
4077                                    fence_hit);
4078          }
4079          /* Make end offsets point to the byte after the end of the
4080             alignment */
4081          gap_align->query_stop += init_hsp->offsets.qs_offsets.q_off + 1;
4082          gap_align->subject_stop += init_hsp->offsets.qs_offsets.s_off + 1;
4083       }
4084    }
4085 
4086    if (found_start == FALSE) {  /* impossible for in-frame */
4087       gap_align->query_start = init_hsp->offsets.qs_offsets.q_off;
4088       gap_align->subject_start = init_hsp->offsets.qs_offsets.s_off;
4089    }
4090    if (found_end == FALSE) {    /* impossible for out-of-frame */
4091       gap_align->query_stop = init_hsp->offsets.qs_offsets.q_off;
4092       gap_align->subject_stop = init_hsp->offsets.qs_offsets.s_off;
4093    }
4094 
4095    gap_align->score = score_right+score_left;
4096 
4097    return 0;
4098 }
4099 
4100 /** Converts OOF traceback from a gapped alignment to a GapEditScript.
4101  * This function is for out-of-frame gapping.
4102  * @param rev_prelim_tback Traceback operations for left extension [in]
4103  * @param fwd_prelim_tback Traceback operations for right extension [in]
4104  * @param nucl_align_length Length of alignment on the translated
4105  *                          nucleotide sequence [in]
4106  * @param edit_script_ptr The resulting edit block [out]
4107  * @return Always zero
4108  */
4109 static Int2
s_BlastOOFTracebackToGapEditScript(GapPrelimEditBlock * rev_prelim_tback,GapPrelimEditBlock * fwd_prelim_tback,Int4 nucl_align_length,GapEditScript ** edit_script_ptr)4110 s_BlastOOFTracebackToGapEditScript(GapPrelimEditBlock *rev_prelim_tback,
4111                                    GapPrelimEditBlock *fwd_prelim_tback,
4112                                    Int4 nucl_align_length,
4113                                    GapEditScript** edit_script_ptr)
4114 {
4115     GapEditScript* e_script;
4116     EGapAlignOpType last_op;
4117     Int4 last_num;
4118     GapPrelimEditBlock *tmp_prelim_tback;
4119     Int4 i, num_nuc;
4120     int extra_needed=0;
4121 
4122     /* prepend a substitution, since the input sequences were
4123        shifted prior to the OOF alignment */
4124 
4125     tmp_prelim_tback = GapPrelimEditBlockNew();
4126     last_op = eGapAlignSub;
4127     last_num = 1;
4128 
4129     /* Propagate the extra substitution through the edit script. */
4130 
4131     for (i = 0; i < rev_prelim_tback->num_ops; i++) {
4132 
4133         EGapAlignOpType next_op = rev_prelim_tback->edit_ops[i].op_type;
4134         Int4 next_num = rev_prelim_tback->edit_ops[i].num;
4135 
4136         if (next_op == last_op) {
4137 
4138             /* merge consecutive operations that are identical */
4139 
4140             last_num += next_num;
4141         }
4142         else if (next_op == eGapAlignIns || next_op == eGapAlignDel) {
4143 
4144             /* Propagating the initial extra substitution requires
4145                that edit operations which are not in-frame gaps must
4146                'flow through' an in-frame gap; the last non-gap
4147                operation before the gap will appear after the gap.
4148                This could mean either one or two edit blocks
4149                getting created */
4150 
4151             if (last_num > 1) {
4152                 GapPrelimEditBlockAdd(tmp_prelim_tback, last_op, last_num - 1);
4153             }
4154             GapPrelimEditBlockAdd(tmp_prelim_tback, next_op, next_num);
4155             last_num = 1;          /* last_op unchanged; one 'orphan' now */
4156         }
4157         else {
4158 
4159             /* write out the last edit operation and
4160                move forward in the script */
4161 
4162             GapPrelimEditBlockAdd(tmp_prelim_tback, last_op, last_num);
4163             last_op = next_op;
4164             last_num = next_num;
4165         }
4166     }
4167 
4168     /* Handle the crossing over from the left extension to
4169        the right extension. Begin by writing out the final
4170        group of ops from the left extension, except for the
4171        one op that must be merged */
4172 
4173     last_num--;
4174     if (last_num > 0) {
4175         GapPrelimEditBlockAdd(tmp_prelim_tback, last_op, last_num);
4176     }
4177 
4178     if (last_op != eGapAlignSub) {
4179 
4180         /* the last traceback letter from the left extension must
4181            be folded into the right extension. */
4182 
4183         for (i = fwd_prelim_tback->num_ops - 1; i >= 0; i--) {
4184 
4185             GapPrelimEditScript *new_script = fwd_prelim_tback->edit_ops + i;
4186 
4187             if (new_script->op_type == eGapAlignIns ||
4188                 new_script->op_type == eGapAlignDel) {
4189 
4190                 /* do not merge with in-frame gaps; just write out
4191                    the gaps and 'flow through' the last op */
4192 
4193                 GapPrelimEditBlockAdd(tmp_prelim_tback, new_script->op_type,
4194                                       new_script->num);
4195             }
4196             else {
4197 
4198                 /* merge with the first letter of new_script */
4199 
4200                 last_op += new_script->op_type - eGapAlignSub;
4201                 GapPrelimEditBlockAdd(tmp_prelim_tback, last_op, 1);
4202 
4203                 /* borrow one letter from new_script to complete the
4204                    merge, and skip new_script if it only had one letter */
4205                 new_script->num--;
4206                 if (new_script->num == 0)
4207                     i--;
4208                 break;
4209             }
4210         }
4211         fwd_prelim_tback->num_ops = i + 1;
4212     }
4213 
4214     /* form the final edit block */
4215     e_script = Blast_PrelimEditBlockToGapEditScript(tmp_prelim_tback, fwd_prelim_tback);
4216     GapPrelimEditBlockFree(tmp_prelim_tback);
4217 
4218     /* postprocess the edit script */
4219     num_nuc = 0;
4220     for (i=0; i<e_script->size; i++)
4221     {
4222         int total_actions=0;
4223         /* Count the number of nucleotides in the next
4224            traceback operation and delete any traceback operations
4225            that would make the alignment too long. This check is
4226            not needed for in-frame alignment because the
4227            traceback is never changed after it is computed */
4228 
4229         last_op = e_script->op_type[i];
4230 
4231         if (last_op == eGapAlignIns)
4232             last_op = eGapAlignSub;
4233 
4234         total_actions = last_op * e_script->num[i];
4235 
4236         if (num_nuc + total_actions >= nucl_align_length) {
4237             e_script->num[i] = (nucl_align_length - num_nuc +
4238                              last_op - 1) / last_op;
4239             break; /* We delete the rest of the script. */
4240         }
4241         else {
4242             num_nuc += total_actions;;
4243         }
4244     }
4245     e_script->size = MIN(i+1, e_script->size);  /* If we broke out early then we truncate the edit script. */
4246 
4247     extra_needed = 0;
4248     for (i=0; i<e_script->size; i++)
4249     {
4250         if (e_script->op_type[i] % 3 != 0 && e_script->num[i] > 1) {
4251            extra_needed += e_script->num[i] - 1;
4252         }
4253     }
4254 
4255     if (extra_needed)
4256     {
4257         GapEditScript* new_esp = GapEditScriptNew(extra_needed+e_script->size);
4258         int new_esp_i=0;
4259         for (i=0; i<e_script->size; i++)
4260         {
4261            /* Only one frame shift operation at a time is
4262            allowed in a single edit script element. */
4263            new_esp->num[new_esp_i] = e_script->num[i];
4264            new_esp->op_type[new_esp_i] = e_script->op_type[i];
4265            new_esp_i++;
4266            last_op = e_script->op_type[i];
4267            if (last_op % 3 != 0 && e_script->num[i] > 1) {
4268                Int4 num_ops = e_script->num[i];
4269                int esp_index=0;
4270                new_esp->num[new_esp_i-1] = 1;
4271                for (esp_index = 1; esp_index < num_ops; esp_index++) {
4272                    new_esp->num[new_esp_i] = 1;
4273                    new_esp->op_type[new_esp_i] = last_op;
4274                    new_esp_i++;
4275                }
4276            }
4277        }
4278        e_script = GapEditScriptDelete(e_script);
4279        e_script = new_esp;
4280     }
4281     *edit_script_ptr = e_script;
4282 
4283     /* finally, add one to the size of any block of substitutions
4284        that follows a frame shift op */
4285     last_op = e_script->op_type[0];
4286     for (i=1; i<e_script->size; i++)
4287     {
4288         if (e_script->op_type[i] == eGapAlignSub && (last_op % 3) != 0)
4289             (e_script->num[i])++;
4290 
4291         last_op = e_script->op_type[i];
4292     }
4293 
4294     return 0;
4295 }
4296 
BLAST_GappedAlignmentWithTraceback(EBlastProgramType program,const Uint1 * query,const Uint1 * subject,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 q_start,Int4 s_start,Int4 query_length,Int4 subject_length,Boolean * fence_hit)4297 Int2 BLAST_GappedAlignmentWithTraceback(EBlastProgramType program,
4298         const Uint1* query, const Uint1* subject, BlastGapAlignStruct* gap_align,
4299         const BlastScoringParameters* score_params,
4300         Int4 q_start, Int4 s_start, Int4 query_length, Int4 subject_length,
4301         Boolean * fence_hit)
4302 {
4303     Boolean found_end;
4304     Int4 score_right, score_left, private_q_length, private_s_length;
4305     Int4 q_length, s_length;
4306     Boolean is_ooframe = score_params->options->is_ooframe;
4307     Int2 status = 0;
4308     Boolean switch_seq = FALSE;
4309     GapPrelimEditBlock *fwd_prelim_tback;
4310     GapPrelimEditBlock *rev_prelim_tback;
4311 
4312     if (gap_align == NULL)
4313         return -1;
4314 
4315     fwd_prelim_tback = gap_align->fwd_prelim_tback;
4316     rev_prelim_tback = gap_align->rev_prelim_tback;
4317     GapPrelimEditBlockReset(fwd_prelim_tback);
4318     GapPrelimEditBlockReset(rev_prelim_tback);
4319 
4320     found_end = FALSE;
4321 
4322     q_length = query_length;
4323     s_length = subject_length;
4324     if (is_ooframe) {
4325        /* The mixed frame sequence is shifted to the 3rd position, so its
4326           maximal available length for extension is less by 2 than its
4327           total length. */
4328        switch_seq = (program == eBlastTypeBlastx);
4329        if (switch_seq) {
4330           q_length -= CODON_LENGTH - 1;
4331        } else {
4332           s_length -= CODON_LENGTH - 1;
4333        }
4334     }
4335 
4336     score_left = 0;
4337 
4338     if(is_ooframe) {
4339        /* NB: Left extension does not include starting point corresponding
4340           to the offset pair; the right extension does. */
4341        score_left =
4342           s_OutOfFrameSemiGappedAlignWrap(query+q_start, subject+s_start,
4343              q_start, s_start, &private_q_length, &private_s_length,
4344              FALSE, rev_prelim_tback, gap_align, score_params, q_start,
4345              TRUE, switch_seq);
4346        gap_align->query_start = q_start - private_q_length;
4347        gap_align->subject_start = s_start - private_s_length;
4348     } else {
4349        /* NB: The left extension includes the starting point
4350           [q_start,s_start]; the right extension does not. */
4351        // <AG> score_left =
4352        //    Blast_SemiGappedAlign(query, subject, q_start+1, s_start+1,
4353        //       &private_q_length, &private_s_length, FALSE, rev_prelim_tback,
4354        //       gap_align, score_params, q_start, FALSE, TRUE,
4355        //       fence_hit);
4356 
4357         score_left = ALIGN_EX(query, subject, q_start+1, s_start+1,  &private_q_length, &private_s_length, rev_prelim_tback,
4358                               gap_align,
4359                               score_params, q_start, FALSE /*reversed*/, TRUE /*reverse_sequence*/,
4360              fence_hit);
4361 
4362        gap_align->query_start = q_start - private_q_length + 1;
4363        gap_align->subject_start = s_start - private_s_length + 1;
4364     }
4365 
4366     score_right = 0;
4367 
4368     if ((! (fence_hit && *fence_hit)) &&
4369         (q_start < q_length) &&
4370         (s_start < s_length)) {
4371 
4372        found_end = TRUE;
4373        if(is_ooframe) {
4374           score_right =
4375              s_OutOfFrameSemiGappedAlignWrap(query+q_start-1, subject+s_start-1,
4376                 q_length-q_start, s_length-s_start,
4377                 &private_q_length, &private_s_length, FALSE, fwd_prelim_tback,
4378                 gap_align, score_params, q_start, FALSE, switch_seq);
4379         } else {
4380             // <AG> score_right =
4381             //    Blast_SemiGappedAlign(query+q_start, subject+s_start,
4382             //       q_length-q_start-1, s_length-s_start-1, &private_q_length,
4383             //       &private_s_length, FALSE, fwd_prelim_tback, gap_align,
4384             //       score_params, q_start, FALSE, FALSE,
4385             //       fence_hit);
4386             score_right =
4387                ALIGN_EX(query+q_start, subject+s_start,
4388                   q_length-q_start-1, s_length-s_start-1, &private_q_length,
4389                   &private_s_length, fwd_prelim_tback, gap_align,
4390                   score_params, q_start, FALSE, FALSE,
4391                   fence_hit);
4392         }
4393 
4394         gap_align->query_stop = q_start + private_q_length + 1;
4395         gap_align->subject_stop = s_start + private_s_length + 1;
4396     }
4397 
4398     if (found_end == FALSE) {
4399         gap_align->query_stop = q_start - 1;
4400         gap_align->subject_stop = s_start - 1;
4401     }
4402 
4403     if(is_ooframe) {
4404         Int4 nucl_align_length;
4405         if (program == eBlastTypeBlastx) {
4406             nucl_align_length = gap_align->query_stop -
4407                                 gap_align->query_start;
4408         }
4409         else {
4410             nucl_align_length = gap_align->subject_stop -
4411                                 gap_align->subject_start;
4412         }
4413         status = s_BlastOOFTracebackToGapEditScript(rev_prelim_tback,
4414                        fwd_prelim_tback, nucl_align_length,
4415                        &gap_align->edit_script);
4416     } else {
4417         Int4 i;
4418         GapEditScript *esp = Blast_PrelimEditBlockToGapEditScript(
4419                                                         rev_prelim_tback,
4420                                                         fwd_prelim_tback);
4421 
4422         /* rarely (typically when the scoring system changes between
4423            the score-only and traceback stages, as happens with
4424            composition-based statistics) it is possible to compute an
4425            optimal alignment with a leading or trailing gap. Prune
4426            these unneeded gaps here and update the score and
4427            alignment boundaries */
4428 
4429         gap_align->edit_script = esp;
4430         if (esp) {
4431             while (esp->size && esp->op_type[0] != eGapAlignSub) {
4432                 score_left += score_params->gap_open +
4433                              esp->num[0] * score_params->gap_extend;
4434 
4435                 if (esp->op_type[0] == eGapAlignDel)
4436                     gap_align->subject_start += esp->num[0];
4437                 else
4438                     gap_align->query_start += esp->num[0];
4439 
4440                 for (i = 1; i < esp->size; i++) {
4441                     esp->op_type[i-1] = esp->op_type[i];
4442                     esp->num[i-1] = esp->num[i];
4443                 }
4444                 esp->size--;
4445             }
4446             i = esp->size;
4447             while (esp->size && esp->op_type[i-1] != eGapAlignSub) {
4448                 score_right += score_params->gap_open +
4449                              esp->num[i-1] * score_params->gap_extend;
4450 
4451                 if (esp->op_type[i-1] == eGapAlignDel)
4452                     gap_align->subject_stop -= esp->num[i-1];
4453                 else
4454                     gap_align->query_stop -= esp->num[i-1];
4455 
4456                 esp->size--;
4457                 i--;
4458                 ASSERT(esp->size == i);
4459             }
4460         }
4461     }
4462 
4463     gap_align->score = score_right + score_left;
4464     return status;
4465 }
4466 
BLAST_GetUngappedHSPList(BlastInitHitList * init_hitlist,BlastQueryInfo * query_info,BLAST_SequenceBlk * subject,const BlastHitSavingOptions * hit_options,BlastHSPList ** hsp_list_ptr)4467 Int2 BLAST_GetUngappedHSPList(BlastInitHitList* init_hitlist,
4468                               BlastQueryInfo* query_info,
4469                               BLAST_SequenceBlk* subject,
4470                               const BlastHitSavingOptions* hit_options,
4471                               BlastHSPList** hsp_list_ptr)
4472 {
4473    BlastHSPList* hsp_list = NULL;
4474    Int4 index;
4475    BlastInitHSP* init_hsp;
4476    const int kHspNumMax = BlastHspNumMax(FALSE, hit_options);
4477 
4478    /* The BlastHSPList structure can be allocated and passed from outside */
4479    if (*hsp_list_ptr != NULL)
4480       hsp_list = *hsp_list_ptr;
4481 
4482    if (!init_hitlist) {
4483       if (!hsp_list)
4484          *hsp_list_ptr = NULL;
4485       else
4486          hsp_list->hspcnt = 0;
4487       return 0;
4488    }
4489 
4490    for (index = 0; index < init_hitlist->total; ++index) {
4491       BlastHSP* new_hsp;
4492       BlastUngappedData* ungapped_data=NULL;
4493       Int4 context = 0;
4494       init_hsp = &init_hitlist->init_hsp_array[index];
4495       if (!init_hsp->ungapped_data)
4496          continue;
4497 
4498       if (!hsp_list) {
4499          hsp_list = Blast_HSPListNew(kHspNumMax);
4500          *hsp_list_ptr = hsp_list;
4501       }
4502       /* Adjust the initial HSP's coordinates in case of concatenated
4503          multiple queries/strands/frames */
4504       context = s_GetUngappedHSPContext(query_info, init_hsp);
4505       s_AdjustInitialHSPOffsets(init_hsp,
4506                                 query_info->contexts[context].query_offset);
4507       ungapped_data = init_hsp->ungapped_data;
4508       Blast_HSPInit(ungapped_data->q_start,
4509                     ungapped_data->length+ungapped_data->q_start,
4510                     ungapped_data->s_start,
4511                     ungapped_data->length+ungapped_data->s_start,
4512                     init_hsp->offsets.qs_offsets.q_off,
4513                     init_hsp->offsets.qs_offsets.s_off,
4514                     context, query_info->contexts[context].frame,
4515                     subject->frame, ungapped_data->score, NULL, &new_hsp);
4516       Blast_HSPListSaveHSP(hsp_list, new_hsp);
4517    }
4518 
4519    /* Sort the HSP array by score */
4520    Blast_HSPListSortByScore(hsp_list);
4521 
4522    return 0;
4523 }
4524 
4525