1 /* $Id: blast_gapalign.c,v 1.229 2016/10/11 12:54:13 fukanchi Exp $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  *
27  */
28 
29 /** @file blast_gapalign.c
30  * Functions to perform gapped alignment
31  */
32 
33 #include <algo/blast/core/ncbi_math.h>
34 #include <algo/blast/core/blast_gapalign.h>
35 #include <algo/blast/core/blast_util.h> /* for NCBI2NA_UNPACK_BASE macros */
36 #include <algo/blast/core/greedy_align.h>
37 #include "blast_gapalign_priv.h"
38 #include "blast_hits_priv.h"
39 #include "blast_itree.h"
40 #include "jumper.h"
41 
42 static Int2 s_BlastDynProgNtGappedAlignment(BLAST_SequenceBlk* query_blk,
43    BLAST_SequenceBlk* subject_blk, BlastGapAlignStruct* gap_align,
44    const BlastScoringParameters* score_params, BlastInitHSP* init_hsp);
45 static Int4 s_BlastAlignPackedNucl(Uint1* B, Uint1* A, Int4 N, Int4 M,
46    Int4* pej, Int4* pei, BlastGapAlignStruct* gap_align,
47    const BlastScoringParameters* score_params, Boolean reverse_sequence);
48 
49 static Int2 s_BlastProtGappedAlignment(EBlastProgramType program,
50    BLAST_SequenceBlk* query_in, BLAST_SequenceBlk* subject_in,
51    BlastGapAlignStruct* gap_align,
52    const BlastScoringParameters* score_params,
53    BlastInitHSP* init_hsp, Boolean restricted_alignment,
54    Boolean * fence_hit);
55 
56 /** Lower bound for scores. Divide by two to prevent underflows. */
57 #define MININT INT4_MIN/2
58 
59 /** Minimal size of a chunk for state array allocation. */
60 #define	CHUNKSIZE	2097152
61 
62 /** Retrieve the state structure corresponding to a given length
63  * @param head Pointer to the first element of the state structures
64  *        array [in]
65  * @param length The length for which the state structure has to be
66  *        found [in]
67  * @return The found or created state structure
68  */
69 static GapStateArrayStruct*
s_GapGetState(GapStateArrayStruct ** head,Int4 length)70 s_GapGetState(GapStateArrayStruct** head, Int4 length)
71 
72 {
73    GapStateArrayStruct*	retval,* var,* last;
74    Int4	chunksize = MAX(CHUNKSIZE, length + length/3);
75 
76    length += length/3;	/* Add on about 30% so the end will get reused. */
77    retval = NULL;
78    if (*head == NULL) {
79       retval = (GapStateArrayStruct*)
80          malloc(sizeof(GapStateArrayStruct));
81       retval->state_array = (Uint1*) malloc(chunksize*sizeof(Uint1));
82       retval->length = chunksize;
83       retval->used = 0;
84       retval->next = NULL;
85       *head = retval;
86    } else {
87       var = *head;
88       last = *head;
89       while (var) {
90          if (length < (var->length - var->used)) {
91             retval = var;
92             break;
93          } else if (var->used == 0) {
94             /* If it's empty and too small, replace. */
95             sfree(var->state_array);
96             var->state_array = (Uint1*) malloc(chunksize*sizeof(Uint1));
97             var->length = chunksize;
98             retval = var;
99             break;
100          }
101          last = var;
102          var = var->next;
103       }
104 
105       if (var == NULL)
106       {
107          retval = (GapStateArrayStruct*) malloc(sizeof(GapStateArrayStruct));
108          retval->state_array = (Uint1*) malloc(chunksize*sizeof(Uint1));
109          retval->length = chunksize;
110          retval->used = 0;
111          retval->next = NULL;
112          last->next = retval;
113       }
114    }
115 
116 #ifdef ERR_POST_EX_DEFINED
117    if (retval->state_array == NULL)
118       ErrPostEx(SEV_ERROR, 0, 0, "state array is NULL");
119 #endif
120 
121    return retval;
122 
123 }
124 
125 /** Remove a state from a state structure */
126 static Boolean
s_GapPurgeState(GapStateArrayStruct * state_struct)127 s_GapPurgeState(GapStateArrayStruct* state_struct)
128 {
129    while (state_struct)
130    {
131       /*
132 	memset(state_struct->state_array, 0, state_struct->used);
133       */
134       state_struct->used = 0;
135       state_struct = state_struct->next;
136    }
137 
138    return TRUE;
139 }
140 
141 /** Deallocate the memory for greedy gapped alignment */
142 static SGreedyAlignMem*
s_BlastGreedyAlignsFree(SGreedyAlignMem * gamp)143 s_BlastGreedyAlignsFree(SGreedyAlignMem* gamp)
144 {
145    if (gamp->last_seq2_off) {
146       sfree(gamp->last_seq2_off[0]);
147       sfree(gamp->last_seq2_off);
148    } else {
149       if (gamp->last_seq2_off_affine) {
150          sfree(gamp->last_seq2_off_affine[0]);
151          sfree(gamp->last_seq2_off_affine);
152       }
153       sfree(gamp->diag_bounds);
154    }
155    sfree(gamp->max_score);
156    if (gamp->space)
157       MBSpaceFree(gamp->space);
158    sfree(gamp);
159    return NULL;
160 }
161 
162 /** Allocate memory for the greedy gapped alignment algorithm
163  * @param score_params Parameters related to scoring [in]
164  * @param ext_params Options and parameters related to the extension [in]
165  * @param max_d The maximum distance to search [in]
166  * @param Xdrop The Xdrop value [in]
167  * @return The allocated SGreedyAlignMem structure
168  */
169 static SGreedyAlignMem*
s_BlastGreedyAlignMemAlloc(const BlastScoringParameters * score_params,const BlastExtensionParameters * ext_params,Int4 max_d,Int4 Xdrop)170 s_BlastGreedyAlignMemAlloc(const BlastScoringParameters* score_params,
171 		       const BlastExtensionParameters* ext_params,
172 		       Int4 max_d, Int4 Xdrop)
173 {
174    SGreedyAlignMem* gamp;
175    Int4 max_d_1, d_diff, max_cost, gd, i;
176    Int4 reward, penalty, gap_open, gap_extend;
177    Int4 Mis_cost, GE_cost;
178 
179    if (score_params == NULL || (!ext_params && !Xdrop))
180       return NULL;
181 
182    if (score_params->reward % 2 == 1) {
183       reward = 2*score_params->reward;
184       penalty = -2*score_params->penalty;
185       if (!Xdrop)
186           Xdrop = 2*MAX(ext_params->gap_x_dropoff,
187                     ext_params->gap_x_dropoff_final);
188       gap_open = 2*score_params->gap_open;
189       gap_extend = 2*score_params->gap_extend;
190    } else {
191       reward = score_params->reward;
192       penalty = -score_params->penalty;
193       if (!Xdrop)
194           Xdrop = MAX(ext_params->gap_x_dropoff,
195                   ext_params->gap_x_dropoff_final);
196       gap_open = score_params->gap_open;
197       gap_extend = score_params->gap_extend;
198    }
199 
200    if (gap_open == 0 && gap_extend == 0)
201       gap_extend = reward / 2 + penalty;
202 
203    gamp = (SGreedyAlignMem*) calloc(1, sizeof(SGreedyAlignMem));
204    gamp->max_dist = max_d;
205    gamp->xdrop = Xdrop;
206 
207    if (score_params->gap_open==0 && score_params->gap_extend==0) {
208       d_diff = (Xdrop+reward/2)/(penalty+reward)+1;
209 
210       gamp->last_seq2_off = (Int4**) malloc((max_d + 2) * sizeof(Int4*));
211       if (gamp->last_seq2_off == NULL) {
212          sfree(gamp);
213          return NULL;
214       }
215       gamp->last_seq2_off[0] =
216          (Int4*) malloc((max_d + max_d + 6) * sizeof(Int4) * 2);
217       if (gamp->last_seq2_off[0] == NULL) {
218 #ifdef ERR_POST_EX_DEFINED
219 	 ErrPostEx(SEV_WARNING, 0, 0,
220               "Failed to allocate %ld bytes for greedy alignment",
221               (max_d + max_d + 6) * sizeof(Int4) * 2);
222 #endif
223          s_BlastGreedyAlignsFree(gamp);
224          return NULL;
225       }
226 
227       gamp->last_seq2_off[1] = gamp->last_seq2_off[0] + max_d + max_d + 6;
228       gamp->last_seq2_off_affine = NULL;
229       gamp->diag_bounds = NULL;
230    } else {
231       gamp->last_seq2_off = NULL;
232       Mis_cost = reward + penalty;
233       GE_cost = gap_extend+reward/2;
234       max_d_1 = max_d;
235       max_d *= GE_cost;
236       max_cost = MAX(Mis_cost, gap_open+GE_cost);
237       gd = BLAST_Gdb3(&Mis_cost, &gap_open, &GE_cost);
238       d_diff = (Xdrop+reward/2)/gd+1;
239       gamp->diag_bounds = (Int4*) calloc(2*(max_d+1+max_cost), sizeof(Int4));
240       gamp->last_seq2_off_affine = (SGreedyOffset**)
241 	 malloc((MAX(max_d, max_cost) + 2) * sizeof(SGreedyOffset*));
242       if (!gamp->diag_bounds || !gamp->last_seq2_off_affine) {
243          s_BlastGreedyAlignsFree(gamp);
244          return NULL;
245       }
246       gamp->last_seq2_off_affine[0] = (SGreedyOffset*)
247 	 calloc((2*max_d_1 + 6) , sizeof(SGreedyOffset) * (max_cost+1));
248       for (i = 1; i <= max_cost; i++)
249 	 gamp->last_seq2_off_affine[i] =
250 	    gamp->last_seq2_off_affine[i-1] + 2*max_d_1 + 6;
251       if (!gamp->last_seq2_off_affine || !gamp->last_seq2_off_affine[0]) {
252          s_BlastGreedyAlignsFree(gamp);
253          return NULL;
254       }
255    }
256    gamp->max_score = (Int4*) malloc(sizeof(Int4) * (max_d + 1 + d_diff));
257 
258    gamp->space = MBSpaceNew(0);
259    if (!gamp->max_score || !gamp->space)
260       /* Failure in one of the memory allocations */
261       s_BlastGreedyAlignsFree(gamp);
262 
263    return gamp;
264 }
265 
266 /* Documented in blast_gapalign.h */
267 BlastGapAlignStruct*
BLAST_GapAlignStructFree(BlastGapAlignStruct * gap_align)268 BLAST_GapAlignStructFree(BlastGapAlignStruct* gap_align)
269 {
270    if (!gap_align)
271       return NULL;
272 
273    GapEditScriptDelete(gap_align->edit_script);
274    GapPrelimEditBlockFree(gap_align->fwd_prelim_tback);
275    GapPrelimEditBlockFree(gap_align->rev_prelim_tback);
276    if (gap_align->greedy_align_mem)
277       s_BlastGreedyAlignsFree(gap_align->greedy_align_mem);
278    GapStateFree(gap_align->state_struct);
279    sfree(gap_align->dp_mem);
280    JumperGapAlignFree(gap_align->jumper);
281 
282    sfree(gap_align);
283    return NULL;
284 }
285 
286 /* Documented in blast_gapalign.h */
287 Int2
BLAST_GapAlignStructNew(const BlastScoringParameters * score_params,const BlastExtensionParameters * ext_params,Uint4 max_subject_length,BlastScoreBlk * sbp,BlastGapAlignStruct ** gap_align_ptr)288 BLAST_GapAlignStructNew(const BlastScoringParameters* score_params,
289    const BlastExtensionParameters* ext_params,
290    Uint4 max_subject_length,
291    BlastScoreBlk* sbp, BlastGapAlignStruct** gap_align_ptr)
292 {
293    Int2 status = 0;
294    BlastGapAlignStruct* gap_align;
295 
296    if (!gap_align_ptr || !sbp || !score_params || !ext_params)
297       return -1;
298 
299    gap_align = (BlastGapAlignStruct*) calloc(1, sizeof(BlastGapAlignStruct));
300 
301    *gap_align_ptr = gap_align;
302 
303    gap_align->sbp = sbp;
304 
305    gap_align->gap_x_dropoff = ext_params->gap_x_dropoff;
306    gap_align->max_mismatches = ext_params->options->max_mismatches;
307    gap_align->mismatch_window = ext_params->options->mismatch_window;
308 
309    /* allocate memory either for dynamic programming or jumper */
310    if (ext_params->options->ePrelimGapExt != eJumperWithTraceback) {
311        if (ext_params->options->ePrelimGapExt == eDynProgScoreOnly) {
312 	   /* allocate structures for ordinary dynamic programming */
313 	   gap_align->dp_mem_alloc = 1000;
314 	   gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
315 						    sizeof(BlastGapDP));
316 	   if (!gap_align->dp_mem)
317 	       gap_align = BLAST_GapAlignStructFree(gap_align);
318        }
319        else {
320 	   /* allocate structures for greedy dynamic programming */
321 	   max_subject_length = MIN(max_subject_length, MAX_DBSEQ_LEN);
322 	   max_subject_length = MIN(GREEDY_MAX_COST,
323 			    max_subject_length / GREEDY_MAX_COST_FRACTION + 1);
324 	   gap_align->greedy_align_mem =
325 	       s_BlastGreedyAlignMemAlloc(score_params, ext_params,
326 					  max_subject_length, 0);
327 	   if (!gap_align->greedy_align_mem)
328 	       gap_align = BLAST_GapAlignStructFree(gap_align);
329        }
330    }
331    else {
332        gap_align->jumper = JumperGapAlignNew(200);
333    }
334 
335    if (!gap_align)
336       return -1;
337 
338    gap_align->positionBased = (sbp->psi_matrix != NULL);
339 
340    gap_align->fwd_prelim_tback = GapPrelimEditBlockNew();
341    gap_align->rev_prelim_tback = GapPrelimEditBlockNew();
342 
343    return status;
344 }
345 
346 /** Values for the editing script operations in traceback */
347 enum {
348     SCRIPT_SUB           = eGapAlignSub,     /**< Substitution */
349     SCRIPT_GAP_IN_A      = eGapAlignDel,     /**< Deletion */
350     SCRIPT_GAP_IN_B      = eGapAlignIns,     /**< Insertion */
351     SCRIPT_OP_MASK       = 0x07, /**< Mask for edit script operations */
352 
353     SCRIPT_EXTEND_GAP_A  = 0x10, /**< continue a gap in A */
354     SCRIPT_EXTEND_GAP_B  = 0x40  /**< continue a gap in B */
355 };
356 
357 Int4
ALIGN_EX(const Uint1 * A,const Uint1 * B,Int4 M,Int4 N,Int4 * a_offset,Int4 * b_offset,GapPrelimEditBlock * edit_block,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 query_offset,Boolean reversed,Boolean reverse_sequence,Boolean * fence_hit)358 ALIGN_EX(const Uint1* A, const Uint1* B, Int4 M, Int4 N, Int4* a_offset,
359 	Int4* b_offset, GapPrelimEditBlock *edit_block,
360         BlastGapAlignStruct* gap_align,
361         const BlastScoringParameters* score_params, Int4 query_offset,
362         Boolean reversed, Boolean reverse_sequence,
363         Boolean * fence_hit)
364 {
365     /* See Blast_SemiGappedAlign for more general comments on
366        what this code is doing; comments in this function
367        only apply to the traceback computations */
368 
369     Int4 i;
370     Int4 a_index;
371     Int4 b_index, b_size, first_b_index, last_b_index, b_increment;
372     const Uint1* b_ptr;
373 
374     BlastGapDP* score_array;
375 
376     Int4 gap_open;
377     Int4 gap_extend;
378     Int4 gap_open_extend;
379     Int4 x_dropoff;
380     Int4 best_score;
381 
382     Int4** matrix = NULL;
383     Int4** pssm = NULL;
384     Int4* matrix_row = NULL;
385 
386     Int4 score;
387     Int4 score_gap_row;
388     Int4 score_gap_col;
389     Int4 next_score;
390 
391     GapStateArrayStruct* state_struct;
392     Uint1* edit_script_row;
393     Uint1** edit_script;
394     Int4 *edit_start_offset;
395     Int4 edit_script_num_rows;
396     Int4 orig_b_index;
397     Uint1 script, next_script, script_row, script_col;
398     Int4 num_extra_cells;
399 
400     matrix = gap_align->sbp->matrix->data;
401     if (gap_align->positionBased) {
402         pssm = gap_align->sbp->psi_matrix->pssm->data;
403     }
404 
405     *a_offset = 0;
406     *b_offset = 0;
407     gap_open = score_params->gap_open;
408     gap_extend = score_params->gap_extend;
409     gap_open_extend = gap_open + gap_extend;
410     x_dropoff = gap_align->gap_x_dropoff;
411 
412     if (x_dropoff < gap_open_extend)
413         x_dropoff = gap_open_extend;
414 
415     if(N <= 0 || M <= 0)
416         return 0;
417 
418     /* Initialize traceback information. edit_script[] is
419        a 2-D array which is filled in row by row as the
420        dynamic programming takes place */
421 
422     s_GapPurgeState(gap_align->state_struct);
423 
424     /* Conceptually, traceback requires a byte array of size
425        MxN. To save on memory, each row of the array is allocated
426        separately. edit_script[i] points to the storage reserved
427        for row i, and edit_start_offset[i] gives the offset into
428        the B sequence corresponding to element 0 of edit_script[i].
429 
430        Also make the number of edit script rows grow dynamically */
431 
432     edit_script_num_rows = 100;
433     edit_script = (Uint1**) malloc(sizeof(Uint1*) * edit_script_num_rows);
434     edit_start_offset = (Int4*) malloc(sizeof(Int4) * edit_script_num_rows);
435 
436     /* allocate storage for the first row of the traceback
437        array. Because row elements correspond to gaps in A,
438        the alignment can only go x_dropoff/gap_extend positions
439        at most before failing the X dropoff criterion */
440 
441     if (gap_extend > 0)
442         num_extra_cells = x_dropoff / gap_extend + 3;
443     else
444         num_extra_cells = N + 3;
445 
446     if (num_extra_cells > gap_align->dp_mem_alloc) {
447         gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
448                                       2 * gap_align->dp_mem_alloc);
449         sfree(gap_align->dp_mem);
450         gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
451                                                   sizeof(BlastGapDP));
452     }
453 
454     state_struct = s_GapGetState(&gap_align->state_struct, num_extra_cells);
455 
456     edit_script[0] = state_struct->state_array;
457     edit_start_offset[0] = 0;
458     edit_script_row = state_struct->state_array;
459 
460     score = -gap_open_extend;
461     score_array = gap_align->dp_mem;
462     score_array[0].best = 0;
463     score_array[0].best_gap = -gap_open_extend;
464 
465     for (i = 1; i <= N; i++) {
466         if (score < -x_dropoff)
467             break;
468 
469         score_array[i].best = score;
470         score_array[i].best_gap = score - gap_open_extend;
471         score -= gap_extend;
472         edit_script_row[i] = SCRIPT_GAP_IN_A;
473     }
474     state_struct->used = i + 1;
475 
476     b_size = i;
477     best_score = 0;
478     first_b_index = 0;
479     if (reverse_sequence)
480         b_increment = -1;
481     else
482         b_increment = 1;
483 
484     for (a_index = 1; a_index <= M; a_index++) {
485 
486         /* Set up the next row of the edit script; this involves
487            allocating memory for the row, then pointing to it.
488            It is not necessary to allocate space for offsets less
489            than first_b_index (since the inner loop will never
490            look at them);
491 
492            It is unknown at this point how far to the right the
493            current traceback row will extend; all that's known for
494            sure is that the previous row fails the X-dropoff test
495            after b_size cells, and that the current row can go at
496            most num_extra_cells beyond that before failing the test */
497 
498         if (gap_extend > 0)
499             state_struct = s_GapGetState(&gap_align->state_struct,
500                            b_size - first_b_index + num_extra_cells);
501         else
502             state_struct = s_GapGetState(&gap_align->state_struct,
503                                         N + 3 - first_b_index);
504 
505         if (a_index == edit_script_num_rows) {
506             edit_script_num_rows = edit_script_num_rows * 2;
507             edit_script = (Uint1 **)realloc(edit_script,
508                                             edit_script_num_rows *
509                                             sizeof(Uint1 *));
510             edit_start_offset = (Int4 *)realloc(edit_start_offset,
511                                                 edit_script_num_rows *
512                                                 sizeof(Int4));
513         }
514 
515         edit_script[a_index] = state_struct->state_array +
516                                 state_struct->used + 1;
517         edit_start_offset[a_index] = first_b_index;
518 
519         /* the inner loop assumes the current traceback
520            row begins at offset zero of B */
521 
522         edit_script_row = edit_script[a_index] - first_b_index;
523         orig_b_index = first_b_index;
524 
525         if (!(gap_align->positionBased)) {
526             if(reverse_sequence)
527                 matrix_row = matrix[ A[ M - a_index ] ];
528             else
529                 matrix_row = matrix[ A[ a_index ] ];
530         }
531         else {
532             if(reversed || reverse_sequence)
533                 matrix_row = pssm[M - a_index];
534             else
535                 matrix_row = pssm[a_index + query_offset];
536         }
537 
538         if(reverse_sequence)
539             b_ptr = &B[N - first_b_index];
540         else
541             b_ptr = &B[first_b_index];
542 
543         score = MININT;
544         score_gap_row = MININT;
545         last_b_index = first_b_index;
546 
547         for (b_index = first_b_index; b_index < b_size; b_index++) {
548             int matrix_index = 0;
549 
550             b_ptr += b_increment;
551             score_gap_col = score_array[b_index].best_gap;
552 
553             matrix_index = *b_ptr;
554 
555             if (matrix_index == FENCE_SENTRY) {
556                 if (fence_hit) {
557                     *fence_hit = 1;
558                 }
559                 break;
560             }
561 
562             next_score = score_array[b_index].best + matrix_row[ *b_ptr ];
563 
564             /* script, script_row and script_col contain the
565                actions specified by the dynamic programming.
566                when the inner loop has finished, 'script' con-
567                tains all of the actions to perform, and is
568                written to edit_script[a_index][b_index]. Otherwise,
569                this inner loop is exactly the same as the one
570                in Blast_SemiGappedAlign() */
571 
572             script = SCRIPT_SUB;
573             script_col = SCRIPT_EXTEND_GAP_B;
574             script_row = SCRIPT_EXTEND_GAP_A;
575 
576             if (score < score_gap_col) {
577                 script = SCRIPT_GAP_IN_B;
578                 score = score_gap_col;
579             }
580             if (score < score_gap_row) {
581                 script = SCRIPT_GAP_IN_A;
582                 score = score_gap_row;
583             }
584 
585             if (best_score - score > x_dropoff) {
586 
587                 if (first_b_index == b_index)
588                     first_b_index++;
589                 else
590                     score_array[b_index].best = MININT;
591             }
592             else {
593                 last_b_index = b_index;
594                 if (score > best_score) {
595                     best_score = score;
596                     *a_offset = a_index;
597                     *b_offset = b_index;
598                 }
599 
600                 score_gap_row -= gap_extend;
601                 score_gap_col -= gap_extend;
602                 if (score_gap_col < (score - gap_open_extend)) {
603                     score_array[b_index].best_gap = score - gap_open_extend;
604                 }
605                 else {
606                     score_array[b_index].best_gap = score_gap_col;
607                     script += script_col;
608                 }
609 
610                 if (score_gap_row < (score - gap_open_extend))
611                     score_gap_row = score - gap_open_extend;
612                 else
613                     script += script_row;
614 
615                 score_array[b_index].best = score;
616             }
617 
618             score = next_score;
619             edit_script_row[b_index] = script;
620         }
621 
622         if (first_b_index == b_size || (fence_hit && *fence_hit))
623             break;
624 
625         if (last_b_index + num_extra_cells + 3 >= gap_align->dp_mem_alloc) {
626 
627             gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100,
628                                           2 * gap_align->dp_mem_alloc);
629             score_array = (BlastGapDP *)realloc(score_array,
630                                                gap_align->dp_mem_alloc *
631                                                sizeof(BlastGapDP));
632             gap_align->dp_mem = score_array;
633         }
634 
635 
636         if (last_b_index < b_size - 1) {
637             b_size = last_b_index + 1;
638         }
639         else {
640             while (score_gap_row >= (best_score - x_dropoff) && b_size <= N) {
641 
642                 score_array[b_size].best = score_gap_row;
643                 score_array[b_size].best_gap = score_gap_row - gap_open_extend;
644                 score_gap_row -= gap_extend;
645                 edit_script_row[b_size] = SCRIPT_GAP_IN_A;
646                 b_size++;
647             }
648         }
649 
650         /* update the memory allocator to reflect the exact number
651            of traceback cells this row needed */
652 
653         state_struct->used += MAX(b_index, b_size) - orig_b_index + 1;
654 
655         if (b_size <= N) {
656             score_array[b_size].best = MININT;
657             score_array[b_size].best_gap = MININT;
658             b_size++;
659         }
660     }
661 
662     /* Pick the optimal path through the now complete
663        edit_script[][]. This is equivalent to flattening
664        the 2-D array into a 1-D list of actions. */
665 
666     a_index = *a_offset;
667     b_index = *b_offset;
668     script = SCRIPT_SUB;
669 
670     if (fence_hit && *fence_hit)
671         goto done;
672 
673     while (a_index > 0 || b_index > 0) {
674         /* Retrieve the next action to perform. Rows of
675            the traceback array do not necessarily start
676            at offset zero of B, so a correction is needed
677            to point to the correct position */
678 
679         next_script =
680             edit_script[a_index][b_index - edit_start_offset[a_index]];
681 
682         switch(script) {
683         case SCRIPT_GAP_IN_A:
684             script = next_script & SCRIPT_OP_MASK;
685             if (next_script & SCRIPT_EXTEND_GAP_A)
686                 script = SCRIPT_GAP_IN_A;
687             break;
688 
689         case SCRIPT_GAP_IN_B:
690             script = next_script & SCRIPT_OP_MASK;
691             if (next_script & SCRIPT_EXTEND_GAP_B)
692                 script = SCRIPT_GAP_IN_B;
693             break;
694 
695         default:
696             script = next_script & SCRIPT_OP_MASK;
697             break;
698         }
699 
700         if (script == SCRIPT_GAP_IN_A) {
701             b_index--;
702         }
703         else if (script == SCRIPT_GAP_IN_B) {
704             a_index--;
705         }
706         else {
707             a_index--;
708             b_index--;
709         }
710         GapPrelimEditBlockAdd(edit_block, (EGapAlignOpType)script, 1);
711     }
712 
713  done:
714     sfree(edit_start_offset);
715     sfree(edit_script);
716     return best_score;
717 }
718 
719 Int4
Blast_SemiGappedAlign(const Uint1 * A,const Uint1 * B,Int4 M,Int4 N,Int4 * a_offset,Int4 * b_offset,Boolean score_only,GapPrelimEditBlock * edit_block,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 query_offset,Boolean reversed,Boolean reverse_sequence,Boolean * fence_hit)720 Blast_SemiGappedAlign(const Uint1* A, const Uint1* B, Int4 M, Int4 N,
721    Int4* a_offset, Int4* b_offset, Boolean score_only,
722    GapPrelimEditBlock *edit_block, BlastGapAlignStruct* gap_align,
723    const BlastScoringParameters* score_params,
724    Int4 query_offset, Boolean reversed, Boolean reverse_sequence,
725    Boolean * fence_hit)
726 {
727     Int4 i;                     /* sequence pointers and indices */
728     Int4 a_index;
729     Int4 b_index, b_size, first_b_index, last_b_index, b_increment;
730     const Uint1* b_ptr;
731 
732     BlastGapDP* score_array;
733 
734     Int4 gap_open;              /* alignment penalty variables */
735     Int4 gap_extend;
736     Int4 gap_open_extend;
737     Int4 x_dropoff;
738 
739     Int4** matrix = NULL;       /* pointers to the score matrix */
740     Int4** pssm = NULL;
741     Int4* matrix_row = NULL;
742 
743     Int4 score;                 /* score tracking variables */
744     Int4 score_gap_row;
745     Int4 score_gap_col;
746     Int4 next_score;
747     Int4 best_score;
748     Int4 num_extra_cells;
749 
750     if (!score_only) {
751         return ALIGN_EX(A, B, M, N, a_offset, b_offset, edit_block, gap_align,
752                         score_params, query_offset, reversed, reverse_sequence,
753                         fence_hit);
754     }
755 
756     /* do initialization and sanity-checking */
757 
758     matrix = gap_align->sbp->matrix->data;
759     if (gap_align->positionBased) {
760         pssm = gap_align->sbp->psi_matrix->pssm->data;
761     }
762     *a_offset = 0;
763     *b_offset = 0;
764     gap_open = score_params->gap_open;
765     gap_extend = score_params->gap_extend;
766     gap_open_extend = gap_open + gap_extend;
767     x_dropoff = gap_align->gap_x_dropoff;
768 
769     if (x_dropoff < gap_open_extend)
770         x_dropoff = gap_open_extend;
771 
772     if(N <= 0 || M <= 0)
773         return 0;
774 
775     /* Allocate and fill in the auxiliary bookeeping structures.
776        Since A and B could be very large, maintain a window
777        of auxiliary structures only large enough to contain to current
778        set of DP computations. The initial window size is determined
779        by the number of cells needed to fail the x-dropoff test */
780 
781     if (gap_extend > 0)
782         num_extra_cells = x_dropoff / gap_extend + 3;
783     else
784         num_extra_cells = N + 3;
785 
786     if (num_extra_cells > gap_align->dp_mem_alloc) {
787         gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
788                                       2 * gap_align->dp_mem_alloc);
789         sfree(gap_align->dp_mem);
790         gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
791                                                   sizeof(BlastGapDP));
792     }
793 
794     score_array = gap_align->dp_mem;
795     score = -gap_open_extend;
796     score_array[0].best = 0;
797     score_array[0].best_gap = -gap_open_extend;
798 
799     for (i = 1; i <= N; i++) {
800         if (score < -x_dropoff)
801             break;
802 
803         score_array[i].best = score;
804         score_array[i].best_gap = score - gap_open_extend;
805         score -= gap_extend;
806     }
807 
808     /* The inner loop below examines letters of B from
809        index 'first_b_index' to 'b_size' */
810 
811     b_size = i;
812     best_score = 0;
813     first_b_index = 0;
814     if (reverse_sequence)
815         b_increment = -1;
816     else
817         b_increment = 1;
818 
819     for (a_index = 1; a_index <= M; a_index++) {
820         /* pick out the row of the score matrix
821            appropriate for A[a_index] */
822 
823         if (!(gap_align->positionBased)) {
824             if(reverse_sequence)
825                 matrix_row = matrix[ A[ M - a_index ] ];
826             else
827                 matrix_row = matrix[ A[ a_index ] ];
828         }
829         else {
830             if(reversed || reverse_sequence)
831                 matrix_row = pssm[M - a_index];
832             else
833                 matrix_row = pssm[a_index + query_offset];
834         }
835 
836         if(reverse_sequence)
837             b_ptr = &B[N - first_b_index];
838         else
839             b_ptr = &B[first_b_index];
840 
841         /* initialize running-score variables */
842         score = MININT;
843         score_gap_row = MININT;
844         last_b_index = first_b_index;
845 
846         for (b_index = first_b_index; b_index < b_size; b_index++) {
847 
848             b_ptr += b_increment;
849             score_gap_col = score_array[b_index].best_gap;
850             next_score = score_array[b_index].best + matrix_row[ *b_ptr ];
851 
852             if (score < score_gap_col)
853                 score = score_gap_col;
854 
855             if (score < score_gap_row)
856                 score = score_gap_row;
857 
858             if (best_score - score > x_dropoff) {
859 
860                 /* the current best score failed the X-dropoff
861                    criterion. Note that this does not stop the
862                    inner loop, only forces future iterations to
863                    skip this column of B.
864 
865                    Also, if the very first letter of B that was
866                    tested failed the X dropoff criterion, make
867                    sure future inner loops start one letter to
868                    the right */
869 
870                 if (b_index == first_b_index)
871                     first_b_index++;
872                 else
873                     score_array[b_index].best = MININT;
874             }
875             else {
876                 last_b_index = b_index;
877                 if (score > best_score) {
878                     best_score = score;
879                     *a_offset = a_index;
880                     *b_offset = b_index;
881                 }
882 
883                 /* If starting a gap at this position will improve
884                    the best row, or column, score, update them to
885                    reflect that. */
886 
887                 score_gap_row -= gap_extend;
888                 score_gap_col -= gap_extend;
889                 score_array[b_index].best_gap = MAX(score - gap_open_extend,
890                                                     score_gap_col);
891                 score_gap_row = MAX(score - gap_open_extend, score_gap_row);
892                 score_array[b_index].best = score;
893             }
894 
895             score = next_score;
896         }
897 
898         /* Finish aligning if the best scores for all positions
899            of B will fail the X-dropoff test, i.e. the inner loop
900            bounds have converged to each other */
901 
902         if (first_b_index == b_size)
903             break;
904 
905         /* enlarge the window for score data if necessary */
906 
907         if (last_b_index + num_extra_cells + 3 >= gap_align->dp_mem_alloc) {
908 
909             gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100,
910                                           2 * gap_align->dp_mem_alloc);
911             score_array = (BlastGapDP *)realloc(score_array,
912                                                gap_align->dp_mem_alloc *
913                                                sizeof(BlastGapDP));
914             gap_align->dp_mem = score_array;
915         }
916 
917         if (last_b_index < b_size - 1) {
918             /* This row failed the X-dropoff test earlier than
919                the last row did; just shorten the loop bounds
920                before doing the next row */
921 
922             b_size = last_b_index + 1;
923         }
924         else {
925             /* The inner loop finished without failing the X-dropoff
926                test; initialize extra bookkeeping structures until
927                the X dropoff test fails or we run out of letters in B.
928                The next inner loop will have larger bounds */
929 
930             while (score_gap_row >= (best_score - x_dropoff) && b_size <= N) {
931                 score_array[b_size].best = score_gap_row;
932                 score_array[b_size].best_gap = score_gap_row - gap_open_extend;
933                 score_gap_row -= gap_extend;
934                 b_size++;
935             }
936         }
937 
938         if (b_size <= N) {
939             score_array[b_size].best = MININT;
940             score_array[b_size].best_gap = MININT;
941             b_size++;
942         }
943     }
944 
945     return best_score;
946 }
947 
948 /** For restricted gapped alignment, gaps may only start once in
949     this many sequence offsets */
950 #define RESTRICT_SIZE 10
951 
952 /** Low level function to perform score-only gapped extension
953  *  in one direction. Gaps in one sequence or the other are only
954  *  allowed to start at sequence offsets that are a multiple of
955  *  RESTRICT_SIZE from the alignment start point. For more details see
956  * <PRE>
957  * Michael Cameron, Hugh E. Williams, Adam Cannane, "Improved
958  * Gapped Alignment in BLAST". IEEE/ACM Transactions on Computational
959  * Biology and Bioinformatics, vol 1 #3 (2004) pp 116-129
960  * </PRE>
961  * @param A The query sequence [in]
962  * @param B The subject sequence [in]
963  * @param M Maximal extension length in query [in]
964  * @param N Maximal extension length in subject [in]
965  * @param a_offset Resulting starting offset in query [out]
966  * @param b_offset Resulting starting offset in subject [out]
967  * @param gap_align Structure holding various information and allocated
968  *        memory for the gapped alignment [in]
969  * @param score_params Parameters related to scoring [in]
970  * @param query_offset The starting offset in query [in]
971  * @param reverse_sequence Do reverse the sequence [in]
972  * @return The best alignment score found.
973  */
974 static Int4
s_RestrictedGappedAlign(const Uint1 * A,const Uint1 * B,Int4 M,Int4 N,Int4 * a_offset,Int4 * b_offset,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 query_offset,Boolean reverse_sequence)975 s_RestrictedGappedAlign(const Uint1* A, const Uint1* B, Int4 M, Int4 N,
976    Int4* a_offset, Int4* b_offset,
977    BlastGapAlignStruct* gap_align,
978    const BlastScoringParameters* score_params,
979    Int4 query_offset, Boolean reverse_sequence)
980 {
981     /* see Blast_SemiGappedAlign for general details */
982 
983     Int4 i;
984     Int4 a_index;
985     Int4 b_index, b_size, first_b_index, last_b_index, b_increment;
986     const Uint1* b_ptr;
987     Int4 b_gap;
988 
989     BlastGapDP* score_array;
990 
991     Int4 gap_open;
992     Int4 gap_extend;
993     Int4 gap_open_extend;
994     Int4 x_dropoff;
995 
996     Int4** matrix = NULL;
997     Int4** pssm = NULL;
998     Int4* matrix_row = NULL;
999 
1000     Int4 score;
1001     Int4 score_gap_row;
1002     Int4 score_gap_col;
1003     Int4 next_score;
1004     Int4 best_score;
1005     Int4 num_extra_cells;
1006 
1007     matrix = gap_align->sbp->matrix->data;
1008     if (gap_align->positionBased) {
1009         pssm = gap_align->sbp->psi_matrix->pssm->data;
1010     }
1011     *a_offset = 0;
1012     *b_offset = 0;
1013     gap_open = score_params->gap_open;
1014     gap_extend = score_params->gap_extend;
1015     gap_open_extend = gap_open + gap_extend;
1016     x_dropoff = gap_align->gap_x_dropoff;
1017 
1018     if (x_dropoff < gap_open_extend)
1019         x_dropoff = gap_open_extend;
1020 
1021     if(N <= 0 || M <= 0)
1022         return 0;
1023 
1024     if (gap_extend > 0)
1025         num_extra_cells = x_dropoff / gap_extend + 3;
1026     else
1027         num_extra_cells = N + 3;
1028 
1029     if (num_extra_cells > gap_align->dp_mem_alloc) {
1030         gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
1031                                       2 * gap_align->dp_mem_alloc);
1032         sfree(gap_align->dp_mem);
1033         gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
1034                                                   sizeof(BlastGapDP));
1035     }
1036 
1037     score_array = gap_align->dp_mem;
1038     score = -gap_open_extend;
1039     score_array[0].best = 0;
1040     score_array[0].best_gap = -gap_open_extend;
1041 
1042     for (i = 1; i <= N; i++) {
1043         if (score < -x_dropoff)
1044             break;
1045 
1046         score_array[i].best = score;
1047         score_array[i].best_gap = score - gap_open_extend;
1048         score -= gap_extend;
1049     }
1050 
1051     b_size = i;
1052     best_score = 0;
1053     first_b_index = 0;
1054     b_gap = 0;
1055     if (reverse_sequence)
1056         b_increment = -1;
1057     else
1058         b_increment = 1;
1059 
1060     for (a_index = 1; a_index <= M; a_index++) {
1061 
1062         if (!(gap_align->positionBased)) {
1063             if(reverse_sequence)
1064                 matrix_row = matrix[ A[ M - a_index ] ];
1065             else
1066                 matrix_row = matrix[ A[ a_index ] ];
1067         }
1068         else {
1069             if(reverse_sequence)
1070                 matrix_row = pssm[M - a_index];
1071             else
1072                 matrix_row = pssm[a_index + query_offset];
1073         }
1074 
1075         if(reverse_sequence)
1076             b_ptr = &B[N - first_b_index];
1077         else
1078             b_ptr = &B[first_b_index];
1079 
1080         score = MININT;
1081         score_gap_row = MININT;
1082         last_b_index = first_b_index;
1083 
1084         /* The double loop that computes the alignment is essentially
1085            unchanged from that of Blast_SemiGappedAlign. The only
1086            real difference is that a gap in A or B is not allowed to
1087            start unless the offset of the gap is divisible by
1088            RESTRICT_SIZE. This allows the ordinary dynamic programming
1089            recurrence relations to be simplified */
1090 
1091         if (a_index % RESTRICT_SIZE != 0) {
1092 
1093             /* a gap will never start in A; do not bother checking
1094                or updating score_gap_row */
1095 
1096             for (b_index = first_b_index; b_index < b_size; b_index++) {
1097 
1098                 b_ptr += b_increment;
1099                 next_score = score_array[b_index].best + matrix_row[ *b_ptr ];
1100 
1101                 if (b_index != b_gap) {
1102 
1103                     /* the majority of cases fall here; a gap
1104                        may not start in either A or B */
1105 
1106                     if (best_score - score > x_dropoff) {
1107                         score_array[b_index].best = MININT;
1108                         if (b_index == first_b_index)
1109                             first_b_index++;
1110                     }
1111                     else {
1112                         last_b_index = b_index;
1113                         if (score > best_score) {
1114                             best_score = score;
1115                             *a_offset = a_index;
1116                             *b_offset = b_index;
1117                         }
1118                         score_array[b_index].best = score;
1119                     }
1120                 }
1121                 else {
1122 
1123                     /* a gap may start in B. Update b_gap, the
1124                        offset when this will next happen, and
1125                        compute the two-term recurrence */
1126 
1127                     b_gap += RESTRICT_SIZE;
1128                     score_gap_col = score_array[b_index].best_gap;
1129 
1130                     if (score < score_gap_col)
1131                         score = score_gap_col;
1132 
1133                     if (best_score - score > x_dropoff) {
1134                         score_array[b_index].best = MININT;
1135                         if (b_index == first_b_index)
1136                             first_b_index++;
1137                     }
1138                     else {
1139                         last_b_index = b_index;
1140                         if (score > best_score) {
1141                             best_score = score;
1142                             *a_offset = a_index;
1143                             *b_offset = b_index;
1144                         }
1145 
1146                         score_gap_col -= gap_extend;
1147                         score_array[b_index].best_gap =
1148                                    MAX(score - gap_open_extend, score_gap_col);
1149                         score_array[b_index].best = score;
1150                     }
1151                 }
1152                 score = next_score;
1153             }
1154 
1155             score_gap_row = score;
1156         }
1157         else {
1158 
1159             /* gap may start in A */
1160 
1161             for (b_index = first_b_index; b_index < b_size; b_index++) {
1162 
1163                 b_ptr += b_increment;
1164                 next_score = score_array[b_index].best + matrix_row[ *b_ptr ];
1165 
1166                 if (b_index != b_gap) {
1167 
1168                     /* gap may not start in B. Compute
1169                        the resulting two-term recurrence */
1170 
1171                     if (score < score_gap_row)
1172                         score = score_gap_row;
1173 
1174                     if (best_score - score > x_dropoff) {
1175                         score_array[b_index].best = MININT;
1176                         if (b_index == first_b_index)
1177                             first_b_index++;
1178                     }
1179                     else {
1180                         last_b_index = b_index;
1181                         if (score > best_score) {
1182                             best_score = score;
1183                             *a_offset = a_index;
1184                             *b_offset = b_index;
1185                         }
1186 
1187                         score_gap_row -= gap_extend;
1188                         score_gap_row = MAX(score - gap_open_extend,
1189                                             score_gap_row);
1190                         score_array[b_index].best = score;
1191                     }
1192                 }
1193                 else {
1194 
1195                     /* the ordinary case: a gap may start in A
1196                        or B and the full three-term recurrence
1197                        must be computed. This happens once every
1198                        RESTRICT_SIZE*RESTRICT_SIZE cells */
1199 
1200                     b_gap += RESTRICT_SIZE;
1201                     score_gap_col = score_array[b_index].best_gap;
1202 
1203                     if (score < score_gap_col)
1204                         score = score_gap_col;
1205                     if (score < score_gap_row)
1206                         score = score_gap_row;
1207 
1208                     if (best_score - score > x_dropoff) {
1209                         score_array[b_index].best = MININT;
1210                         if (b_index == first_b_index)
1211                             first_b_index++;
1212                     }
1213                     else {
1214                         last_b_index = b_index;
1215                         if (score > best_score) {
1216                             best_score = score;
1217                             *a_offset = a_index;
1218                             *b_offset = b_index;
1219                         }
1220 
1221                         score_gap_row -= gap_extend;
1222                         score_gap_col -= gap_extend;
1223                         score_array[b_index].best_gap =
1224                                  MAX(score - gap_open_extend, score_gap_col);
1225                         score_gap_row = MAX(score - gap_open_extend,
1226                                             score_gap_row);
1227                         score_array[b_index].best = score;
1228                     }
1229                 }
1230                 score = next_score;
1231             }
1232         }
1233 
1234         if (first_b_index == b_size)
1235             break;
1236 
1237         /* compute the first offset of the next row where a
1238            gap in B may start */
1239 
1240         b_index = first_b_index % RESTRICT_SIZE;
1241         b_gap = first_b_index;
1242         if (b_index > 0)
1243             b_gap += RESTRICT_SIZE - b_index;
1244 
1245         if (last_b_index + num_extra_cells + 3 >= gap_align->dp_mem_alloc) {
1246 
1247             gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100,
1248                                           2 * gap_align->dp_mem_alloc);
1249             score_array = (BlastGapDP *)realloc(score_array,
1250                                                gap_align->dp_mem_alloc *
1251                                                sizeof(BlastGapDP));
1252             gap_align->dp_mem = score_array;
1253         }
1254 
1255         if (last_b_index < b_size - 1) {
1256             b_size = last_b_index + 1;
1257         }
1258         else {
1259             /* The inner loop finished without failing the X-dropoff
1260                test; initialize extra bookkeeping structures until
1261                the X dropoff test fails or we run out of letters in B.
1262                The next inner loop will have larger bounds.
1263 
1264                Note that if gaps in A are not allowed for this row, then
1265                score_gap_row is -infinity and the loop below will not
1266                extend the region to explore */
1267 
1268             while (score_gap_row >= (best_score - x_dropoff) && b_size <= N) {
1269                 score_array[b_size].best = score_gap_row;
1270                 score_array[b_size].best_gap = score_gap_row - gap_open_extend;
1271                 score_gap_row -= gap_extend;
1272                 b_size++;
1273             }
1274         }
1275 
1276         if (b_size <= N) {
1277             score_array[b_size].best = MININT;
1278             score_array[b_size].best_gap = MININT;
1279             b_size++;
1280         }
1281     }
1282 
1283     return best_score;
1284 }
1285 
1286 /** Editing script operations for out-of-frame traceback */
1287 enum {
1288     SCRIPT_AHEAD_ONE_FRAME       = 1, /**< Shift 1 frame in sequence A
1289                                          (gap 2 nucleotides) */
1290     SCRIPT_AHEAD_TWO_FRAMES      = 2, /**< Shift 2 frames in sequence A
1291                                          (gap 1 nucleotide) */
1292     SCRIPT_NEXT_IN_FRAME         = 3, /**< Shift to next base (substitution) */
1293     SCRIPT_NEXT_PLUS_ONE_FRAME   = 4, /**< Shift to next base plus 1 frame
1294                                          (gap 1 nucleotide in sequence B) */
1295     SCRIPT_NEXT_PLUS_TWO_FRAMES  = 5, /**< Shift to next base plus 2 frames
1296                                          (gap 2 nucleotides in sequence B) */
1297     SCRIPT_OOF_OPEN_GAP          = 0x08 /**< Opening a gap */
1298 };
1299 
1300 /** Low level function to perform gapped extension with out-of-frame
1301  * gapping with traceback.
1302  * @param A The query sequence [in]
1303  * @param B The subject sequence [in]
1304  * @param M Maximal extension length in query [in]
1305  * @param N Maximal extension length in subject [in]
1306  * @param a_offset Resulting starting offset in query [out]
1307  * @param b_offset Resulting starting offset in subject [out]
1308  * @param edit_block Structure to hold generated traceback [out]
1309  * @param gap_align Structure holding various information and allocated
1310  *        memory for the gapped alignment [in]
1311  * @param score_params Parameters related to scoring [in]
1312  * @param query_offset The starting offset in query [in]
1313  * @param reversed Has the sequence been reversed? Used for psi-blast [in]
1314  * @return The best alignment score found.
1315  */
1316 static Int4
s_OutOfFrameAlignWithTraceback(const Uint1 * A,const Uint1 * B,Int4 M,Int4 N,Int4 * a_offset,Int4 * b_offset,GapPrelimEditBlock * edit_block,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 query_offset,Boolean reversed)1317 s_OutOfFrameAlignWithTraceback(const Uint1* A, const Uint1* B, Int4 M, Int4 N,
1318    Int4* a_offset, Int4* b_offset, GapPrelimEditBlock *edit_block,
1319    BlastGapAlignStruct* gap_align, const BlastScoringParameters* score_params,
1320    Int4 query_offset, Boolean reversed)
1321 {
1322     Int4 i, increment;          /* sequence pointers and indices */
1323     Int4 a_index;
1324     Int4 b_index, b_size, first_b_index, last_b_index;
1325 
1326     BlastGapDP* score_array;
1327 
1328     Int4 gap_open;              /* alignment penalty variables */
1329     Int4 gap_extend;
1330     Int4 gap_open_extend;
1331     Int4 shift_penalty;
1332     Int4 x_dropoff;
1333 
1334     Int4** matrix = NULL;       /* pointers to the score matrix */
1335     Int4** pssm = NULL;
1336     Int4* matrix_row = NULL;
1337 
1338     Int4 score;                 /* score tracking variables */
1339     Int4 score_row1;
1340     Int4 score_row2;
1341     Int4 score_row3;
1342     Int4 score_gap_col;
1343     Int4 score_col1;
1344     Int4 score_col2;
1345     Int4 score_col3;
1346     Int4 score_other_frame1;
1347     Int4 score_other_frame2;
1348     Int4 best_score;
1349 
1350     GapStateArrayStruct* state_struct;
1351     Uint1** edit_script;
1352     Uint1* edit_script_row;
1353     Int4 *edit_start_offset;
1354     Int4 edit_script_num_rows;
1355     Int4 orig_b_index;
1356     Int1 script, next_script;
1357     Int4 num_extra_cells;
1358 
1359     /* do initialization and sanity-checking */
1360 
1361     matrix = gap_align->sbp->matrix->data;
1362     if (gap_align->positionBased) {
1363         pssm = gap_align->sbp->psi_matrix->pssm->data;
1364     }
1365     *a_offset = 0;
1366     *b_offset = -2;
1367     gap_open = score_params->gap_open;
1368     gap_extend = score_params->gap_extend;
1369     gap_open_extend = gap_open + gap_extend;
1370     shift_penalty = score_params->shift_pen;
1371     x_dropoff = gap_align->gap_x_dropoff;
1372 
1373     if (x_dropoff < gap_open_extend)
1374         x_dropoff = gap_open_extend;
1375 
1376     if(N <= 0 || M <= 0)
1377         return 0;
1378 
1379     /* Initialize traceback information. edit_script[] is
1380        a 2-D array which is filled in row by row as the
1381        dynamic programming takes place */
1382 
1383     s_GapPurgeState(gap_align->state_struct);
1384 
1385     /* Conceptually, traceback requires a byte array of size
1386        MxN. To save on memory, each row of the array is allocated
1387        separately. edit_script[i] points to the storage reserved
1388        for row i, and edit_start_offset[i] gives the offset into
1389        the B sequence corresponding to element 0 of edit_script[i]
1390 
1391        Also make the number of edit script rows grow dynamically */
1392 
1393     edit_script_num_rows = 100;
1394     edit_script = (Uint1**) malloc(sizeof(Uint1*) * edit_script_num_rows);
1395     edit_start_offset = (Int4*) malloc(sizeof(Int4) * edit_script_num_rows);
1396 
1397     /* allocate storage for the first row of the traceback
1398        array. Because row elements correspond to gaps in A,
1399        the alignment can only go at most x_dropoff/gap_extend
1400        positions, in all three frames, before failing the
1401        X dropoff criterion */
1402 
1403     if (gap_extend > 0)
1404         num_extra_cells = CODON_LENGTH * (x_dropoff / gap_extend + 5);
1405     else
1406         num_extra_cells = N + 5;
1407 
1408     if (num_extra_cells > gap_align->dp_mem_alloc) {
1409         gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
1410                                       2 * gap_align->dp_mem_alloc);
1411         sfree(gap_align->dp_mem);
1412         gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
1413                                                   sizeof(BlastGapDP));
1414     }
1415 
1416     state_struct = s_GapGetState(&gap_align->state_struct, num_extra_cells);
1417 
1418     edit_script[0] = state_struct->state_array;
1419     edit_start_offset[0] = 0;
1420     edit_script_row = state_struct->state_array;
1421 
1422     score_array = gap_align->dp_mem;
1423     score = -gap_open_extend;
1424     score_array[0].best = 0;
1425     score_array[0].best_gap = -gap_open_extend;
1426 
1427     for (i = 3; i <= N + 2; i += 3) {
1428         score_array[i].best = score;
1429         score_array[i].best_gap = score - gap_open_extend;
1430         edit_script_row[i] = SCRIPT_GAP_IN_B;
1431 
1432         score_array[i-1].best = MININT;
1433         score_array[i-1].best_gap = MININT;
1434         edit_script_row[i-1] = SCRIPT_GAP_IN_B;
1435 
1436         score_array[i-2].best = MININT;
1437         score_array[i-2].best_gap = MININT;
1438         edit_script_row[i-2] = SCRIPT_GAP_IN_B;
1439 
1440         if (score < -x_dropoff)
1441             break;
1442         score -= gap_extend;
1443     }
1444 
1445     /* The inner loop below examines letters of B from
1446        index 'first_b_index' to 'b_size' */
1447 
1448     score_array[i].best = MININT;
1449     score_array[i].best_gap = MININT;
1450     b_size = i - 2;
1451     state_struct->used = b_size + 1;
1452 
1453     best_score = 0;
1454     first_b_index = 0;
1455     if (reversed) {
1456         increment = -1;
1457     }
1458     else {
1459         /* Allow for a backwards frame shift */
1460         B -= 2;
1461         increment = 1;
1462     }
1463 
1464     for (a_index = 1; a_index <= M; a_index++) {
1465 
1466         /* Set up the next row of the edit script; this involves
1467            allocating memory for the row, then pointing to it.
1468            It is not necessary to allocate space for offsets less
1469            than first_b_index (since the inner loop will never
1470            look at them).
1471 
1472            It is unknown at this point how far to the right the
1473            current traceback row will extend; all that's known for
1474            sure is that the previous row fails the X-dropoff test
1475            after b_size cells, and that the current row can go at
1476            most num_extra_cells beyond that before failing the test */
1477 
1478         if (gap_extend > 0)
1479             state_struct = s_GapGetState(&gap_align->state_struct,
1480                            b_size - first_b_index + num_extra_cells);
1481         else
1482             state_struct = s_GapGetState(&gap_align->state_struct,
1483                                         N + 5 - first_b_index);
1484 
1485         if (a_index == edit_script_num_rows) {
1486             edit_script_num_rows = edit_script_num_rows * 2;
1487             edit_script = (Uint1 **)realloc(edit_script,
1488                                             edit_script_num_rows *
1489                                             sizeof(Uint1 *));
1490             edit_start_offset = (Int4 *)realloc(edit_start_offset,
1491                                                 edit_script_num_rows *
1492                                                 sizeof(Int4));
1493         }
1494 
1495         edit_script[a_index] = state_struct->state_array +
1496                                 state_struct->used + 1;
1497         edit_start_offset[a_index] = first_b_index;
1498 
1499         /* the inner loop assumes the current traceback
1500            row begins at offset zero of B */
1501 
1502         edit_script_row = edit_script[a_index] - first_b_index;
1503         orig_b_index = first_b_index;
1504 
1505         if (!(gap_align->positionBased)) {
1506             matrix_row = matrix[ A[ a_index * increment ] ];
1507         }
1508         else {
1509             if(reversed)
1510                 matrix_row = pssm[M - a_index];
1511             else
1512                 matrix_row = pssm[a_index + query_offset];
1513         }
1514 
1515         score = MININT;
1516         score_row1 = MININT;
1517         score_row2 = MININT;
1518         score_row3 = MININT;
1519         score_gap_col = MININT;
1520         score_col1 = MININT;
1521         score_col2 = MININT;
1522         score_col3 = MININT;
1523         score_other_frame1 = MININT;
1524         score_other_frame2 = MININT;
1525         last_b_index = first_b_index;
1526         b_index = first_b_index;
1527 
1528         /* The inner loop is identical to that of OOF_SEMI_G_ALIGN,
1529            except that traceback operations are sprinkled throughout. */
1530 
1531         while (b_index < b_size) {
1532 
1533             /* FRAME 0 */
1534 
1535             score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
1536             score = MAX(score, score_col1);
1537             if (score == score_col1) {
1538                 script = SCRIPT_NEXT_IN_FRAME;
1539             }
1540             else if (score + shift_penalty == score_other_frame1) {
1541                 if (score_other_frame1 == score_col2)
1542                     script = SCRIPT_AHEAD_TWO_FRAMES;
1543                 else
1544                     script = SCRIPT_NEXT_PLUS_TWO_FRAMES;
1545             }
1546             else {
1547                 if (score_other_frame2 == score_col3)
1548                     script = SCRIPT_AHEAD_ONE_FRAME;
1549                 else
1550                     script = SCRIPT_NEXT_PLUS_ONE_FRAME;
1551             }
1552             score += matrix_row[ B[ b_index * increment ] ];
1553 
1554             score_other_frame1 = MAX(score_col1, score_array[b_index].best);
1555             score_col1 = score_array[b_index].best;
1556             score_gap_col = score_array[b_index].best_gap;
1557 
1558             if (score < MAX(score_gap_col, score_row1)) {
1559                 if (score_gap_col > score_row1) {
1560                     score = score_gap_col;
1561                     script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_A;
1562                 }
1563                 else {
1564                     score = score_row1;
1565                     script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_B;
1566                 }
1567 
1568                 if (best_score - score > x_dropoff) {
1569                     if (first_b_index == b_index)
1570                         first_b_index = b_index + 1;
1571                     else
1572                         score_array[b_index].best = MININT;
1573                 }
1574                 else {
1575                     last_b_index = b_index;
1576                     score_array[b_index].best = score;
1577                     score_array[b_index].best_gap = score_gap_col - gap_extend;
1578                     score_row1 -= gap_extend;
1579                 }
1580             }
1581             else {
1582                 if (best_score - score > x_dropoff) {
1583                     if (first_b_index == b_index)
1584                         first_b_index = b_index + 1;
1585                     else
1586                         score_array[b_index].best = MININT;
1587                 }
1588                 else {
1589                     last_b_index = b_index;
1590                     score_array[b_index].best = score;
1591                     if (score > best_score) {
1592                         best_score = score;
1593                         *a_offset = a_index;
1594                         *b_offset = b_index;
1595                     }
1596 
1597                     score -= gap_open_extend;
1598                     score_row1 -= gap_extend;
1599                     if (score > score_row1)
1600                         score_row1 = score;
1601                     else
1602                         script |= SCRIPT_EXTEND_GAP_B;
1603 
1604                     score_gap_col -= gap_extend;
1605                     if (score < score_gap_col) {
1606                         score_array[b_index].best_gap = score_gap_col;
1607                         script |= SCRIPT_EXTEND_GAP_A;
1608                     }
1609                     else {
1610                         score_array[b_index].best_gap = score;
1611                     }
1612                 }
1613             }
1614 
1615             edit_script_row[b_index++] = script;
1616             if (b_index >= b_size) {
1617                 score = score_row1;
1618                 score_row1 = score_row2;
1619                 score_row2 = score_row3;
1620                 score_row3 = score;
1621                 break;
1622             }
1623 
1624             /* FRAME 1 */
1625 
1626             score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
1627             score = MAX(score, score_col2);
1628             if (score == score_col2) {
1629                 script = SCRIPT_NEXT_IN_FRAME;
1630             }
1631             else if (score + shift_penalty == score_other_frame1) {
1632                 if (score_other_frame1 == score_col1)
1633                     script = SCRIPT_AHEAD_ONE_FRAME;
1634                 else
1635                     script = SCRIPT_NEXT_PLUS_ONE_FRAME;
1636             }
1637             else {
1638                 if (score_other_frame2 == score_col3)
1639                     script = SCRIPT_AHEAD_TWO_FRAMES;
1640                 else
1641                     script = SCRIPT_NEXT_PLUS_TWO_FRAMES;
1642             }
1643             score += matrix_row[ B[ b_index * increment ] ];
1644             score_other_frame2 = MAX(score_col2, score_array[b_index].best);
1645             score_col2 = score_array[b_index].best;
1646             score_gap_col = score_array[b_index].best_gap;
1647 
1648             if (score < MAX(score_gap_col, score_row2)) {
1649                 score = MAX(score_gap_col, score_row2);
1650                 if (best_score - score > x_dropoff) {
1651                     if (first_b_index == b_index)
1652                         first_b_index = b_index + 1;
1653                     else
1654                         score_array[b_index].best = MININT;
1655                 }
1656                 else {
1657                     if (score == score_gap_col)
1658                         script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_A;
1659                     else
1660                         script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_B;
1661 
1662                     last_b_index = b_index;
1663                     score_array[b_index].best = score;
1664                     score_array[b_index].best_gap = score_gap_col - gap_extend;
1665                     score_row2 -= gap_extend;
1666                 }
1667             }
1668             else {
1669                 if (best_score - score > x_dropoff) {
1670                     if (first_b_index == b_index)
1671                         first_b_index = b_index + 1;
1672                     else
1673                         score_array[b_index].best = MININT;
1674                 }
1675                 else {
1676                     last_b_index = b_index;
1677                     score_array[b_index].best = score;
1678                     if (score > best_score) {
1679                         best_score = score;
1680                         *a_offset = a_index;
1681                         *b_offset = b_index;
1682                     }
1683                     score -= gap_open_extend;
1684                     score_row2 -= gap_extend;
1685                     if (score > score_row2)
1686                         score_row2 = score;
1687                     else
1688                         script |= SCRIPT_EXTEND_GAP_B;
1689 
1690                     score_gap_col -= gap_extend;
1691                     if (score < score_gap_col) {
1692                         score_array[b_index].best_gap = score_gap_col;
1693                         script |= SCRIPT_EXTEND_GAP_A;
1694                     }
1695                     else {
1696                         score_array[b_index].best_gap = score;
1697                     }
1698                 }
1699             }
1700 
1701             edit_script_row[b_index++] = script;
1702             if (b_index >= b_size) {
1703                 score = score_row2;
1704                 score_row2 = score_row1;
1705                 score_row1 = score_row3;
1706                 score_row3 = score;
1707                 break;
1708             }
1709 
1710             /* FRAME 2 */
1711 
1712             score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
1713             score = MAX(score, score_col3);
1714             if (score == score_col3) {
1715                 script = SCRIPT_NEXT_IN_FRAME;
1716             }
1717             else if (score + shift_penalty == score_other_frame1) {
1718                 if (score_other_frame1 == score_col1)
1719                     script = SCRIPT_AHEAD_TWO_FRAMES;
1720                 else
1721                     script = SCRIPT_NEXT_PLUS_TWO_FRAMES;
1722             }
1723             else {
1724                 if (score_other_frame2 == score_col2)
1725                     script = SCRIPT_AHEAD_ONE_FRAME;
1726                 else
1727                     script = SCRIPT_NEXT_PLUS_ONE_FRAME;
1728             }
1729             score += matrix_row[ B[ b_index * increment ] ];
1730             score_other_frame1 = score_other_frame2;
1731             score_other_frame2 = MAX(score_col3, score_array[b_index].best);
1732             score_col3 = score_array[b_index].best;
1733             score_gap_col = score_array[b_index].best_gap;
1734 
1735             if (score < MAX(score_gap_col, score_row3)) {
1736                 score = MAX(score_gap_col, score_row3);
1737                 if (best_score - score > x_dropoff) {
1738                     if (first_b_index == b_index)
1739                         first_b_index = b_index + 1;
1740                     else
1741                         score_array[b_index].best = MININT;
1742                 }
1743                 else {
1744                     if (score == score_gap_col)
1745                         script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_A;
1746                     else
1747                         script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_B;
1748 
1749                     last_b_index = b_index;
1750                     score_array[b_index].best = score;
1751                     score_array[b_index].best_gap = score_gap_col - gap_extend;
1752                     score_row3 -= gap_extend;
1753                 }
1754             }
1755             else {
1756                 if (best_score - score > x_dropoff) {
1757                     if (first_b_index == b_index)
1758                         first_b_index = b_index + 1;
1759                     else
1760                         score_array[b_index].best = MININT;
1761                 }
1762                 else {
1763                     last_b_index = b_index;
1764                     score_array[b_index].best = score;
1765                     if (score > best_score) {
1766                         best_score = score;
1767                         *a_offset = a_index;
1768                         *b_offset = b_index;
1769                     }
1770                     score -= gap_open_extend;
1771                     score_row3 -= gap_extend;
1772                     if (score > score_row3)
1773                         score_row3 = score;
1774                     else
1775                         script |= SCRIPT_EXTEND_GAP_B;
1776 
1777                     score_gap_col -= gap_extend;
1778                     if (score < score_gap_col) {
1779                         score_array[b_index].best_gap = score_gap_col;
1780                         script |= SCRIPT_EXTEND_GAP_A;
1781                     }
1782                     else {
1783                         score_array[b_index].best_gap = score;
1784                     }
1785                 }
1786             }
1787             edit_script_row[b_index++] = script;
1788         }
1789 
1790         /* Finish aligning if the best scores for all positions
1791            of B will fail the X-dropoff test, i.e. the inner loop
1792            bounds have converged to each other */
1793 
1794         if (first_b_index == b_size)
1795             break;
1796 
1797         /* Enlarge the window for score data if necessary */
1798 
1799         if (last_b_index + num_extra_cells + 5 >= gap_align->dp_mem_alloc) {
1800 
1801             gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100,
1802                                           2 * gap_align->dp_mem_alloc);
1803             score_array = (BlastGapDP *)realloc(score_array,
1804                                                gap_align->dp_mem_alloc *
1805                                                sizeof(BlastGapDP));
1806             gap_align->dp_mem = score_array;
1807         }
1808 
1809         if (last_b_index < b_size - 1) {
1810             /* This row failed the X-dropoff test earlier than
1811                the last row did; just shorten the loop bounds
1812                before doing the next row */
1813 
1814             b_size = last_b_index + 1;
1815         }
1816         else {
1817             /* The inner loop finished without failing the X-dropoff
1818                test; initialize extra bookkeeping structures until
1819                the X dropoff test fails or we run out of letters in B.
1820                The next inner loop will have larger bounds.
1821 
1822                Keep initializing extra structures until the X dropoff
1823                test fails in all frames for this row */
1824 
1825             score = MAX(score_row1, score_row2);
1826             score = MAX(score, score_row3);
1827             while (score >= (best_score - x_dropoff) && b_size < N + 1) {
1828 
1829                 score_array[b_size].best = score_row1;
1830                 score_array[b_size].best_gap = score_row1 - gap_open_extend;
1831                 score_row1 -= gap_extend;
1832                 edit_script_row[b_size] = SCRIPT_OOF_OPEN_GAP |
1833                                           SCRIPT_GAP_IN_B;
1834 
1835                 score_array[b_size+1].best = score_row2;
1836                 score_array[b_size+1].best_gap = score_row2 - gap_open_extend;
1837                 score_row2 -= gap_extend;
1838                 edit_script_row[b_size+1] = SCRIPT_OOF_OPEN_GAP |
1839                                             SCRIPT_GAP_IN_B;
1840 
1841                 score_array[b_size+2].best = score_row3;
1842                 score_array[b_size+2].best_gap = score_row3 - gap_open_extend;
1843                 score_row3 -= gap_extend;
1844                 edit_script_row[b_size+2] = SCRIPT_OOF_OPEN_GAP |
1845                                             SCRIPT_GAP_IN_B;
1846                 b_size += 3;
1847                 score -= gap_extend;
1848             }
1849         }
1850 
1851         /* update the memory allocator to reflect the exact number
1852            of traceback cells this row needed */
1853 
1854         b_size = MIN(b_size, N + 1);
1855         state_struct->used += MAX(b_index, b_size) - orig_b_index + 1;
1856 
1857         /* chop off the best score in each frame */
1858 
1859         last_b_index = MIN(b_size + 4, N + 3);
1860         while (b_size < last_b_index) {
1861             score_array[b_size].best = MININT;
1862             score_array[b_size].best_gap = MININT;
1863             b_size++;
1864         }
1865     }
1866 
1867     a_index = *a_offset;
1868     b_index = *b_offset;
1869     script = SCRIPT_AHEAD_ONE_FRAME;
1870 
1871     while (a_index > 0 || b_index > 0) {
1872         /* Retrieve the next action to perform. Rows of
1873            the traceback array do not necessarily start
1874            at offset zero of B, so a correction is needed
1875            to point to the correct position */
1876 
1877         next_script =
1878             edit_script[a_index][b_index - edit_start_offset[a_index]];
1879 
1880         switch (script) {
1881         case SCRIPT_GAP_IN_A:
1882             script = next_script & SCRIPT_OP_MASK;
1883             if (next_script & (SCRIPT_OOF_OPEN_GAP | SCRIPT_EXTEND_GAP_A))
1884                 script = SCRIPT_GAP_IN_A;
1885             break;
1886         case SCRIPT_GAP_IN_B:
1887             script = next_script & SCRIPT_OP_MASK;
1888             if (next_script & (SCRIPT_OOF_OPEN_GAP | SCRIPT_EXTEND_GAP_B))
1889                 script = SCRIPT_GAP_IN_B;
1890             break;
1891         default:
1892             script = next_script & SCRIPT_OP_MASK;
1893             break;
1894         }
1895 
1896         if (script == SCRIPT_GAP_IN_B) {
1897             b_index -= 3;
1898         }
1899         else {
1900             b_index -= script;
1901             a_index--;
1902         }
1903 
1904         GapPrelimEditBlockAdd(edit_block, (EGapAlignOpType)script, 1);
1905     }
1906 
1907     sfree(edit_start_offset);
1908     sfree(edit_script);
1909 
1910     if (!reversed)
1911         *b_offset -= 2;
1912     return best_score;
1913 }
1914 
1915 /** Low level function to perform gapped extension with out-of-frame
1916  * gapping with or without traceback.
1917  * @param A The query sequence [in]
1918  * @param B The subject sequence [in]
1919  * @param M Maximal extension length in query [in]
1920  * @param N Maximal extension length in subject [in]
1921  * @param a_offset Resulting starting offset in query [out]
1922  * @param b_offset Resulting starting offset in subject [out]
1923  * @param score_only Only find the score, without saving traceback [in]
1924  * @param edit_block Structure to hold generated traceback [out]
1925  * @param gap_align Structure holding various information and allocated
1926  *        memory for the gapped alignment [in]
1927  * @param score_params Parameters related to scoring [in]
1928  * @param query_offset The starting offset in query [in]
1929  * @param reversed Has the sequence been reversed? Used for psi-blast [in]
1930  * @return The best alignment score found.
1931  */
1932 static Int4
s_OutOfFrameGappedAlign(const Uint1 * A,const Uint1 * B,Int4 M,Int4 N,Int4 * a_offset,Int4 * b_offset,Boolean score_only,GapPrelimEditBlock * edit_block,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 query_offset,Boolean reversed)1933 s_OutOfFrameGappedAlign(const Uint1* A, const Uint1* B, Int4 M, Int4 N,
1934    Int4* a_offset, Int4* b_offset, Boolean score_only,
1935    GapPrelimEditBlock *edit_block, BlastGapAlignStruct* gap_align,
1936    const BlastScoringParameters* score_params,
1937    Int4 query_offset, Boolean reversed)
1938 {
1939     Int4 i, increment;          /* sequence pointers and indices */
1940     Int4 a_index;
1941     Int4 b_index, b_size, first_b_index, last_b_index;
1942 
1943     Int4 gap_open;              /* alignment penalty variables */
1944     Int4 gap_extend;
1945     Int4 gap_open_extend;
1946     Int4 shift_penalty;
1947     Int4 x_dropoff;
1948 
1949     BlastGapDP* score_array;
1950     Int4 num_extra_cells;
1951 
1952     Int4** matrix = NULL;       /* pointers to the score matrix */
1953     Int4** pssm = NULL;
1954     Int4* matrix_row = NULL;
1955 
1956     Int4 score;                 /* score tracking variables */
1957     Int4 score_row1;
1958     Int4 score_row2;
1959     Int4 score_row3;
1960     Int4 score_gap_col;
1961     Int4 score_col1;
1962     Int4 score_col2;
1963     Int4 score_col3;
1964     Int4 score_other_frame1;
1965     Int4 score_other_frame2;
1966     Int4 best_score;
1967 
1968     if (!score_only) {
1969         return s_OutOfFrameAlignWithTraceback(A, B, M, N, a_offset, b_offset,
1970                                               edit_block, gap_align,
1971                                               score_params, query_offset,
1972                                               reversed);
1973     }
1974 
1975     /* do initialization and sanity-checking */
1976 
1977     matrix = gap_align->sbp->matrix->data;
1978     if (gap_align->positionBased) {
1979         pssm = gap_align->sbp->psi_matrix->pssm->data;
1980     }
1981     *a_offset = 0;
1982     *b_offset = -2;
1983     gap_open = score_params->gap_open;
1984     gap_extend = score_params->gap_extend;
1985     gap_open_extend = gap_open + gap_extend;
1986     shift_penalty = score_params->shift_pen;
1987     x_dropoff = gap_align->gap_x_dropoff;
1988 
1989     if (x_dropoff < gap_open_extend)
1990         x_dropoff = gap_open_extend;
1991 
1992     if(N <= 0 || M <= 0)
1993         return 0;
1994 
1995     /* Allocate and fill in the auxiliary bookeeping structures.
1996        Since A and B could be very large, maintain a window
1997        of auxiliary structures only large enough to contain to current
1998        set of DP computations. The initial window size is determined
1999        by the number of cells needed to fail the x-dropoff test */
2000 
2001     if (gap_extend > 0)
2002         num_extra_cells = CODON_LENGTH * (x_dropoff / gap_extend + 5);
2003     else
2004         num_extra_cells = N + 5;
2005 
2006     if (num_extra_cells > gap_align->dp_mem_alloc) {
2007         gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
2008                                       2 * gap_align->dp_mem_alloc);
2009         sfree(gap_align->dp_mem);
2010         gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
2011                                                   sizeof(BlastGapDP));
2012     }
2013 
2014     score_array = gap_align->dp_mem;
2015     score = -gap_open_extend;
2016     score_array[0].best = 0;
2017     score_array[0].best_gap = -gap_open_extend;
2018 
2019     for (i = 3; i <= N + 2; i += 3) {
2020         score_array[i].best = score;
2021         score_array[i].best_gap = score - gap_open_extend;
2022         score_array[i-1].best = MININT;
2023         score_array[i-1].best_gap = MININT;
2024         score_array[i-2].best = MININT;
2025         score_array[i-2].best_gap = MININT;
2026         if (score < -x_dropoff)
2027             break;
2028         score -= gap_extend;
2029     }
2030 
2031     /* The inner loop below examines letters of B from
2032        index 'first_b_index' to 'b_size' */
2033 
2034     b_size = i - 2;
2035     score_array[i].best = MININT;
2036     score_array[i].best_gap = MININT;
2037 
2038     best_score = 0;
2039     first_b_index = 0;
2040     if (reversed) {
2041         increment = -1;
2042     }
2043     else {
2044         /* Allow for a backwards frame shift */
2045         B -= 2;
2046         increment = 1;
2047     }
2048 
2049     for (a_index = 1; a_index <= M; a_index++) {
2050 
2051         /* pick out the row of the score matrix
2052            appropriate for A[a_index] */
2053 
2054         if (!(gap_align->positionBased)) {
2055             matrix_row = matrix[ A[ a_index * increment ] ];
2056         }
2057         else {
2058             if(reversed)
2059                 matrix_row = pssm[M - a_index];
2060             else
2061                 matrix_row = pssm[a_index + query_offset];
2062         }
2063 
2064         /* initialize running-score variables */
2065         score = MININT;
2066         score_row1 = MININT;
2067         score_row2 = MININT;
2068         score_row3 = MININT;
2069         score_gap_col = MININT;
2070         score_col1 = MININT;
2071         score_col2 = MININT;
2072         score_col3 = MININT;
2073         score_other_frame1 = MININT;
2074         score_other_frame2 = MININT;
2075         last_b_index = first_b_index;
2076         b_index = first_b_index;
2077 
2078         while (b_index < b_size) {
2079 
2080             /* FRAME 0 */
2081 
2082             /* Pick the best score among all frames */
2083             score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
2084             score = MAX(score, score_col1) +
2085                                 matrix_row[ B[ b_index * increment ] ];
2086             score_other_frame1 = MAX(score_col1, score_array[b_index].best);
2087             score_col1 = score_array[b_index].best;
2088             score_gap_col = score_array[b_index].best_gap;
2089 
2090             /* Use the row and column scores if they improve
2091                the score overall */
2092 
2093             if (score < MAX(score_gap_col, score_row1)) {
2094                 score = MAX(score_gap_col, score_row1);
2095                 if (best_score - score > x_dropoff) {
2096 
2097                    /* the current best score failed the X-dropoff
2098                       criterion. Note that this does not stop the
2099                       inner loop, only forces future iterations to
2100                       skip this column of B.
2101 
2102                       Also, if the very first letter of B that was
2103                       tested failed the X dropoff criterion, make
2104                       sure future inner loops start one letter to
2105                       the right */
2106 
2107                     if (first_b_index == b_index)
2108                         first_b_index = b_index + 1;
2109                     else
2110                         score_array[b_index].best = MININT;
2111                 }
2112                 else {
2113                     /* update the row and column running scores */
2114                     last_b_index = b_index;
2115                     score_array[b_index].best = score;
2116                     score_array[b_index].best_gap = score_gap_col - gap_extend;
2117                     score_row1 -= gap_extend;
2118                 }
2119             }
2120             else {
2121                 if (best_score - score > x_dropoff) {
2122 
2123                    /* the current best score failed the X-dropoff
2124                       criterion. */
2125 
2126                     if (first_b_index == b_index)
2127                         first_b_index = b_index + 1;
2128                     else
2129                         score_array[b_index].best = MININT;
2130                 }
2131                 else {
2132                     /* The current best score exceeds the
2133                        row and column scores, and thus may
2134                        improve on the current optimal score */
2135 
2136                     last_b_index = b_index;
2137                     score_array[b_index].best = score;
2138                     if (score > best_score) {
2139                         best_score = score;
2140                         *a_offset = a_index;
2141                         *b_offset = b_index;
2142                     }
2143 
2144                     /* compute the best scores that include gaps
2145                        or gap extensions */
2146 
2147                     score -= gap_open_extend;
2148                     score_row1 -= gap_extend;
2149                     score_row1 = MAX(score, score_row1);
2150                     score_array[b_index].best_gap = MAX(score,
2151                                                   score_gap_col - gap_extend);
2152                 }
2153             }
2154 
2155             /* If this was the last letter of B checked, rotate
2156                the row scores so that code beyond the inner loop
2157                works correctly */
2158 
2159             if (++b_index >= b_size) {
2160                 score = score_row1;
2161                 score_row1 = score_row2;
2162                 score_row2 = score_row3;
2163                 score_row3 = score;
2164                 break;
2165             }
2166 
2167             /* FRAME 1 */
2168 
2169             /* This code, and that for Frame 2, are essentially the
2170                same as the preceeding code. The only real difference
2171                is the updating of the other_frame best scores */
2172 
2173             score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
2174             score = MAX(score, score_col2) +
2175                                 matrix_row[ B[ b_index * increment ] ];
2176             score_other_frame2 = MAX(score_col2, score_array[b_index].best);
2177             score_col2 = score_array[b_index].best;
2178             score_gap_col = score_array[b_index].best_gap;
2179 
2180             if (score < MAX(score_gap_col, score_row2)) {
2181                 score = MAX(score_gap_col, score_row2);
2182                 if (best_score - score > x_dropoff) {
2183                     if (first_b_index == b_index)
2184                         first_b_index = b_index + 1;
2185                     else
2186                         score_array[b_index].best = MININT;
2187                 }
2188                 else {
2189                     last_b_index = b_index;
2190                     score_array[b_index].best = score;
2191                     score_array[b_index].best_gap = score_gap_col - gap_extend;
2192                     score_row2 -= gap_extend;
2193                 }
2194             }
2195             else {
2196                 if (best_score - score > x_dropoff) {
2197                     if (first_b_index == b_index)
2198                         first_b_index = b_index + 1;
2199                     else
2200                         score_array[b_index].best = MININT;
2201                 }
2202                 else {
2203                     last_b_index = b_index;
2204                     score_array[b_index].best = score;
2205                     if (score > best_score) {
2206                         best_score = score;
2207                         *a_offset = a_index;
2208                         *b_offset = b_index;
2209                     }
2210                     score -= gap_open_extend;
2211                     score_row2 -= gap_extend;
2212                     score_row2 = MAX(score, score_row2);
2213                     score_array[b_index].best_gap = MAX(score,
2214                                                   score_gap_col - gap_extend);
2215                 }
2216             }
2217 
2218             if (++b_index >= b_size) {
2219                 score = score_row2;
2220                 score_row2 = score_row1;
2221                 score_row1 = score_row3;
2222                 score_row3 = score;
2223                 break;
2224             }
2225 
2226             /* FRAME 2 */
2227 
2228             score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
2229             score = MAX(score, score_col3) +
2230                                 matrix_row[ B[ b_index * increment ] ];
2231             score_other_frame1 = score_other_frame2;
2232             score_other_frame2 = MAX(score_col3, score_array[b_index].best);
2233             score_col3 = score_array[b_index].best;
2234             score_gap_col = score_array[b_index].best_gap;
2235 
2236             if (score < MAX(score_gap_col, score_row3)) {
2237                 score = MAX(score_gap_col, score_row3);
2238                 if (best_score - score > x_dropoff) {
2239                     if (first_b_index == b_index)
2240                         first_b_index = b_index + 1;
2241                     else
2242                         score_array[b_index].best = MININT;
2243                 }
2244                 else {
2245                     last_b_index = b_index;
2246                     score_array[b_index].best = score;
2247                     score_array[b_index].best_gap = score_gap_col - gap_extend;
2248                     score_row3 -= gap_extend;
2249                 }
2250             }
2251             else {
2252                 if (best_score - score > x_dropoff) {
2253                     if (first_b_index == b_index)
2254                         first_b_index = b_index + 1;
2255                     else
2256                         score_array[b_index].best = MININT;
2257                 }
2258                 else {
2259                     last_b_index = b_index;
2260                     score_array[b_index].best = score;
2261                     if (score > best_score) {
2262                         best_score = score;
2263                         *a_offset = a_index;
2264                         *b_offset = b_index;
2265                     }
2266                     score -= gap_open_extend;
2267                     score_row3 -= gap_extend;
2268                     score_row3 = MAX(score, score_row3);
2269                     score_array[b_index].best_gap = MAX(score,
2270                                                   score_gap_col - gap_extend);
2271                 }
2272             }
2273             b_index++;
2274         }
2275 
2276         /* Finish aligning if the best scores for all positions
2277            of B will fail the X-dropoff test, i.e. the inner loop
2278            bounds have converged to each other */
2279 
2280         if (first_b_index == b_size)
2281             break;
2282 
2283         /* Enlarge the window for score data, if necessary */
2284 
2285         if (b_size + num_extra_cells + 5 >= gap_align->dp_mem_alloc) {
2286 
2287             gap_align->dp_mem_alloc = MAX(b_size + num_extra_cells + 100,
2288                                           2 * gap_align->dp_mem_alloc);
2289             score_array = (BlastGapDP *)realloc(score_array,
2290                                                gap_align->dp_mem_alloc *
2291                                                sizeof(BlastGapDP));
2292             gap_align->dp_mem = score_array;
2293         }
2294 
2295         if (last_b_index < b_size - 1) {
2296             /* This row failed the X-dropoff test earlier than
2297                the last row did; just shorten the loop bounds
2298                before doing the next row */
2299 
2300             b_size = last_b_index + 1;
2301         }
2302         else {
2303             /* The inner loop finished without failing the X-dropoff
2304                test; initialize extra bookkeeping structures until
2305                the X dropoff test fails or we run out of letters in B.
2306                The next inner loop will have larger bounds.
2307 
2308                Keep initializing extra structures until the X dropoff
2309                test fails in all frames for this row */
2310 
2311             score = MAX(score_row1, score_row2);
2312             score = MAX(score, score_row3);
2313             while (score >= (best_score - x_dropoff) && b_size < N + 1) {
2314 
2315                 score_array[b_size].best = score_row1;
2316                 score_array[b_size].best_gap = score_row1 - gap_open_extend;
2317                 score_row1 -= gap_extend;
2318 
2319                 score_array[b_size+1].best = score_row2;
2320                 score_array[b_size+1].best_gap = score_row2 - gap_open_extend;
2321                 score_row2 -= gap_extend;
2322 
2323                 score_array[b_size+2].best = score_row3;
2324                 score_array[b_size+2].best_gap = score_row3 - gap_open_extend;
2325                 score_row3 -= gap_extend;
2326 
2327                 b_size += 3;
2328                 score -= gap_extend;
2329             }
2330         }
2331 
2332         /* chop off the best score in each frame */
2333         b_size = MIN(b_size, N + 1);
2334         last_b_index = MIN(b_size + 4, N + 3);
2335         while (b_size < last_b_index) {
2336             score_array[b_size].best = MININT;
2337             score_array[b_size].best_gap = MININT;
2338             b_size++;
2339         }
2340     }
2341 
2342     if (!reversed) {
2343         /* The sequence was shifted, so length should be adjusted as well */
2344         *b_offset -= 2;
2345     }
2346     return best_score;
2347 }
2348 
2349 /** Simple wrapper around binary search function for BlastQueryInfo structure
2350  * to obtain the context in which this initial ungapped HSP is found
2351  * @param query_info Query information structure [in]
2352  * @param init_hsp Initial HSP [in]
2353  */
2354 static NCBI_INLINE Int4
s_GetUngappedHSPContext(const BlastQueryInfo * query_info,const BlastInitHSP * init_hsp)2355 s_GetUngappedHSPContext(const BlastQueryInfo* query_info,
2356                         const BlastInitHSP* init_hsp)
2357 {
2358     return BSearchContextInfo(init_hsp->offsets.qs_offsets.q_off, query_info);
2359 }
2360 
2361 /** Adjust the HSP offsets in the initial ungapped HSP structure given the
2362  * query start
2363  * @param init_hsp Initial HSP [in|out]
2364  * @param query_start Starting offset of the query sequence in the query info
2365  * structure [in]
2366  */
2367 static NCBI_INLINE void
s_AdjustInitialHSPOffsets(BlastInitHSP * init_hsp,Int4 query_start)2368 s_AdjustInitialHSPOffsets(BlastInitHSP* init_hsp, Int4 query_start)
2369 {
2370     init_hsp->offsets.qs_offsets.q_off -= query_start;
2371     if (init_hsp->ungapped_data) {
2372         init_hsp->ungapped_data->q_start -= query_start;
2373     }
2374     ASSERT(init_hsp->ungapped_data == NULL ||
2375            init_hsp->ungapped_data->q_start >= 0);
2376 }
2377 
2378 /** Set up a BLAST_SequenceBlk structure for a single query sequence
2379  * @param concatenated_query Query sequence block for concatenated query [in]
2380  * @param query_info Query information structure for concatenated query [in]
2381  * @param context Context in which the HSP ocurred [in]
2382  * @param single_query query Output sequence block which will contain adjusted
2383  * sequence pointer and sequence length [out]
2384  * @param query_start Starting offset of the single query sequence in the
2385  * concatenated query [out]
2386   */
2387 static void
s_SetUpLocalBlastSequenceBlk(const BLAST_SequenceBlk * concatenated_query,const BlastQueryInfo * query_info,Int4 context,BLAST_SequenceBlk * single_query,Int4 * query_start)2388 s_SetUpLocalBlastSequenceBlk(const BLAST_SequenceBlk* concatenated_query,
2389                              const BlastQueryInfo* query_info, Int4 context,
2390                              BLAST_SequenceBlk* single_query,
2391                              Int4* query_start)
2392 {
2393     Int4 query_length = 0;
2394     if (concatenated_query->oof_sequence) {
2395         /* Out-of-frame blastx case: all frames of the same parity are mixed
2396            together in a special sequence. */
2397         Int4 query_end_pt = 0;
2398         Int4 mixed_frame_context = context - context % CODON_LENGTH;
2399 
2400         *query_start = query_info->contexts[mixed_frame_context].query_offset;
2401         query_end_pt =
2402             query_info->contexts[mixed_frame_context+CODON_LENGTH-1].query_offset +
2403             query_info->contexts[mixed_frame_context+CODON_LENGTH-1].query_length;
2404         query_length = query_end_pt - *query_start;
2405         single_query->sequence = NULL;
2406         single_query->oof_sequence = concatenated_query->oof_sequence + *query_start;
2407     } else {
2408         *query_start  = query_info->contexts[context].query_offset;
2409         query_length = query_info->contexts[context].query_length;
2410         single_query->sequence = concatenated_query->sequence + *query_start;
2411         single_query->oof_sequence = NULL;
2412     }
2413     single_query->length = query_length;
2414 }
2415 
2416 
2417 /** Find the HSP offsets relative to the individual query sequence instead of
2418  * the concatenated sequence and retrieve relevant portions of the query
2419  * sequence data.
2420  * @param query Query sequence block [in]
2421  * @param query_info Query information structure, including context offsets
2422  *                   array [in]
2423  * @param init_hsp Initial HSP, this will be modified to hold the adjusted
2424  *                 offsets [in] [out]
2425  * @param query_out query sequence block with modified sequence
2426  *                  pointer and sequence length [out]
2427  * @param context Which context this HSP belongs to? [out]
2428  */
2429 static void
s_AdjustHspOffsetsAndGetQueryData(const BLAST_SequenceBlk * query,const BlastQueryInfo * query_info,BlastInitHSP * init_hsp,BLAST_SequenceBlk * query_out,Int4 * context)2430 s_AdjustHspOffsetsAndGetQueryData(const BLAST_SequenceBlk* query,
2431                                   const BlastQueryInfo* query_info,
2432                                   BlastInitHSP* init_hsp,
2433                                   BLAST_SequenceBlk* query_out,
2434                                   Int4* context)
2435 {
2436     Int4 query_start = 0;
2437 
2438     ASSERT(query);
2439     ASSERT(query_info);
2440     ASSERT(query_out);
2441     ASSERT(init_hsp);
2442     ASSERT(context);
2443 
2444     ASSERT(init_hsp->ungapped_data == NULL ||
2445            init_hsp->ungapped_data->q_start >= 0);
2446     *context = s_GetUngappedHSPContext(query_info, init_hsp);
2447     s_SetUpLocalBlastSequenceBlk(query, query_info, *context,
2448                                  query_out, &query_start);
2449     ASSERT(init_hsp->ungapped_data == NULL ||
2450            (init_hsp->ungapped_data->q_start-query_start) >= 0);
2451     s_AdjustInitialHSPOffsets(init_hsp, query_start);
2452 }
2453 
2454 
2455 /** Convert the initial list of traceback actions from a non-OOF
2456  *  gapped alignment into a blast edit script. Note that this routine
2457  *  assumes the input edit blocks have not been reversed or rearranged
2458  *  by calling code
2459  *  @param rev_prelim_tback Traceback from extension to the left [in]
2460  *  @param fwd_prelim_tback Traceback from extension to the right [in]
2461  *  @return Pointer to the resulting edit script, or NULL if there
2462  *          are no traceback actions specified
2463  */
2464 GapEditScript*
Blast_PrelimEditBlockToGapEditScript(GapPrelimEditBlock * rev_prelim_tback,GapPrelimEditBlock * fwd_prelim_tback)2465 Blast_PrelimEditBlockToGapEditScript (GapPrelimEditBlock* rev_prelim_tback,
2466                                       GapPrelimEditBlock* fwd_prelim_tback)
2467 {
2468    Boolean merge_ops = FALSE;
2469    GapEditScript* esp;
2470    GapPrelimEditScript *op;
2471    Int4 i;
2472    Int4 index=0;
2473    Int4 size = 0;
2474 
2475    if (rev_prelim_tback == NULL || fwd_prelim_tback == NULL)
2476       return NULL;
2477 
2478    /* The fwd_prelim_tback script will get reversed here as the traceback started from the highest scoring point
2479      and worked backwards. The rev_prelim_tback script does NOT get reversed.  Since it was reversed when the
2480      traceback was produced it's already "forward" */
2481 
2482    if (fwd_prelim_tback->num_ops > 0 && rev_prelim_tback->num_ops > 0 &&
2483        fwd_prelim_tback->edit_ops[(fwd_prelim_tback->num_ops)-1].op_type ==
2484          rev_prelim_tback->edit_ops[(rev_prelim_tback->num_ops)-1].op_type)
2485      merge_ops = TRUE;
2486 
2487    size = fwd_prelim_tback->num_ops+rev_prelim_tback->num_ops;
2488    if (merge_ops)
2489      size--;
2490 
2491    esp = GapEditScriptNew(size);
2492 
2493    index = 0;
2494    for (i=0; i < rev_prelim_tback->num_ops; i++) {
2495       op = rev_prelim_tback->edit_ops + i;
2496       esp->op_type[index] = op->op_type;
2497       esp->num[index] = op->num;
2498       index++;
2499    }
2500 
2501    if (fwd_prelim_tback->num_ops == 0)
2502       return esp;
2503 
2504    if (merge_ops)
2505        esp->num[index-1] += fwd_prelim_tback->edit_ops[(fwd_prelim_tback->num_ops)-1].num;
2506 
2507    /* If we merge, then we skip the first one. */
2508    if (merge_ops)
2509       i = fwd_prelim_tback->num_ops - 2;
2510    else
2511       i = fwd_prelim_tback->num_ops - 1;
2512 
2513    for (; i >= 0; i--) {
2514       op = fwd_prelim_tback->edit_ops + i;
2515       esp->op_type[index] = op->op_type;
2516       esp->num[index] = op->num;
2517       index++;
2518    }
2519 
2520    return esp;
2521 }
2522 
2523 /** Fills the BlastGapAlignStruct structure with the results
2524  *  of a greedy gapped extension.
2525  * @param gap_align the initialized gapped alignment structure [in] [out]
2526  * @param q_start The starting offset in query [in]
2527  * @param s_start The starting offset in subject [in]
2528  * @param q_end The ending offset in query [in]
2529  * @param s_end The ending offset in subject [in]
2530  * @param q_seed_start The query offset of the alignment start point [in]
2531  * @param s_seed_start The subject offset of the alignment start point [in]
2532  * @param score The alignment score [in]
2533  * @param esp The edit script containing the traceback information [in]
2534  */
2535 static Int2
s_BlastGreedyGapAlignStructFill(BlastGapAlignStruct * gap_align,Int4 q_start,Int4 s_start,Int4 q_end,Int4 s_end,Int4 q_seed_start,Int4 s_seed_start,Int4 score,GapEditScript * esp)2536 s_BlastGreedyGapAlignStructFill(BlastGapAlignStruct* gap_align,
2537                                 Int4 q_start, Int4 s_start,
2538                                 Int4 q_end, Int4 s_end,
2539                                 Int4 q_seed_start, Int4 s_seed_start,
2540                                 Int4 score, GapEditScript* esp)
2541 {
2542    gap_align->query_start = q_start;
2543    gap_align->query_stop = q_end;
2544    gap_align->subject_start = s_start;
2545    gap_align->subject_stop = s_end;
2546    gap_align->greedy_query_seed_start = q_seed_start;
2547    gap_align->greedy_subject_seed_start = s_seed_start;
2548    gap_align->score = score;
2549 
2550    if (esp)
2551       gap_align->edit_script = esp;
2552 
2553    return 0;
2554 }
2555 
s_UpdateEditScript(GapEditScript * esp,int pos,int bf,int af)2556 static void s_UpdateEditScript(GapEditScript* esp, int pos, int bf, int af) {
2557    int op, qd, sd;
2558 
2559    if (bf > 0) {
2560       op = pos;
2561       qd = sd = bf;
2562       do {
2563           if (--op < 0) return;
2564           switch(esp->op_type[op]) {
2565           case eGapAlignSub:
2566               qd -= esp->num[op];
2567               sd -= esp->num[op];
2568               break;
2569           case eGapAlignIns:
2570               qd -= esp->num[op];
2571               break;
2572           case eGapAlignDel:
2573               sd -= esp->num[op];
2574           default:
2575               break;
2576           }
2577       } while (qd > 0 || sd > 0);
2578 
2579       esp->num[op] = -MAX(qd, sd);
2580       esp->op_type[op++] = eGapAlignSub;
2581       for (; op < pos-1; op++) esp->num[op] = 0;
2582       esp->num[pos] += bf;
2583       qd -= sd;
2584       esp->op_type[pos-1] = (qd>0) ? eGapAlignDel: eGapAlignIns;
2585       esp->num[pos-1] = (qd>0) ? qd : -qd;
2586    }
2587 
2588    if (af > 0) {
2589       op = pos;
2590       qd = sd = af;
2591       do {
2592           if (++op >= esp->size) return;
2593           switch(esp->op_type[op]) {
2594           case eGapAlignSub:
2595               qd -= esp->num[op];
2596               sd -= esp->num[op];
2597               break;
2598           case eGapAlignIns:
2599               qd -= esp->num[op];
2600               break;
2601           case eGapAlignDel:
2602               sd -= esp->num[op];
2603           default:
2604               break;
2605           }
2606       } while (qd > 0 || sd > 0);
2607 
2608       esp->num[op] = -MAX(qd, sd);
2609       esp->op_type[op--] = eGapAlignSub;
2610       for (; op > pos+1; op--) esp->num[op] = 0;
2611       esp->num[pos] += af;
2612       qd -= sd;
2613       esp->op_type[pos+1] = (qd>0) ? eGapAlignDel: eGapAlignIns;
2614       esp->num[pos+1] = (qd>0) ? qd : -qd;
2615    }
2616 }
2617 
s_RebuildEditScript(GapEditScript * esp)2618 static void s_RebuildEditScript(GapEditScript* esp) {
2619    int i, j;
2620    for (i=0, j=-1; i<esp->size; i++) {
2621        if (esp->num[i] == 0) continue;
2622        if (j>=0 && esp->op_type[i] == esp->op_type[j]) {
2623            esp->num[j] += esp->num[i];
2624        } else if (j==-1 || esp->op_type[i] == eGapAlignSub
2625            || esp->op_type[j] == eGapAlignSub) {
2626            esp->op_type[++j] = esp->op_type[i];
2627            esp->num[j] = esp->num[i];
2628        } else {
2629            int d = esp->num[j] - esp->num[i];
2630            if (d > 0) {
2631               esp->num[j-1] += esp->num[i];
2632               esp->num[j] = d;
2633            } else if (d < 0) {
2634               esp->num[j-1] += esp->num[j];
2635               esp->num[j] = -d;
2636               esp->op_type[j] = esp->op_type[i];
2637            } else {
2638               esp->num[j-1] += esp->num[j];
2639               --j;
2640            }
2641        }
2642    }
2643    esp->size = ++j;
2644 }
2645 
s_ReduceGaps(GapEditScript * esp,const Uint1 * q,const Uint1 * s,const Uint1 * qf,const Uint1 * sf)2646 static void s_ReduceGaps(GapEditScript* esp, const Uint1 *q, const Uint1 *s,
2647                                              const Uint1 *qf,const Uint1 *sf){
2648    int i, j, nm1, nm2, d;
2649    const Uint1 *q1, *s1;
2650 
2651    for (q1=q, s1=s, i=0; i<esp->size; i++) {
2652        if (esp->num[i] == 0) continue;
2653        if (esp->op_type[i] == eGapAlignSub) {
2654            if(esp->num[i] >= 12) {
2655                nm1 = 1;
2656                if (i > 0) {
2657                    while (q1-nm1>=q && (*(q1-nm1) == *(s1-nm1))) ++nm1;
2658                }
2659                q1 += esp->num[i];
2660                s1 += esp->num[i];
2661                nm2 = 0;
2662                if (i < esp->size -1) {
2663                    while ((q1+1<qf) && (s1+1<sf) && (*(q1++) == *(s1++))) ++nm2;
2664                }
2665                if (nm1>1 || nm2>0) s_UpdateEditScript(esp, i, nm1-1, nm2);
2666                q1--; s1--;
2667            } else {
2668                q1 += esp->num[i];
2669                s1 += esp->num[i];
2670            }
2671        } else if (esp->op_type[i] == eGapAlignIns) {
2672            q1 += esp->num[i];
2673        } else {
2674            s1 += esp->num[i];
2675        }
2676    }
2677    s_RebuildEditScript(esp);
2678 
2679    for (i=0; i<esp->size; i++) {
2680        if (esp->op_type[i] == eGapAlignSub) {
2681            q += esp->num[i];
2682            s += esp->num[i];
2683            continue;
2684        }
2685        if (i>1 && esp->op_type[i] != esp->op_type[i-2]
2686                && esp->num[i-2] > 0) {
2687            d = esp->num[i] + esp->num[i-1] + esp->num[i-2];
2688            if (d == 3) {
2689                /* special case, no need to do further testing */
2690                (esp->num[i-2]) = 0;
2691                (esp->num[i-1]) = 2;
2692                (esp->num[i]) = 0;
2693                if (esp->op_type[i] == eGapAlignIns) {
2694                    ++q;
2695                } else {
2696                    ++s;
2697                }
2698            } else if (d < 12) {
2699                /* Try reducing this sub... */
2700                nm1 = 0;
2701                nm2 = 0;
2702                d = MIN(esp->num[i], esp->num[i-2]);
2703                q -= esp->num[i-1];
2704                s -= esp->num[i-1];
2705                q1 = q;
2706                s1 = s;
2707                if (esp->op_type[i] == eGapAlignIns) {
2708                    s -= d;
2709                } else {
2710                    q -= d;
2711                }
2712                for (j=0; j<esp->num[i-1]; ++j, ++q1, ++s1, ++q, ++s) {
2713                    if (*q1 == *s1) nm1++;
2714                    if (*q == *s) nm2++;
2715                }
2716                for (j=0; j<d; ++j, ++q, ++s) {
2717                    if (*q == *s) nm2++;
2718                }
2719                if (nm2 >= nm1 - d) {
2720                    (esp->num[i-2]) -= d;
2721                    (esp->num[i-1]) += d;
2722                    (esp->num[i]) -= d;
2723                } else {
2724                    q = q1;
2725                    s = s1;
2726                }
2727            }
2728        }
2729        if (esp->op_type[i] == eGapAlignIns) {
2730            q += esp->num[i];
2731        } else {
2732            s += esp->num[i];
2733        }
2734    }
2735    s_RebuildEditScript(esp);
2736 }
2737 
2738 Int2
BLAST_GreedyGappedAlignment(const Uint1 * query,const Uint1 * subject,Int4 query_length,Int4 subject_length,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 q_off,Int4 s_off,Boolean compressed_subject,Boolean do_traceback,Boolean * fence_hit)2739 BLAST_GreedyGappedAlignment(const Uint1* query, const Uint1* subject,
2740    Int4 query_length, Int4 subject_length, BlastGapAlignStruct* gap_align,
2741    const BlastScoringParameters* score_params,
2742    Int4 q_off, Int4 s_off, Boolean compressed_subject, Boolean do_traceback,
2743    Boolean * fence_hit)
2744 {
2745    const Uint1* q;
2746    const Uint1* s;
2747    Int4 score;
2748    Int4 X;
2749    Int4 q_avail, s_avail;
2750    Int4 q_ext_l, q_ext_r, s_ext_l, s_ext_r;
2751    GapPrelimEditBlock *fwd_prelim_tback = NULL;
2752    GapPrelimEditBlock *rev_prelim_tback = NULL;
2753    SGreedySeed fwd_start_point;
2754    SGreedySeed rev_start_point;
2755    Uint1 rem;
2756    GapEditScript* esp = NULL;
2757    Int4 q_seed_start = q_off;
2758    Int4 s_seed_start = s_off;
2759 
2760    q_avail = query_length - q_off;
2761    s_avail = subject_length - s_off;
2762 
2763    q = query + q_off;
2764    if (!compressed_subject) {
2765       s = subject + s_off;
2766       rem = 4; /* Special value to indicate that sequence is already
2767                   uncompressed */
2768    } else {
2769       s = subject + s_off/4;
2770       rem = s_off % 4;
2771    }
2772 
2773    X = gap_align->gap_x_dropoff;
2774 
2775    if (do_traceback) {
2776       fwd_prelim_tback = gap_align->fwd_prelim_tback;
2777       rev_prelim_tback = gap_align->rev_prelim_tback;
2778       GapPrelimEditBlockReset(fwd_prelim_tback);
2779       GapPrelimEditBlockReset(rev_prelim_tback);
2780    }
2781 
2782    /* extend to the right */
2783    while (TRUE) {
2784        Int4 new_dist, xdrop;
2785        score = BLAST_AffineGreedyAlign(q, q_avail, s, s_avail, FALSE, X,
2786               score_params->reward, -score_params->penalty,
2787               score_params->gap_open, score_params->gap_extend,
2788               &q_ext_r, &s_ext_r, gap_align->greedy_align_mem,
2789               fwd_prelim_tback, rem, fence_hit, &fwd_start_point);
2790        if (score >=0) break;
2791 
2792        /* double the max distance */
2793        new_dist = gap_align->greedy_align_mem->max_dist * 2;
2794        xdrop    = gap_align->greedy_align_mem->xdrop;
2795        s_BlastGreedyAlignsFree(gap_align->greedy_align_mem);
2796        gap_align->greedy_align_mem =
2797           s_BlastGreedyAlignMemAlloc(score_params, NULL, new_dist, xdrop);
2798 
2799        if (!gap_align->greedy_align_mem) {
2800           gap_align = BLAST_GapAlignStructFree(gap_align);
2801           return -1;
2802        }
2803    }
2804 
2805    if (compressed_subject)
2806       rem = 0;
2807 
2808    /* extend to the left */
2809    while (TRUE) {
2810        Int4 new_dist, xdrop, score1;
2811        score1 = BLAST_AffineGreedyAlign(query, q_off,
2812                subject, s_off, TRUE, X,
2813                score_params->reward, -score_params->penalty,
2814                score_params->gap_open, score_params->gap_extend,
2815                &q_ext_l, &s_ext_l, gap_align->greedy_align_mem,
2816                rev_prelim_tback, rem, fence_hit, &rev_start_point);
2817        if (score1 >=0) {
2818            score += score1;
2819            break;
2820        }
2821 
2822        /* double the max distance */
2823        new_dist = gap_align->greedy_align_mem->max_dist * 2;
2824        xdrop    = gap_align->greedy_align_mem->xdrop;
2825        s_BlastGreedyAlignsFree(gap_align->greedy_align_mem);
2826        gap_align->greedy_align_mem =
2827           s_BlastGreedyAlignMemAlloc(score_params, NULL, new_dist, xdrop);
2828 
2829        if (!gap_align->greedy_align_mem) {
2830           gap_align = BLAST_GapAlignStructFree(gap_align);
2831           return -1;
2832        }
2833    }
2834 
2835    /* In basic case the greedy algorithm returns number of
2836       differences, hence we need to convert it to score */
2837    if (score_params->gap_open==0 && score_params->gap_extend==0) {
2838       score =
2839          (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*score_params->reward/2 -
2840          score*(score_params->reward - score_params->penalty);
2841    } else if (score_params->reward % 2 == 1) {
2842       score /= 2;
2843    }
2844 
2845    if (do_traceback) {
2846       esp = Blast_PrelimEditBlockToGapEditScript(rev_prelim_tback,
2847                                              fwd_prelim_tback);
2848       /* check for possible gap elimination */
2849       ASSERT(!compressed_subject);
2850       if (esp) s_ReduceGaps(esp, query+q_off-q_ext_l, subject+s_off-s_ext_l,
2851                                  query+q_off+q_ext_r, subject+s_off+s_ext_r);
2852    }
2853    else {
2854        /* estimate the best alignment start point. This is the middle
2855           of the longest run of exact matches found by the aligner,
2856           subject to the constraint that the start point lies in the
2857           box formed by the optimal alignment */
2858 
2859        Int4 q_box_l = q_off - q_ext_l;
2860        Int4 s_box_l = s_off - s_ext_l;
2861        Int4 q_box_r = q_off + q_ext_r;
2862        Int4 s_box_r = s_off + s_ext_r;
2863        Int4 q_seed_start_l = q_off - rev_start_point.start_q;
2864        Int4 s_seed_start_l = s_off - rev_start_point.start_s;
2865        Int4 q_seed_start_r = q_off + fwd_start_point.start_q;
2866        Int4 s_seed_start_r = s_off + fwd_start_point.start_s;
2867        Int4 valid_seed_len_l = 0;
2868        Int4 valid_seed_len_r = 0;
2869 
2870        if (q_seed_start_r < q_box_r && s_seed_start_r < s_box_r) {
2871            valid_seed_len_r = MIN(q_box_r - q_seed_start_r,
2872                                   s_box_r - s_seed_start_r);
2873            valid_seed_len_r = MIN(valid_seed_len_r,
2874                                   fwd_start_point.match_length) / 2;
2875        } else {
2876            q_seed_start_r = q_off;
2877            s_seed_start_r = s_off;
2878        }
2879 
2880        if (q_seed_start_l > q_box_l && s_seed_start_l > s_box_l) {
2881            valid_seed_len_l = MIN(q_seed_start_l - q_box_l,
2882                                   s_seed_start_l - s_box_l);
2883            valid_seed_len_l = MIN(valid_seed_len_l,
2884                                   rev_start_point.match_length) / 2;
2885        } else {
2886            q_seed_start_l = q_off;
2887            s_seed_start_l = s_off;
2888        }
2889 
2890        if (valid_seed_len_r > valid_seed_len_l) {
2891            q_seed_start = q_seed_start_r + valid_seed_len_r;
2892            s_seed_start = s_seed_start_r + valid_seed_len_r;
2893        }
2894        else {
2895            q_seed_start = q_seed_start_l - valid_seed_len_l;
2896            s_seed_start = s_seed_start_l - valid_seed_len_l;
2897        }
2898    }
2899 
2900    s_BlastGreedyGapAlignStructFill(gap_align,
2901                                    q_off-q_ext_l, s_off-s_ext_l,
2902                                    q_off+q_ext_r, s_off+s_ext_r,
2903                                    q_seed_start, s_seed_start,
2904                                    score, esp);
2905    return 0;
2906 }
2907 
2908 /** Performs dynamic programming style gapped extension, given an initial
2909  * HSP (offset pair), two sequence blocks and scoring and extension options.
2910  * Note: traceback is not done in this function.
2911  * @param query_blk The query sequence [in]
2912  * @param subject_blk The subject sequence [in]
2913  * @param gap_align The auxiliary structure for gapped alignment [in]
2914  * @param score_params Parameters related to scoring [in]
2915  * @param init_hsp The initial HSP that needs to be extended [in]
2916 */
2917 static Int2
s_BlastDynProgNtGappedAlignment(BLAST_SequenceBlk * query_blk,BLAST_SequenceBlk * subject_blk,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,BlastInitHSP * init_hsp)2918 s_BlastDynProgNtGappedAlignment(BLAST_SequenceBlk* query_blk,
2919    BLAST_SequenceBlk* subject_blk, BlastGapAlignStruct* gap_align,
2920    const BlastScoringParameters* score_params, BlastInitHSP* init_hsp)
2921 {
2922    Int4 q_length, s_length;
2923    Int4 private_q_start, private_s_start;
2924    Int4 score_right = 0, score_left = 0;
2925    Uint1 offset_adjustment;
2926    Uint1* query = query_blk->sequence;
2927    Uint1* subject = subject_blk->sequence;
2928 
2929    /* If subject offset is not at the start of a full byte,
2930       s_BlastAlignPackedNucl won't work, so shift the alignment start
2931       to the next multiple of 4 subject letters. Note that the
2932       shift amount is always nonzero, so a left extension always happens
2933       (and has a few presumed exact matches to start with). In the case
2934       of the smallest ungapped alignment (4 nucleotides), this can
2935       conceivably put the start point at the end of one sequence */
2936 
2937    offset_adjustment = COMPRESSION_RATIO -
2938             (init_hsp->offsets.qs_offsets.s_off % COMPRESSION_RATIO);
2939    q_length = init_hsp->offsets.qs_offsets.q_off + offset_adjustment;
2940    s_length = init_hsp->offsets.qs_offsets.s_off + offset_adjustment;
2941 
2942    /* This prevents the starting point from being at the end of a sequence
2943       as mentioned in the comment above. */
2944    if (q_length > query_blk->length || s_length > subject_blk->length)
2945    {
2946        q_length -= COMPRESSION_RATIO;
2947        s_length -= COMPRESSION_RATIO;
2948    }
2949 
2950    /* perform extension to left */
2951    score_left = s_BlastAlignPackedNucl(query, subject, q_length, s_length,
2952                       &private_q_start, &private_s_start, gap_align,
2953                       score_params, TRUE);
2954    if (score_left < 0)
2955       return -1;
2956    gap_align->query_start = q_length - private_q_start;
2957    gap_align->subject_start = s_length - private_s_start;
2958 
2959    /* perform extension to right */
2960    if (q_length < query_blk->length &&
2961        s_length < subject_blk->length)
2962    {
2963       score_right = s_BlastAlignPackedNucl(query+q_length-1,
2964          subject+(s_length+3)/COMPRESSION_RATIO - 1,
2965          query_blk->length-q_length,
2966          subject_blk->length-s_length, &(gap_align->query_stop),
2967          &(gap_align->subject_stop), gap_align, score_params, FALSE);
2968       if (score_right < 0)
2969          return -1;
2970       gap_align->query_stop += q_length;
2971       gap_align->subject_stop += s_length;
2972    }
2973    else {
2974       gap_align->query_stop = q_length;
2975       gap_align->subject_stop = s_length;
2976    }
2977 
2978    gap_align->score = score_right+score_left;
2979 
2980    return 0;
2981 }
2982 
2983 /** Aligns two nucleotide sequences, one (A) should be packed in the
2984  * same way as the BLAST databases, the other (B) should contain one
2985  * basepair/byte. Traceback is not done in this function.
2986  * @param B The query sequence [in]
2987  * @param A The subject sequence [in]
2988  * @param N Maximal extension length in query [in]
2989  * @param M Maximal extension length in subject [in]
2990  * @param b_offset Resulting starting offset in query [out]
2991  * @param a_offset Resulting starting offset in subject [out]
2992  * @param gap_align The auxiliary structure for gapped alignment [in]
2993  * @param score_params Parameters related to scoring [in]
2994  * @param reverse_sequence Reverse the sequence.
2995  * @return The best alignment score found.
2996 */
2997 static Int4
s_BlastAlignPackedNucl(Uint1 * B,Uint1 * A,Int4 N,Int4 M,Int4 * b_offset,Int4 * a_offset,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Boolean reverse_sequence)2998 s_BlastAlignPackedNucl(Uint1* B, Uint1* A, Int4 N, Int4 M,
2999 	Int4* b_offset, Int4* a_offset,
3000         BlastGapAlignStruct* gap_align,
3001         const BlastScoringParameters* score_params,
3002         Boolean reverse_sequence)
3003 {
3004     Int4 i;                     /* sequence pointers and indices */
3005     Int4 a_index, a_base_pair;
3006     Int4 b_index, b_size, first_b_index, last_b_index, b_increment;
3007     Uint1* b_ptr;
3008 
3009     BlastGapDP* score_array;
3010     Int4 num_extra_cells;
3011 
3012     Int4 gap_open;              /* alignment penalty variables */
3013     Int4 gap_extend;
3014     Int4 gap_open_extend;
3015     Int4 x_dropoff;
3016 
3017     Int4* *matrix;              /* pointers to the score matrix */
3018     Int4* matrix_row;
3019 
3020     Int4 score;                 /* score tracking variables */
3021     Int4 score_gap_row;
3022     Int4 score_gap_col;
3023     Int4 next_score;
3024     Int4 best_score;
3025 
3026     /* do initialization and sanity-checking */
3027 
3028     matrix = gap_align->sbp->matrix->data;
3029     *a_offset = 0;
3030     *b_offset = 0;
3031     gap_open = score_params->gap_open;
3032     gap_extend = score_params->gap_extend;
3033     gap_open_extend = gap_open + gap_extend;
3034     x_dropoff = gap_align->gap_x_dropoff;
3035 
3036     if (x_dropoff < gap_open_extend)
3037         x_dropoff = gap_open_extend;
3038 
3039     if(N <= 0 || M <= 0)
3040         return 0;
3041 
3042     /* Allocate and fill in the auxiliary bookeeping structures.
3043        Since A and B could be very large, maintain a window
3044        of auxiliary structures only large enough to contain the current
3045        set of DP computations. The initial window size is determined
3046        by the number of cells needed to fail the x-dropoff test */
3047 
3048     if (gap_extend > 0)
3049         num_extra_cells = x_dropoff / gap_extend + 3;
3050     else
3051         num_extra_cells = N + 3;
3052 
3053     if (num_extra_cells > gap_align->dp_mem_alloc) {
3054         gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
3055                                       2 * gap_align->dp_mem_alloc);
3056         sfree(gap_align->dp_mem);
3057         gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
3058                                                   sizeof(BlastGapDP));
3059     }
3060 
3061     score_array = gap_align->dp_mem;
3062     score = -gap_open_extend;
3063     score_array[0].best = 0;
3064     score_array[0].best_gap = -gap_open_extend;
3065 
3066     for (i = 1; i <= N; i++) {
3067         if (score < -x_dropoff)
3068             break;
3069 
3070         score_array[i].best = score;
3071         score_array[i].best_gap = score - gap_open_extend;
3072         score -= gap_extend;
3073     }
3074 
3075     /* The inner loop below examines letters of B from
3076        index 'first_b_index' to 'b_size' */
3077 
3078     b_size = i;
3079     best_score = 0;
3080     first_b_index = 0;
3081     if (reverse_sequence)
3082         b_increment = -1;
3083     else
3084         b_increment = 1;
3085 
3086     for (a_index = 1; a_index <= M; a_index++) {
3087 
3088         /* pick out the row of the score matrix
3089            appropriate for A[a_index] */
3090 
3091         if(reverse_sequence) {
3092             a_base_pair = NCBI2NA_UNPACK_BASE(A[(M-a_index)/4],
3093                                                ((a_index-1)%4));
3094             matrix_row = matrix[a_base_pair];
3095         }
3096         else {
3097             a_base_pair = NCBI2NA_UNPACK_BASE(A[1+((a_index-1)/4)],
3098                                                (3-((a_index-1)%4)));
3099             matrix_row = matrix[a_base_pair];
3100         }
3101 
3102         if(reverse_sequence)
3103             b_ptr = &B[N - first_b_index];
3104         else
3105             b_ptr = &B[first_b_index];
3106 
3107         /* initialize running-score variables */
3108         score = MININT;
3109         score_gap_row = MININT;
3110         last_b_index = first_b_index;
3111 
3112         for (b_index = first_b_index; b_index < b_size; b_index++) {
3113 
3114             b_ptr += b_increment;
3115             score_gap_col = score_array[b_index].best_gap;
3116             next_score = score_array[b_index].best + matrix_row[ *b_ptr ];
3117 
3118             if (score < score_gap_col)
3119                 score = score_gap_col;
3120 
3121             if (score < score_gap_row)
3122                 score = score_gap_row;
3123 
3124             if (best_score - score > x_dropoff) {
3125 
3126                 /* the current best score failed the X-dropoff
3127                    criterion. Note that this does not stop the
3128                    inner loop, only forces future iterations to
3129                    skip this column of B.
3130 
3131                    Also, if the very first letter of B that was
3132                    tested failed the X dropoff criterion, make
3133                    sure future inner loops start one letter to
3134                    the right */
3135 
3136                 if (b_index == first_b_index)
3137                     first_b_index++;
3138                 else
3139                     score_array[b_index].best = MININT;
3140             }
3141             else {
3142                 last_b_index = b_index;
3143                 if (score > best_score) {
3144                     best_score = score;
3145                     *a_offset = a_index;
3146                     *b_offset = b_index;
3147                 }
3148 
3149                 /* If starting a gap at this position will improve
3150                    the best row or column score, update them to
3151                    reflect that. */
3152 
3153                 score_gap_row -= gap_extend;
3154                 score_gap_col -= gap_extend;
3155                 score_array[b_index].best_gap = MAX(score - gap_open_extend,
3156                                                     score_gap_col);
3157                 score_gap_row = MAX(score - gap_open_extend, score_gap_row);
3158 
3159                 score_array[b_index].best = score;
3160             }
3161 
3162             score = next_score;
3163         }
3164 
3165         /* Finish aligning if the best scores for all positions
3166            of B will fail the X-dropoff test, i.e. the inner loop
3167            bounds have converged to each other */
3168 
3169         if (first_b_index == b_size)
3170             break;
3171 
3172         if (last_b_index + num_extra_cells + 3 >= gap_align->dp_mem_alloc) {
3173 
3174             gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100,
3175                                           2 * gap_align->dp_mem_alloc);
3176             score_array = (BlastGapDP *)realloc(score_array,
3177                                                gap_align->dp_mem_alloc *
3178                                                sizeof(BlastGapDP));
3179             gap_align->dp_mem = score_array;
3180         }
3181 
3182         if (last_b_index < b_size - 1) {
3183             /* This row failed the X-dropoff test earlier than
3184                the last row did; just shorten the loop bounds
3185                before doing the next row */
3186 
3187             b_size = last_b_index + 1;
3188         }
3189         else {
3190             /* The inner loop finished without failing the X-dropoff
3191                test; initialize extra bookkeeping structures until
3192                the X dropoff test fails or we run out of letters in B.
3193                The next inner loop will have larger bounds */
3194 
3195             while (score_gap_row >= (best_score - x_dropoff) && b_size <= N) {
3196                 score_array[b_size].best = score_gap_row;
3197                 score_array[b_size].best_gap = score_gap_row - gap_open_extend;
3198                 score_gap_row -= gap_extend;
3199                 b_size++;
3200             }
3201         }
3202 
3203         if (b_size <= N) {
3204             score_array[b_size].best = MININT;
3205             score_array[b_size].best_gap = MININT;
3206             b_size++;
3207         }
3208     }
3209 
3210     return best_score;
3211 }
3212 
3213 Boolean
BlastGetOffsetsForGappedAlignment(const Uint1 * query,const Uint1 * subject,const BlastScoreBlk * sbp,BlastHSP * hsp,Int4 * q_retval,Int4 * s_retval)3214 BlastGetOffsetsForGappedAlignment (const Uint1* query, const Uint1* subject,
3215    const BlastScoreBlk* sbp, BlastHSP* hsp, Int4* q_retval, Int4* s_retval)
3216 {
3217     Int4 index1, max_offset, score, max_score, hsp_end;
3218     const Uint1* query_var,* subject_var;
3219     Boolean positionBased = (sbp->psi_matrix != NULL);
3220     Int4 q_length = hsp->query.end - hsp->query.offset;
3221     Int4 s_length = hsp->subject.end - hsp->subject.offset;
3222     int q_start = hsp->query.offset;
3223     int s_start = hsp->subject.offset;
3224 
3225     if (q_length <= HSP_MAX_WINDOW) {
3226         *q_retval = q_start + q_length/2;
3227         *s_retval = s_start + q_length/2;
3228         return TRUE;
3229     }
3230 
3231     hsp_end = q_start + HSP_MAX_WINDOW;
3232     query_var = query + q_start;
3233     subject_var = subject + s_start;
3234     score=0;
3235     for (index1=q_start; index1<hsp_end; index1++) {
3236         if (!(positionBased))
3237             score += sbp->matrix->data[*query_var][*subject_var];
3238         else
3239             score += sbp->psi_matrix->pssm->data[index1][*subject_var];
3240         query_var++; subject_var++;
3241     }
3242     max_score = score;
3243     max_offset = hsp_end - 1;
3244     hsp_end = q_start + MIN(q_length, s_length);
3245     for (index1=q_start + HSP_MAX_WINDOW; index1<hsp_end; index1++) {
3246         if (!(positionBased)) {
3247             score -= sbp->matrix->data[*(query_var-HSP_MAX_WINDOW)][*(subject_var-HSP_MAX_WINDOW)];
3248             score += sbp->matrix->data[*query_var][*subject_var];
3249         } else {
3250             score -= sbp->psi_matrix->pssm->data[index1-HSP_MAX_WINDOW][*(subject_var-HSP_MAX_WINDOW)];
3251             score += sbp->psi_matrix->pssm->data[index1][*subject_var];
3252         }
3253         if (score > max_score) {
3254             max_score = score;
3255             max_offset = index1;
3256         }
3257         query_var++; subject_var++;
3258     }
3259 
3260     if (max_score > 0)
3261     {
3262         *q_retval = max_offset;
3263         *s_retval = (max_offset - q_start) + s_start;
3264         return TRUE;
3265     }
3266     else  /* Test the window around the ends of the HSP. */
3267     {
3268         score=0;
3269         query_var = query + q_start + q_length - HSP_MAX_WINDOW;
3270         subject_var = subject + s_start + s_length - HSP_MAX_WINDOW;
3271         for (index1=hsp->query.end-HSP_MAX_WINDOW; index1<hsp->query.end; index1++) {
3272             if (!(positionBased))
3273                 score += sbp->matrix->data[*query_var][*subject_var];
3274             else
3275                 score += sbp->psi_matrix->pssm->data[index1][*subject_var];
3276             query_var++; subject_var++;
3277         }
3278         if (score > 0)
3279         {
3280             *q_retval = hsp->query.end - HSP_MAX_WINDOW/2;
3281             *s_retval = hsp->subject.end - HSP_MAX_WINDOW/2;
3282             return TRUE;
3283         }
3284     }
3285     return FALSE;
3286 }
3287 
3288 void
BlastGetStartForGappedAlignmentNucl(const Uint1 * query,const Uint1 * subject,BlastHSP * hsp)3289 BlastGetStartForGappedAlignmentNucl (const Uint1* query, const Uint1* subject,
3290    BlastHSP* hsp)
3291 {
3292     /* We will stop when the identity count reaches to this number */
3293     int hspMaxIdentRun = 10;
3294     const Uint1 *q, *s;
3295     Int4 index, max_offset, score, max_score, q_start, s_start, q_len;
3296     Boolean match, prev_match;
3297     Int4 offset = MIN(hsp->subject.gapped_start - hsp->subject.offset,
3298                       hsp->query.gapped_start - hsp->query.offset);
3299     /* first check if the old value is ok */
3300     q_start = hsp->query.gapped_start;
3301     s_start = hsp->subject.gapped_start;
3302     score = -1;
3303     q = query + q_start;
3304     s = subject + s_start;
3305     q_len = hsp->query.end;
3306     while ((q-query < q_len) && (*q++ == *s++)) {
3307         score++;
3308         if (score > hspMaxIdentRun) return;
3309     }
3310     q = query + q_start;
3311     s = subject + s_start;
3312     while ((q-query >= 0) && (*q-- == *s--)) {
3313         score++;
3314         if (score > hspMaxIdentRun) return;
3315     }
3316     hspMaxIdentRun *= 1.5;  /* Demand larger value if we move */
3317     /* if the old value is not ok, try to find a better point */
3318     q_start = hsp->query.gapped_start - offset;
3319     s_start = hsp->subject.gapped_start - offset;
3320     q_len = MIN(hsp->subject.end - s_start, hsp->query.end - q_start);
3321     q = query + q_start;
3322     s = subject + s_start;
3323     max_score = 0;
3324     max_offset = q_start;
3325     score = 0;
3326     match = FALSE;
3327     prev_match = FALSE;
3328     for (index = q_start; index < q_start + q_len; index++) {
3329         match = (*q++ == *s++);
3330         if (match != prev_match) {
3331             prev_match = match;
3332             if (match) {
3333                 score = 1;
3334             } else if (score > max_score) {
3335                 max_score = score;
3336                 max_offset = index - score/2;
3337             }
3338         } else if (match) {
3339             ++score;
3340             if (score > hspMaxIdentRun) {
3341                 max_offset = index - hspMaxIdentRun/2;
3342                 hsp->query.gapped_start = max_offset;
3343                 hsp->subject.gapped_start = max_offset + s_start - q_start;
3344                 return;
3345             }
3346         }
3347     }
3348     if (match && score > max_score) {
3349         max_score = score;
3350         max_offset = index - score/2;
3351     }
3352     if (max_score > 0) {
3353         hsp->query.gapped_start = max_offset;
3354         hsp->subject.gapped_start = max_offset + s_start - q_start;
3355     }
3356 }
3357 
3358 Int4
BlastGetStartForGappedAlignment(const Uint1 * query,const Uint1 * subject,const BlastScoreBlk * sbp,Uint4 q_start,Uint4 q_length,Uint4 s_start,Uint4 s_length)3359 BlastGetStartForGappedAlignment (const Uint1* query, const Uint1* subject,
3360    const BlastScoreBlk* sbp, Uint4 q_start, Uint4 q_length,
3361    Uint4 s_start, Uint4 s_length)
3362 {
3363     Int4 index1, max_offset, score, max_score, hsp_end;
3364     const Uint1* query_var,* subject_var;
3365     Boolean positionBased = (sbp->psi_matrix != NULL);
3366 
3367     if (q_length <= HSP_MAX_WINDOW) {
3368         max_offset = q_start + q_length/2;
3369         return max_offset;
3370     }
3371 
3372     hsp_end = q_start + HSP_MAX_WINDOW;
3373     query_var = query + q_start;
3374     subject_var = subject + s_start;
3375     score=0;
3376     for (index1=q_start; index1<hsp_end; index1++) {
3377         if (!(positionBased))
3378             score += sbp->matrix->data[*query_var][*subject_var];
3379         else
3380             score += sbp->psi_matrix->pssm->data[index1][*subject_var];
3381         query_var++; subject_var++;
3382     }
3383     max_score = score;
3384     max_offset = hsp_end - 1;
3385     hsp_end = q_start + MIN(q_length, s_length);
3386     for (index1=q_start + HSP_MAX_WINDOW; index1<hsp_end; index1++) {
3387         if (!(positionBased)) {
3388             score -= sbp->matrix->data[*(query_var-HSP_MAX_WINDOW)][*(subject_var-HSP_MAX_WINDOW)];
3389             score += sbp->matrix->data[*query_var][*subject_var];
3390         } else {
3391             score -= sbp->psi_matrix->pssm->data[index1-HSP_MAX_WINDOW][*(subject_var-HSP_MAX_WINDOW)];
3392             score += sbp->psi_matrix->pssm->data[index1][*subject_var];
3393         }
3394         if (score > max_score) {
3395             max_score = score;
3396             max_offset = index1;
3397         }
3398         query_var++; subject_var++;
3399     }
3400     if (max_score > 0)
3401        max_offset -= HSP_MAX_WINDOW/2;
3402     else
3403        max_offset = q_start;
3404 
3405     return max_offset;
3406 }
3407 
BLAST_GetGappedScore(EBlastProgramType program_number,BLAST_SequenceBlk * query,BlastQueryInfo * query_info,BLAST_SequenceBlk * subject,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,const BlastExtensionParameters * ext_params,const BlastHitSavingParameters * hit_params,BlastInitHitList * init_hitlist,BlastHSPList ** hsp_list_ptr,BlastGappedStats * gapped_stats,Boolean * fence_hit)3408 Int2 BLAST_GetGappedScore (EBlastProgramType program_number,
3409         BLAST_SequenceBlk* query, BlastQueryInfo* query_info,
3410         BLAST_SequenceBlk* subject,
3411         BlastGapAlignStruct* gap_align,
3412         const BlastScoringParameters* score_params,
3413         const BlastExtensionParameters* ext_params,
3414         const BlastHitSavingParameters* hit_params,
3415         BlastInitHitList* init_hitlist,
3416         BlastHSPList** hsp_list_ptr, BlastGappedStats* gapped_stats,
3417         Boolean * fence_hit)
3418 {
3419    Int4 index;
3420    BlastInitHSP* init_hsp = NULL;
3421    BlastInitHSP* init_hsp_array;
3422    Int4 q_start, s_start, q_end, s_end;
3423    Boolean is_prot;
3424    Boolean* restricted_align_array = NULL; /* if allocated, restricted
3425                                               alignment option per query */
3426    Boolean is_greedy;
3427    Boolean is_rpsblast = Blast_ProgramIsRpsBlast(program_number);
3428    Int4 max_offset;
3429    Int4 rps_cutoff_index = 0;
3430    Int2 status = 0;
3431    BlastHSPList* hsp_list = NULL;
3432    const BlastHitSavingOptions* hit_options = hit_params->options;
3433    BLAST_SequenceBlk query_tmp;
3434    Int4 context;
3435    BlastIntervalTree *tree;
3436    Int4 score;
3437    Int4* found_high_score = NULL;
3438    Int4 **rpsblast_pssms = NULL;   /* Pointer to concatenated PSSMs in
3439                                        RPS-BLAST database */
3440    const int kHspNumMax = BlastHspNumMax(TRUE, hit_options);
3441    const double kRestrictedMult = 0.68;/* fraction of the ordinary cutoff score
3442                                         used for approximate gapped alignment */
3443    Int4 redo_index = -1; /* these are used for recomputing alignmnets if the */
3444    Int4 redo_query = -1; /* approximate alignment score is inconclusive */
3445 
3446    if (!query || !subject || !gap_align || !score_params || !ext_params ||
3447        !hit_params || !init_hitlist || !hsp_list_ptr)
3448       return 1;
3449 
3450    if (init_hitlist->total == 0)
3451       return 0;
3452 
3453    is_prot = (program_number != eBlastTypeBlastn &&
3454               program_number != eBlastTypePhiBlastn &&
3455               program_number != eBlastTypeMapping);
3456    is_greedy = (ext_params->options->ePrelimGapExt == eGreedyScoreOnly);
3457 
3458    /* turn on approximate gapped alignment if 1) the search
3459       specifies it, and 2) the first, highest-scoring ungapped
3460       alignment in init_hitlist scores below a reduced cutoff.
3461       The second condition is used to avoid computing approximate
3462       alignments that experiment shows are likely to be thrown
3463       away and recomputed optimally */
3464 
3465    if (hit_params->restricted_align && !score_params->options->is_ooframe) {
3466        int i;
3467        Boolean* found = calloc(query_info->last_context + 1, sizeof(Boolean));
3468 
3469        restricted_align_array =
3470            (Boolean*)calloc(Blast_GetQueryIndexFromContext(
3471                                                query_info->last_context,
3472                                                program_number) + 1,
3473                             sizeof(Boolean));
3474 
3475        /* find the first ungapped alignment for each query and determine
3476           whetherto use approximate gapped alignment for the query */
3477        for (i = 0;i < init_hitlist->total;i++) {
3478            int contxt; /* context */
3479            int query_index = -1;
3480            Int4 q_off;
3481 
3482            init_hsp = &init_hitlist->init_hsp_array[i];
3483            /* find query index */
3484            q_off = init_hsp->offsets.qs_offsets.q_off;
3485            for (contxt = query_info->first_context;
3486                 contxt <= query_info->last_context; contxt++) {
3487 
3488                if (q_off >= query_info->contexts[contxt].query_offset &&
3489                    q_off < query_info->contexts[contxt].query_offset +
3490                    query_info->contexts[contxt].query_length) {
3491 
3492                    query_index = Blast_GetQueryIndexFromContext(contxt,
3493                                                                 program_number);
3494                    break;
3495                }
3496            }
3497            ASSERT(query_index >= 0);
3498 
3499            /* if this is not the first initial hit for the query, disregard it */
3500            if (found[query_index]) {
3501                continue;
3502            }
3503 
3504            found[query_index] = TRUE;
3505 
3506            /* use approximate gapped alignment if the highest scoring ungapped
3507               alignment scores below the reduced cutoff */
3508            if (init_hsp->ungapped_data && init_hsp->ungapped_data->score <
3509                (Int4)(kRestrictedMult *
3510                       hit_params->cutoffs[contxt].cutoff_score)) {
3511 
3512                restricted_align_array[query_index] = TRUE;
3513            }
3514            else {
3515                restricted_align_array[query_index] = FALSE;
3516            }
3517        }
3518        if (found) {
3519            free(found);
3520        }
3521    }
3522 
3523    if (is_rpsblast) {
3524       rpsblast_pssms = gap_align->sbp->psi_matrix->pssm->data;
3525       if (program_number == eBlastTypeRpsBlast)
3526          rps_cutoff_index = subject->oid;
3527       else
3528          rps_cutoff_index = subject->oid * NUM_FRAMES +
3529                             BLAST_FrameToContext(subject->frame,
3530                                                  program_number);
3531    }
3532 
3533    ASSERT(Blast_InitHitListIsSortedByScore(init_hitlist));
3534 
3535    if (*hsp_list_ptr == NULL)
3536       *hsp_list_ptr = hsp_list = Blast_HSPListNew(kHspNumMax);
3537    else
3538       hsp_list = *hsp_list_ptr;
3539 
3540    /* Initialize the interval tree with the maximum possible
3541       query and subject offsets. For query sequences this is always
3542       query->length, and for subject sequences it is subject->length
3543       except for out-of-frame tblastn. In that case, subject->length
3544       is the length of one strand of the subject sequence, but the
3545       subject offsets of each ungapped alignment are wrt all six subject
3546       frames concatenated together. Finally, add 1 to the maximum,
3547       to account for HSPs that include the end of a sequence */
3548 
3549    if (Blast_SubjectIsTranslated(program_number) &&
3550        score_params->options->is_ooframe) {
3551       /* OOF search only supported for tblastn and blastx. (blastx
3552 	 does not have a translated subject, so can't get here.) */
3553       ASSERT(program_number == eBlastTypeTblastn);
3554 
3555       tree = Blast_IntervalTreeInit(0, query->length+1,
3556                                     0, 2*(subject->length + CODON_LENGTH)+1);
3557    }
3558    else {
3559       tree = Blast_IntervalTreeInit(0, query->length+1,
3560                                     0, subject->length+1);
3561    }
3562    if (!tree)
3563      return BLASTERR_MEMORY;
3564 
3565 
3566    init_hsp_array = init_hitlist->init_hsp_array;
3567    found_high_score = (Int4*) calloc(query_info->num_queries, sizeof(Int4));
3568    if (hit_params->low_score)
3569    {
3570    	for (index=0; index<init_hitlist->total; index++)
3571    	{
3572       	    int query_index =
3573        		  Blast_GetQueryIndexFromQueryOffset(init_hsp_array[index].offsets.qs_offsets.q_off, program_number, query_info);
3574       	    if (init_hsp_array[index].ungapped_data->score > hit_params->low_score[query_index])
3575         	found_high_score[query_index] = 1;
3576         }
3577    }
3578 
3579    for (index=0; index<init_hitlist->total; index++)
3580    {
3581       BlastHSP tmp_hsp;
3582       BlastInitHSP tmp_init_hsp;
3583       BlastUngappedData tmp_ungapped_data;
3584       int query_index=0;
3585       Boolean restricted_alignment = FALSE;
3586 
3587       /* make a local copy of the initial HSP */
3588 
3589       tmp_init_hsp = init_hsp_array[index];
3590       if (tmp_init_hsp.ungapped_data) {
3591           tmp_ungapped_data = *(init_hsp_array[index].ungapped_data);
3592           tmp_init_hsp.ungapped_data = &tmp_ungapped_data;
3593       }
3594       init_hsp = &tmp_init_hsp;
3595 
3596       s_AdjustHspOffsetsAndGetQueryData(query, query_info, init_hsp,
3597                                         &query_tmp, &context);
3598 
3599       query_index = Blast_GetQueryIndexFromContext(context, program_number);
3600 
3601       /* If redo_index > -1 and redo_query > -1, the main loop is recomputing
3602          gappaed alignments for a single query, becuase the approximate
3603          alignment score was inconclusive. This recomputing was triggered when
3604          index was equal to redo_index. Until index reaches redo_index again,
3605          skip all concatenated queries with index different from redo_query */
3606 
3607       if (index < redo_index && query_index != redo_query) {
3608           continue;
3609       }
3610 
3611       if (restricted_align_array && restricted_align_array[query_index]) {
3612           restricted_alignment = TRUE;
3613       }
3614 
3615       if (hit_params->low_score)
3616       {
3617       	if (!found_high_score[query_index])
3618          continue;
3619       }
3620 
3621       if (rpsblast_pssms)
3622          gap_align->sbp->psi_matrix->pssm->data = rpsblast_pssms +
3623              query_info->contexts[context].query_offset;
3624 
3625       if (!init_hsp->ungapped_data) {
3626          q_start = q_end = init_hsp->offsets.qs_offsets.q_off;
3627          s_start = s_end = init_hsp->offsets.qs_offsets.s_off;
3628          score = INT4_MIN;
3629       } else {
3630          q_start = init_hsp->ungapped_data->q_start;
3631          q_end = q_start + init_hsp->ungapped_data->length;
3632          s_start = init_hsp->ungapped_data->s_start;
3633          s_end = s_start + init_hsp->ungapped_data->length;
3634          score = init_hsp->ungapped_data->score;
3635       }
3636 
3637       tmp_hsp.score = score;
3638       tmp_hsp.context = context;
3639       tmp_hsp.query.offset = q_start;
3640       tmp_hsp.query.end = q_end;
3641       tmp_hsp.query.frame = query_info->contexts[context].frame;
3642       tmp_hsp.subject.offset = s_start;
3643       tmp_hsp.subject.end = s_end;
3644       tmp_hsp.subject.frame = subject->frame;
3645 
3646       /* use priate interval tree when recomputing alignments */
3647       if (!BlastIntervalTreeContainsHSP(tree, &tmp_hsp, query_info,
3648                                         hit_options->min_diag_separation))
3649       {
3650          BlastHSP* new_hsp;
3651          Int4 cutoff, restricted_cutoff = 0;
3652 
3653          if (is_rpsblast)
3654             cutoff = hit_params->cutoffs[rps_cutoff_index].cutoff_score;
3655          else
3656             cutoff = hit_params->cutoffs[context].cutoff_score;
3657 
3658          if (restricted_alignment)
3659              restricted_cutoff = (Int4)(kRestrictedMult * cutoff);
3660 
3661          if (gapped_stats)
3662             ++gapped_stats->extensions;
3663 
3664          if(is_prot && !score_params->options->is_ooframe) {
3665             max_offset =
3666                BlastGetStartForGappedAlignment(query_tmp.sequence,
3667                   subject->sequence, gap_align->sbp,
3668                   init_hsp->ungapped_data->q_start,
3669                   init_hsp->ungapped_data->length,
3670                   init_hsp->ungapped_data->s_start,
3671                   init_hsp->ungapped_data->length);
3672             init_hsp->offsets.qs_offsets.s_off +=
3673                 max_offset - init_hsp->offsets.qs_offsets.q_off;
3674             init_hsp->offsets.qs_offsets.q_off = max_offset;
3675          }
3676 
3677          if (is_prot) {
3678             status = s_BlastProtGappedAlignment(program_number, &query_tmp,
3679                                                 subject, gap_align,
3680                                                 score_params, init_hsp,
3681                                                 restricted_alignment,
3682                                                 fence_hit);
3683 
3684             if (restricted_alignment &&
3685                 gap_align->score < cutoff &&
3686                 gap_align->score >= restricted_cutoff) {
3687 
3688                 /* the alignment score is inconclusive; it may really be
3689                    low-scoring, or the approximate gapped alignment
3690                    could be highly suboptimal. Recomputing the alignment
3691                    would resolve the ambiguity, but when the final list of
3692                    HSPs contains a mix of approximate and exact gapped
3693                    alignments it's possible for a highly suboptimal
3694                    alignment to make it to the traceback phase, where
3695                    its start point is reused. We sidestep this problem
3696                    by throwing away all the previously computed gapped
3697                    alignments and recomputing them exactly. */
3698 
3699                 Int4 index2;
3700                 BlastHSPList* new_hsp_list = Blast_HSPListNew(kHspNumMax);
3701                 ASSERT(restricted_align_array);
3702                 restricted_align_array[query_index] = FALSE;
3703 
3704                 Blast_IntervalTreeReset(tree);
3705 
3706                 /* remove all HSPs computed for the current query */
3707                 for (index2 = 0; index2 < hsp_list->hspcnt; index2++) {
3708                     Int4 context2 = hsp_list->hsp_array[index2]->context;
3709                     Int4 q_index2 = Blast_GetQueryIndexFromContext(context2,
3710                                                                program_number);
3711 
3712                     if (q_index2 != query_index) {
3713 
3714                         Blast_HSPListSaveHSP(new_hsp_list,
3715                                              hsp_list->hsp_array[index2]);
3716 
3717                         status = BlastIntervalTreeAddHSP(
3718                                                 hsp_list->hsp_array[index2],
3719                                                 tree,
3720                                                 query_info,
3721                                                 eQueryAndSubject);
3722 
3723                         hsp_list->hsp_array[index2] = NULL;
3724 
3725                     }
3726                     else {
3727                         hsp_list->hsp_array[index2] =
3728                             Blast_HSPFree(hsp_list->hsp_array[index2]);
3729                     }
3730                 }
3731                 Blast_HSPListFree(hsp_list);
3732                 hsp_list = new_hsp_list;
3733 
3734                 /* restart the loop over initial hits to recompute
3735                    alignments for the current query */
3736                 redo_index = index;
3737                 redo_query = query_index;
3738                 index = -1;
3739                 continue;
3740             }
3741          } else if (is_greedy) {
3742             if (init_hsp->ungapped_data) {
3743                 init_hsp->offsets.qs_offsets.q_off =
3744                     init_hsp->ungapped_data->q_start + init_hsp->ungapped_data->length/2;
3745                 init_hsp->offsets.qs_offsets.s_off =
3746                     init_hsp->ungapped_data->s_start + init_hsp->ungapped_data->length/2;
3747             }
3748             status = BLAST_GreedyGappedAlignment(
3749                          query_tmp.sequence, subject->sequence,
3750                          query_tmp.length, subject->length,
3751                          gap_align, score_params,
3752                          init_hsp->offsets.qs_offsets.q_off,
3753                          init_hsp->offsets.qs_offsets.s_off,
3754                          (Boolean) TRUE, FALSE, fence_hit);
3755             /* override the alignment start point with the estimate
3756                by the aligner of the best start point */
3757             init_hsp->offsets.qs_offsets.q_off =
3758                                 gap_align->greedy_query_seed_start;
3759             init_hsp->offsets.qs_offsets.s_off =
3760                                 gap_align->greedy_subject_seed_start;
3761          } else {
3762             /*  Assuming the ungapped alignment is long enough to
3763                 contain an 8-letter seed of exact matches, start
3764                 the gapped alignment inside the first byte of the
3765                 seed that contains all matches. Note that even if
3766                 the ungapped alignment is not long enough,
3767                 s_BlastDynProgNtGappedAlignment will still round
3768                 the start point up to the first 4-base boundary
3769                 on the subject sequence */
3770             if (s_end >= (Int4)init_hsp->offsets.qs_offsets.s_off + 8) {
3771                init_hsp->offsets.qs_offsets.s_off += 3;
3772                init_hsp->offsets.qs_offsets.q_off += 3;
3773             }
3774             status = s_BlastDynProgNtGappedAlignment(&query_tmp, subject,
3775                          gap_align, score_params, init_hsp);
3776          }
3777 
3778          if (status) {
3779              if (rpsblast_pssms) {
3780                  gap_align->sbp->psi_matrix->pssm->data = rpsblast_pssms;
3781              }
3782    	    sfree(found_high_score);
3783    	    tree = Blast_IntervalTreeFree(tree);
3784             return status;
3785          }
3786 
3787          if (gap_align->score >= cutoff) {
3788              Int2 query_frame = 0;
3789              /* For mixed-frame search, the query frame is determined
3790                 from the offset, not only from context. */
3791              if (score_params->options->is_ooframe &&
3792                  program_number == eBlastTypeBlastx) {
3793                  query_frame = gap_align->query_start % CODON_LENGTH + 1;
3794                  if ((context % NUM_FRAMES) >= CODON_LENGTH)
3795                      query_frame = -query_frame;
3796              } else {
3797                  query_frame = query_info->contexts[context].frame;
3798              }
3799 
3800              status = Blast_HSPInit(gap_align->query_start,
3801                            gap_align->query_stop, gap_align->subject_start,
3802                            gap_align->subject_stop,
3803                            init_hsp->offsets.qs_offsets.q_off,
3804                            init_hsp->offsets.qs_offsets.s_off, context,
3805                            query_frame, subject->frame, gap_align->score,
3806                            &(gap_align->edit_script), &new_hsp);
3807              if (status)
3808 	     {
3809    		sfree(found_high_score);
3810    		tree = Blast_IntervalTreeFree(tree);
3811                 return status;
3812 	     }
3813              status = Blast_HSPListSaveHSP(hsp_list, new_hsp);
3814              if (status)
3815                  break;
3816              status = BlastIntervalTreeAddHSP(new_hsp, tree, query_info,
3817                                      eQueryAndSubject);
3818              if (status)
3819                  break;
3820          }
3821          else {
3822             /* a greedy alignment may have traceback associated with it;
3823                free that traceback if the alignment will not be used */
3824             if (is_greedy) {
3825                gap_align->edit_script = GapEditScriptDelete(
3826                                              gap_align->edit_script);
3827             }
3828          }
3829       }
3830    }
3831 
3832    sfree(found_high_score);
3833    if (restricted_align_array) {
3834        sfree(restricted_align_array);
3835    }
3836    tree = Blast_IntervalTreeFree(tree);
3837    if (rpsblast_pssms) {
3838        gap_align->sbp->psi_matrix->pssm->data = rpsblast_pssms;
3839    }
3840 
3841    *hsp_list_ptr = hsp_list;
3842    return status;
3843 }
3844 
3845 /** Out-of-frame gapped alignment wrapper function.
3846  * @param query Query sequence [in]
3847  * @param subject Subject sequence [in]
3848  * @param q_off Offset in query [in]
3849  * @param s_off Offset in subject [in]
3850  * @param private_q_start Extent of alignment in query [out]
3851  * @param private_s_start Extent of alignment in subject [out]
3852  * @param score_only Return score only, without traceback [in]
3853  * @param edit_block Structure to hold generated traceback [out]
3854  * @param gap_align Gapped alignment information and preallocated
3855  *                  memory [in] [out]
3856  * @param score_params Scoring parameters [in]
3857  * @param psi_offset Starting position in PSI-BLAST matrix [in]
3858  * @param reversed Direction of the extension [in]
3859  * @param switch_seq Sequences need to be switched for blastx,
3860  *                   but not for tblastn [in]
3861  */
3862 static Int4
s_OutOfFrameSemiGappedAlignWrap(const Uint1 * query,const Uint1 * subject,Int4 q_off,Int4 s_off,Int4 * private_q_start,Int4 * private_s_start,Boolean score_only,GapPrelimEditBlock * edit_block,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 psi_offset,Boolean reversed,Boolean switch_seq)3863 s_OutOfFrameSemiGappedAlignWrap(const Uint1* query, const Uint1* subject, Int4 q_off,
3864    Int4 s_off, Int4* private_q_start, Int4* private_s_start,
3865    Boolean score_only, GapPrelimEditBlock *edit_block,
3866    BlastGapAlignStruct* gap_align,
3867    const BlastScoringParameters* score_params, Int4 psi_offset,
3868    Boolean reversed, Boolean switch_seq)
3869 {
3870    if (switch_seq) {
3871       return s_OutOfFrameGappedAlign(subject, query, s_off, q_off,
3872                 private_s_start, private_q_start, score_only, edit_block,
3873                 gap_align, score_params, psi_offset, reversed);
3874    } else {
3875       return s_OutOfFrameGappedAlign(query, subject, q_off, s_off,
3876                 private_q_start, private_s_start, score_only, edit_block,
3877                 gap_align, score_params, psi_offset, reversed);
3878    }
3879 }
3880 
3881 /** Maximal subject length after which the offsets are adjusted to a
3882  * subsequence.
3883  */
3884 #define MAX_SUBJECT_OFFSET 90000
3885 /** Approximate upper bound on a number of gaps in an HSP, needed to determine
3886  * the length of the subject subsequence to be retrieved for alignment with
3887  * traceback.
3888  */
3889 #define MAX_TOTAL_GAPS 3000
3890 
3891 void
AdjustSubjectRange(Int4 * subject_offset_ptr,Int4 * subject_length_ptr,Int4 query_offset,Int4 query_length,Int4 * start_shift)3892 AdjustSubjectRange(Int4* subject_offset_ptr, Int4* subject_length_ptr,
3893 		   Int4 query_offset, Int4 query_length, Int4* start_shift)
3894 {
3895    Int4 s_offset;
3896    Int4 subject_length = *subject_length_ptr;
3897    Int4 max_extension_left, max_extension_right;
3898 
3899    /* If subject sequence is not too long, leave everything as is */
3900    if (subject_length < MAX_SUBJECT_OFFSET) {
3901       *start_shift = 0;
3902       return;
3903    }
3904 
3905    s_offset = *subject_offset_ptr;
3906    /* Maximal extension length is the remaining length in the query, plus
3907       an estimate of a maximal total number of gaps. */
3908    max_extension_left = query_offset + MAX_TOTAL_GAPS;
3909    max_extension_right = query_length - query_offset + MAX_TOTAL_GAPS;
3910 
3911    if (s_offset <= max_extension_left) {
3912       *start_shift = 0;
3913    } else {
3914       *start_shift = s_offset - max_extension_left;
3915       *subject_offset_ptr = max_extension_left;
3916    }
3917 
3918    *subject_length_ptr =
3919       MIN(subject_length, s_offset + max_extension_right) - *start_shift;
3920 }
3921 
3922 /** Performs gapped extension for protein sequences, given two
3923  * sequence blocks, scoring and extension options, and an initial HSP
3924  * with information from the previously performed ungapped extension
3925  * @param program BLAST program [in]
3926  * @param query_blk The query sequence block [in]
3927  * @param subject_blk The subject sequence block [in]
3928  * @param gap_align The auxiliary structure for gapped alignment [in]
3929  * @param score_params Parameters related to scoring [in]
3930  * @param init_hsp The initial HSP information [in]
3931  * @param restricted_alignment If true and search is not out-of-frame,
3932  *              use a faster approximate gapped alignment algorithm [in]
3933  * @param fence_hit If not NULL, pointer to a boolean that is set to
3934  *                  TRUE if gapped extension reaches a neighborhood
3935  *                  of the subject sequence that is not initialized [in][out]
3936  */
3937 static Int2
s_BlastProtGappedAlignment(EBlastProgramType program,BLAST_SequenceBlk * query_blk,BLAST_SequenceBlk * subject_blk,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,BlastInitHSP * init_hsp,Boolean restricted_alignment,Boolean * fence_hit)3938 s_BlastProtGappedAlignment(EBlastProgramType program,
3939    BLAST_SequenceBlk* query_blk, BLAST_SequenceBlk* subject_blk,
3940    BlastGapAlignStruct* gap_align,
3941    const BlastScoringParameters* score_params, BlastInitHSP* init_hsp,
3942    Boolean restricted_alignment,
3943    Boolean * fence_hit)
3944 {
3945    Boolean found_start, found_end;
3946    Int4 q_length=0, s_length=0, score_right, score_left;
3947    Int4 private_q_start, private_s_start;
3948    Uint1* query=NULL,* subject=NULL;
3949    Boolean switch_seq = FALSE;
3950    Int4 query_length = query_blk->length;
3951    Int4 subject_length = subject_blk->length;
3952    Int4 subject_shift = 0;
3953    BlastScoringOptions *score_options = score_params->options;
3954 
3955    if (gap_align == NULL)
3956       return -1;
3957 
3958    if (score_options->is_ooframe) {
3959       ASSERT(program == eBlastTypeTblastn || program == eBlastTypeBlastx);
3960 
3961       q_length = init_hsp->offsets.qs_offsets.q_off;
3962       /* For negative subject frames, make subject offset relative to the part
3963          of the mixed-frame sequence corresponding to the reverse strand. */
3964       if (program == eBlastTypeTblastn && subject_blk->frame < 0)
3965           init_hsp->offsets.qs_offsets.s_off -= subject_length + 1;
3966 
3967       s_length = init_hsp->offsets.qs_offsets.s_off;
3968 
3969       if (program == eBlastTypeBlastx) {
3970          subject = subject_blk->sequence + s_length;
3971          query = query_blk->oof_sequence + CODON_LENGTH + q_length;
3972          query_length -= CODON_LENGTH - 1;
3973          switch_seq = TRUE;
3974       } else if (program == eBlastTypeTblastn) {
3975          subject = subject_blk->oof_sequence + CODON_LENGTH + s_length;
3976          query = query_blk->sequence + q_length;
3977          subject_length -= CODON_LENGTH - 1;
3978       }
3979    } else {
3980       q_length = init_hsp->offsets.qs_offsets.q_off + 1;
3981       s_length = init_hsp->offsets.qs_offsets.s_off + 1;
3982       query = query_blk->sequence;
3983       subject = subject_blk->sequence;
3984    }
3985 
3986    AdjustSubjectRange(&s_length, &subject_length, q_length, query_length,
3987                       &subject_shift);
3988 
3989    found_start = FALSE;
3990    found_end = FALSE;
3991 
3992    /* Looking for "left" score */
3993    score_left = 0;
3994    if (q_length != 0 && s_length != 0) {
3995       found_start = TRUE;
3996       if(score_options->is_ooframe) {
3997          score_left =
3998              s_OutOfFrameSemiGappedAlignWrap(query, subject, q_length, s_length,
3999                  &private_q_start, &private_s_start, TRUE, NULL,
4000                  gap_align, score_params, q_length, TRUE, switch_seq);
4001       } else {
4002          if (restricted_alignment) {
4003             score_left = s_RestrictedGappedAlign(query, subject+subject_shift,
4004                                    q_length, s_length,
4005                                    &private_q_start, &private_s_start,
4006                                    gap_align, score_params,
4007                                    init_hsp->offsets.qs_offsets.q_off, TRUE);
4008          }
4009          else {
4010             score_left = Blast_SemiGappedAlign(query, subject+subject_shift,
4011                                    q_length, s_length,
4012                                    &private_q_start, &private_s_start, TRUE,
4013                                    NULL, gap_align, score_params,
4014                                    init_hsp->offsets.qs_offsets.q_off,
4015                                    FALSE, TRUE,
4016                                    fence_hit);
4017          }
4018       }
4019       gap_align->query_start = q_length - private_q_start;
4020       gap_align->subject_start = s_length - private_s_start + subject_shift;
4021    }
4022 
4023    score_right = 0;
4024    if (q_length < query_length && s_length < subject_length) {
4025       found_end = TRUE;
4026       if(score_options->is_ooframe) {
4027          score_right =
4028              s_OutOfFrameSemiGappedAlignWrap(query-1, subject-1,
4029                  query_length-q_length+1, subject_length-s_length+1,
4030                  &(gap_align->query_stop), &(gap_align->subject_stop),
4031                  TRUE, NULL, gap_align, score_params, q_length, FALSE,
4032                  switch_seq);
4033          gap_align->query_stop += q_length;
4034          gap_align->subject_stop += s_length + subject_shift;
4035       } else {
4036          if (restricted_alignment) {
4037             score_right = s_RestrictedGappedAlign(
4038                                    query+init_hsp->offsets.qs_offsets.q_off,
4039                                    subject+init_hsp->offsets.qs_offsets.s_off,
4040                                    query_length-q_length,
4041                                    subject_length-s_length,
4042                                    &(gap_align->query_stop),
4043                                    &(gap_align->subject_stop),
4044                                    gap_align, score_params,
4045                                    init_hsp->offsets.qs_offsets.q_off, FALSE);
4046          }
4047          else {
4048             score_right = Blast_SemiGappedAlign(
4049                                    query+init_hsp->offsets.qs_offsets.q_off,
4050                                    subject+init_hsp->offsets.qs_offsets.s_off,
4051                                    query_length-q_length,
4052                                    subject_length-s_length,
4053                                    &(gap_align->query_stop),
4054                                    &(gap_align->subject_stop),
4055                                    TRUE, NULL, gap_align, score_params,
4056                                    init_hsp->offsets.qs_offsets.q_off, FALSE,
4057                                    FALSE,
4058                                    fence_hit);
4059          }
4060          /* Make end offsets point to the byte after the end of the
4061             alignment */
4062          gap_align->query_stop += init_hsp->offsets.qs_offsets.q_off + 1;
4063          gap_align->subject_stop += init_hsp->offsets.qs_offsets.s_off + 1;
4064       }
4065    }
4066 
4067    if (found_start == FALSE) {  /* impossible for in-frame */
4068       gap_align->query_start = init_hsp->offsets.qs_offsets.q_off;
4069       gap_align->subject_start = init_hsp->offsets.qs_offsets.s_off;
4070    }
4071    if (found_end == FALSE) {    /* impossible for out-of-frame */
4072       gap_align->query_stop = init_hsp->offsets.qs_offsets.q_off;
4073       gap_align->subject_stop = init_hsp->offsets.qs_offsets.s_off;
4074    }
4075 
4076    gap_align->score = score_right+score_left;
4077 
4078    return 0;
4079 }
4080 
4081 /** Converts OOF traceback from a gapped alignment to a GapEditScript.
4082  * This function is for out-of-frame gapping.
4083  * @param rev_prelim_tback Traceback operations for left extension [in]
4084  * @param fwd_prelim_tback Traceback operations for right extension [in]
4085  * @param nucl_align_length Length of alignment on the translated
4086  *                          nucleotide sequence [in]
4087  * @param edit_script_ptr The resulting edit block [out]
4088  * @return Always zero
4089  */
4090 static Int2
s_BlastOOFTracebackToGapEditScript(GapPrelimEditBlock * rev_prelim_tback,GapPrelimEditBlock * fwd_prelim_tback,Int4 nucl_align_length,GapEditScript ** edit_script_ptr)4091 s_BlastOOFTracebackToGapEditScript(GapPrelimEditBlock *rev_prelim_tback,
4092                                    GapPrelimEditBlock *fwd_prelim_tback,
4093                                    Int4 nucl_align_length,
4094                                    GapEditScript** edit_script_ptr)
4095 {
4096     GapEditScript* e_script;
4097     EGapAlignOpType last_op;
4098     Int4 last_num;
4099     GapPrelimEditBlock *tmp_prelim_tback;
4100     Int4 i, num_nuc;
4101     int extra_needed=0;
4102 
4103     /* prepend a substitution, since the input sequences were
4104        shifted prior to the OOF alignment */
4105 
4106     tmp_prelim_tback = GapPrelimEditBlockNew();
4107     last_op = eGapAlignSub;
4108     last_num = 1;
4109 
4110     /* Propagate the extra substitution through the edit script. */
4111 
4112     for (i = 0; i < rev_prelim_tback->num_ops; i++) {
4113 
4114         EGapAlignOpType next_op = rev_prelim_tback->edit_ops[i].op_type;
4115         Int4 next_num = rev_prelim_tback->edit_ops[i].num;
4116 
4117         if (next_op == last_op) {
4118 
4119             /* merge consecutive operations that are identical */
4120 
4121             last_num += next_num;
4122         }
4123         else if (next_op == eGapAlignIns || next_op == eGapAlignDel) {
4124 
4125             /* Propagating the initial extra substitution requires
4126                that edit operations which are not in-frame gaps must
4127                'flow through' an in-frame gap; the last non-gap
4128                operation before the gap will appear after the gap.
4129                This could mean either one or two edit blocks
4130                getting created */
4131 
4132             if (last_num > 1) {
4133                 GapPrelimEditBlockAdd(tmp_prelim_tback, last_op, last_num - 1);
4134             }
4135             GapPrelimEditBlockAdd(tmp_prelim_tback, next_op, next_num);
4136             last_num = 1;          /* last_op unchanged; one 'orphan' now */
4137         }
4138         else {
4139 
4140             /* write out the last edit operation and
4141                move forward in the script */
4142 
4143             GapPrelimEditBlockAdd(tmp_prelim_tback, last_op, last_num);
4144             last_op = next_op;
4145             last_num = next_num;
4146         }
4147     }
4148 
4149     /* Handle the crossing over from the left extension to
4150        the right extension. Begin by writing out the final
4151        group of ops from the left extension, except for the
4152        one op that must be merged */
4153 
4154     last_num--;
4155     if (last_num > 0) {
4156         GapPrelimEditBlockAdd(tmp_prelim_tback, last_op, last_num);
4157     }
4158 
4159     if (last_op != eGapAlignSub) {
4160 
4161         /* the last traceback letter from the left extension must
4162            be folded into the right extension. */
4163 
4164         for (i = fwd_prelim_tback->num_ops - 1; i >= 0; i--) {
4165 
4166             GapPrelimEditScript *new_script = fwd_prelim_tback->edit_ops + i;
4167 
4168             if (new_script->op_type == eGapAlignIns ||
4169                 new_script->op_type == eGapAlignDel) {
4170 
4171                 /* do not merge with in-frame gaps; just write out
4172                    the gaps and 'flow through' the last op */
4173 
4174                 GapPrelimEditBlockAdd(tmp_prelim_tback, new_script->op_type,
4175                                       new_script->num);
4176             }
4177             else {
4178 
4179                 /* merge with the first letter of new_script */
4180 
4181                 last_op += new_script->op_type - eGapAlignSub;
4182                 GapPrelimEditBlockAdd(tmp_prelim_tback, last_op, 1);
4183 
4184                 /* borrow one letter from new_script to complete the
4185                    merge, and skip new_script if it only had one letter */
4186                 new_script->num--;
4187                 if (new_script->num == 0)
4188                     i--;
4189                 break;
4190             }
4191         }
4192         fwd_prelim_tback->num_ops = i + 1;
4193     }
4194 
4195     /* form the final edit block */
4196     e_script = Blast_PrelimEditBlockToGapEditScript(tmp_prelim_tback, fwd_prelim_tback);
4197     GapPrelimEditBlockFree(tmp_prelim_tback);
4198 
4199     /* postprocess the edit script */
4200     num_nuc = 0;
4201     for (i=0; i<e_script->size; i++)
4202     {
4203         int total_actions=0;
4204         /* Count the number of nucleotides in the next
4205            traceback operation and delete any traceback operations
4206            that would make the alignment too long. This check is
4207            not needed for in-frame alignment because the
4208            traceback is never changed after it is computed */
4209 
4210         last_op = e_script->op_type[i];
4211 
4212         if (last_op == eGapAlignIns)
4213             last_op = eGapAlignSub;
4214 
4215         total_actions = last_op * e_script->num[i];
4216 
4217         if (num_nuc + total_actions >= nucl_align_length) {
4218             e_script->num[i] = (nucl_align_length - num_nuc +
4219                              last_op - 1) / last_op;
4220             break; /* We delete the rest of the script. */
4221         }
4222         else {
4223             num_nuc += total_actions;;
4224         }
4225     }
4226     e_script->size = MIN(i+1, e_script->size);  /* If we broke out early then we truncate the edit script. */
4227 
4228     extra_needed = 0;
4229     for (i=0; i<e_script->size; i++)
4230     {
4231         if (e_script->op_type[i] % 3 != 0 && e_script->num[i] > 1) {
4232            extra_needed += e_script->num[i] - 1;
4233         }
4234     }
4235 
4236     if (extra_needed)
4237     {
4238         GapEditScript* new_esp = GapEditScriptNew(extra_needed+e_script->size);
4239         int new_esp_i=0;
4240         for (i=0; i<e_script->size; i++)
4241         {
4242            /* Only one frame shift operation at a time is
4243            allowed in a single edit script element. */
4244            new_esp->num[new_esp_i] = e_script->num[i];
4245            new_esp->op_type[new_esp_i] = e_script->op_type[i];
4246            new_esp_i++;
4247            last_op = e_script->op_type[i];
4248            if (last_op % 3 != 0 && e_script->num[i] > 1) {
4249                Int4 num_ops = e_script->num[i];
4250                int esp_index=0;
4251                new_esp->num[new_esp_i-1] = 1;
4252                for (esp_index = 1; esp_index < num_ops; esp_index++) {
4253                    new_esp->num[new_esp_i] = 1;
4254                    new_esp->op_type[new_esp_i] = last_op;
4255                    new_esp_i++;
4256                }
4257            }
4258        }
4259        e_script = GapEditScriptDelete(e_script);
4260        e_script = new_esp;
4261     }
4262     *edit_script_ptr = e_script;
4263 
4264     /* finally, add one to the size of any block of substitutions
4265        that follows a frame shift op */
4266     last_op = e_script->op_type[0];
4267     for (i=1; i<e_script->size; i++)
4268     {
4269         if (e_script->op_type[i] == eGapAlignSub && (last_op % 3) != 0)
4270             (e_script->num[i])++;
4271 
4272         last_op = e_script->op_type[i];
4273     }
4274 
4275     return 0;
4276 }
4277 
BLAST_GappedAlignmentWithTraceback(EBlastProgramType program,const Uint1 * query,const Uint1 * subject,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 q_start,Int4 s_start,Int4 query_length,Int4 subject_length,Boolean * fence_hit)4278 Int2 BLAST_GappedAlignmentWithTraceback(EBlastProgramType program,
4279         const Uint1* query, const Uint1* subject, BlastGapAlignStruct* gap_align,
4280         const BlastScoringParameters* score_params,
4281         Int4 q_start, Int4 s_start, Int4 query_length, Int4 subject_length,
4282         Boolean * fence_hit)
4283 {
4284     Boolean found_end;
4285     Int4 score_right, score_left, private_q_length, private_s_length;
4286     Int4 q_length, s_length;
4287     Boolean is_ooframe = score_params->options->is_ooframe;
4288     Int2 status = 0;
4289     Boolean switch_seq = FALSE;
4290     GapPrelimEditBlock *fwd_prelim_tback;
4291     GapPrelimEditBlock *rev_prelim_tback;
4292 
4293     if (gap_align == NULL)
4294         return -1;
4295 
4296     fwd_prelim_tback = gap_align->fwd_prelim_tback;
4297     rev_prelim_tback = gap_align->rev_prelim_tback;
4298     GapPrelimEditBlockReset(fwd_prelim_tback);
4299     GapPrelimEditBlockReset(rev_prelim_tback);
4300 
4301     found_end = FALSE;
4302 
4303     q_length = query_length;
4304     s_length = subject_length;
4305     if (is_ooframe) {
4306        /* The mixed frame sequence is shifted to the 3rd position, so its
4307           maximal available length for extension is less by 2 than its
4308           total length. */
4309        switch_seq = (program == eBlastTypeBlastx);
4310        if (switch_seq) {
4311           q_length -= CODON_LENGTH - 1;
4312        } else {
4313           s_length -= CODON_LENGTH - 1;
4314        }
4315     }
4316 
4317     score_left = 0;
4318 
4319     if(is_ooframe) {
4320        /* NB: Left extension does not include starting point corresponding
4321           to the offset pair; the right extension does. */
4322        score_left =
4323           s_OutOfFrameSemiGappedAlignWrap(query+q_start, subject+s_start,
4324              q_start, s_start, &private_q_length, &private_s_length,
4325              FALSE, rev_prelim_tback, gap_align, score_params, q_start,
4326              TRUE, switch_seq);
4327        gap_align->query_start = q_start - private_q_length;
4328        gap_align->subject_start = s_start - private_s_length;
4329     } else {
4330        /* NB: The left extension includes the starting point
4331           [q_start,s_start]; the right extension does not. */
4332        // <AG> score_left =
4333        //    Blast_SemiGappedAlign(query, subject, q_start+1, s_start+1,
4334        //       &private_q_length, &private_s_length, FALSE, rev_prelim_tback,
4335        //       gap_align, score_params, q_start, FALSE, TRUE,
4336        //       fence_hit);
4337 
4338         score_left = ALIGN_EX(query, subject, q_start+1, s_start+1,  &private_q_length, &private_s_length, rev_prelim_tback,
4339                               gap_align,
4340                               score_params, q_start, FALSE /*reversed*/, TRUE /*reverse_sequence*/,
4341              fence_hit);
4342 
4343        gap_align->query_start = q_start - private_q_length + 1;
4344        gap_align->subject_start = s_start - private_s_length + 1;
4345     }
4346 
4347     score_right = 0;
4348 
4349     if ((! (fence_hit && *fence_hit)) &&
4350         (q_start < q_length) &&
4351         (s_start < s_length)) {
4352 
4353        found_end = TRUE;
4354        if(is_ooframe) {
4355           score_right =
4356              s_OutOfFrameSemiGappedAlignWrap(query+q_start-1, subject+s_start-1,
4357                 q_length-q_start, s_length-s_start,
4358                 &private_q_length, &private_s_length, FALSE, fwd_prelim_tback,
4359                 gap_align, score_params, q_start, FALSE, switch_seq);
4360         } else {
4361             // <AG> score_right =
4362             //    Blast_SemiGappedAlign(query+q_start, subject+s_start,
4363             //       q_length-q_start-1, s_length-s_start-1, &private_q_length,
4364             //       &private_s_length, FALSE, fwd_prelim_tback, gap_align,
4365             //       score_params, q_start, FALSE, FALSE,
4366             //       fence_hit);
4367             score_right =
4368                ALIGN_EX(query+q_start, subject+s_start,
4369                   q_length-q_start-1, s_length-s_start-1, &private_q_length,
4370                   &private_s_length, fwd_prelim_tback, gap_align,
4371                   score_params, q_start, FALSE, FALSE,
4372                   fence_hit);
4373         }
4374 
4375         gap_align->query_stop = q_start + private_q_length + 1;
4376         gap_align->subject_stop = s_start + private_s_length + 1;
4377     }
4378 
4379     if (found_end == FALSE) {
4380         gap_align->query_stop = q_start - 1;
4381         gap_align->subject_stop = s_start - 1;
4382     }
4383 
4384     if(is_ooframe) {
4385         Int4 nucl_align_length;
4386         if (program == eBlastTypeBlastx) {
4387             nucl_align_length = gap_align->query_stop -
4388                                 gap_align->query_start;
4389         }
4390         else {
4391             nucl_align_length = gap_align->subject_stop -
4392                                 gap_align->subject_start;
4393         }
4394         status = s_BlastOOFTracebackToGapEditScript(rev_prelim_tback,
4395                        fwd_prelim_tback, nucl_align_length,
4396                        &gap_align->edit_script);
4397     } else {
4398         Int4 i;
4399         GapEditScript *esp = Blast_PrelimEditBlockToGapEditScript(
4400                                                         rev_prelim_tback,
4401                                                         fwd_prelim_tback);
4402 
4403         /* rarely (typically when the scoring system changes between
4404            the score-only and traceback stages, as happens with
4405            composition-based statistics) it is possible to compute an
4406            optimal alignment with a leading or trailing gap. Prune
4407            these unneeded gaps here and update the score and
4408            alignment boundaries */
4409 
4410         gap_align->edit_script = esp;
4411         if (esp) {
4412             while (esp->size && esp->op_type[0] != eGapAlignSub) {
4413                 score_left += score_params->gap_open +
4414                              esp->num[0] * score_params->gap_extend;
4415 
4416                 if (esp->op_type[0] == eGapAlignDel)
4417                     gap_align->subject_start += esp->num[0];
4418                 else
4419                     gap_align->query_start += esp->num[0];
4420 
4421                 for (i = 1; i < esp->size; i++) {
4422                     esp->op_type[i-1] = esp->op_type[i];
4423                     esp->num[i-1] = esp->num[i];
4424                 }
4425                 esp->size--;
4426             }
4427             i = esp->size;
4428             while (esp->size && esp->op_type[i-1] != eGapAlignSub) {
4429                 score_right += score_params->gap_open +
4430                              esp->num[i-1] * score_params->gap_extend;
4431 
4432                 if (esp->op_type[i-1] == eGapAlignDel)
4433                     gap_align->subject_stop -= esp->num[i-1];
4434                 else
4435                     gap_align->query_stop -= esp->num[i-1];
4436 
4437                 esp->size--;
4438                 i--;
4439                 ASSERT(esp->size == i);
4440             }
4441         }
4442     }
4443 
4444     gap_align->score = score_right + score_left;
4445     return status;
4446 }
4447 
BLAST_GetUngappedHSPList(BlastInitHitList * init_hitlist,BlastQueryInfo * query_info,BLAST_SequenceBlk * subject,const BlastHitSavingOptions * hit_options,BlastHSPList ** hsp_list_ptr)4448 Int2 BLAST_GetUngappedHSPList(BlastInitHitList* init_hitlist,
4449                               BlastQueryInfo* query_info,
4450                               BLAST_SequenceBlk* subject,
4451                               const BlastHitSavingOptions* hit_options,
4452                               BlastHSPList** hsp_list_ptr)
4453 {
4454    BlastHSPList* hsp_list = NULL;
4455    Int4 index;
4456    BlastInitHSP* init_hsp;
4457    const int kHspNumMax = BlastHspNumMax(FALSE, hit_options);
4458 
4459    /* The BlastHSPList structure can be allocated and passed from outside */
4460    if (*hsp_list_ptr != NULL)
4461       hsp_list = *hsp_list_ptr;
4462 
4463    if (!init_hitlist) {
4464       if (!hsp_list)
4465          *hsp_list_ptr = NULL;
4466       else
4467          hsp_list->hspcnt = 0;
4468       return 0;
4469    }
4470 
4471    for (index = 0; index < init_hitlist->total; ++index) {
4472       BlastHSP* new_hsp;
4473       BlastUngappedData* ungapped_data=NULL;
4474       Int4 context = 0;
4475       init_hsp = &init_hitlist->init_hsp_array[index];
4476       if (!init_hsp->ungapped_data)
4477          continue;
4478 
4479       if (!hsp_list) {
4480          hsp_list = Blast_HSPListNew(kHspNumMax);
4481          *hsp_list_ptr = hsp_list;
4482       }
4483       /* Adjust the initial HSP's coordinates in case of concatenated
4484          multiple queries/strands/frames */
4485       context = s_GetUngappedHSPContext(query_info, init_hsp);
4486       s_AdjustInitialHSPOffsets(init_hsp,
4487                                 query_info->contexts[context].query_offset);
4488       ungapped_data = init_hsp->ungapped_data;
4489       Blast_HSPInit(ungapped_data->q_start,
4490                     ungapped_data->length+ungapped_data->q_start,
4491                     ungapped_data->s_start,
4492                     ungapped_data->length+ungapped_data->s_start,
4493                     init_hsp->offsets.qs_offsets.q_off,
4494                     init_hsp->offsets.qs_offsets.s_off,
4495                     context, query_info->contexts[context].frame,
4496                     subject->frame, ungapped_data->score, NULL, &new_hsp);
4497       Blast_HSPListSaveHSP(hsp_list, new_hsp);
4498    }
4499 
4500    /* Sort the HSP array by score */
4501    Blast_HSPListSortByScore(hsp_list);
4502 
4503    return 0;
4504 }
4505 
4506