1 /* $Id: blast_gapalign.c 581681 2019-03-04 21:01:49Z boratyng $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 *
27 */
28
29 /** @file blast_gapalign.c
30 * Functions to perform gapped alignment
31 */
32
33 #include <algo/blast/core/ncbi_math.h>
34 #include <algo/blast/core/blast_gapalign.h>
35 #include <algo/blast/core/blast_util.h> /* for NCBI2NA_UNPACK_BASE macros */
36 #include <algo/blast/core/greedy_align.h>
37 #include "blast_gapalign_priv.h"
38 #include "blast_hits_priv.h"
39 #include "blast_itree.h"
40 #include "jumper.h"
41
42 static Int2 s_BlastDynProgNtGappedAlignment(BLAST_SequenceBlk* query_blk,
43 BLAST_SequenceBlk* subject_blk, BlastGapAlignStruct* gap_align,
44 const BlastScoringParameters* score_params, BlastInitHSP* init_hsp);
45 static Int4 s_BlastAlignPackedNucl(Uint1* B, Uint1* A, Int4 N, Int4 M,
46 Int4* pej, Int4* pei, BlastGapAlignStruct* gap_align,
47 const BlastScoringParameters* score_params, Boolean reverse_sequence,
48 Int4 x_dropoff);
49
50 static Int2 s_BlastProtGappedAlignment(EBlastProgramType program,
51 BLAST_SequenceBlk* query_in, BLAST_SequenceBlk* subject_in,
52 BlastGapAlignStruct* gap_align,
53 const BlastScoringParameters* score_params,
54 BlastInitHSP* init_hsp, Boolean restricted_alignment,
55 Boolean * fence_hit);
56
57 /** Lower bound for scores. Divide by two to prevent underflows. */
58 #define MININT INT4_MIN/2
59
60 /** Minimal size of a chunk for state array allocation. */
61 #define CHUNKSIZE 2097152
62
63 /** Retrieve the state structure corresponding to a given length
64 * @param head Pointer to the first element of the state structures
65 * array [in]
66 * @param length The length for which the state structure has to be
67 * found [in]
68 * @return The found or created state structure
69 */
70 static GapStateArrayStruct*
s_GapGetState(GapStateArrayStruct ** head,Int4 length)71 s_GapGetState(GapStateArrayStruct** head, Int4 length)
72
73 {
74 GapStateArrayStruct* retval,* var,* last;
75 Int4 chunksize = MAX(CHUNKSIZE, length + length/3);
76
77 length += length/3; /* Add on about 30% so the end will get reused. */
78 retval = NULL;
79 if (*head == NULL) {
80 retval = (GapStateArrayStruct*)
81 malloc(sizeof(GapStateArrayStruct));
82 retval->state_array = (Uint1*) malloc(chunksize*sizeof(Uint1));
83 retval->length = chunksize;
84 retval->used = 0;
85 retval->next = NULL;
86 *head = retval;
87 } else {
88 var = *head;
89 last = *head;
90 while (var) {
91 if (length < (var->length - var->used)) {
92 retval = var;
93 break;
94 } else if (var->used == 0) {
95 /* If it's empty and too small, replace. */
96 sfree(var->state_array);
97 var->state_array = (Uint1*) malloc(chunksize*sizeof(Uint1));
98 var->length = chunksize;
99 retval = var;
100 break;
101 }
102 last = var;
103 var = var->next;
104 }
105
106 if (var == NULL)
107 {
108 retval = (GapStateArrayStruct*) malloc(sizeof(GapStateArrayStruct));
109 retval->state_array = (Uint1*) malloc(chunksize*sizeof(Uint1));
110 retval->length = chunksize;
111 retval->used = 0;
112 retval->next = NULL;
113 last->next = retval;
114 }
115 }
116
117 #ifdef ERR_POST_EX_DEFINED
118 if (retval->state_array == NULL)
119 ErrPostEx(SEV_ERROR, 0, 0, "state array is NULL");
120 #endif
121
122 return retval;
123
124 }
125
126 /** Remove a state from a state structure */
127 static Boolean
s_GapPurgeState(GapStateArrayStruct * state_struct)128 s_GapPurgeState(GapStateArrayStruct* state_struct)
129 {
130 while (state_struct)
131 {
132 /*
133 memset(state_struct->state_array, 0, state_struct->used);
134 */
135 state_struct->used = 0;
136 state_struct = state_struct->next;
137 }
138
139 return TRUE;
140 }
141
142 /** Deallocate the memory for greedy gapped alignment */
143 static SGreedyAlignMem*
s_BlastGreedyAlignsFree(SGreedyAlignMem * gamp)144 s_BlastGreedyAlignsFree(SGreedyAlignMem* gamp)
145 {
146 if (gamp->last_seq2_off) {
147 sfree(gamp->last_seq2_off[0]);
148 sfree(gamp->last_seq2_off);
149 } else {
150 if (gamp->last_seq2_off_affine) {
151 sfree(gamp->last_seq2_off_affine[0]);
152 sfree(gamp->last_seq2_off_affine);
153 }
154 sfree(gamp->diag_bounds);
155 }
156 sfree(gamp->max_score);
157 if (gamp->space)
158 MBSpaceFree(gamp->space);
159 sfree(gamp);
160 return NULL;
161 }
162
163 /** Allocate memory for the greedy gapped alignment algorithm
164 * @param score_params Parameters related to scoring [in]
165 * @param ext_params Options and parameters related to the extension [in]
166 * @param max_d The maximum distance to search [in]
167 * @param Xdrop The Xdrop value [in]
168 * @return The allocated SGreedyAlignMem structure
169 */
170 static SGreedyAlignMem*
s_BlastGreedyAlignMemAlloc(const BlastScoringParameters * score_params,const BlastExtensionParameters * ext_params,Int4 max_d,Int4 Xdrop)171 s_BlastGreedyAlignMemAlloc(const BlastScoringParameters* score_params,
172 const BlastExtensionParameters* ext_params,
173 Int4 max_d, Int4 Xdrop)
174 {
175 SGreedyAlignMem* gamp;
176 Int4 max_d_1, d_diff, max_cost, gd, i;
177 Int4 reward, penalty, gap_open, gap_extend;
178 Int4 Mis_cost, GE_cost;
179
180 if (score_params == NULL || (!ext_params && !Xdrop))
181 return NULL;
182
183 if (score_params->reward % 2 == 1) {
184 reward = 2*score_params->reward;
185 penalty = -2*score_params->penalty;
186 if (!Xdrop)
187 Xdrop = 2*MAX(ext_params->gap_x_dropoff,
188 ext_params->gap_x_dropoff_final);
189 gap_open = 2*score_params->gap_open;
190 gap_extend = 2*score_params->gap_extend;
191 } else {
192 reward = score_params->reward;
193 penalty = -score_params->penalty;
194 if (!Xdrop)
195 Xdrop = MAX(ext_params->gap_x_dropoff,
196 ext_params->gap_x_dropoff_final);
197 gap_open = score_params->gap_open;
198 gap_extend = score_params->gap_extend;
199 }
200
201 if (gap_open == 0 && gap_extend == 0)
202 gap_extend = reward / 2 + penalty;
203
204 gamp = (SGreedyAlignMem*) calloc(1, sizeof(SGreedyAlignMem));
205 gamp->max_dist = max_d;
206 gamp->xdrop = Xdrop;
207
208 if (score_params->gap_open==0 && score_params->gap_extend==0) {
209 d_diff = (Xdrop+reward/2)/(penalty+reward)+1;
210
211 gamp->last_seq2_off = (Int4**) malloc((max_d + 2) * sizeof(Int4*));
212 if (gamp->last_seq2_off == NULL) {
213 sfree(gamp);
214 return NULL;
215 }
216 gamp->last_seq2_off[0] =
217 (Int4*) malloc((max_d + max_d + 6) * sizeof(Int4) * 2);
218 if (gamp->last_seq2_off[0] == NULL) {
219 #ifdef ERR_POST_EX_DEFINED
220 ErrPostEx(SEV_WARNING, 0, 0,
221 "Failed to allocate %ld bytes for greedy alignment",
222 (max_d + max_d + 6) * sizeof(Int4) * 2);
223 #endif
224 s_BlastGreedyAlignsFree(gamp);
225 return NULL;
226 }
227
228 gamp->last_seq2_off[1] = gamp->last_seq2_off[0] + max_d + max_d + 6;
229 gamp->last_seq2_off_affine = NULL;
230 gamp->diag_bounds = NULL;
231 } else {
232 gamp->last_seq2_off = NULL;
233 Mis_cost = reward + penalty;
234 GE_cost = gap_extend+reward/2;
235 max_d_1 = max_d;
236 max_d *= GE_cost;
237 max_cost = MAX(Mis_cost, gap_open+GE_cost);
238 gd = BLAST_Gdb3(&Mis_cost, &gap_open, &GE_cost);
239 d_diff = (Xdrop+reward/2)/gd+1;
240 gamp->diag_bounds = (Int4*) calloc(2*(max_d+1+max_cost), sizeof(Int4));
241 gamp->last_seq2_off_affine = (SGreedyOffset**)
242 malloc((MAX(max_d, max_cost) + 2) * sizeof(SGreedyOffset*));
243 if (!gamp->diag_bounds || !gamp->last_seq2_off_affine) {
244 s_BlastGreedyAlignsFree(gamp);
245 return NULL;
246 }
247 gamp->last_seq2_off_affine[0] = (SGreedyOffset*)
248 calloc((2*max_d_1 + 6) , sizeof(SGreedyOffset) * (max_cost+1));
249 for (i = 1; i <= max_cost; i++)
250 gamp->last_seq2_off_affine[i] =
251 gamp->last_seq2_off_affine[i-1] + 2*max_d_1 + 6;
252 if (!gamp->last_seq2_off_affine || !gamp->last_seq2_off_affine[0]) {
253 s_BlastGreedyAlignsFree(gamp);
254 return NULL;
255 }
256 }
257 gamp->max_score = (Int4*) malloc(sizeof(Int4) * (max_d + 1 + d_diff));
258
259 gamp->space = MBSpaceNew(0);
260 if (!gamp->max_score || !gamp->space) {
261 /* Failure in one of the memory allocations */
262 s_BlastGreedyAlignsFree(gamp);
263 return NULL;
264 }
265
266 return gamp;
267 }
268
269 /* Documented in blast_gapalign.h */
270 BlastGapAlignStruct*
BLAST_GapAlignStructFree(BlastGapAlignStruct * gap_align)271 BLAST_GapAlignStructFree(BlastGapAlignStruct* gap_align)
272 {
273 if (!gap_align)
274 return NULL;
275
276 GapEditScriptDelete(gap_align->edit_script);
277 GapPrelimEditBlockFree(gap_align->fwd_prelim_tback);
278 GapPrelimEditBlockFree(gap_align->rev_prelim_tback);
279 if (gap_align->greedy_align_mem)
280 s_BlastGreedyAlignsFree(gap_align->greedy_align_mem);
281 GapStateFree(gap_align->state_struct);
282 sfree(gap_align->dp_mem);
283 JumperGapAlignFree(gap_align->jumper);
284
285 sfree(gap_align);
286 return NULL;
287 }
288
289 /* Documented in blast_gapalign.h */
290 Int2
BLAST_GapAlignStructNew(const BlastScoringParameters * score_params,const BlastExtensionParameters * ext_params,Uint4 max_subject_length,BlastScoreBlk * sbp,BlastGapAlignStruct ** gap_align_ptr)291 BLAST_GapAlignStructNew(const BlastScoringParameters* score_params,
292 const BlastExtensionParameters* ext_params,
293 Uint4 max_subject_length,
294 BlastScoreBlk* sbp, BlastGapAlignStruct** gap_align_ptr)
295 {
296 Int2 status = 0;
297 BlastGapAlignStruct* gap_align;
298
299 if (!gap_align_ptr || !sbp || !score_params || !ext_params)
300 return -1;
301
302 gap_align = (BlastGapAlignStruct*) calloc(1, sizeof(BlastGapAlignStruct));
303
304 *gap_align_ptr = gap_align;
305
306 gap_align->sbp = sbp;
307
308 gap_align->gap_x_dropoff = ext_params->gap_x_dropoff;
309 gap_align->max_mismatches = ext_params->options->max_mismatches;
310 gap_align->mismatch_window = ext_params->options->mismatch_window;
311
312 /* allocate memory either for dynamic programming or jumper */
313 if (ext_params->options->ePrelimGapExt != eJumperWithTraceback) {
314 if (ext_params->options->ePrelimGapExt == eDynProgScoreOnly) {
315 /* allocate structures for ordinary dynamic programming */
316 gap_align->dp_mem_alloc = 1000;
317 gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
318 sizeof(BlastGapDP));
319 if (!gap_align->dp_mem)
320 gap_align = BLAST_GapAlignStructFree(gap_align);
321 }
322 else {
323 /* allocate structures for greedy dynamic programming */
324 max_subject_length = MIN(max_subject_length, MAX_DBSEQ_LEN);
325 max_subject_length = MIN(GREEDY_MAX_COST,
326 max_subject_length / GREEDY_MAX_COST_FRACTION + 1);
327 gap_align->greedy_align_mem =
328 s_BlastGreedyAlignMemAlloc(score_params, ext_params,
329 max_subject_length, 0);
330 if (!gap_align->greedy_align_mem)
331 gap_align = BLAST_GapAlignStructFree(gap_align);
332 }
333 }
334 else {
335 gap_align->jumper = JumperGapAlignNew(200);
336 if (ext_params->gap_x_dropoff == 0) {
337 gap_align->gap_x_dropoff = 3 * MAX(-score_params->penalty,
338 score_params->gap_open +
339 score_params->gap_extend);
340 }
341 }
342
343 if (!gap_align)
344 return -1;
345
346 gap_align->positionBased = (sbp->psi_matrix != NULL);
347
348 gap_align->fwd_prelim_tback = GapPrelimEditBlockNew();
349 gap_align->rev_prelim_tback = GapPrelimEditBlockNew();
350
351 return status;
352 }
353
354 /** Values for the editing script operations in traceback */
355 enum {
356 SCRIPT_SUB = eGapAlignSub, /**< Substitution */
357 SCRIPT_GAP_IN_A = eGapAlignDel, /**< Deletion */
358 SCRIPT_GAP_IN_B = eGapAlignIns, /**< Insertion */
359 SCRIPT_OP_MASK = 0x07, /**< Mask for edit script operations */
360
361 SCRIPT_EXTEND_GAP_A = 0x10, /**< continue a gap in A */
362 SCRIPT_EXTEND_GAP_B = 0x40 /**< continue a gap in B */
363 };
364
365 Int4
ALIGN_EX(const Uint1 * A,const Uint1 * B,Int4 M,Int4 N,Int4 * a_offset,Int4 * b_offset,GapPrelimEditBlock * edit_block,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 query_offset,Boolean reversed,Boolean reverse_sequence,Boolean * fence_hit)366 ALIGN_EX(const Uint1* A, const Uint1* B, Int4 M, Int4 N, Int4* a_offset,
367 Int4* b_offset, GapPrelimEditBlock *edit_block,
368 BlastGapAlignStruct* gap_align,
369 const BlastScoringParameters* score_params, Int4 query_offset,
370 Boolean reversed, Boolean reverse_sequence,
371 Boolean * fence_hit)
372 {
373 /* See Blast_SemiGappedAlign for more general comments on
374 what this code is doing; comments in this function
375 only apply to the traceback computations */
376
377 Int4 i;
378 Int4 a_index;
379 Int4 b_index, b_size, first_b_index, last_b_index, b_increment;
380 const Uint1* b_ptr;
381
382 BlastGapDP* score_array;
383
384 Int4 gap_open;
385 Int4 gap_extend;
386 Int4 gap_open_extend;
387 Int4 x_dropoff;
388 Int4 best_score;
389
390 Int4** matrix = NULL;
391 Int4** pssm = NULL;
392 Int4* matrix_row = NULL;
393
394 Int4 score;
395 Int4 score_gap_row;
396 Int4 score_gap_col;
397 Int4 next_score;
398
399 GapStateArrayStruct* state_struct;
400 Uint1* edit_script_row;
401 Uint1** edit_script;
402 Int4 *edit_start_offset;
403 Int4 edit_script_num_rows;
404 Int4 orig_b_index;
405 Uint1 script, next_script, script_row, script_col;
406 Int4 num_extra_cells;
407
408 matrix = gap_align->sbp->matrix->data;
409 if (gap_align->positionBased) {
410 pssm = gap_align->sbp->psi_matrix->pssm->data;
411 }
412
413 *a_offset = 0;
414 *b_offset = 0;
415 gap_open = score_params->gap_open;
416 gap_extend = score_params->gap_extend;
417 gap_open_extend = gap_open + gap_extend;
418 x_dropoff = gap_align->gap_x_dropoff;
419
420 if (x_dropoff < gap_open_extend)
421 x_dropoff = gap_open_extend;
422
423 if(N <= 0 || M <= 0)
424 return 0;
425
426 /* Initialize traceback information. edit_script[] is
427 a 2-D array which is filled in row by row as the
428 dynamic programming takes place */
429
430 s_GapPurgeState(gap_align->state_struct);
431
432 /* Conceptually, traceback requires a byte array of size
433 MxN. To save on memory, each row of the array is allocated
434 separately. edit_script[i] points to the storage reserved
435 for row i, and edit_start_offset[i] gives the offset into
436 the B sequence corresponding to element 0 of edit_script[i].
437
438 Also make the number of edit script rows grow dynamically */
439
440 edit_script_num_rows = 100;
441 edit_script = (Uint1**) malloc(sizeof(Uint1*) * edit_script_num_rows);
442 edit_start_offset = (Int4*) malloc(sizeof(Int4) * edit_script_num_rows);
443
444 /* allocate storage for the first row of the traceback
445 array. Because row elements correspond to gaps in A,
446 the alignment can only go x_dropoff/gap_extend positions
447 at most before failing the X dropoff criterion */
448
449 if (gap_extend > 0)
450 num_extra_cells = x_dropoff / gap_extend + 3;
451 else
452 num_extra_cells = N + 3;
453
454 if (num_extra_cells > gap_align->dp_mem_alloc) {
455 gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
456 2 * gap_align->dp_mem_alloc);
457 sfree(gap_align->dp_mem);
458 gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
459 sizeof(BlastGapDP));
460 }
461
462 state_struct = s_GapGetState(&gap_align->state_struct, num_extra_cells);
463
464 edit_script[0] = state_struct->state_array;
465 edit_start_offset[0] = 0;
466 edit_script_row = state_struct->state_array;
467
468 score = -gap_open_extend;
469 score_array = gap_align->dp_mem;
470 score_array[0].best = 0;
471 score_array[0].best_gap = -gap_open_extend;
472
473 for (i = 1; i <= N; i++) {
474 if (score < -x_dropoff)
475 break;
476
477 score_array[i].best = score;
478 score_array[i].best_gap = score - gap_open_extend;
479 score -= gap_extend;
480 edit_script_row[i] = SCRIPT_GAP_IN_A;
481 }
482 state_struct->used = i + 1;
483
484 b_size = i;
485 best_score = 0;
486 first_b_index = 0;
487 if (reverse_sequence)
488 b_increment = -1;
489 else
490 b_increment = 1;
491
492 for (a_index = 1; a_index <= M; a_index++) {
493
494 /* Set up the next row of the edit script; this involves
495 allocating memory for the row, then pointing to it.
496 It is not necessary to allocate space for offsets less
497 than first_b_index (since the inner loop will never
498 look at them);
499
500 It is unknown at this point how far to the right the
501 current traceback row will extend; all that's known for
502 sure is that the previous row fails the X-dropoff test
503 after b_size cells, and that the current row can go at
504 most num_extra_cells beyond that before failing the test */
505
506 if (gap_extend > 0)
507 state_struct = s_GapGetState(&gap_align->state_struct,
508 b_size - first_b_index + num_extra_cells);
509 else
510 state_struct = s_GapGetState(&gap_align->state_struct,
511 N + 3 - first_b_index);
512
513 if (a_index == edit_script_num_rows) {
514 edit_script_num_rows = edit_script_num_rows * 2;
515 edit_script = (Uint1 **)realloc(edit_script,
516 edit_script_num_rows *
517 sizeof(Uint1 *));
518 edit_start_offset = (Int4 *)realloc(edit_start_offset,
519 edit_script_num_rows *
520 sizeof(Int4));
521 }
522
523 edit_script[a_index] = state_struct->state_array +
524 state_struct->used + 1;
525 edit_start_offset[a_index] = first_b_index;
526
527 /* the inner loop assumes the current traceback
528 row begins at offset zero of B */
529
530 edit_script_row = edit_script[a_index] - first_b_index;
531 orig_b_index = first_b_index;
532
533 if (!(gap_align->positionBased)) {
534 if(reverse_sequence)
535 matrix_row = matrix[ A[ M - a_index ] ];
536 else
537 matrix_row = matrix[ A[ a_index ] ];
538 }
539 else {
540 if(reversed || reverse_sequence)
541 matrix_row = pssm[M - a_index];
542 else
543 matrix_row = pssm[a_index + query_offset];
544 }
545
546 if(reverse_sequence)
547 b_ptr = &B[N - first_b_index];
548 else
549 b_ptr = &B[first_b_index];
550
551 score = MININT;
552 score_gap_row = MININT;
553 last_b_index = first_b_index;
554
555 for (b_index = first_b_index; b_index < b_size; b_index++) {
556 int matrix_index = 0;
557
558 b_ptr += b_increment;
559 score_gap_col = score_array[b_index].best_gap;
560
561 matrix_index = *b_ptr;
562
563 if (matrix_index == FENCE_SENTRY) {
564 if (fence_hit) {
565 *fence_hit = 1;
566 }
567 break;
568 }
569
570 next_score = score_array[b_index].best + matrix_row[ *b_ptr ];
571
572 /* script, script_row and script_col contain the
573 actions specified by the dynamic programming.
574 when the inner loop has finished, 'script' con-
575 tains all of the actions to perform, and is
576 written to edit_script[a_index][b_index]. Otherwise,
577 this inner loop is exactly the same as the one
578 in Blast_SemiGappedAlign() */
579
580 script = SCRIPT_SUB;
581 script_col = SCRIPT_EXTEND_GAP_B;
582 script_row = SCRIPT_EXTEND_GAP_A;
583
584 if (score < score_gap_col) {
585 script = SCRIPT_GAP_IN_B;
586 score = score_gap_col;
587 }
588 if (score < score_gap_row) {
589 script = SCRIPT_GAP_IN_A;
590 score = score_gap_row;
591 }
592
593 if (best_score - score > x_dropoff) {
594
595 if (first_b_index == b_index)
596 first_b_index++;
597 else
598 score_array[b_index].best = MININT;
599 }
600 else {
601 last_b_index = b_index;
602 if (score > best_score) {
603 best_score = score;
604 *a_offset = a_index;
605 *b_offset = b_index;
606 }
607
608 score_gap_row -= gap_extend;
609 score_gap_col -= gap_extend;
610 if (score_gap_col < (score - gap_open_extend)) {
611 score_array[b_index].best_gap = score - gap_open_extend;
612 }
613 else {
614 score_array[b_index].best_gap = score_gap_col;
615 script += script_col;
616 }
617
618 if (score_gap_row < (score - gap_open_extend))
619 score_gap_row = score - gap_open_extend;
620 else
621 script += script_row;
622
623 score_array[b_index].best = score;
624 }
625
626 score = next_score;
627 edit_script_row[b_index] = script;
628 }
629
630 if (first_b_index == b_size || (fence_hit && *fence_hit))
631 break;
632
633 if (last_b_index + num_extra_cells + 3 >= gap_align->dp_mem_alloc) {
634
635 gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100,
636 2 * gap_align->dp_mem_alloc);
637 score_array = (BlastGapDP *)realloc(score_array,
638 gap_align->dp_mem_alloc *
639 sizeof(BlastGapDP));
640 gap_align->dp_mem = score_array;
641 }
642
643
644 if (last_b_index < b_size - 1) {
645 b_size = last_b_index + 1;
646 }
647 else {
648 while (score_gap_row >= (best_score - x_dropoff) && b_size <= N) {
649
650 score_array[b_size].best = score_gap_row;
651 score_array[b_size].best_gap = score_gap_row - gap_open_extend;
652 score_gap_row -= gap_extend;
653 edit_script_row[b_size] = SCRIPT_GAP_IN_A;
654 b_size++;
655 }
656 }
657
658 /* update the memory allocator to reflect the exact number
659 of traceback cells this row needed */
660
661 state_struct->used += MAX(b_index, b_size) - orig_b_index + 1;
662
663 if (b_size <= N) {
664 score_array[b_size].best = MININT;
665 score_array[b_size].best_gap = MININT;
666 b_size++;
667 }
668 }
669
670 /* Pick the optimal path through the now complete
671 edit_script[][]. This is equivalent to flattening
672 the 2-D array into a 1-D list of actions. */
673
674 a_index = *a_offset;
675 b_index = *b_offset;
676 script = SCRIPT_SUB;
677
678 if (fence_hit && *fence_hit)
679 goto done;
680
681 while (a_index > 0 || b_index > 0) {
682 /* Retrieve the next action to perform. Rows of
683 the traceback array do not necessarily start
684 at offset zero of B, so a correction is needed
685 to point to the correct position */
686
687 next_script =
688 edit_script[a_index][b_index - edit_start_offset[a_index]];
689
690 switch(script) {
691 case SCRIPT_GAP_IN_A:
692 script = next_script & SCRIPT_OP_MASK;
693 if (next_script & SCRIPT_EXTEND_GAP_A)
694 script = SCRIPT_GAP_IN_A;
695 break;
696
697 case SCRIPT_GAP_IN_B:
698 script = next_script & SCRIPT_OP_MASK;
699 if (next_script & SCRIPT_EXTEND_GAP_B)
700 script = SCRIPT_GAP_IN_B;
701 break;
702
703 default:
704 script = next_script & SCRIPT_OP_MASK;
705 break;
706 }
707
708 if (script == SCRIPT_GAP_IN_A) {
709 b_index--;
710 }
711 else if (script == SCRIPT_GAP_IN_B) {
712 a_index--;
713 }
714 else {
715 a_index--;
716 b_index--;
717 }
718 GapPrelimEditBlockAdd(edit_block, (EGapAlignOpType)script, 1);
719 }
720
721 done:
722 sfree(edit_start_offset);
723 sfree(edit_script);
724 return best_score;
725 }
726
727 Int4
Blast_SemiGappedAlign(const Uint1 * A,const Uint1 * B,Int4 M,Int4 N,Int4 * a_offset,Int4 * b_offset,Boolean score_only,GapPrelimEditBlock * edit_block,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 query_offset,Boolean reversed,Boolean reverse_sequence,Boolean * fence_hit)728 Blast_SemiGappedAlign(const Uint1* A, const Uint1* B, Int4 M, Int4 N,
729 Int4* a_offset, Int4* b_offset, Boolean score_only,
730 GapPrelimEditBlock *edit_block, BlastGapAlignStruct* gap_align,
731 const BlastScoringParameters* score_params,
732 Int4 query_offset, Boolean reversed, Boolean reverse_sequence,
733 Boolean * fence_hit)
734 {
735 Int4 i; /* sequence pointers and indices */
736 Int4 a_index;
737 Int4 b_index, b_size, first_b_index, last_b_index, b_increment;
738 const Uint1* b_ptr;
739
740 BlastGapDP* score_array;
741
742 Int4 gap_open; /* alignment penalty variables */
743 Int4 gap_extend;
744 Int4 gap_open_extend;
745 Int4 x_dropoff;
746
747 Int4** matrix = NULL; /* pointers to the score matrix */
748 Int4** pssm = NULL;
749 Int4* matrix_row = NULL;
750
751 Int4 score; /* score tracking variables */
752 Int4 score_gap_row;
753 Int4 score_gap_col;
754 Int4 next_score;
755 Int4 best_score;
756 Int4 num_extra_cells;
757
758 if (!score_only) {
759 return ALIGN_EX(A, B, M, N, a_offset, b_offset, edit_block, gap_align,
760 score_params, query_offset, reversed, reverse_sequence,
761 fence_hit);
762 }
763
764 /* do initialization and sanity-checking */
765
766 matrix = gap_align->sbp->matrix->data;
767 if (gap_align->positionBased) {
768 pssm = gap_align->sbp->psi_matrix->pssm->data;
769 }
770 *a_offset = 0;
771 *b_offset = 0;
772 gap_open = score_params->gap_open;
773 gap_extend = score_params->gap_extend;
774 gap_open_extend = gap_open + gap_extend;
775 x_dropoff = gap_align->gap_x_dropoff;
776
777 if (x_dropoff < gap_open_extend)
778 x_dropoff = gap_open_extend;
779
780 if(N <= 0 || M <= 0)
781 return 0;
782
783 /* Allocate and fill in the auxiliary bookeeping structures.
784 Since A and B could be very large, maintain a window
785 of auxiliary structures only large enough to contain to current
786 set of DP computations. The initial window size is determined
787 by the number of cells needed to fail the x-dropoff test */
788
789 if (gap_extend > 0)
790 num_extra_cells = x_dropoff / gap_extend + 3;
791 else
792 num_extra_cells = N + 3;
793
794 if (num_extra_cells > gap_align->dp_mem_alloc) {
795 gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
796 2 * gap_align->dp_mem_alloc);
797 sfree(gap_align->dp_mem);
798 gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
799 sizeof(BlastGapDP));
800 }
801
802 score_array = gap_align->dp_mem;
803 score = -gap_open_extend;
804 score_array[0].best = 0;
805 score_array[0].best_gap = -gap_open_extend;
806
807 for (i = 1; i <= N; i++) {
808 if (score < -x_dropoff)
809 break;
810
811 score_array[i].best = score;
812 score_array[i].best_gap = score - gap_open_extend;
813 score -= gap_extend;
814 }
815
816 /* The inner loop below examines letters of B from
817 index 'first_b_index' to 'b_size' */
818
819 b_size = i;
820 best_score = 0;
821 first_b_index = 0;
822 if (reverse_sequence)
823 b_increment = -1;
824 else
825 b_increment = 1;
826
827 for (a_index = 1; a_index <= M; a_index++) {
828 /* pick out the row of the score matrix
829 appropriate for A[a_index] */
830
831 if (!(gap_align->positionBased)) {
832 if(reverse_sequence)
833 matrix_row = matrix[ A[ M - a_index ] ];
834 else
835 matrix_row = matrix[ A[ a_index ] ];
836 }
837 else {
838 if(reversed || reverse_sequence)
839 matrix_row = pssm[M - a_index];
840 else
841 matrix_row = pssm[a_index + query_offset];
842 }
843
844 if(reverse_sequence)
845 b_ptr = &B[N - first_b_index];
846 else
847 b_ptr = &B[first_b_index];
848
849 /* initialize running-score variables */
850 score = MININT;
851 score_gap_row = MININT;
852 last_b_index = first_b_index;
853
854 for (b_index = first_b_index; b_index < b_size; b_index++) {
855
856 b_ptr += b_increment;
857 score_gap_col = score_array[b_index].best_gap;
858 next_score = score_array[b_index].best + matrix_row[ *b_ptr ];
859
860 if (score < score_gap_col)
861 score = score_gap_col;
862
863 if (score < score_gap_row)
864 score = score_gap_row;
865
866 if (best_score - score > x_dropoff) {
867
868 /* the current best score failed the X-dropoff
869 criterion. Note that this does not stop the
870 inner loop, only forces future iterations to
871 skip this column of B.
872
873 Also, if the very first letter of B that was
874 tested failed the X dropoff criterion, make
875 sure future inner loops start one letter to
876 the right */
877
878 if (b_index == first_b_index)
879 first_b_index++;
880 else
881 score_array[b_index].best = MININT;
882 }
883 else {
884 last_b_index = b_index;
885 if (score > best_score) {
886 best_score = score;
887 *a_offset = a_index;
888 *b_offset = b_index;
889 }
890
891 /* If starting a gap at this position will improve
892 the best row, or column, score, update them to
893 reflect that. */
894
895 score_gap_row -= gap_extend;
896 score_gap_col -= gap_extend;
897 score_array[b_index].best_gap = MAX(score - gap_open_extend,
898 score_gap_col);
899 score_gap_row = MAX(score - gap_open_extend, score_gap_row);
900 score_array[b_index].best = score;
901 }
902
903 score = next_score;
904 }
905
906 /* Finish aligning if the best scores for all positions
907 of B will fail the X-dropoff test, i.e. the inner loop
908 bounds have converged to each other */
909
910 if (first_b_index == b_size)
911 break;
912
913 /* enlarge the window for score data if necessary */
914
915 if (last_b_index + num_extra_cells + 3 >= gap_align->dp_mem_alloc) {
916
917 gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100,
918 2 * gap_align->dp_mem_alloc);
919 score_array = (BlastGapDP *)realloc(score_array,
920 gap_align->dp_mem_alloc *
921 sizeof(BlastGapDP));
922 gap_align->dp_mem = score_array;
923 }
924
925 if (last_b_index < b_size - 1) {
926 /* This row failed the X-dropoff test earlier than
927 the last row did; just shorten the loop bounds
928 before doing the next row */
929
930 b_size = last_b_index + 1;
931 }
932 else {
933 /* The inner loop finished without failing the X-dropoff
934 test; initialize extra bookkeeping structures until
935 the X dropoff test fails or we run out of letters in B.
936 The next inner loop will have larger bounds */
937
938 while (score_gap_row >= (best_score - x_dropoff) && b_size <= N) {
939 score_array[b_size].best = score_gap_row;
940 score_array[b_size].best_gap = score_gap_row - gap_open_extend;
941 score_gap_row -= gap_extend;
942 b_size++;
943 }
944 }
945
946 if (b_size <= N) {
947 score_array[b_size].best = MININT;
948 score_array[b_size].best_gap = MININT;
949 b_size++;
950 }
951 }
952
953 return best_score;
954 }
955
956 /** For restricted gapped alignment, gaps may only start once in
957 this many sequence offsets */
958 #define RESTRICT_SIZE 10
959
960 /** Low level function to perform score-only gapped extension
961 * in one direction. Gaps in one sequence or the other are only
962 * allowed to start at sequence offsets that are a multiple of
963 * RESTRICT_SIZE from the alignment start point. For more details see
964 * <PRE>
965 * Michael Cameron, Hugh E. Williams, Adam Cannane, "Improved
966 * Gapped Alignment in BLAST". IEEE/ACM Transactions on Computational
967 * Biology and Bioinformatics, vol 1 #3 (2004) pp 116-129
968 * </PRE>
969 * @param A The query sequence [in]
970 * @param B The subject sequence [in]
971 * @param M Maximal extension length in query [in]
972 * @param N Maximal extension length in subject [in]
973 * @param a_offset Resulting starting offset in query [out]
974 * @param b_offset Resulting starting offset in subject [out]
975 * @param gap_align Structure holding various information and allocated
976 * memory for the gapped alignment [in]
977 * @param score_params Parameters related to scoring [in]
978 * @param query_offset The starting offset in query [in]
979 * @param reverse_sequence Do reverse the sequence [in]
980 * @return The best alignment score found.
981 */
982 static Int4
s_RestrictedGappedAlign(const Uint1 * A,const Uint1 * B,Int4 M,Int4 N,Int4 * a_offset,Int4 * b_offset,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 query_offset,Boolean reverse_sequence)983 s_RestrictedGappedAlign(const Uint1* A, const Uint1* B, Int4 M, Int4 N,
984 Int4* a_offset, Int4* b_offset,
985 BlastGapAlignStruct* gap_align,
986 const BlastScoringParameters* score_params,
987 Int4 query_offset, Boolean reverse_sequence)
988 {
989 /* see Blast_SemiGappedAlign for general details */
990
991 Int4 i;
992 Int4 a_index;
993 Int4 b_index, b_size, first_b_index, last_b_index, b_increment;
994 const Uint1* b_ptr;
995 Int4 b_gap;
996
997 BlastGapDP* score_array;
998
999 Int4 gap_open;
1000 Int4 gap_extend;
1001 Int4 gap_open_extend;
1002 Int4 x_dropoff;
1003
1004 Int4** matrix = NULL;
1005 Int4** pssm = NULL;
1006 Int4* matrix_row = NULL;
1007
1008 Int4 score;
1009 Int4 score_gap_row;
1010 Int4 score_gap_col;
1011 Int4 next_score;
1012 Int4 best_score;
1013 Int4 num_extra_cells;
1014
1015 matrix = gap_align->sbp->matrix->data;
1016 if (gap_align->positionBased) {
1017 pssm = gap_align->sbp->psi_matrix->pssm->data;
1018 }
1019 *a_offset = 0;
1020 *b_offset = 0;
1021 gap_open = score_params->gap_open;
1022 gap_extend = score_params->gap_extend;
1023 gap_open_extend = gap_open + gap_extend;
1024 x_dropoff = gap_align->gap_x_dropoff;
1025
1026 if (x_dropoff < gap_open_extend)
1027 x_dropoff = gap_open_extend;
1028
1029 if(N <= 0 || M <= 0)
1030 return 0;
1031
1032 if (gap_extend > 0)
1033 num_extra_cells = x_dropoff / gap_extend + 3;
1034 else
1035 num_extra_cells = N + 3;
1036
1037 if (num_extra_cells > gap_align->dp_mem_alloc) {
1038 gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
1039 2 * gap_align->dp_mem_alloc);
1040 sfree(gap_align->dp_mem);
1041 gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
1042 sizeof(BlastGapDP));
1043 }
1044
1045 score_array = gap_align->dp_mem;
1046 score = -gap_open_extend;
1047 score_array[0].best = 0;
1048 score_array[0].best_gap = -gap_open_extend;
1049
1050 for (i = 1; i <= N; i++) {
1051 if (score < -x_dropoff)
1052 break;
1053
1054 score_array[i].best = score;
1055 score_array[i].best_gap = score - gap_open_extend;
1056 score -= gap_extend;
1057 }
1058
1059 b_size = i;
1060 best_score = 0;
1061 first_b_index = 0;
1062 b_gap = 0;
1063 if (reverse_sequence)
1064 b_increment = -1;
1065 else
1066 b_increment = 1;
1067
1068 for (a_index = 1; a_index <= M; a_index++) {
1069
1070 if (!(gap_align->positionBased)) {
1071 if(reverse_sequence)
1072 matrix_row = matrix[ A[ M - a_index ] ];
1073 else
1074 matrix_row = matrix[ A[ a_index ] ];
1075 }
1076 else {
1077 if(reverse_sequence)
1078 matrix_row = pssm[M - a_index];
1079 else
1080 matrix_row = pssm[a_index + query_offset];
1081 }
1082
1083 if(reverse_sequence)
1084 b_ptr = &B[N - first_b_index];
1085 else
1086 b_ptr = &B[first_b_index];
1087
1088 score = MININT;
1089 score_gap_row = MININT;
1090 last_b_index = first_b_index;
1091
1092 /* The double loop that computes the alignment is essentially
1093 unchanged from that of Blast_SemiGappedAlign. The only
1094 real difference is that a gap in A or B is not allowed to
1095 start unless the offset of the gap is divisible by
1096 RESTRICT_SIZE. This allows the ordinary dynamic programming
1097 recurrence relations to be simplified */
1098
1099 if (a_index % RESTRICT_SIZE != 0) {
1100
1101 /* a gap will never start in A; do not bother checking
1102 or updating score_gap_row */
1103
1104 for (b_index = first_b_index; b_index < b_size; b_index++) {
1105
1106 b_ptr += b_increment;
1107 next_score = score_array[b_index].best + matrix_row[ *b_ptr ];
1108
1109 if (b_index != b_gap) {
1110
1111 /* the majority of cases fall here; a gap
1112 may not start in either A or B */
1113
1114 if (best_score - score > x_dropoff) {
1115 score_array[b_index].best = MININT;
1116 if (b_index == first_b_index)
1117 first_b_index++;
1118 }
1119 else {
1120 last_b_index = b_index;
1121 if (score > best_score) {
1122 best_score = score;
1123 *a_offset = a_index;
1124 *b_offset = b_index;
1125 }
1126 score_array[b_index].best = score;
1127 }
1128 }
1129 else {
1130
1131 /* a gap may start in B. Update b_gap, the
1132 offset when this will next happen, and
1133 compute the two-term recurrence */
1134
1135 b_gap += RESTRICT_SIZE;
1136 score_gap_col = score_array[b_index].best_gap;
1137
1138 if (score < score_gap_col)
1139 score = score_gap_col;
1140
1141 if (best_score - score > x_dropoff) {
1142 score_array[b_index].best = MININT;
1143 if (b_index == first_b_index)
1144 first_b_index++;
1145 }
1146 else {
1147 last_b_index = b_index;
1148 if (score > best_score) {
1149 best_score = score;
1150 *a_offset = a_index;
1151 *b_offset = b_index;
1152 }
1153
1154 score_gap_col -= gap_extend;
1155 score_array[b_index].best_gap =
1156 MAX(score - gap_open_extend, score_gap_col);
1157 score_array[b_index].best = score;
1158 }
1159 }
1160 score = next_score;
1161 }
1162
1163 score_gap_row = score;
1164 }
1165 else {
1166
1167 /* gap may start in A */
1168
1169 for (b_index = first_b_index; b_index < b_size; b_index++) {
1170
1171 b_ptr += b_increment;
1172 next_score = score_array[b_index].best + matrix_row[ *b_ptr ];
1173
1174 if (b_index != b_gap) {
1175
1176 /* gap may not start in B. Compute
1177 the resulting two-term recurrence */
1178
1179 if (score < score_gap_row)
1180 score = score_gap_row;
1181
1182 if (best_score - score > x_dropoff) {
1183 score_array[b_index].best = MININT;
1184 if (b_index == first_b_index)
1185 first_b_index++;
1186 }
1187 else {
1188 last_b_index = b_index;
1189 if (score > best_score) {
1190 best_score = score;
1191 *a_offset = a_index;
1192 *b_offset = b_index;
1193 }
1194
1195 score_gap_row -= gap_extend;
1196 score_gap_row = MAX(score - gap_open_extend,
1197 score_gap_row);
1198 score_array[b_index].best = score;
1199 }
1200 }
1201 else {
1202
1203 /* the ordinary case: a gap may start in A
1204 or B and the full three-term recurrence
1205 must be computed. This happens once every
1206 RESTRICT_SIZE*RESTRICT_SIZE cells */
1207
1208 b_gap += RESTRICT_SIZE;
1209 score_gap_col = score_array[b_index].best_gap;
1210
1211 if (score < score_gap_col)
1212 score = score_gap_col;
1213 if (score < score_gap_row)
1214 score = score_gap_row;
1215
1216 if (best_score - score > x_dropoff) {
1217 score_array[b_index].best = MININT;
1218 if (b_index == first_b_index)
1219 first_b_index++;
1220 }
1221 else {
1222 last_b_index = b_index;
1223 if (score > best_score) {
1224 best_score = score;
1225 *a_offset = a_index;
1226 *b_offset = b_index;
1227 }
1228
1229 score_gap_row -= gap_extend;
1230 score_gap_col -= gap_extend;
1231 score_array[b_index].best_gap =
1232 MAX(score - gap_open_extend, score_gap_col);
1233 score_gap_row = MAX(score - gap_open_extend,
1234 score_gap_row);
1235 score_array[b_index].best = score;
1236 }
1237 }
1238 score = next_score;
1239 }
1240 }
1241
1242 if (first_b_index == b_size)
1243 break;
1244
1245 /* compute the first offset of the next row where a
1246 gap in B may start */
1247
1248 b_index = first_b_index % RESTRICT_SIZE;
1249 b_gap = first_b_index;
1250 if (b_index > 0)
1251 b_gap += RESTRICT_SIZE - b_index;
1252
1253 if (last_b_index + num_extra_cells + 3 >= gap_align->dp_mem_alloc) {
1254
1255 gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100,
1256 2 * gap_align->dp_mem_alloc);
1257 score_array = (BlastGapDP *)realloc(score_array,
1258 gap_align->dp_mem_alloc *
1259 sizeof(BlastGapDP));
1260 gap_align->dp_mem = score_array;
1261 }
1262
1263 if (last_b_index < b_size - 1) {
1264 b_size = last_b_index + 1;
1265 }
1266 else {
1267 /* The inner loop finished without failing the X-dropoff
1268 test; initialize extra bookkeeping structures until
1269 the X dropoff test fails or we run out of letters in B.
1270 The next inner loop will have larger bounds.
1271
1272 Note that if gaps in A are not allowed for this row, then
1273 score_gap_row is -infinity and the loop below will not
1274 extend the region to explore */
1275
1276 while (score_gap_row >= (best_score - x_dropoff) && b_size <= N) {
1277 score_array[b_size].best = score_gap_row;
1278 score_array[b_size].best_gap = score_gap_row - gap_open_extend;
1279 score_gap_row -= gap_extend;
1280 b_size++;
1281 }
1282 }
1283
1284 if (b_size <= N) {
1285 score_array[b_size].best = MININT;
1286 score_array[b_size].best_gap = MININT;
1287 b_size++;
1288 }
1289 }
1290
1291 return best_score;
1292 }
1293
1294 /** Editing script operations for out-of-frame traceback */
1295 enum {
1296 SCRIPT_AHEAD_ONE_FRAME = 1, /**< Shift 1 frame in sequence A
1297 (gap 2 nucleotides) */
1298 SCRIPT_AHEAD_TWO_FRAMES = 2, /**< Shift 2 frames in sequence A
1299 (gap 1 nucleotide) */
1300 SCRIPT_NEXT_IN_FRAME = 3, /**< Shift to next base (substitution) */
1301 SCRIPT_NEXT_PLUS_ONE_FRAME = 4, /**< Shift to next base plus 1 frame
1302 (gap 1 nucleotide in sequence B) */
1303 SCRIPT_NEXT_PLUS_TWO_FRAMES = 5, /**< Shift to next base plus 2 frames
1304 (gap 2 nucleotides in sequence B) */
1305 SCRIPT_OOF_OPEN_GAP = 0x08 /**< Opening a gap */
1306 };
1307
1308 /** Low level function to perform gapped extension with out-of-frame
1309 * gapping with traceback.
1310 * @param A The query sequence [in]
1311 * @param B The subject sequence [in]
1312 * @param M Maximal extension length in query [in]
1313 * @param N Maximal extension length in subject [in]
1314 * @param a_offset Resulting starting offset in query [out]
1315 * @param b_offset Resulting starting offset in subject [out]
1316 * @param edit_block Structure to hold generated traceback [out]
1317 * @param gap_align Structure holding various information and allocated
1318 * memory for the gapped alignment [in]
1319 * @param score_params Parameters related to scoring [in]
1320 * @param query_offset The starting offset in query [in]
1321 * @param reversed Has the sequence been reversed? Used for psi-blast [in]
1322 * @return The best alignment score found.
1323 */
1324 static Int4
s_OutOfFrameAlignWithTraceback(const Uint1 * A,const Uint1 * B,Int4 M,Int4 N,Int4 * a_offset,Int4 * b_offset,GapPrelimEditBlock * edit_block,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 query_offset,Boolean reversed)1325 s_OutOfFrameAlignWithTraceback(const Uint1* A, const Uint1* B, Int4 M, Int4 N,
1326 Int4* a_offset, Int4* b_offset, GapPrelimEditBlock *edit_block,
1327 BlastGapAlignStruct* gap_align, const BlastScoringParameters* score_params,
1328 Int4 query_offset, Boolean reversed)
1329 {
1330 Int4 i, increment; /* sequence pointers and indices */
1331 Int4 a_index;
1332 Int4 b_index, b_size, first_b_index, last_b_index;
1333
1334 BlastGapDP* score_array;
1335
1336 Int4 gap_open; /* alignment penalty variables */
1337 Int4 gap_extend;
1338 Int4 gap_open_extend;
1339 Int4 shift_penalty;
1340 Int4 x_dropoff;
1341
1342 Int4** matrix = NULL; /* pointers to the score matrix */
1343 Int4** pssm = NULL;
1344 Int4* matrix_row = NULL;
1345
1346 Int4 score; /* score tracking variables */
1347 Int4 score_row1;
1348 Int4 score_row2;
1349 Int4 score_row3;
1350 Int4 score_gap_col;
1351 Int4 score_col1;
1352 Int4 score_col2;
1353 Int4 score_col3;
1354 Int4 score_other_frame1;
1355 Int4 score_other_frame2;
1356 Int4 best_score;
1357
1358 GapStateArrayStruct* state_struct;
1359 Uint1** edit_script;
1360 Uint1* edit_script_row;
1361 Int4 *edit_start_offset;
1362 Int4 edit_script_num_rows;
1363 Int4 orig_b_index;
1364 Int1 script, next_script;
1365 Int4 num_extra_cells;
1366
1367 /* do initialization and sanity-checking */
1368
1369 matrix = gap_align->sbp->matrix->data;
1370 if (gap_align->positionBased) {
1371 pssm = gap_align->sbp->psi_matrix->pssm->data;
1372 }
1373 *a_offset = 0;
1374 *b_offset = -2;
1375 gap_open = score_params->gap_open;
1376 gap_extend = score_params->gap_extend;
1377 gap_open_extend = gap_open + gap_extend;
1378 shift_penalty = score_params->shift_pen;
1379 x_dropoff = gap_align->gap_x_dropoff;
1380
1381 if (x_dropoff < gap_open_extend)
1382 x_dropoff = gap_open_extend;
1383
1384 if(N <= 0 || M <= 0)
1385 return 0;
1386
1387 /* Initialize traceback information. edit_script[] is
1388 a 2-D array which is filled in row by row as the
1389 dynamic programming takes place */
1390
1391 s_GapPurgeState(gap_align->state_struct);
1392
1393 /* Conceptually, traceback requires a byte array of size
1394 MxN. To save on memory, each row of the array is allocated
1395 separately. edit_script[i] points to the storage reserved
1396 for row i, and edit_start_offset[i] gives the offset into
1397 the B sequence corresponding to element 0 of edit_script[i]
1398
1399 Also make the number of edit script rows grow dynamically */
1400
1401 edit_script_num_rows = 100;
1402 edit_script = (Uint1**) malloc(sizeof(Uint1*) * edit_script_num_rows);
1403 edit_start_offset = (Int4*) malloc(sizeof(Int4) * edit_script_num_rows);
1404
1405 /* allocate storage for the first row of the traceback
1406 array. Because row elements correspond to gaps in A,
1407 the alignment can only go at most x_dropoff/gap_extend
1408 positions, in all three frames, before failing the
1409 X dropoff criterion */
1410
1411 if (gap_extend > 0)
1412 num_extra_cells = CODON_LENGTH * (x_dropoff / gap_extend + 5);
1413 else
1414 num_extra_cells = N + 5;
1415
1416 if (num_extra_cells > gap_align->dp_mem_alloc) {
1417 gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
1418 2 * gap_align->dp_mem_alloc);
1419 sfree(gap_align->dp_mem);
1420 gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
1421 sizeof(BlastGapDP));
1422 }
1423
1424 state_struct = s_GapGetState(&gap_align->state_struct, num_extra_cells);
1425
1426 edit_script[0] = state_struct->state_array;
1427 edit_start_offset[0] = 0;
1428 edit_script_row = state_struct->state_array;
1429
1430 score_array = gap_align->dp_mem;
1431 score = -gap_open_extend;
1432 score_array[0].best = 0;
1433 score_array[0].best_gap = -gap_open_extend;
1434
1435 for (i = 3; i <= N + 2; i += 3) {
1436 score_array[i].best = score;
1437 score_array[i].best_gap = score - gap_open_extend;
1438 edit_script_row[i] = SCRIPT_GAP_IN_B;
1439
1440 score_array[i-1].best = MININT;
1441 score_array[i-1].best_gap = MININT;
1442 edit_script_row[i-1] = SCRIPT_GAP_IN_B;
1443
1444 score_array[i-2].best = MININT;
1445 score_array[i-2].best_gap = MININT;
1446 edit_script_row[i-2] = SCRIPT_GAP_IN_B;
1447
1448 if (score < -x_dropoff)
1449 break;
1450 score -= gap_extend;
1451 }
1452
1453 /* The inner loop below examines letters of B from
1454 index 'first_b_index' to 'b_size' */
1455
1456 score_array[i].best = MININT;
1457 score_array[i].best_gap = MININT;
1458 b_size = i - 2;
1459 state_struct->used = b_size + 1;
1460
1461 best_score = 0;
1462 first_b_index = 0;
1463 if (reversed) {
1464 increment = -1;
1465 }
1466 else {
1467 /* Allow for a backwards frame shift */
1468 B -= 2;
1469 increment = 1;
1470 }
1471
1472 for (a_index = 1; a_index <= M; a_index++) {
1473
1474 /* Set up the next row of the edit script; this involves
1475 allocating memory for the row, then pointing to it.
1476 It is not necessary to allocate space for offsets less
1477 than first_b_index (since the inner loop will never
1478 look at them).
1479
1480 It is unknown at this point how far to the right the
1481 current traceback row will extend; all that's known for
1482 sure is that the previous row fails the X-dropoff test
1483 after b_size cells, and that the current row can go at
1484 most num_extra_cells beyond that before failing the test */
1485
1486 if (gap_extend > 0)
1487 state_struct = s_GapGetState(&gap_align->state_struct,
1488 b_size - first_b_index + num_extra_cells);
1489 else
1490 state_struct = s_GapGetState(&gap_align->state_struct,
1491 N + 5 - first_b_index);
1492
1493 if (a_index == edit_script_num_rows) {
1494 edit_script_num_rows = edit_script_num_rows * 2;
1495 edit_script = (Uint1 **)realloc(edit_script,
1496 edit_script_num_rows *
1497 sizeof(Uint1 *));
1498 edit_start_offset = (Int4 *)realloc(edit_start_offset,
1499 edit_script_num_rows *
1500 sizeof(Int4));
1501 }
1502
1503 edit_script[a_index] = state_struct->state_array +
1504 state_struct->used + 1;
1505 edit_start_offset[a_index] = first_b_index;
1506
1507 /* the inner loop assumes the current traceback
1508 row begins at offset zero of B */
1509
1510 edit_script_row = edit_script[a_index] - first_b_index;
1511 orig_b_index = first_b_index;
1512
1513 if (!(gap_align->positionBased)) {
1514 matrix_row = matrix[ A[ a_index * increment ] ];
1515 }
1516 else {
1517 if(reversed)
1518 matrix_row = pssm[M - a_index];
1519 else
1520 matrix_row = pssm[a_index + query_offset];
1521 }
1522
1523 score = MININT;
1524 score_row1 = MININT;
1525 score_row2 = MININT;
1526 score_row3 = MININT;
1527 score_gap_col = MININT;
1528 score_col1 = MININT;
1529 score_col2 = MININT;
1530 score_col3 = MININT;
1531 score_other_frame1 = MININT;
1532 score_other_frame2 = MININT;
1533 last_b_index = first_b_index;
1534 b_index = first_b_index;
1535
1536 /* The inner loop is identical to that of OOF_SEMI_G_ALIGN,
1537 except that traceback operations are sprinkled throughout. */
1538
1539 while (b_index < b_size) {
1540
1541 /* FRAME 0 */
1542
1543 score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
1544 score = MAX(score, score_col1);
1545 if (score == score_col1) {
1546 script = SCRIPT_NEXT_IN_FRAME;
1547 }
1548 else if (score + shift_penalty == score_other_frame1) {
1549 if (score_other_frame1 == score_col2)
1550 script = SCRIPT_AHEAD_TWO_FRAMES;
1551 else
1552 script = SCRIPT_NEXT_PLUS_TWO_FRAMES;
1553 }
1554 else {
1555 if (score_other_frame2 == score_col3)
1556 script = SCRIPT_AHEAD_ONE_FRAME;
1557 else
1558 script = SCRIPT_NEXT_PLUS_ONE_FRAME;
1559 }
1560 score += matrix_row[ B[ b_index * increment ] ];
1561
1562 score_other_frame1 = MAX(score_col1, score_array[b_index].best);
1563 score_col1 = score_array[b_index].best;
1564 score_gap_col = score_array[b_index].best_gap;
1565
1566 if (score < MAX(score_gap_col, score_row1)) {
1567 if (score_gap_col > score_row1) {
1568 score = score_gap_col;
1569 script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_A;
1570 }
1571 else {
1572 score = score_row1;
1573 script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_B;
1574 }
1575
1576 if (best_score - score > x_dropoff) {
1577 if (first_b_index == b_index)
1578 first_b_index = b_index + 1;
1579 else
1580 score_array[b_index].best = MININT;
1581 }
1582 else {
1583 last_b_index = b_index;
1584 score_array[b_index].best = score;
1585 score_array[b_index].best_gap = score_gap_col - gap_extend;
1586 score_row1 -= gap_extend;
1587 }
1588 }
1589 else {
1590 if (best_score - score > x_dropoff) {
1591 if (first_b_index == b_index)
1592 first_b_index = b_index + 1;
1593 else
1594 score_array[b_index].best = MININT;
1595 }
1596 else {
1597 last_b_index = b_index;
1598 score_array[b_index].best = score;
1599 if (score > best_score) {
1600 best_score = score;
1601 *a_offset = a_index;
1602 *b_offset = b_index;
1603 }
1604
1605 score -= gap_open_extend;
1606 score_row1 -= gap_extend;
1607 if (score > score_row1)
1608 score_row1 = score;
1609 else
1610 script |= SCRIPT_EXTEND_GAP_B;
1611
1612 score_gap_col -= gap_extend;
1613 if (score < score_gap_col) {
1614 score_array[b_index].best_gap = score_gap_col;
1615 script |= SCRIPT_EXTEND_GAP_A;
1616 }
1617 else {
1618 score_array[b_index].best_gap = score;
1619 }
1620 }
1621 }
1622
1623 edit_script_row[b_index++] = script;
1624 if (b_index >= b_size) {
1625 score = score_row1;
1626 score_row1 = score_row2;
1627 score_row2 = score_row3;
1628 score_row3 = score;
1629 break;
1630 }
1631
1632 /* FRAME 1 */
1633
1634 score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
1635 score = MAX(score, score_col2);
1636 if (score == score_col2) {
1637 script = SCRIPT_NEXT_IN_FRAME;
1638 }
1639 else if (score + shift_penalty == score_other_frame1) {
1640 if (score_other_frame1 == score_col1)
1641 script = SCRIPT_AHEAD_ONE_FRAME;
1642 else
1643 script = SCRIPT_NEXT_PLUS_ONE_FRAME;
1644 }
1645 else {
1646 if (score_other_frame2 == score_col3)
1647 script = SCRIPT_AHEAD_TWO_FRAMES;
1648 else
1649 script = SCRIPT_NEXT_PLUS_TWO_FRAMES;
1650 }
1651 score += matrix_row[ B[ b_index * increment ] ];
1652 score_other_frame2 = MAX(score_col2, score_array[b_index].best);
1653 score_col2 = score_array[b_index].best;
1654 score_gap_col = score_array[b_index].best_gap;
1655
1656 if (score < MAX(score_gap_col, score_row2)) {
1657 score = MAX(score_gap_col, score_row2);
1658 if (best_score - score > x_dropoff) {
1659 if (first_b_index == b_index)
1660 first_b_index = b_index + 1;
1661 else
1662 score_array[b_index].best = MININT;
1663 }
1664 else {
1665 if (score == score_gap_col)
1666 script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_A;
1667 else
1668 script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_B;
1669
1670 last_b_index = b_index;
1671 score_array[b_index].best = score;
1672 score_array[b_index].best_gap = score_gap_col - gap_extend;
1673 score_row2 -= gap_extend;
1674 }
1675 }
1676 else {
1677 if (best_score - score > x_dropoff) {
1678 if (first_b_index == b_index)
1679 first_b_index = b_index + 1;
1680 else
1681 score_array[b_index].best = MININT;
1682 }
1683 else {
1684 last_b_index = b_index;
1685 score_array[b_index].best = score;
1686 if (score > best_score) {
1687 best_score = score;
1688 *a_offset = a_index;
1689 *b_offset = b_index;
1690 }
1691 score -= gap_open_extend;
1692 score_row2 -= gap_extend;
1693 if (score > score_row2)
1694 score_row2 = score;
1695 else
1696 script |= SCRIPT_EXTEND_GAP_B;
1697
1698 score_gap_col -= gap_extend;
1699 if (score < score_gap_col) {
1700 score_array[b_index].best_gap = score_gap_col;
1701 script |= SCRIPT_EXTEND_GAP_A;
1702 }
1703 else {
1704 score_array[b_index].best_gap = score;
1705 }
1706 }
1707 }
1708
1709 edit_script_row[b_index++] = script;
1710 if (b_index >= b_size) {
1711 score = score_row2;
1712 score_row2 = score_row1;
1713 score_row1 = score_row3;
1714 score_row3 = score;
1715 break;
1716 }
1717
1718 /* FRAME 2 */
1719
1720 score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
1721 score = MAX(score, score_col3);
1722 if (score == score_col3) {
1723 script = SCRIPT_NEXT_IN_FRAME;
1724 }
1725 else if (score + shift_penalty == score_other_frame1) {
1726 if (score_other_frame1 == score_col1)
1727 script = SCRIPT_AHEAD_TWO_FRAMES;
1728 else
1729 script = SCRIPT_NEXT_PLUS_TWO_FRAMES;
1730 }
1731 else {
1732 if (score_other_frame2 == score_col2)
1733 script = SCRIPT_AHEAD_ONE_FRAME;
1734 else
1735 script = SCRIPT_NEXT_PLUS_ONE_FRAME;
1736 }
1737 score += matrix_row[ B[ b_index * increment ] ];
1738 score_other_frame1 = score_other_frame2;
1739 score_other_frame2 = MAX(score_col3, score_array[b_index].best);
1740 score_col3 = score_array[b_index].best;
1741 score_gap_col = score_array[b_index].best_gap;
1742
1743 if (score < MAX(score_gap_col, score_row3)) {
1744 score = MAX(score_gap_col, score_row3);
1745 if (best_score - score > x_dropoff) {
1746 if (first_b_index == b_index)
1747 first_b_index = b_index + 1;
1748 else
1749 score_array[b_index].best = MININT;
1750 }
1751 else {
1752 if (score == score_gap_col)
1753 script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_A;
1754 else
1755 script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_B;
1756
1757 last_b_index = b_index;
1758 score_array[b_index].best = score;
1759 score_array[b_index].best_gap = score_gap_col - gap_extend;
1760 score_row3 -= gap_extend;
1761 }
1762 }
1763 else {
1764 if (best_score - score > x_dropoff) {
1765 if (first_b_index == b_index)
1766 first_b_index = b_index + 1;
1767 else
1768 score_array[b_index].best = MININT;
1769 }
1770 else {
1771 last_b_index = b_index;
1772 score_array[b_index].best = score;
1773 if (score > best_score) {
1774 best_score = score;
1775 *a_offset = a_index;
1776 *b_offset = b_index;
1777 }
1778 score -= gap_open_extend;
1779 score_row3 -= gap_extend;
1780 if (score > score_row3)
1781 score_row3 = score;
1782 else
1783 script |= SCRIPT_EXTEND_GAP_B;
1784
1785 score_gap_col -= gap_extend;
1786 if (score < score_gap_col) {
1787 score_array[b_index].best_gap = score_gap_col;
1788 script |= SCRIPT_EXTEND_GAP_A;
1789 }
1790 else {
1791 score_array[b_index].best_gap = score;
1792 }
1793 }
1794 }
1795 edit_script_row[b_index++] = script;
1796 }
1797
1798 /* Finish aligning if the best scores for all positions
1799 of B will fail the X-dropoff test, i.e. the inner loop
1800 bounds have converged to each other */
1801
1802 if (first_b_index == b_size)
1803 break;
1804
1805 /* Enlarge the window for score data if necessary */
1806
1807 if (last_b_index + num_extra_cells + 5 >= gap_align->dp_mem_alloc) {
1808
1809 gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100,
1810 2 * gap_align->dp_mem_alloc);
1811 score_array = (BlastGapDP *)realloc(score_array,
1812 gap_align->dp_mem_alloc *
1813 sizeof(BlastGapDP));
1814 gap_align->dp_mem = score_array;
1815 }
1816
1817 if (last_b_index < b_size - 1) {
1818 /* This row failed the X-dropoff test earlier than
1819 the last row did; just shorten the loop bounds
1820 before doing the next row */
1821
1822 b_size = last_b_index + 1;
1823 }
1824 else {
1825 /* The inner loop finished without failing the X-dropoff
1826 test; initialize extra bookkeeping structures until
1827 the X dropoff test fails or we run out of letters in B.
1828 The next inner loop will have larger bounds.
1829
1830 Keep initializing extra structures until the X dropoff
1831 test fails in all frames for this row */
1832
1833 score = MAX(score_row1, score_row2);
1834 score = MAX(score, score_row3);
1835 while (score >= (best_score - x_dropoff) && b_size < N + 1) {
1836
1837 score_array[b_size].best = score_row1;
1838 score_array[b_size].best_gap = score_row1 - gap_open_extend;
1839 score_row1 -= gap_extend;
1840 edit_script_row[b_size] = SCRIPT_OOF_OPEN_GAP |
1841 SCRIPT_GAP_IN_B;
1842
1843 score_array[b_size+1].best = score_row2;
1844 score_array[b_size+1].best_gap = score_row2 - gap_open_extend;
1845 score_row2 -= gap_extend;
1846 edit_script_row[b_size+1] = SCRIPT_OOF_OPEN_GAP |
1847 SCRIPT_GAP_IN_B;
1848
1849 score_array[b_size+2].best = score_row3;
1850 score_array[b_size+2].best_gap = score_row3 - gap_open_extend;
1851 score_row3 -= gap_extend;
1852 edit_script_row[b_size+2] = SCRIPT_OOF_OPEN_GAP |
1853 SCRIPT_GAP_IN_B;
1854 b_size += 3;
1855 score -= gap_extend;
1856 }
1857 }
1858
1859 /* update the memory allocator to reflect the exact number
1860 of traceback cells this row needed */
1861
1862 b_size = MIN(b_size, N + 1);
1863 state_struct->used += MAX(b_index, b_size) - orig_b_index + 1;
1864
1865 /* chop off the best score in each frame */
1866
1867 last_b_index = MIN(b_size + 4, N + 3);
1868 while (b_size < last_b_index) {
1869 score_array[b_size].best = MININT;
1870 score_array[b_size].best_gap = MININT;
1871 b_size++;
1872 }
1873 }
1874
1875 a_index = *a_offset;
1876 b_index = *b_offset;
1877 script = SCRIPT_AHEAD_ONE_FRAME;
1878
1879 while (a_index > 0 || b_index > 0) {
1880 /* Retrieve the next action to perform. Rows of
1881 the traceback array do not necessarily start
1882 at offset zero of B, so a correction is needed
1883 to point to the correct position */
1884
1885 next_script =
1886 edit_script[a_index][b_index - edit_start_offset[a_index]];
1887
1888 switch (script) {
1889 case SCRIPT_GAP_IN_A:
1890 script = next_script & SCRIPT_OP_MASK;
1891 if (next_script & (SCRIPT_OOF_OPEN_GAP | SCRIPT_EXTEND_GAP_A))
1892 script = SCRIPT_GAP_IN_A;
1893 break;
1894 case SCRIPT_GAP_IN_B:
1895 script = next_script & SCRIPT_OP_MASK;
1896 if (next_script & (SCRIPT_OOF_OPEN_GAP | SCRIPT_EXTEND_GAP_B))
1897 script = SCRIPT_GAP_IN_B;
1898 break;
1899 default:
1900 script = next_script & SCRIPT_OP_MASK;
1901 break;
1902 }
1903
1904 if (script == SCRIPT_GAP_IN_B) {
1905 b_index -= 3;
1906 }
1907 else {
1908 b_index -= script;
1909 a_index--;
1910 }
1911
1912 GapPrelimEditBlockAdd(edit_block, (EGapAlignOpType)script, 1);
1913 }
1914
1915 sfree(edit_start_offset);
1916 sfree(edit_script);
1917
1918 if (!reversed)
1919 *b_offset -= 2;
1920 return best_score;
1921 }
1922
1923 /** Low level function to perform gapped extension with out-of-frame
1924 * gapping with or without traceback.
1925 * @param A The query sequence [in]
1926 * @param B The subject sequence [in]
1927 * @param M Maximal extension length in query [in]
1928 * @param N Maximal extension length in subject [in]
1929 * @param a_offset Resulting starting offset in query [out]
1930 * @param b_offset Resulting starting offset in subject [out]
1931 * @param score_only Only find the score, without saving traceback [in]
1932 * @param edit_block Structure to hold generated traceback [out]
1933 * @param gap_align Structure holding various information and allocated
1934 * memory for the gapped alignment [in]
1935 * @param score_params Parameters related to scoring [in]
1936 * @param query_offset The starting offset in query [in]
1937 * @param reversed Has the sequence been reversed? Used for psi-blast [in]
1938 * @return The best alignment score found.
1939 */
1940 static Int4
s_OutOfFrameGappedAlign(const Uint1 * A,const Uint1 * B,Int4 M,Int4 N,Int4 * a_offset,Int4 * b_offset,Boolean score_only,GapPrelimEditBlock * edit_block,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 query_offset,Boolean reversed)1941 s_OutOfFrameGappedAlign(const Uint1* A, const Uint1* B, Int4 M, Int4 N,
1942 Int4* a_offset, Int4* b_offset, Boolean score_only,
1943 GapPrelimEditBlock *edit_block, BlastGapAlignStruct* gap_align,
1944 const BlastScoringParameters* score_params,
1945 Int4 query_offset, Boolean reversed)
1946 {
1947 Int4 i, increment; /* sequence pointers and indices */
1948 Int4 a_index;
1949 Int4 b_index, b_size, first_b_index, last_b_index;
1950
1951 Int4 gap_open; /* alignment penalty variables */
1952 Int4 gap_extend;
1953 Int4 gap_open_extend;
1954 Int4 shift_penalty;
1955 Int4 x_dropoff;
1956
1957 BlastGapDP* score_array;
1958 Int4 num_extra_cells;
1959
1960 Int4** matrix = NULL; /* pointers to the score matrix */
1961 Int4** pssm = NULL;
1962 Int4* matrix_row = NULL;
1963
1964 Int4 score; /* score tracking variables */
1965 Int4 score_row1;
1966 Int4 score_row2;
1967 Int4 score_row3;
1968 Int4 score_gap_col;
1969 Int4 score_col1;
1970 Int4 score_col2;
1971 Int4 score_col3;
1972 Int4 score_other_frame1;
1973 Int4 score_other_frame2;
1974 Int4 best_score;
1975
1976 if (!score_only) {
1977 return s_OutOfFrameAlignWithTraceback(A, B, M, N, a_offset, b_offset,
1978 edit_block, gap_align,
1979 score_params, query_offset,
1980 reversed);
1981 }
1982
1983 /* do initialization and sanity-checking */
1984
1985 matrix = gap_align->sbp->matrix->data;
1986 if (gap_align->positionBased) {
1987 pssm = gap_align->sbp->psi_matrix->pssm->data;
1988 }
1989 *a_offset = 0;
1990 *b_offset = -2;
1991 gap_open = score_params->gap_open;
1992 gap_extend = score_params->gap_extend;
1993 gap_open_extend = gap_open + gap_extend;
1994 shift_penalty = score_params->shift_pen;
1995 x_dropoff = gap_align->gap_x_dropoff;
1996
1997 if (x_dropoff < gap_open_extend)
1998 x_dropoff = gap_open_extend;
1999
2000 if(N <= 0 || M <= 0)
2001 return 0;
2002
2003 /* Allocate and fill in the auxiliary bookeeping structures.
2004 Since A and B could be very large, maintain a window
2005 of auxiliary structures only large enough to contain to current
2006 set of DP computations. The initial window size is determined
2007 by the number of cells needed to fail the x-dropoff test */
2008
2009 if (gap_extend > 0)
2010 num_extra_cells = CODON_LENGTH * (x_dropoff / gap_extend + 5);
2011 else
2012 num_extra_cells = N + 5;
2013
2014 if (num_extra_cells > gap_align->dp_mem_alloc) {
2015 gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
2016 2 * gap_align->dp_mem_alloc);
2017 sfree(gap_align->dp_mem);
2018 gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
2019 sizeof(BlastGapDP));
2020 }
2021
2022 score_array = gap_align->dp_mem;
2023 score = -gap_open_extend;
2024 score_array[0].best = 0;
2025 score_array[0].best_gap = -gap_open_extend;
2026
2027 for (i = 3; i <= N + 2; i += 3) {
2028 score_array[i].best = score;
2029 score_array[i].best_gap = score - gap_open_extend;
2030 score_array[i-1].best = MININT;
2031 score_array[i-1].best_gap = MININT;
2032 score_array[i-2].best = MININT;
2033 score_array[i-2].best_gap = MININT;
2034 if (score < -x_dropoff)
2035 break;
2036 score -= gap_extend;
2037 }
2038
2039 /* The inner loop below examines letters of B from
2040 index 'first_b_index' to 'b_size' */
2041
2042 b_size = i - 2;
2043 score_array[i].best = MININT;
2044 score_array[i].best_gap = MININT;
2045
2046 best_score = 0;
2047 first_b_index = 0;
2048 if (reversed) {
2049 increment = -1;
2050 }
2051 else {
2052 /* Allow for a backwards frame shift */
2053 B -= 2;
2054 increment = 1;
2055 }
2056
2057 for (a_index = 1; a_index <= M; a_index++) {
2058
2059 /* pick out the row of the score matrix
2060 appropriate for A[a_index] */
2061
2062 if (!(gap_align->positionBased)) {
2063 matrix_row = matrix[ A[ a_index * increment ] ];
2064 }
2065 else {
2066 if(reversed)
2067 matrix_row = pssm[M - a_index];
2068 else
2069 matrix_row = pssm[a_index + query_offset];
2070 }
2071
2072 /* initialize running-score variables */
2073 score = MININT;
2074 score_row1 = MININT;
2075 score_row2 = MININT;
2076 score_row3 = MININT;
2077 score_gap_col = MININT;
2078 score_col1 = MININT;
2079 score_col2 = MININT;
2080 score_col3 = MININT;
2081 score_other_frame1 = MININT;
2082 score_other_frame2 = MININT;
2083 last_b_index = first_b_index;
2084 b_index = first_b_index;
2085
2086 while (b_index < b_size) {
2087
2088 /* FRAME 0 */
2089
2090 /* Pick the best score among all frames */
2091 score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
2092 score = MAX(score, score_col1) +
2093 matrix_row[ B[ b_index * increment ] ];
2094 score_other_frame1 = MAX(score_col1, score_array[b_index].best);
2095 score_col1 = score_array[b_index].best;
2096 score_gap_col = score_array[b_index].best_gap;
2097
2098 /* Use the row and column scores if they improve
2099 the score overall */
2100
2101 if (score < MAX(score_gap_col, score_row1)) {
2102 score = MAX(score_gap_col, score_row1);
2103 if (best_score - score > x_dropoff) {
2104
2105 /* the current best score failed the X-dropoff
2106 criterion. Note that this does not stop the
2107 inner loop, only forces future iterations to
2108 skip this column of B.
2109
2110 Also, if the very first letter of B that was
2111 tested failed the X dropoff criterion, make
2112 sure future inner loops start one letter to
2113 the right */
2114
2115 if (first_b_index == b_index)
2116 first_b_index = b_index + 1;
2117 else
2118 score_array[b_index].best = MININT;
2119 }
2120 else {
2121 /* update the row and column running scores */
2122 last_b_index = b_index;
2123 score_array[b_index].best = score;
2124 score_array[b_index].best_gap = score_gap_col - gap_extend;
2125 score_row1 -= gap_extend;
2126 }
2127 }
2128 else {
2129 if (best_score - score > x_dropoff) {
2130
2131 /* the current best score failed the X-dropoff
2132 criterion. */
2133
2134 if (first_b_index == b_index)
2135 first_b_index = b_index + 1;
2136 else
2137 score_array[b_index].best = MININT;
2138 }
2139 else {
2140 /* The current best score exceeds the
2141 row and column scores, and thus may
2142 improve on the current optimal score */
2143
2144 last_b_index = b_index;
2145 score_array[b_index].best = score;
2146 if (score > best_score) {
2147 best_score = score;
2148 *a_offset = a_index;
2149 *b_offset = b_index;
2150 }
2151
2152 /* compute the best scores that include gaps
2153 or gap extensions */
2154
2155 score -= gap_open_extend;
2156 score_row1 -= gap_extend;
2157 score_row1 = MAX(score, score_row1);
2158 score_array[b_index].best_gap = MAX(score,
2159 score_gap_col - gap_extend);
2160 }
2161 }
2162
2163 /* If this was the last letter of B checked, rotate
2164 the row scores so that code beyond the inner loop
2165 works correctly */
2166
2167 if (++b_index >= b_size) {
2168 score = score_row1;
2169 score_row1 = score_row2;
2170 score_row2 = score_row3;
2171 score_row3 = score;
2172 break;
2173 }
2174
2175 /* FRAME 1 */
2176
2177 /* This code, and that for Frame 2, are essentially the
2178 same as the preceeding code. The only real difference
2179 is the updating of the other_frame best scores */
2180
2181 score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
2182 score = MAX(score, score_col2) +
2183 matrix_row[ B[ b_index * increment ] ];
2184 score_other_frame2 = MAX(score_col2, score_array[b_index].best);
2185 score_col2 = score_array[b_index].best;
2186 score_gap_col = score_array[b_index].best_gap;
2187
2188 if (score < MAX(score_gap_col, score_row2)) {
2189 score = MAX(score_gap_col, score_row2);
2190 if (best_score - score > x_dropoff) {
2191 if (first_b_index == b_index)
2192 first_b_index = b_index + 1;
2193 else
2194 score_array[b_index].best = MININT;
2195 }
2196 else {
2197 last_b_index = b_index;
2198 score_array[b_index].best = score;
2199 score_array[b_index].best_gap = score_gap_col - gap_extend;
2200 score_row2 -= gap_extend;
2201 }
2202 }
2203 else {
2204 if (best_score - score > x_dropoff) {
2205 if (first_b_index == b_index)
2206 first_b_index = b_index + 1;
2207 else
2208 score_array[b_index].best = MININT;
2209 }
2210 else {
2211 last_b_index = b_index;
2212 score_array[b_index].best = score;
2213 if (score > best_score) {
2214 best_score = score;
2215 *a_offset = a_index;
2216 *b_offset = b_index;
2217 }
2218 score -= gap_open_extend;
2219 score_row2 -= gap_extend;
2220 score_row2 = MAX(score, score_row2);
2221 score_array[b_index].best_gap = MAX(score,
2222 score_gap_col - gap_extend);
2223 }
2224 }
2225
2226 if (++b_index >= b_size) {
2227 score = score_row2;
2228 score_row2 = score_row1;
2229 score_row1 = score_row3;
2230 score_row3 = score;
2231 break;
2232 }
2233
2234 /* FRAME 2 */
2235
2236 score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
2237 score = MAX(score, score_col3) +
2238 matrix_row[ B[ b_index * increment ] ];
2239 score_other_frame1 = score_other_frame2;
2240 score_other_frame2 = MAX(score_col3, score_array[b_index].best);
2241 score_col3 = score_array[b_index].best;
2242 score_gap_col = score_array[b_index].best_gap;
2243
2244 if (score < MAX(score_gap_col, score_row3)) {
2245 score = MAX(score_gap_col, score_row3);
2246 if (best_score - score > x_dropoff) {
2247 if (first_b_index == b_index)
2248 first_b_index = b_index + 1;
2249 else
2250 score_array[b_index].best = MININT;
2251 }
2252 else {
2253 last_b_index = b_index;
2254 score_array[b_index].best = score;
2255 score_array[b_index].best_gap = score_gap_col - gap_extend;
2256 score_row3 -= gap_extend;
2257 }
2258 }
2259 else {
2260 if (best_score - score > x_dropoff) {
2261 if (first_b_index == b_index)
2262 first_b_index = b_index + 1;
2263 else
2264 score_array[b_index].best = MININT;
2265 }
2266 else {
2267 last_b_index = b_index;
2268 score_array[b_index].best = score;
2269 if (score > best_score) {
2270 best_score = score;
2271 *a_offset = a_index;
2272 *b_offset = b_index;
2273 }
2274 score -= gap_open_extend;
2275 score_row3 -= gap_extend;
2276 score_row3 = MAX(score, score_row3);
2277 score_array[b_index].best_gap = MAX(score,
2278 score_gap_col - gap_extend);
2279 }
2280 }
2281 b_index++;
2282 }
2283
2284 /* Finish aligning if the best scores for all positions
2285 of B will fail the X-dropoff test, i.e. the inner loop
2286 bounds have converged to each other */
2287
2288 if (first_b_index == b_size)
2289 break;
2290
2291 /* Enlarge the window for score data, if necessary */
2292
2293 if (b_size + num_extra_cells + 5 >= gap_align->dp_mem_alloc) {
2294
2295 gap_align->dp_mem_alloc = MAX(b_size + num_extra_cells + 100,
2296 2 * gap_align->dp_mem_alloc);
2297 score_array = (BlastGapDP *)realloc(score_array,
2298 gap_align->dp_mem_alloc *
2299 sizeof(BlastGapDP));
2300 gap_align->dp_mem = score_array;
2301 }
2302
2303 if (last_b_index < b_size - 1) {
2304 /* This row failed the X-dropoff test earlier than
2305 the last row did; just shorten the loop bounds
2306 before doing the next row */
2307
2308 b_size = last_b_index + 1;
2309 }
2310 else {
2311 /* The inner loop finished without failing the X-dropoff
2312 test; initialize extra bookkeeping structures until
2313 the X dropoff test fails or we run out of letters in B.
2314 The next inner loop will have larger bounds.
2315
2316 Keep initializing extra structures until the X dropoff
2317 test fails in all frames for this row */
2318
2319 score = MAX(score_row1, score_row2);
2320 score = MAX(score, score_row3);
2321 while (score >= (best_score - x_dropoff) && b_size < N + 1) {
2322
2323 score_array[b_size].best = score_row1;
2324 score_array[b_size].best_gap = score_row1 - gap_open_extend;
2325 score_row1 -= gap_extend;
2326
2327 score_array[b_size+1].best = score_row2;
2328 score_array[b_size+1].best_gap = score_row2 - gap_open_extend;
2329 score_row2 -= gap_extend;
2330
2331 score_array[b_size+2].best = score_row3;
2332 score_array[b_size+2].best_gap = score_row3 - gap_open_extend;
2333 score_row3 -= gap_extend;
2334
2335 b_size += 3;
2336 score -= gap_extend;
2337 }
2338 }
2339
2340 /* chop off the best score in each frame */
2341 b_size = MIN(b_size, N + 1);
2342 last_b_index = MIN(b_size + 4, N + 3);
2343 while (b_size < last_b_index) {
2344 score_array[b_size].best = MININT;
2345 score_array[b_size].best_gap = MININT;
2346 b_size++;
2347 }
2348 }
2349
2350 if (!reversed) {
2351 /* The sequence was shifted, so length should be adjusted as well */
2352 *b_offset -= 2;
2353 }
2354 return best_score;
2355 }
2356
2357 /** Simple wrapper around binary search function for BlastQueryInfo structure
2358 * to obtain the context in which this initial ungapped HSP is found
2359 * @param query_info Query information structure [in]
2360 * @param init_hsp Initial HSP [in]
2361 */
2362 static NCBI_INLINE Int4
s_GetUngappedHSPContext(const BlastQueryInfo * query_info,const BlastInitHSP * init_hsp)2363 s_GetUngappedHSPContext(const BlastQueryInfo* query_info,
2364 const BlastInitHSP* init_hsp)
2365 {
2366 return BSearchContextInfo(init_hsp->offsets.qs_offsets.q_off, query_info);
2367 }
2368
2369 /** Adjust the HSP offsets in the initial ungapped HSP structure given the
2370 * query start
2371 * @param init_hsp Initial HSP [in|out]
2372 * @param query_start Starting offset of the query sequence in the query info
2373 * structure [in]
2374 */
2375 static NCBI_INLINE void
s_AdjustInitialHSPOffsets(BlastInitHSP * init_hsp,Int4 query_start)2376 s_AdjustInitialHSPOffsets(BlastInitHSP* init_hsp, Int4 query_start)
2377 {
2378 init_hsp->offsets.qs_offsets.q_off -= query_start;
2379 if (init_hsp->ungapped_data) {
2380 init_hsp->ungapped_data->q_start -= query_start;
2381 }
2382 ASSERT(init_hsp->ungapped_data == NULL ||
2383 init_hsp->ungapped_data->q_start >= 0);
2384 }
2385
2386 /** Set up a BLAST_SequenceBlk structure for a single query sequence
2387 * @param concatenated_query Query sequence block for concatenated query [in]
2388 * @param query_info Query information structure for concatenated query [in]
2389 * @param context Context in which the HSP ocurred [in]
2390 * @param single_query query Output sequence block which will contain adjusted
2391 * sequence pointer and sequence length [out]
2392 * @param query_start Starting offset of the single query sequence in the
2393 * concatenated query [out]
2394 */
2395 static void
s_SetUpLocalBlastSequenceBlk(const BLAST_SequenceBlk * concatenated_query,const BlastQueryInfo * query_info,Int4 context,BLAST_SequenceBlk * single_query,Int4 * query_start)2396 s_SetUpLocalBlastSequenceBlk(const BLAST_SequenceBlk* concatenated_query,
2397 const BlastQueryInfo* query_info, Int4 context,
2398 BLAST_SequenceBlk* single_query,
2399 Int4* query_start)
2400 {
2401 Int4 query_length = 0;
2402 if (concatenated_query->oof_sequence) {
2403 /* Out-of-frame blastx case: all frames of the same parity are mixed
2404 together in a special sequence. */
2405 Int4 query_end_pt = 0;
2406 Int4 mixed_frame_context = context - context % CODON_LENGTH;
2407
2408 *query_start = query_info->contexts[mixed_frame_context].query_offset;
2409 query_end_pt =
2410 query_info->contexts[mixed_frame_context+CODON_LENGTH-1].query_offset +
2411 query_info->contexts[mixed_frame_context+CODON_LENGTH-1].query_length;
2412 query_length = query_end_pt - *query_start;
2413 single_query->sequence = NULL;
2414 single_query->oof_sequence = concatenated_query->oof_sequence + *query_start;
2415 } else {
2416 *query_start = query_info->contexts[context].query_offset;
2417 query_length = query_info->contexts[context].query_length;
2418 single_query->sequence = concatenated_query->sequence + *query_start;
2419 single_query->oof_sequence = NULL;
2420 }
2421 single_query->length = query_length;
2422 }
2423
2424
2425 /** Find the HSP offsets relative to the individual query sequence instead of
2426 * the concatenated sequence and retrieve relevant portions of the query
2427 * sequence data.
2428 * @param query Query sequence block [in]
2429 * @param query_info Query information structure, including context offsets
2430 * array [in]
2431 * @param init_hsp Initial HSP, this will be modified to hold the adjusted
2432 * offsets [in] [out]
2433 * @param query_out query sequence block with modified sequence
2434 * pointer and sequence length [out]
2435 * @param context Which context this HSP belongs to? [out]
2436 */
2437 static void
s_AdjustHspOffsetsAndGetQueryData(const BLAST_SequenceBlk * query,const BlastQueryInfo * query_info,BlastInitHSP * init_hsp,BLAST_SequenceBlk * query_out,Int4 * context)2438 s_AdjustHspOffsetsAndGetQueryData(const BLAST_SequenceBlk* query,
2439 const BlastQueryInfo* query_info,
2440 BlastInitHSP* init_hsp,
2441 BLAST_SequenceBlk* query_out,
2442 Int4* context)
2443 {
2444 Int4 query_start = 0;
2445
2446 ASSERT(query);
2447 ASSERT(query_info);
2448 ASSERT(query_out);
2449 ASSERT(init_hsp);
2450 ASSERT(context);
2451
2452 ASSERT(init_hsp->ungapped_data == NULL ||
2453 init_hsp->ungapped_data->q_start >= 0);
2454 *context = s_GetUngappedHSPContext(query_info, init_hsp);
2455 s_SetUpLocalBlastSequenceBlk(query, query_info, *context,
2456 query_out, &query_start);
2457 ASSERT(init_hsp->ungapped_data == NULL ||
2458 (init_hsp->ungapped_data->q_start-query_start) >= 0);
2459 s_AdjustInitialHSPOffsets(init_hsp, query_start);
2460 }
2461
2462
2463 /** Convert the initial list of traceback actions from a non-OOF
2464 * gapped alignment into a blast edit script. Note that this routine
2465 * assumes the input edit blocks have not been reversed or rearranged
2466 * by calling code
2467 * @param rev_prelim_tback Traceback from extension to the left [in]
2468 * @param fwd_prelim_tback Traceback from extension to the right [in]
2469 * @return Pointer to the resulting edit script, or NULL if there
2470 * are no traceback actions specified
2471 */
2472 GapEditScript*
Blast_PrelimEditBlockToGapEditScript(GapPrelimEditBlock * rev_prelim_tback,GapPrelimEditBlock * fwd_prelim_tback)2473 Blast_PrelimEditBlockToGapEditScript (GapPrelimEditBlock* rev_prelim_tback,
2474 GapPrelimEditBlock* fwd_prelim_tback)
2475 {
2476 Boolean merge_ops = FALSE;
2477 GapEditScript* esp;
2478 GapPrelimEditScript *op;
2479 Int4 i;
2480 Int4 index=0;
2481 Int4 size = 0;
2482
2483 if (rev_prelim_tback == NULL || fwd_prelim_tback == NULL)
2484 return NULL;
2485
2486 /* The fwd_prelim_tback script will get reversed here as the traceback started from the highest scoring point
2487 and worked backwards. The rev_prelim_tback script does NOT get reversed. Since it was reversed when the
2488 traceback was produced it's already "forward" */
2489
2490 if (fwd_prelim_tback->num_ops > 0 && rev_prelim_tback->num_ops > 0 &&
2491 fwd_prelim_tback->edit_ops[(fwd_prelim_tback->num_ops)-1].op_type ==
2492 rev_prelim_tback->edit_ops[(rev_prelim_tback->num_ops)-1].op_type)
2493 merge_ops = TRUE;
2494
2495 size = fwd_prelim_tback->num_ops+rev_prelim_tback->num_ops;
2496 if (merge_ops)
2497 size--;
2498
2499 esp = GapEditScriptNew(size);
2500
2501 index = 0;
2502 for (i=0; i < rev_prelim_tback->num_ops; i++) {
2503 op = rev_prelim_tback->edit_ops + i;
2504 esp->op_type[index] = op->op_type;
2505 esp->num[index] = op->num;
2506 index++;
2507 }
2508
2509 if (fwd_prelim_tback->num_ops == 0)
2510 return esp;
2511
2512 if (merge_ops)
2513 esp->num[index-1] += fwd_prelim_tback->edit_ops[(fwd_prelim_tback->num_ops)-1].num;
2514
2515 /* If we merge, then we skip the first one. */
2516 if (merge_ops)
2517 i = fwd_prelim_tback->num_ops - 2;
2518 else
2519 i = fwd_prelim_tback->num_ops - 1;
2520
2521 for (; i >= 0; i--) {
2522 op = fwd_prelim_tback->edit_ops + i;
2523 esp->op_type[index] = op->op_type;
2524 esp->num[index] = op->num;
2525 index++;
2526 }
2527
2528 return esp;
2529 }
2530
2531 /** Fills the BlastGapAlignStruct structure with the results
2532 * of a greedy gapped extension.
2533 * @param gap_align the initialized gapped alignment structure [in] [out]
2534 * @param q_start The starting offset in query [in]
2535 * @param s_start The starting offset in subject [in]
2536 * @param q_end The ending offset in query [in]
2537 * @param s_end The ending offset in subject [in]
2538 * @param q_seed_start The query offset of the alignment start point [in]
2539 * @param s_seed_start The subject offset of the alignment start point [in]
2540 * @param score The alignment score [in]
2541 * @param esp The edit script containing the traceback information [in]
2542 */
2543 static Int2
s_BlastGreedyGapAlignStructFill(BlastGapAlignStruct * gap_align,Int4 q_start,Int4 s_start,Int4 q_end,Int4 s_end,Int4 q_seed_start,Int4 s_seed_start,Int4 score,GapEditScript * esp)2544 s_BlastGreedyGapAlignStructFill(BlastGapAlignStruct* gap_align,
2545 Int4 q_start, Int4 s_start,
2546 Int4 q_end, Int4 s_end,
2547 Int4 q_seed_start, Int4 s_seed_start,
2548 Int4 score, GapEditScript* esp)
2549 {
2550 gap_align->query_start = q_start;
2551 gap_align->query_stop = q_end;
2552 gap_align->subject_start = s_start;
2553 gap_align->subject_stop = s_end;
2554 gap_align->greedy_query_seed_start = q_seed_start;
2555 gap_align->greedy_subject_seed_start = s_seed_start;
2556 gap_align->score = score;
2557
2558 if (esp)
2559 gap_align->edit_script = esp;
2560
2561 return 0;
2562 }
2563
s_UpdateEditScript(GapEditScript * esp,int pos,int bf,int af)2564 static void s_UpdateEditScript(GapEditScript* esp, int pos, int bf, int af) {
2565 int op, qd, sd;
2566
2567 if (bf > 0) {
2568 op = pos;
2569 qd = sd = bf;
2570 do {
2571 if (--op < 0) return;
2572 switch(esp->op_type[op]) {
2573 case eGapAlignSub:
2574 qd -= esp->num[op];
2575 sd -= esp->num[op];
2576 break;
2577 case eGapAlignIns:
2578 qd -= esp->num[op];
2579 break;
2580 case eGapAlignDel:
2581 sd -= esp->num[op];
2582 default:
2583 break;
2584 }
2585 } while (qd > 0 || sd > 0);
2586
2587 esp->num[op] = -MAX(qd, sd);
2588 esp->op_type[op++] = eGapAlignSub;
2589 for (; op < pos-1; op++) esp->num[op] = 0;
2590 esp->num[pos] += bf;
2591 qd -= sd;
2592 esp->op_type[pos-1] = (qd>0) ? eGapAlignDel: eGapAlignIns;
2593 esp->num[pos-1] = (qd>0) ? qd : -qd;
2594 }
2595
2596 if (af > 0) {
2597 op = pos;
2598 qd = sd = af;
2599 do {
2600 if (++op >= esp->size) return;
2601 switch(esp->op_type[op]) {
2602 case eGapAlignSub:
2603 qd -= esp->num[op];
2604 sd -= esp->num[op];
2605 break;
2606 case eGapAlignIns:
2607 qd -= esp->num[op];
2608 break;
2609 case eGapAlignDel:
2610 sd -= esp->num[op];
2611 default:
2612 break;
2613 }
2614 } while (qd > 0 || sd > 0);
2615
2616 esp->num[op] = -MAX(qd, sd);
2617 esp->op_type[op--] = eGapAlignSub;
2618 for (; op > pos+1; op--) esp->num[op] = 0;
2619 esp->num[pos] += af;
2620 qd -= sd;
2621 esp->op_type[pos+1] = (qd>0) ? eGapAlignDel: eGapAlignIns;
2622 esp->num[pos+1] = (qd>0) ? qd : -qd;
2623 }
2624 }
2625
s_RebuildEditScript(GapEditScript * esp)2626 static void s_RebuildEditScript(GapEditScript* esp) {
2627 int i, j;
2628 for (i=0, j=-1; i<esp->size; i++) {
2629 if (esp->num[i] == 0) continue;
2630 if (j>=0 && esp->op_type[i] == esp->op_type[j]) {
2631 esp->num[j] += esp->num[i];
2632 } else if (j==-1 || esp->op_type[i] == eGapAlignSub
2633 || esp->op_type[j] == eGapAlignSub) {
2634 esp->op_type[++j] = esp->op_type[i];
2635 esp->num[j] = esp->num[i];
2636 } else {
2637 int d = esp->num[j] - esp->num[i];
2638 if (d > 0) {
2639 esp->num[j-1] += esp->num[i];
2640 esp->num[j] = d;
2641 } else if (d < 0) {
2642 esp->num[j-1] += esp->num[j];
2643 esp->num[j] = -d;
2644 esp->op_type[j] = esp->op_type[i];
2645 } else {
2646 esp->num[j-1] += esp->num[j];
2647 --j;
2648 }
2649 }
2650 }
2651 esp->size = ++j;
2652 }
2653
s_ReduceGaps(GapEditScript * esp,const Uint1 * q,const Uint1 * s,const Uint1 * qf,const Uint1 * sf)2654 static void s_ReduceGaps(GapEditScript* esp, const Uint1 *q, const Uint1 *s,
2655 const Uint1 *qf,const Uint1 *sf){
2656 int i, j, nm1, nm2, d;
2657 const Uint1 *q1, *s1;
2658
2659 for (q1=q, s1=s, i=0; i<esp->size; i++) {
2660 if (esp->num[i] == 0) continue;
2661 if (esp->op_type[i] == eGapAlignSub) {
2662 if(esp->num[i] >= 12) {
2663 nm1 = 1;
2664 if (i > 0) {
2665 while (q1-nm1>=q && (*(q1-nm1) == *(s1-nm1))) ++nm1;
2666 }
2667 q1 += esp->num[i];
2668 s1 += esp->num[i];
2669 nm2 = 0;
2670 if (i < esp->size -1) {
2671 while ((q1+1<qf) && (s1+1<sf) && (*(q1++) == *(s1++))) ++nm2;
2672 }
2673 if (nm1>1 || nm2>0) s_UpdateEditScript(esp, i, nm1-1, nm2);
2674 q1--; s1--;
2675 } else {
2676 q1 += esp->num[i];
2677 s1 += esp->num[i];
2678 }
2679 } else if (esp->op_type[i] == eGapAlignIns) {
2680 q1 += esp->num[i];
2681 } else {
2682 s1 += esp->num[i];
2683 }
2684 }
2685 s_RebuildEditScript(esp);
2686
2687 for (i=0; i<esp->size; i++) {
2688 if (esp->op_type[i] == eGapAlignSub) {
2689 q += esp->num[i];
2690 s += esp->num[i];
2691 continue;
2692 }
2693 if (i>1 && esp->op_type[i] != esp->op_type[i-2]
2694 && esp->num[i-2] > 0) {
2695 d = esp->num[i] + esp->num[i-1] + esp->num[i-2];
2696 if (d == 3) {
2697 /* special case, no need to do further testing */
2698 (esp->num[i-2]) = 0;
2699 (esp->num[i-1]) = 2;
2700 (esp->num[i]) = 0;
2701 if (esp->op_type[i] == eGapAlignIns) {
2702 ++q;
2703 } else {
2704 ++s;
2705 }
2706 } else if (d < 12) {
2707 /* Try reducing this sub... */
2708 nm1 = 0;
2709 nm2 = 0;
2710 d = MIN(esp->num[i], esp->num[i-2]);
2711 q -= esp->num[i-1];
2712 s -= esp->num[i-1];
2713 q1 = q;
2714 s1 = s;
2715 if (esp->op_type[i] == eGapAlignIns) {
2716 s -= d;
2717 } else {
2718 q -= d;
2719 }
2720 for (j=0; j<esp->num[i-1]; ++j, ++q1, ++s1, ++q, ++s) {
2721 if (*q1 == *s1) nm1++;
2722 if (*q == *s) nm2++;
2723 }
2724 for (j=0; j<d; ++j, ++q, ++s) {
2725 if (*q == *s) nm2++;
2726 }
2727 if (nm2 >= nm1 - d) {
2728 (esp->num[i-2]) -= d;
2729 (esp->num[i-1]) += d;
2730 (esp->num[i]) -= d;
2731 } else {
2732 q = q1;
2733 s = s1;
2734 }
2735 }
2736 }
2737 if (esp->op_type[i] == eGapAlignIns) {
2738 q += esp->num[i];
2739 } else {
2740 s += esp->num[i];
2741 }
2742 }
2743 s_RebuildEditScript(esp);
2744 }
2745
2746 Int2
BLAST_GreedyGappedAlignment(const Uint1 * query,const Uint1 * subject,Int4 query_length,Int4 subject_length,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 q_off,Int4 s_off,Boolean compressed_subject,Boolean do_traceback,Boolean * fence_hit)2747 BLAST_GreedyGappedAlignment(const Uint1* query, const Uint1* subject,
2748 Int4 query_length, Int4 subject_length, BlastGapAlignStruct* gap_align,
2749 const BlastScoringParameters* score_params,
2750 Int4 q_off, Int4 s_off, Boolean compressed_subject, Boolean do_traceback,
2751 Boolean * fence_hit)
2752 {
2753 const Uint1* q;
2754 const Uint1* s;
2755 Int4 score;
2756 Int4 X;
2757 Int4 q_avail, s_avail;
2758 Int4 q_ext_l, q_ext_r, s_ext_l, s_ext_r;
2759 GapPrelimEditBlock *fwd_prelim_tback = NULL;
2760 GapPrelimEditBlock *rev_prelim_tback = NULL;
2761 SGreedySeed fwd_start_point;
2762 SGreedySeed rev_start_point;
2763 Uint1 rem;
2764 GapEditScript* esp = NULL;
2765 Int4 q_seed_start = q_off;
2766 Int4 s_seed_start = s_off;
2767
2768 q_avail = query_length - q_off;
2769 s_avail = subject_length - s_off;
2770
2771 q = query + q_off;
2772 if (!compressed_subject) {
2773 s = subject + s_off;
2774 rem = 4; /* Special value to indicate that sequence is already
2775 uncompressed */
2776 } else {
2777 s = subject + s_off/4;
2778 rem = s_off % 4;
2779 }
2780
2781 X = gap_align->gap_x_dropoff;
2782
2783 if (do_traceback) {
2784 fwd_prelim_tback = gap_align->fwd_prelim_tback;
2785 rev_prelim_tback = gap_align->rev_prelim_tback;
2786 GapPrelimEditBlockReset(fwd_prelim_tback);
2787 GapPrelimEditBlockReset(rev_prelim_tback);
2788 }
2789
2790 /* extend to the right */
2791 while (TRUE) {
2792 Int4 new_dist, xdrop;
2793 score = BLAST_AffineGreedyAlign(q, q_avail, s, s_avail, FALSE, X,
2794 score_params->reward, -score_params->penalty,
2795 score_params->gap_open, score_params->gap_extend,
2796 &q_ext_r, &s_ext_r, gap_align->greedy_align_mem,
2797 fwd_prelim_tback, rem, fence_hit, &fwd_start_point);
2798
2799 if(fence_hit && *fence_hit) {
2800 return -1;
2801 }
2802
2803 if (score >=0) break;
2804
2805 /* double the max distance */
2806 new_dist = gap_align->greedy_align_mem->max_dist * 2;
2807 xdrop = gap_align->greedy_align_mem->xdrop;
2808 s_BlastGreedyAlignsFree(gap_align->greedy_align_mem);
2809 gap_align->greedy_align_mem =
2810 s_BlastGreedyAlignMemAlloc(score_params, NULL, new_dist, xdrop);
2811
2812 if (!gap_align->greedy_align_mem) {
2813 gap_align = BLAST_GapAlignStructFree(gap_align);
2814 return -1;
2815 }
2816 }
2817
2818 if (compressed_subject)
2819 rem = 0;
2820
2821 /* extend to the left */
2822 while (TRUE) {
2823 Int4 new_dist, xdrop, score1;
2824 score1 = BLAST_AffineGreedyAlign(query, q_off,
2825 subject, s_off, TRUE, X,
2826 score_params->reward, -score_params->penalty,
2827 score_params->gap_open, score_params->gap_extend,
2828 &q_ext_l, &s_ext_l, gap_align->greedy_align_mem,
2829 rev_prelim_tback, rem, fence_hit, &rev_start_point);
2830 if(fence_hit && *fence_hit) {
2831 return -1;
2832 }
2833 if (score1 >=0) {
2834 score += score1;
2835 break;
2836 }
2837
2838 /* double the max distance */
2839 new_dist = gap_align->greedy_align_mem->max_dist * 2;
2840 xdrop = gap_align->greedy_align_mem->xdrop;
2841 s_BlastGreedyAlignsFree(gap_align->greedy_align_mem);
2842 gap_align->greedy_align_mem =
2843 s_BlastGreedyAlignMemAlloc(score_params, NULL, new_dist, xdrop);
2844
2845 if (!gap_align->greedy_align_mem) {
2846 gap_align = BLAST_GapAlignStructFree(gap_align);
2847 return -1;
2848 }
2849 }
2850
2851 /* In basic case the greedy algorithm returns number of
2852 differences, hence we need to convert it to score */
2853 if (score_params->gap_open==0 && score_params->gap_extend==0) {
2854 score =
2855 (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*score_params->reward/2 -
2856 score*(score_params->reward - score_params->penalty);
2857 } else if (score_params->reward % 2 == 1) {
2858 score /= 2;
2859 }
2860
2861 if (do_traceback) {
2862 esp = Blast_PrelimEditBlockToGapEditScript(rev_prelim_tback,
2863 fwd_prelim_tback);
2864 /* check for possible gap elimination */
2865 ASSERT(!compressed_subject);
2866 if (esp) s_ReduceGaps(esp, query+q_off-q_ext_l, subject+s_off-s_ext_l,
2867 query+q_off+q_ext_r, subject+s_off+s_ext_r);
2868 }
2869 else {
2870 /* estimate the best alignment start point. This is the middle
2871 of the longest run of exact matches found by the aligner,
2872 subject to the constraint that the start point lies in the
2873 box formed by the optimal alignment */
2874
2875 Int4 q_box_l = q_off - q_ext_l;
2876 Int4 s_box_l = s_off - s_ext_l;
2877 Int4 q_box_r = q_off + q_ext_r;
2878 Int4 s_box_r = s_off + s_ext_r;
2879 Int4 q_seed_start_l = q_off - rev_start_point.start_q;
2880 Int4 s_seed_start_l = s_off - rev_start_point.start_s;
2881 Int4 q_seed_start_r = q_off + fwd_start_point.start_q;
2882 Int4 s_seed_start_r = s_off + fwd_start_point.start_s;
2883 Int4 valid_seed_len_l = 0;
2884 Int4 valid_seed_len_r = 0;
2885
2886 if (q_seed_start_r < q_box_r && s_seed_start_r < s_box_r) {
2887 valid_seed_len_r = MIN(q_box_r - q_seed_start_r,
2888 s_box_r - s_seed_start_r);
2889 valid_seed_len_r = MIN(valid_seed_len_r,
2890 fwd_start_point.match_length) / 2;
2891 } else {
2892 q_seed_start_r = q_off;
2893 s_seed_start_r = s_off;
2894 }
2895
2896 if (q_seed_start_l > q_box_l && s_seed_start_l > s_box_l) {
2897 valid_seed_len_l = MIN(q_seed_start_l - q_box_l,
2898 s_seed_start_l - s_box_l);
2899 valid_seed_len_l = MIN(valid_seed_len_l,
2900 rev_start_point.match_length) / 2;
2901 } else {
2902 q_seed_start_l = q_off;
2903 s_seed_start_l = s_off;
2904 }
2905
2906 if (valid_seed_len_r > valid_seed_len_l) {
2907 q_seed_start = q_seed_start_r + valid_seed_len_r;
2908 s_seed_start = s_seed_start_r + valid_seed_len_r;
2909 }
2910 else {
2911 q_seed_start = q_seed_start_l - valid_seed_len_l;
2912 s_seed_start = s_seed_start_l - valid_seed_len_l;
2913 }
2914 }
2915
2916 s_BlastGreedyGapAlignStructFill(gap_align,
2917 q_off-q_ext_l, s_off-s_ext_l,
2918 q_off+q_ext_r, s_off+s_ext_r,
2919 q_seed_start, s_seed_start,
2920 score, esp);
2921 return 0;
2922 }
2923
2924 /** Performs dynamic programming style gapped extension, given an initial
2925 * HSP (offset pair), two sequence blocks and scoring and extension options.
2926 * Note: traceback is not done in this function.
2927 * @param query_blk The query sequence [in]
2928 * @param subject_blk The subject sequence [in]
2929 * @param gap_align The auxiliary structure for gapped alignment [in]
2930 * @param score_params Parameters related to scoring [in]
2931 * @param init_hsp The initial HSP that needs to be extended [in]
2932 */
2933 static Int2
s_BlastDynProgNtGappedAlignment(BLAST_SequenceBlk * query_blk,BLAST_SequenceBlk * subject_blk,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,BlastInitHSP * init_hsp)2934 s_BlastDynProgNtGappedAlignment(BLAST_SequenceBlk* query_blk,
2935 BLAST_SequenceBlk* subject_blk, BlastGapAlignStruct* gap_align,
2936 const BlastScoringParameters* score_params, BlastInitHSP* init_hsp)
2937 {
2938 Int4 q_length, s_length;
2939 Int4 private_q_start, private_s_start;
2940 Int4 score_right = 0, score_left = 0;
2941 Uint1 offset_adjustment;
2942 Uint1* query = query_blk->sequence;
2943 Uint1* subject = subject_blk->sequence;
2944 Int4 x_dropoff = gap_align->gap_x_dropoff;
2945
2946 if(init_hsp->ungapped_data->score < x_dropoff) {
2947 x_dropoff = init_hsp->ungapped_data->score;
2948 }
2949
2950 /* If subject offset is not at the start of a full byte,
2951 s_BlastAlignPackedNucl won't work, so shift the alignment start
2952 to the next multiple of 4 subject letters. Note that the
2953 shift amount is always nonzero, so a left extension always happens
2954 (and has a few presumed exact matches to start with). In the case
2955 of the smallest ungapped alignment (4 nucleotides), this can
2956 conceivably put the start point at the end of one sequence */
2957
2958 offset_adjustment = COMPRESSION_RATIO -
2959 (init_hsp->offsets.qs_offsets.s_off % COMPRESSION_RATIO);
2960 q_length = init_hsp->offsets.qs_offsets.q_off + offset_adjustment;
2961 s_length = init_hsp->offsets.qs_offsets.s_off + offset_adjustment;
2962
2963 /* This prevents the starting point from being at the end of a sequence
2964 as mentioned in the comment above. */
2965 if (q_length > query_blk->length || s_length > subject_blk->length)
2966 {
2967 q_length -= COMPRESSION_RATIO;
2968 s_length -= COMPRESSION_RATIO;
2969 }
2970
2971 /* perform extension to left */
2972 score_left = s_BlastAlignPackedNucl(query, subject, q_length, s_length,
2973 &private_q_start, &private_s_start, gap_align,
2974 score_params, TRUE, x_dropoff);
2975 if (score_left < 0)
2976 return -1;
2977 gap_align->query_start = q_length - private_q_start;
2978 gap_align->subject_start = s_length - private_s_start;
2979
2980 /* perform extension to right */
2981 if (q_length < query_blk->length &&
2982 s_length < subject_blk->length)
2983 {
2984 score_right = s_BlastAlignPackedNucl(query+q_length-1,
2985 subject+(s_length+3)/COMPRESSION_RATIO - 1,
2986 query_blk->length-q_length,
2987 subject_blk->length-s_length, &(gap_align->query_stop),
2988 &(gap_align->subject_stop), gap_align, score_params, FALSE, x_dropoff);
2989 if (score_right < 0)
2990 return -1;
2991 gap_align->query_stop += q_length;
2992 gap_align->subject_stop += s_length;
2993 }
2994 else {
2995 gap_align->query_stop = q_length;
2996 gap_align->subject_stop = s_length;
2997 }
2998
2999 gap_align->score = score_right+score_left;
3000
3001 return 0;
3002 }
3003
3004 /** Aligns two nucleotide sequences, one (A) should be packed in the
3005 * same way as the BLAST databases, the other (B) should contain one
3006 * basepair/byte. Traceback is not done in this function.
3007 * @param B The query sequence [in]
3008 * @param A The subject sequence [in]
3009 * @param N Maximal extension length in query [in]
3010 * @param M Maximal extension length in subject [in]
3011 * @param b_offset Resulting starting offset in query [out]
3012 * @param a_offset Resulting starting offset in subject [out]
3013 * @param gap_align The auxiliary structure for gapped alignment [in]
3014 * @param score_params Parameters related to scoring [in]
3015 * @param reverse_sequence Reverse the sequence.
3016 * @return The best alignment score found.
3017 */
3018 static Int4
s_BlastAlignPackedNucl(Uint1 * B,Uint1 * A,Int4 N,Int4 M,Int4 * b_offset,Int4 * a_offset,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Boolean reverse_sequence,Int4 x_dropoff)3019 s_BlastAlignPackedNucl(Uint1* B, Uint1* A, Int4 N, Int4 M,
3020 Int4* b_offset, Int4* a_offset,
3021 BlastGapAlignStruct* gap_align,
3022 const BlastScoringParameters* score_params,
3023 Boolean reverse_sequence, Int4 x_dropoff)
3024 {
3025 Int4 i; /* sequence pointers and indices */
3026 Int4 a_index, a_base_pair;
3027 Int4 b_index, b_size, first_b_index, last_b_index, b_increment;
3028 Uint1* b_ptr;
3029
3030 BlastGapDP* score_array;
3031 Int4 num_extra_cells;
3032
3033 Int4 gap_open; /* alignment penalty variables */
3034 Int4 gap_extend;
3035 Int4 gap_open_extend;
3036
3037 Int4* *matrix; /* pointers to the score matrix */
3038 Int4* matrix_row;
3039
3040 Int4 score; /* score tracking variables */
3041 Int4 score_gap_row;
3042 Int4 score_gap_col;
3043 Int4 next_score;
3044 Int4 best_score;
3045
3046 /* do initialization and sanity-checking */
3047
3048 matrix = gap_align->sbp->matrix->data;
3049 *a_offset = 0;
3050 *b_offset = 0;
3051 gap_open = score_params->gap_open;
3052 gap_extend = score_params->gap_extend;
3053 gap_open_extend = gap_open + gap_extend;
3054
3055 if (x_dropoff < gap_open_extend)
3056 x_dropoff = gap_open_extend;
3057
3058 if(N <= 0 || M <= 0)
3059 return 0;
3060
3061 /* Allocate and fill in the auxiliary bookeeping structures.
3062 Since A and B could be very large, maintain a window
3063 of auxiliary structures only large enough to contain the current
3064 set of DP computations. The initial window size is determined
3065 by the number of cells needed to fail the x-dropoff test */
3066
3067 if (gap_extend > 0)
3068 num_extra_cells = x_dropoff / gap_extend + 3;
3069 else
3070 num_extra_cells = N + 3;
3071
3072 if (num_extra_cells > gap_align->dp_mem_alloc) {
3073 gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
3074 2 * gap_align->dp_mem_alloc);
3075 sfree(gap_align->dp_mem);
3076 gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
3077 sizeof(BlastGapDP));
3078 }
3079
3080 score_array = gap_align->dp_mem;
3081 score = -gap_open_extend;
3082 score_array[0].best = 0;
3083 score_array[0].best_gap = -gap_open_extend;
3084
3085 for (i = 1; i <= N; i++) {
3086 if (score < -x_dropoff)
3087 break;
3088
3089 score_array[i].best = score;
3090 score_array[i].best_gap = score - gap_open_extend;
3091 score -= gap_extend;
3092 }
3093
3094 /* The inner loop below examines letters of B from
3095 index 'first_b_index' to 'b_size' */
3096
3097 b_size = i;
3098 best_score = 0;
3099 first_b_index = 0;
3100 if (reverse_sequence)
3101 b_increment = -1;
3102 else
3103 b_increment = 1;
3104
3105 for (a_index = 1; a_index <= M; a_index++) {
3106
3107 /* pick out the row of the score matrix
3108 appropriate for A[a_index] */
3109
3110 if(reverse_sequence) {
3111 a_base_pair = NCBI2NA_UNPACK_BASE(A[(M-a_index)/4],
3112 ((a_index-1)%4));
3113 matrix_row = matrix[a_base_pair];
3114 }
3115 else {
3116 a_base_pair = NCBI2NA_UNPACK_BASE(A[1+((a_index-1)/4)],
3117 (3-((a_index-1)%4)));
3118 matrix_row = matrix[a_base_pair];
3119 }
3120
3121 if(reverse_sequence)
3122 b_ptr = &B[N - first_b_index];
3123 else
3124 b_ptr = &B[first_b_index];
3125
3126 /* initialize running-score variables */
3127 score = MININT;
3128 score_gap_row = MININT;
3129 last_b_index = first_b_index;
3130
3131 for (b_index = first_b_index; b_index < b_size; b_index++) {
3132
3133 b_ptr += b_increment;
3134 score_gap_col = score_array[b_index].best_gap;
3135 next_score = score_array[b_index].best + matrix_row[ *b_ptr ];
3136
3137 if (score < score_gap_col)
3138 score = score_gap_col;
3139
3140 if (score < score_gap_row)
3141 score = score_gap_row;
3142
3143 if (best_score - score > x_dropoff) {
3144
3145 /* the current best score failed the X-dropoff
3146 criterion. Note that this does not stop the
3147 inner loop, only forces future iterations to
3148 skip this column of B.
3149
3150 Also, if the very first letter of B that was
3151 tested failed the X dropoff criterion, make
3152 sure future inner loops start one letter to
3153 the right */
3154
3155 if (b_index == first_b_index)
3156 first_b_index++;
3157 else
3158 score_array[b_index].best = MININT;
3159 }
3160 else {
3161 last_b_index = b_index;
3162 if (score > best_score) {
3163 best_score = score;
3164 *a_offset = a_index;
3165 *b_offset = b_index;
3166 }
3167
3168 /* If starting a gap at this position will improve
3169 the best row or column score, update them to
3170 reflect that. */
3171
3172 score_gap_row -= gap_extend;
3173 score_gap_col -= gap_extend;
3174 score_array[b_index].best_gap = MAX(score - gap_open_extend,
3175 score_gap_col);
3176 score_gap_row = MAX(score - gap_open_extend, score_gap_row);
3177
3178 score_array[b_index].best = score;
3179 }
3180
3181 score = next_score;
3182 }
3183
3184 /* Finish aligning if the best scores for all positions
3185 of B will fail the X-dropoff test, i.e. the inner loop
3186 bounds have converged to each other */
3187
3188 if (first_b_index == b_size)
3189 break;
3190
3191 if (last_b_index + num_extra_cells + 3 >= gap_align->dp_mem_alloc) {
3192
3193 gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100,
3194 2 * gap_align->dp_mem_alloc);
3195 score_array = (BlastGapDP *)realloc(score_array,
3196 gap_align->dp_mem_alloc *
3197 sizeof(BlastGapDP));
3198 gap_align->dp_mem = score_array;
3199 }
3200
3201 if (last_b_index < b_size - 1) {
3202 /* This row failed the X-dropoff test earlier than
3203 the last row did; just shorten the loop bounds
3204 before doing the next row */
3205
3206 b_size = last_b_index + 1;
3207 }
3208 else {
3209 /* The inner loop finished without failing the X-dropoff
3210 test; initialize extra bookkeeping structures until
3211 the X dropoff test fails or we run out of letters in B.
3212 The next inner loop will have larger bounds */
3213
3214 while (score_gap_row >= (best_score - x_dropoff) && b_size <= N) {
3215 score_array[b_size].best = score_gap_row;
3216 score_array[b_size].best_gap = score_gap_row - gap_open_extend;
3217 score_gap_row -= gap_extend;
3218 b_size++;
3219 }
3220 }
3221
3222 if (b_size <= N) {
3223 score_array[b_size].best = MININT;
3224 score_array[b_size].best_gap = MININT;
3225 b_size++;
3226 }
3227 }
3228
3229 return best_score;
3230 }
3231
3232 Boolean
BlastGetOffsetsForGappedAlignment(const Uint1 * query,const Uint1 * subject,const BlastScoreBlk * sbp,BlastHSP * hsp,Int4 * q_retval,Int4 * s_retval)3233 BlastGetOffsetsForGappedAlignment (const Uint1* query, const Uint1* subject,
3234 const BlastScoreBlk* sbp, BlastHSP* hsp, Int4* q_retval, Int4* s_retval)
3235 {
3236 Int4 index1, max_offset, score, max_score, hsp_end;
3237 const Uint1* query_var,* subject_var;
3238 Boolean positionBased = (sbp->psi_matrix != NULL);
3239 Int4 q_length = hsp->query.end - hsp->query.offset;
3240 Int4 s_length = hsp->subject.end - hsp->subject.offset;
3241 int q_start = hsp->query.offset;
3242 int s_start = hsp->subject.offset;
3243
3244 if (q_length <= HSP_MAX_WINDOW) {
3245 *q_retval = q_start + q_length/2;
3246 *s_retval = s_start + q_length/2;
3247 return TRUE;
3248 }
3249
3250 hsp_end = q_start + HSP_MAX_WINDOW;
3251 query_var = query + q_start;
3252 subject_var = subject + s_start;
3253 score=0;
3254 for (index1=q_start; index1<hsp_end; index1++) {
3255 if (!(positionBased))
3256 score += sbp->matrix->data[*query_var][*subject_var];
3257 else
3258 score += sbp->psi_matrix->pssm->data[index1][*subject_var];
3259 query_var++; subject_var++;
3260 }
3261 max_score = score;
3262 max_offset = hsp_end - 1;
3263 hsp_end = q_start + MIN(q_length, s_length);
3264 for (index1=q_start + HSP_MAX_WINDOW; index1<hsp_end; index1++) {
3265 if (!(positionBased)) {
3266 score -= sbp->matrix->data[*(query_var-HSP_MAX_WINDOW)][*(subject_var-HSP_MAX_WINDOW)];
3267 score += sbp->matrix->data[*query_var][*subject_var];
3268 } else {
3269 score -= sbp->psi_matrix->pssm->data[index1-HSP_MAX_WINDOW][*(subject_var-HSP_MAX_WINDOW)];
3270 score += sbp->psi_matrix->pssm->data[index1][*subject_var];
3271 }
3272 if (score > max_score) {
3273 max_score = score;
3274 max_offset = index1;
3275 }
3276 query_var++; subject_var++;
3277 }
3278
3279 if (max_score > 0)
3280 {
3281 *q_retval = max_offset;
3282 *s_retval = (max_offset - q_start) + s_start;
3283 return TRUE;
3284 }
3285 else /* Test the window around the ends of the HSP. */
3286 {
3287 score=0;
3288 query_var = query + q_start + q_length - HSP_MAX_WINDOW;
3289 subject_var = subject + s_start + s_length - HSP_MAX_WINDOW;
3290 for (index1=hsp->query.end-HSP_MAX_WINDOW; index1<hsp->query.end; index1++) {
3291 if (!(positionBased))
3292 score += sbp->matrix->data[*query_var][*subject_var];
3293 else
3294 score += sbp->psi_matrix->pssm->data[index1][*subject_var];
3295 query_var++; subject_var++;
3296 }
3297 if (score > 0)
3298 {
3299 *q_retval = hsp->query.end - HSP_MAX_WINDOW/2;
3300 *s_retval = hsp->subject.end - HSP_MAX_WINDOW/2;
3301 return TRUE;
3302 }
3303 }
3304 return FALSE;
3305 }
3306
3307 void
BlastGetStartForGappedAlignmentNucl(const Uint1 * query,const Uint1 * subject,BlastHSP * hsp)3308 BlastGetStartForGappedAlignmentNucl (const Uint1* query, const Uint1* subject,
3309 BlastHSP* hsp)
3310 {
3311 /* We will stop when the identity count reaches to this number */
3312 int hspMaxIdentRun = 10;
3313 const Uint1 *q, *s;
3314 Int4 index, max_offset, score, max_score, q_start, s_start, q_len;
3315 Boolean match, prev_match;
3316 Int4 offset = MIN(hsp->subject.gapped_start - hsp->subject.offset,
3317 hsp->query.gapped_start - hsp->query.offset);
3318 /* first check if the old value is ok */
3319 q_start = hsp->query.gapped_start;
3320 s_start = hsp->subject.gapped_start;
3321 score = -1;
3322 q = query + q_start;
3323 s = subject + s_start;
3324 q_len = hsp->query.end;
3325 while ((q-query < q_len) && (*q++ == *s++)) {
3326 score++;
3327 if (score > hspMaxIdentRun) return;
3328 }
3329 q = query + q_start;
3330 s = subject + s_start;
3331 while ((q-query >= 0) && (*q-- == *s--)) {
3332 score++;
3333 if (score > hspMaxIdentRun) return;
3334 }
3335 hspMaxIdentRun *= 1.5; /* Demand larger value if we move */
3336 /* if the old value is not ok, try to find a better point */
3337 q_start = hsp->query.gapped_start - offset;
3338 s_start = hsp->subject.gapped_start - offset;
3339 q_len = MIN(hsp->subject.end - s_start, hsp->query.end - q_start);
3340 q = query + q_start;
3341 s = subject + s_start;
3342 max_score = 0;
3343 max_offset = q_start;
3344 score = 0;
3345 match = FALSE;
3346 prev_match = FALSE;
3347 for (index = q_start; index < q_start + q_len; index++) {
3348 match = (*q++ == *s++);
3349 if (match != prev_match) {
3350 prev_match = match;
3351 if (match) {
3352 score = 1;
3353 } else if (score > max_score) {
3354 max_score = score;
3355 max_offset = index - score/2;
3356 }
3357 } else if (match) {
3358 ++score;
3359 if (score > hspMaxIdentRun) {
3360 max_offset = index - hspMaxIdentRun/2;
3361 hsp->query.gapped_start = max_offset;
3362 hsp->subject.gapped_start = max_offset + s_start - q_start;
3363 return;
3364 }
3365 }
3366 }
3367 if (match && score > max_score) {
3368 max_score = score;
3369 max_offset = index - score/2;
3370 }
3371 if (max_score > 0) {
3372 hsp->query.gapped_start = max_offset;
3373 hsp->subject.gapped_start = max_offset + s_start - q_start;
3374 }
3375 }
3376
3377 Int4
BlastGetStartForGappedAlignment(const Uint1 * query,const Uint1 * subject,const BlastScoreBlk * sbp,Uint4 q_start,Uint4 q_length,Uint4 s_start,Uint4 s_length)3378 BlastGetStartForGappedAlignment (const Uint1* query, const Uint1* subject,
3379 const BlastScoreBlk* sbp, Uint4 q_start, Uint4 q_length,
3380 Uint4 s_start, Uint4 s_length)
3381 {
3382 Int4 index1, max_offset, score, max_score, hsp_end;
3383 const Uint1* query_var,* subject_var;
3384 Boolean positionBased = (sbp->psi_matrix != NULL);
3385
3386 if (q_length <= HSP_MAX_WINDOW) {
3387 max_offset = q_start + q_length/2;
3388 return max_offset;
3389 }
3390
3391 hsp_end = q_start + HSP_MAX_WINDOW;
3392 query_var = query + q_start;
3393 subject_var = subject + s_start;
3394 score=0;
3395 for (index1=q_start; index1<hsp_end; index1++) {
3396 if (!(positionBased))
3397 score += sbp->matrix->data[*query_var][*subject_var];
3398 else
3399 score += sbp->psi_matrix->pssm->data[index1][*subject_var];
3400 query_var++; subject_var++;
3401 }
3402 max_score = score;
3403 max_offset = hsp_end - 1;
3404 hsp_end = q_start + MIN(q_length, s_length);
3405 for (index1=q_start + HSP_MAX_WINDOW; index1<hsp_end; index1++) {
3406 if (!(positionBased)) {
3407 score -= sbp->matrix->data[*(query_var-HSP_MAX_WINDOW)][*(subject_var-HSP_MAX_WINDOW)];
3408 score += sbp->matrix->data[*query_var][*subject_var];
3409 } else {
3410 score -= sbp->psi_matrix->pssm->data[index1-HSP_MAX_WINDOW][*(subject_var-HSP_MAX_WINDOW)];
3411 score += sbp->psi_matrix->pssm->data[index1][*subject_var];
3412 }
3413 if (score > max_score) {
3414 max_score = score;
3415 max_offset = index1;
3416 }
3417 query_var++; subject_var++;
3418 }
3419 if (max_score > 0)
3420 max_offset -= HSP_MAX_WINDOW/2;
3421 else
3422 max_offset = q_start;
3423
3424 return max_offset;
3425 }
3426
BLAST_GetGappedScore(EBlastProgramType program_number,BLAST_SequenceBlk * query,BlastQueryInfo * query_info,BLAST_SequenceBlk * subject,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,const BlastExtensionParameters * ext_params,const BlastHitSavingParameters * hit_params,BlastInitHitList * init_hitlist,BlastHSPList ** hsp_list_ptr,BlastGappedStats * gapped_stats,Boolean * fence_hit)3427 Int2 BLAST_GetGappedScore (EBlastProgramType program_number,
3428 BLAST_SequenceBlk* query, BlastQueryInfo* query_info,
3429 BLAST_SequenceBlk* subject,
3430 BlastGapAlignStruct* gap_align,
3431 const BlastScoringParameters* score_params,
3432 const BlastExtensionParameters* ext_params,
3433 const BlastHitSavingParameters* hit_params,
3434 BlastInitHitList* init_hitlist,
3435 BlastHSPList** hsp_list_ptr, BlastGappedStats* gapped_stats,
3436 Boolean * fence_hit)
3437 {
3438 Int4 index;
3439 BlastInitHSP* init_hsp = NULL;
3440 BlastInitHSP* init_hsp_array;
3441 Int4 q_start, s_start, q_end, s_end;
3442 Boolean is_prot;
3443 Boolean* restricted_align_array = NULL; /* if allocated, restricted
3444 alignment option per query */
3445 Boolean is_greedy;
3446 Boolean is_rpsblast = Blast_ProgramIsRpsBlast(program_number);
3447 Int4 max_offset;
3448 Int4 rps_cutoff_index = 0;
3449 Int2 status = 0;
3450 BlastHSPList* hsp_list = NULL;
3451 const BlastHitSavingOptions* hit_options = hit_params->options;
3452 BLAST_SequenceBlk query_tmp;
3453 Int4 context;
3454 BlastIntervalTree *tree;
3455 Int4 score;
3456 Int4* found_high_score = NULL;
3457 Int4 **rpsblast_pssms = NULL; /* Pointer to concatenated PSSMs in
3458 RPS-BLAST database */
3459 const int kHspNumMax = BlastHspNumMax(TRUE, hit_options);
3460 const double kRestrictedMult = 0.68;/* fraction of the ordinary cutoff score
3461 used for approximate gapped alignment */
3462 Int4 redo_index = -1; /* these are used for recomputing alignmnets if the */
3463 Int4 redo_query = -1; /* approximate alignment score is inconclusive */
3464
3465 if (!query || !subject || !gap_align || !score_params || !ext_params ||
3466 !hit_params || !init_hitlist || !hsp_list_ptr)
3467 return 1;
3468
3469 if (init_hitlist->total == 0)
3470 return 0;
3471
3472 is_prot = (program_number != eBlastTypeBlastn &&
3473 program_number != eBlastTypePhiBlastn &&
3474 program_number != eBlastTypeMapping);
3475 is_greedy = (ext_params->options->ePrelimGapExt == eGreedyScoreOnly);
3476
3477 /* turn on approximate gapped alignment if 1) the search
3478 specifies it, and 2) the first, highest-scoring ungapped
3479 alignment in init_hitlist scores below a reduced cutoff.
3480 The second condition is used to avoid computing approximate
3481 alignments that experiment shows are likely to be thrown
3482 away and recomputed optimally */
3483
3484 if (hit_params->restricted_align && !score_params->options->is_ooframe) {
3485 int i;
3486 Boolean* found = calloc(query_info->last_context + 1, sizeof(Boolean));
3487
3488 restricted_align_array =
3489 (Boolean*)calloc(Blast_GetQueryIndexFromContext(
3490 query_info->last_context,
3491 program_number) + 1,
3492 sizeof(Boolean));
3493
3494 /* find the first ungapped alignment for each query and determine
3495 whetherto use approximate gapped alignment for the query */
3496 for (i = 0;i < init_hitlist->total;i++) {
3497 int contxt; /* context */
3498 int query_index = -1;
3499 Int4 q_off;
3500
3501 init_hsp = &init_hitlist->init_hsp_array[i];
3502 /* find query index */
3503 q_off = init_hsp->offsets.qs_offsets.q_off;
3504 for (contxt = query_info->first_context;
3505 contxt <= query_info->last_context; contxt++) {
3506
3507 if (q_off >= query_info->contexts[contxt].query_offset &&
3508 q_off < query_info->contexts[contxt].query_offset +
3509 query_info->contexts[contxt].query_length) {
3510
3511 query_index = Blast_GetQueryIndexFromContext(contxt,
3512 program_number);
3513 break;
3514 }
3515 }
3516 ASSERT(query_index >= 0);
3517
3518 /* if this is not the first initial hit for the query, disregard it */
3519 if (found[query_index]) {
3520 continue;
3521 }
3522
3523 found[query_index] = TRUE;
3524
3525 /* use approximate gapped alignment if the highest scoring ungapped
3526 alignment scores below the reduced cutoff */
3527 if (init_hsp->ungapped_data && init_hsp->ungapped_data->score <
3528 (Int4)(kRestrictedMult *
3529 hit_params->cutoffs[contxt].cutoff_score)) {
3530
3531 restricted_align_array[query_index] = TRUE;
3532 }
3533 else {
3534 restricted_align_array[query_index] = FALSE;
3535 }
3536 }
3537 if (found) {
3538 free(found);
3539 }
3540 }
3541
3542 if (is_rpsblast) {
3543 rpsblast_pssms = gap_align->sbp->psi_matrix->pssm->data;
3544 if (program_number == eBlastTypeRpsBlast)
3545 rps_cutoff_index = subject->oid;
3546 else
3547 rps_cutoff_index = subject->oid * NUM_FRAMES +
3548 BLAST_FrameToContext(subject->frame,
3549 program_number);
3550 }
3551
3552 ASSERT(Blast_InitHitListIsSortedByScore(init_hitlist));
3553
3554 if (*hsp_list_ptr == NULL)
3555 *hsp_list_ptr = hsp_list = Blast_HSPListNew(kHspNumMax);
3556 else
3557 hsp_list = *hsp_list_ptr;
3558
3559 /* Initialize the interval tree with the maximum possible
3560 query and subject offsets. For query sequences this is always
3561 query->length, and for subject sequences it is subject->length
3562 except for out-of-frame tblastn. In that case, subject->length
3563 is the length of one strand of the subject sequence, but the
3564 subject offsets of each ungapped alignment are wrt all six subject
3565 frames concatenated together. Finally, add 1 to the maximum,
3566 to account for HSPs that include the end of a sequence */
3567
3568 if (Blast_SubjectIsTranslated(program_number) &&
3569 score_params->options->is_ooframe) {
3570 /* OOF search only supported for tblastn and blastx. (blastx
3571 does not have a translated subject, so can't get here.) */
3572 ASSERT(program_number == eBlastTypeTblastn);
3573
3574 tree = Blast_IntervalTreeInit(0, query->length+1,
3575 0, 2*(subject->length + CODON_LENGTH)+1);
3576 }
3577 else {
3578 tree = Blast_IntervalTreeInit(0, query->length+1,
3579 0, subject->length+1);
3580 }
3581 if (!tree)
3582 return BLASTERR_MEMORY;
3583
3584
3585 init_hsp_array = init_hitlist->init_hsp_array;
3586 found_high_score = (Int4*) calloc(query_info->num_queries, sizeof(Int4));
3587 if (hit_params->low_score)
3588 {
3589 for (index=0; index<init_hitlist->total; index++)
3590 {
3591 int query_index =
3592 Blast_GetQueryIndexFromQueryOffset(init_hsp_array[index].offsets.qs_offsets.q_off, program_number, query_info);
3593 if (init_hsp_array[index].ungapped_data->score > hit_params->low_score[query_index])
3594 found_high_score[query_index] = 1;
3595 }
3596 }
3597
3598 for (index=0; index<init_hitlist->total; index++)
3599 {
3600 BlastHSP tmp_hsp;
3601 BlastInitHSP tmp_init_hsp;
3602 BlastUngappedData tmp_ungapped_data;
3603 int query_index=0;
3604 Boolean restricted_alignment = FALSE;
3605
3606 /* make a local copy of the initial HSP */
3607
3608 tmp_init_hsp = init_hsp_array[index];
3609 if (tmp_init_hsp.ungapped_data) {
3610 tmp_ungapped_data = *(init_hsp_array[index].ungapped_data);
3611 tmp_init_hsp.ungapped_data = &tmp_ungapped_data;
3612 }
3613 init_hsp = &tmp_init_hsp;
3614
3615 s_AdjustHspOffsetsAndGetQueryData(query, query_info, init_hsp,
3616 &query_tmp, &context);
3617
3618 query_index = Blast_GetQueryIndexFromContext(context, program_number);
3619
3620 /* If redo_index > -1 and redo_query > -1, the main loop is recomputing
3621 gappaed alignments for a single query, becuase the approximate
3622 alignment score was inconclusive. This recomputing was triggered when
3623 index was equal to redo_index. Until index reaches redo_index again,
3624 skip all concatenated queries with index different from redo_query */
3625
3626 if (index < redo_index && query_index != redo_query) {
3627 continue;
3628 }
3629
3630 if (restricted_align_array && restricted_align_array[query_index]) {
3631 restricted_alignment = TRUE;
3632 }
3633
3634 if (hit_params->low_score)
3635 {
3636 if (!found_high_score[query_index])
3637 continue;
3638 }
3639
3640 if (rpsblast_pssms)
3641 gap_align->sbp->psi_matrix->pssm->data = rpsblast_pssms +
3642 query_info->contexts[context].query_offset;
3643
3644 if (!init_hsp->ungapped_data) {
3645 q_start = q_end = init_hsp->offsets.qs_offsets.q_off;
3646 s_start = s_end = init_hsp->offsets.qs_offsets.s_off;
3647 score = INT4_MIN;
3648 } else {
3649 q_start = init_hsp->ungapped_data->q_start;
3650 q_end = q_start + init_hsp->ungapped_data->length;
3651 s_start = init_hsp->ungapped_data->s_start;
3652 s_end = s_start + init_hsp->ungapped_data->length;
3653 score = init_hsp->ungapped_data->score;
3654 }
3655
3656 tmp_hsp.score = score;
3657 tmp_hsp.context = context;
3658 tmp_hsp.query.offset = q_start;
3659 tmp_hsp.query.end = q_end;
3660 tmp_hsp.query.frame = query_info->contexts[context].frame;
3661 tmp_hsp.subject.offset = s_start;
3662 tmp_hsp.subject.end = s_end;
3663 tmp_hsp.subject.frame = subject->frame;
3664
3665 /* use priate interval tree when recomputing alignments */
3666 if (!BlastIntervalTreeContainsHSP(tree, &tmp_hsp, query_info,
3667 hit_options->min_diag_separation))
3668 {
3669 BlastHSP* new_hsp;
3670 Int4 cutoff, restricted_cutoff = 0;
3671
3672 if (is_rpsblast)
3673 cutoff = hit_params->cutoffs[rps_cutoff_index].cutoff_score;
3674 else
3675 cutoff = hit_params->cutoffs[context].cutoff_score;
3676
3677 if (restricted_alignment)
3678 restricted_cutoff = (Int4)(kRestrictedMult * cutoff);
3679
3680 if (gapped_stats)
3681 ++gapped_stats->extensions;
3682
3683 if(is_prot && !score_params->options->is_ooframe) {
3684 max_offset =
3685 BlastGetStartForGappedAlignment(query_tmp.sequence,
3686 subject->sequence, gap_align->sbp,
3687 init_hsp->ungapped_data->q_start,
3688 init_hsp->ungapped_data->length,
3689 init_hsp->ungapped_data->s_start,
3690 init_hsp->ungapped_data->length);
3691 init_hsp->offsets.qs_offsets.s_off +=
3692 max_offset - init_hsp->offsets.qs_offsets.q_off;
3693 init_hsp->offsets.qs_offsets.q_off = max_offset;
3694 }
3695
3696 if (is_prot) {
3697 status = s_BlastProtGappedAlignment(program_number, &query_tmp,
3698 subject, gap_align,
3699 score_params, init_hsp,
3700 restricted_alignment,
3701 fence_hit);
3702
3703 if (restricted_alignment &&
3704 gap_align->score < cutoff &&
3705 gap_align->score >= restricted_cutoff) {
3706
3707 /* the alignment score is inconclusive; it may really be
3708 low-scoring, or the approximate gapped alignment
3709 could be highly suboptimal. Recomputing the alignment
3710 would resolve the ambiguity, but when the final list of
3711 HSPs contains a mix of approximate and exact gapped
3712 alignments it's possible for a highly suboptimal
3713 alignment to make it to the traceback phase, where
3714 its start point is reused. We sidestep this problem
3715 by throwing away all the previously computed gapped
3716 alignments and recomputing them exactly. */
3717
3718 Int4 index2;
3719 BlastHSPList* new_hsp_list = Blast_HSPListNew(kHspNumMax);
3720 ASSERT(restricted_align_array);
3721 restricted_align_array[query_index] = FALSE;
3722
3723 Blast_IntervalTreeReset(tree);
3724
3725 /* remove all HSPs computed for the current query */
3726 for (index2 = 0; index2 < hsp_list->hspcnt; index2++) {
3727 Int4 context2 = hsp_list->hsp_array[index2]->context;
3728 Int4 q_index2 = Blast_GetQueryIndexFromContext(context2,
3729 program_number);
3730
3731 if (q_index2 != query_index) {
3732
3733 Blast_HSPListSaveHSP(new_hsp_list,
3734 hsp_list->hsp_array[index2]);
3735
3736 status = BlastIntervalTreeAddHSP(
3737 hsp_list->hsp_array[index2],
3738 tree,
3739 query_info,
3740 eQueryAndSubject);
3741
3742 hsp_list->hsp_array[index2] = NULL;
3743
3744 }
3745 else {
3746 hsp_list->hsp_array[index2] =
3747 Blast_HSPFree(hsp_list->hsp_array[index2]);
3748 }
3749 }
3750 Blast_HSPListFree(hsp_list);
3751 hsp_list = new_hsp_list;
3752
3753 /* restart the loop over initial hits to recompute
3754 alignments for the current query */
3755 redo_index = index;
3756 redo_query = query_index;
3757 index = -1;
3758 continue;
3759 }
3760 } else if (is_greedy) {
3761 if (init_hsp->ungapped_data) {
3762 init_hsp->offsets.qs_offsets.q_off =
3763 init_hsp->ungapped_data->q_start + init_hsp->ungapped_data->length/2;
3764 init_hsp->offsets.qs_offsets.s_off =
3765 init_hsp->ungapped_data->s_start + init_hsp->ungapped_data->length/2;
3766 }
3767 status = BLAST_GreedyGappedAlignment(
3768 query_tmp.sequence, subject->sequence,
3769 query_tmp.length, subject->length,
3770 gap_align, score_params,
3771 init_hsp->offsets.qs_offsets.q_off,
3772 init_hsp->offsets.qs_offsets.s_off,
3773 (Boolean) TRUE, FALSE, fence_hit);
3774 /* override the alignment start point with the estimate
3775 by the aligner of the best start point */
3776 init_hsp->offsets.qs_offsets.q_off =
3777 gap_align->greedy_query_seed_start;
3778 init_hsp->offsets.qs_offsets.s_off =
3779 gap_align->greedy_subject_seed_start;
3780 } else {
3781 /* Assuming the ungapped alignment is long enough to
3782 contain an 8-letter seed of exact matches, start
3783 the gapped alignment inside the first byte of the
3784 seed that contains all matches. Note that even if
3785 the ungapped alignment is not long enough,
3786 s_BlastDynProgNtGappedAlignment will still round
3787 the start point up to the first 4-base boundary
3788 on the subject sequence */
3789 if (s_end >= (Int4)init_hsp->offsets.qs_offsets.s_off + 8) {
3790 init_hsp->offsets.qs_offsets.s_off += 3;
3791 init_hsp->offsets.qs_offsets.q_off += 3;
3792 }
3793 status = s_BlastDynProgNtGappedAlignment(&query_tmp, subject,
3794 gap_align, score_params, init_hsp);
3795 }
3796
3797 if (status) {
3798 if (rpsblast_pssms) {
3799 gap_align->sbp->psi_matrix->pssm->data = rpsblast_pssms;
3800 }
3801 sfree(found_high_score);
3802 tree = Blast_IntervalTreeFree(tree);
3803 return status;
3804 }
3805
3806 if (gap_align->score >= cutoff) {
3807 Int2 query_frame = 0;
3808 /* For mixed-frame search, the query frame is determined
3809 from the offset, not only from context. */
3810 if (score_params->options->is_ooframe &&
3811 program_number == eBlastTypeBlastx) {
3812 query_frame = gap_align->query_start % CODON_LENGTH + 1;
3813 if ((context % NUM_FRAMES) >= CODON_LENGTH)
3814 query_frame = -query_frame;
3815 } else {
3816 query_frame = query_info->contexts[context].frame;
3817 }
3818
3819 status = Blast_HSPInit(gap_align->query_start,
3820 gap_align->query_stop, gap_align->subject_start,
3821 gap_align->subject_stop,
3822 init_hsp->offsets.qs_offsets.q_off,
3823 init_hsp->offsets.qs_offsets.s_off, context,
3824 query_frame, subject->frame, gap_align->score,
3825 &(gap_align->edit_script), &new_hsp);
3826 if (status)
3827 {
3828 sfree(found_high_score);
3829 tree = Blast_IntervalTreeFree(tree);
3830 return status;
3831 }
3832 status = Blast_HSPListSaveHSP(hsp_list, new_hsp);
3833 if (status)
3834 break;
3835 status = BlastIntervalTreeAddHSP(new_hsp, tree, query_info,
3836 eQueryAndSubject);
3837 if (status)
3838 break;
3839 }
3840 else {
3841 /* a greedy alignment may have traceback associated with it;
3842 free that traceback if the alignment will not be used */
3843 if (is_greedy) {
3844 gap_align->edit_script = GapEditScriptDelete(
3845 gap_align->edit_script);
3846 }
3847 }
3848 }
3849 }
3850
3851 sfree(found_high_score);
3852 if (restricted_align_array) {
3853 sfree(restricted_align_array);
3854 }
3855 tree = Blast_IntervalTreeFree(tree);
3856 if (rpsblast_pssms) {
3857 gap_align->sbp->psi_matrix->pssm->data = rpsblast_pssms;
3858 }
3859
3860 *hsp_list_ptr = hsp_list;
3861 return status;
3862 }
3863
3864 /** Out-of-frame gapped alignment wrapper function.
3865 * @param query Query sequence [in]
3866 * @param subject Subject sequence [in]
3867 * @param q_off Offset in query [in]
3868 * @param s_off Offset in subject [in]
3869 * @param private_q_start Extent of alignment in query [out]
3870 * @param private_s_start Extent of alignment in subject [out]
3871 * @param score_only Return score only, without traceback [in]
3872 * @param edit_block Structure to hold generated traceback [out]
3873 * @param gap_align Gapped alignment information and preallocated
3874 * memory [in] [out]
3875 * @param score_params Scoring parameters [in]
3876 * @param psi_offset Starting position in PSI-BLAST matrix [in]
3877 * @param reversed Direction of the extension [in]
3878 * @param switch_seq Sequences need to be switched for blastx,
3879 * but not for tblastn [in]
3880 */
3881 static Int4
s_OutOfFrameSemiGappedAlignWrap(const Uint1 * query,const Uint1 * subject,Int4 q_off,Int4 s_off,Int4 * private_q_start,Int4 * private_s_start,Boolean score_only,GapPrelimEditBlock * edit_block,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 psi_offset,Boolean reversed,Boolean switch_seq)3882 s_OutOfFrameSemiGappedAlignWrap(const Uint1* query, const Uint1* subject, Int4 q_off,
3883 Int4 s_off, Int4* private_q_start, Int4* private_s_start,
3884 Boolean score_only, GapPrelimEditBlock *edit_block,
3885 BlastGapAlignStruct* gap_align,
3886 const BlastScoringParameters* score_params, Int4 psi_offset,
3887 Boolean reversed, Boolean switch_seq)
3888 {
3889 if (switch_seq) {
3890 return s_OutOfFrameGappedAlign(subject, query, s_off, q_off,
3891 private_s_start, private_q_start, score_only, edit_block,
3892 gap_align, score_params, psi_offset, reversed);
3893 } else {
3894 return s_OutOfFrameGappedAlign(query, subject, q_off, s_off,
3895 private_q_start, private_s_start, score_only, edit_block,
3896 gap_align, score_params, psi_offset, reversed);
3897 }
3898 }
3899
3900 /** Maximal subject length after which the offsets are adjusted to a
3901 * subsequence.
3902 */
3903 #define MAX_SUBJECT_OFFSET 90000
3904 /** Approximate upper bound on a number of gaps in an HSP, needed to determine
3905 * the length of the subject subsequence to be retrieved for alignment with
3906 * traceback.
3907 */
3908 #define MAX_TOTAL_GAPS 3000
3909
3910 void
AdjustSubjectRange(Int4 * subject_offset_ptr,Int4 * subject_length_ptr,Int4 query_offset,Int4 query_length,Int4 * start_shift)3911 AdjustSubjectRange(Int4* subject_offset_ptr, Int4* subject_length_ptr,
3912 Int4 query_offset, Int4 query_length, Int4* start_shift)
3913 {
3914 Int4 s_offset;
3915 Int4 subject_length = *subject_length_ptr;
3916 Int4 max_extension_left, max_extension_right;
3917
3918 /* If subject sequence is not too long, leave everything as is */
3919 if (subject_length < MAX_SUBJECT_OFFSET) {
3920 *start_shift = 0;
3921 return;
3922 }
3923
3924 s_offset = *subject_offset_ptr;
3925 /* Maximal extension length is the remaining length in the query, plus
3926 an estimate of a maximal total number of gaps. */
3927 max_extension_left = query_offset + MAX_TOTAL_GAPS;
3928 max_extension_right = query_length - query_offset + MAX_TOTAL_GAPS;
3929
3930 if (s_offset <= max_extension_left) {
3931 *start_shift = 0;
3932 } else {
3933 *start_shift = s_offset - max_extension_left;
3934 *subject_offset_ptr = max_extension_left;
3935 }
3936
3937 *subject_length_ptr =
3938 MIN(subject_length, s_offset + max_extension_right) - *start_shift;
3939 }
3940
3941 /** Performs gapped extension for protein sequences, given two
3942 * sequence blocks, scoring and extension options, and an initial HSP
3943 * with information from the previously performed ungapped extension
3944 * @param program BLAST program [in]
3945 * @param query_blk The query sequence block [in]
3946 * @param subject_blk The subject sequence block [in]
3947 * @param gap_align The auxiliary structure for gapped alignment [in]
3948 * @param score_params Parameters related to scoring [in]
3949 * @param init_hsp The initial HSP information [in]
3950 * @param restricted_alignment If true and search is not out-of-frame,
3951 * use a faster approximate gapped alignment algorithm [in]
3952 * @param fence_hit If not NULL, pointer to a boolean that is set to
3953 * TRUE if gapped extension reaches a neighborhood
3954 * of the subject sequence that is not initialized [in][out]
3955 */
3956 static Int2
s_BlastProtGappedAlignment(EBlastProgramType program,BLAST_SequenceBlk * query_blk,BLAST_SequenceBlk * subject_blk,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,BlastInitHSP * init_hsp,Boolean restricted_alignment,Boolean * fence_hit)3957 s_BlastProtGappedAlignment(EBlastProgramType program,
3958 BLAST_SequenceBlk* query_blk, BLAST_SequenceBlk* subject_blk,
3959 BlastGapAlignStruct* gap_align,
3960 const BlastScoringParameters* score_params, BlastInitHSP* init_hsp,
3961 Boolean restricted_alignment,
3962 Boolean * fence_hit)
3963 {
3964 Boolean found_start, found_end;
3965 Int4 q_length=0, s_length=0, score_right, score_left;
3966 Int4 private_q_start, private_s_start;
3967 Uint1* query=NULL,* subject=NULL;
3968 Boolean switch_seq = FALSE;
3969 Int4 query_length = query_blk->length;
3970 Int4 subject_length = subject_blk->length;
3971 Int4 subject_shift = 0;
3972 BlastScoringOptions *score_options = score_params->options;
3973
3974 if (gap_align == NULL)
3975 return -1;
3976
3977 if (score_options->is_ooframe) {
3978 ASSERT(program == eBlastTypeTblastn || program == eBlastTypeBlastx);
3979
3980 q_length = init_hsp->offsets.qs_offsets.q_off;
3981 /* For negative subject frames, make subject offset relative to the part
3982 of the mixed-frame sequence corresponding to the reverse strand. */
3983 if (program == eBlastTypeTblastn && subject_blk->frame < 0)
3984 init_hsp->offsets.qs_offsets.s_off -= subject_length + 1;
3985
3986 s_length = init_hsp->offsets.qs_offsets.s_off;
3987
3988 if (program == eBlastTypeBlastx) {
3989 subject = subject_blk->sequence + s_length;
3990 query = query_blk->oof_sequence + CODON_LENGTH + q_length;
3991 query_length -= CODON_LENGTH - 1;
3992 switch_seq = TRUE;
3993 } else if (program == eBlastTypeTblastn) {
3994 subject = subject_blk->oof_sequence + CODON_LENGTH + s_length;
3995 query = query_blk->sequence + q_length;
3996 subject_length -= CODON_LENGTH - 1;
3997 }
3998 } else {
3999 q_length = init_hsp->offsets.qs_offsets.q_off + 1;
4000 s_length = init_hsp->offsets.qs_offsets.s_off + 1;
4001 query = query_blk->sequence;
4002 subject = subject_blk->sequence;
4003 }
4004
4005 AdjustSubjectRange(&s_length, &subject_length, q_length, query_length,
4006 &subject_shift);
4007
4008 found_start = FALSE;
4009 found_end = FALSE;
4010
4011 /* Looking for "left" score */
4012 score_left = 0;
4013 if (q_length != 0 && s_length != 0) {
4014 found_start = TRUE;
4015 if(score_options->is_ooframe) {
4016 score_left =
4017 s_OutOfFrameSemiGappedAlignWrap(query, subject, q_length, s_length,
4018 &private_q_start, &private_s_start, TRUE, NULL,
4019 gap_align, score_params, q_length, TRUE, switch_seq);
4020 } else {
4021 if (restricted_alignment) {
4022 score_left = s_RestrictedGappedAlign(query, subject+subject_shift,
4023 q_length, s_length,
4024 &private_q_start, &private_s_start,
4025 gap_align, score_params,
4026 init_hsp->offsets.qs_offsets.q_off, TRUE);
4027 }
4028 else {
4029 score_left = Blast_SemiGappedAlign(query, subject+subject_shift,
4030 q_length, s_length,
4031 &private_q_start, &private_s_start, TRUE,
4032 NULL, gap_align, score_params,
4033 init_hsp->offsets.qs_offsets.q_off,
4034 FALSE, TRUE,
4035 fence_hit);
4036 }
4037 }
4038 gap_align->query_start = q_length - private_q_start;
4039 gap_align->subject_start = s_length - private_s_start + subject_shift;
4040 }
4041
4042 score_right = 0;
4043 if (q_length < query_length && s_length < subject_length) {
4044 found_end = TRUE;
4045 if(score_options->is_ooframe) {
4046 score_right =
4047 s_OutOfFrameSemiGappedAlignWrap(query-1, subject-1,
4048 query_length-q_length+1, subject_length-s_length+1,
4049 &(gap_align->query_stop), &(gap_align->subject_stop),
4050 TRUE, NULL, gap_align, score_params, q_length, FALSE,
4051 switch_seq);
4052 gap_align->query_stop += q_length;
4053 gap_align->subject_stop += s_length + subject_shift;
4054 } else {
4055 if (restricted_alignment) {
4056 score_right = s_RestrictedGappedAlign(
4057 query+init_hsp->offsets.qs_offsets.q_off,
4058 subject+init_hsp->offsets.qs_offsets.s_off,
4059 query_length-q_length,
4060 subject_length-s_length,
4061 &(gap_align->query_stop),
4062 &(gap_align->subject_stop),
4063 gap_align, score_params,
4064 init_hsp->offsets.qs_offsets.q_off, FALSE);
4065 }
4066 else {
4067 score_right = Blast_SemiGappedAlign(
4068 query+init_hsp->offsets.qs_offsets.q_off,
4069 subject+init_hsp->offsets.qs_offsets.s_off,
4070 query_length-q_length,
4071 subject_length-s_length,
4072 &(gap_align->query_stop),
4073 &(gap_align->subject_stop),
4074 TRUE, NULL, gap_align, score_params,
4075 init_hsp->offsets.qs_offsets.q_off, FALSE,
4076 FALSE,
4077 fence_hit);
4078 }
4079 /* Make end offsets point to the byte after the end of the
4080 alignment */
4081 gap_align->query_stop += init_hsp->offsets.qs_offsets.q_off + 1;
4082 gap_align->subject_stop += init_hsp->offsets.qs_offsets.s_off + 1;
4083 }
4084 }
4085
4086 if (found_start == FALSE) { /* impossible for in-frame */
4087 gap_align->query_start = init_hsp->offsets.qs_offsets.q_off;
4088 gap_align->subject_start = init_hsp->offsets.qs_offsets.s_off;
4089 }
4090 if (found_end == FALSE) { /* impossible for out-of-frame */
4091 gap_align->query_stop = init_hsp->offsets.qs_offsets.q_off;
4092 gap_align->subject_stop = init_hsp->offsets.qs_offsets.s_off;
4093 }
4094
4095 gap_align->score = score_right+score_left;
4096
4097 return 0;
4098 }
4099
4100 /** Converts OOF traceback from a gapped alignment to a GapEditScript.
4101 * This function is for out-of-frame gapping.
4102 * @param rev_prelim_tback Traceback operations for left extension [in]
4103 * @param fwd_prelim_tback Traceback operations for right extension [in]
4104 * @param nucl_align_length Length of alignment on the translated
4105 * nucleotide sequence [in]
4106 * @param edit_script_ptr The resulting edit block [out]
4107 * @return Always zero
4108 */
4109 static Int2
s_BlastOOFTracebackToGapEditScript(GapPrelimEditBlock * rev_prelim_tback,GapPrelimEditBlock * fwd_prelim_tback,Int4 nucl_align_length,GapEditScript ** edit_script_ptr)4110 s_BlastOOFTracebackToGapEditScript(GapPrelimEditBlock *rev_prelim_tback,
4111 GapPrelimEditBlock *fwd_prelim_tback,
4112 Int4 nucl_align_length,
4113 GapEditScript** edit_script_ptr)
4114 {
4115 GapEditScript* e_script;
4116 EGapAlignOpType last_op;
4117 Int4 last_num;
4118 GapPrelimEditBlock *tmp_prelim_tback;
4119 Int4 i, num_nuc;
4120 int extra_needed=0;
4121
4122 /* prepend a substitution, since the input sequences were
4123 shifted prior to the OOF alignment */
4124
4125 tmp_prelim_tback = GapPrelimEditBlockNew();
4126 last_op = eGapAlignSub;
4127 last_num = 1;
4128
4129 /* Propagate the extra substitution through the edit script. */
4130
4131 for (i = 0; i < rev_prelim_tback->num_ops; i++) {
4132
4133 EGapAlignOpType next_op = rev_prelim_tback->edit_ops[i].op_type;
4134 Int4 next_num = rev_prelim_tback->edit_ops[i].num;
4135
4136 if (next_op == last_op) {
4137
4138 /* merge consecutive operations that are identical */
4139
4140 last_num += next_num;
4141 }
4142 else if (next_op == eGapAlignIns || next_op == eGapAlignDel) {
4143
4144 /* Propagating the initial extra substitution requires
4145 that edit operations which are not in-frame gaps must
4146 'flow through' an in-frame gap; the last non-gap
4147 operation before the gap will appear after the gap.
4148 This could mean either one or two edit blocks
4149 getting created */
4150
4151 if (last_num > 1) {
4152 GapPrelimEditBlockAdd(tmp_prelim_tback, last_op, last_num - 1);
4153 }
4154 GapPrelimEditBlockAdd(tmp_prelim_tback, next_op, next_num);
4155 last_num = 1; /* last_op unchanged; one 'orphan' now */
4156 }
4157 else {
4158
4159 /* write out the last edit operation and
4160 move forward in the script */
4161
4162 GapPrelimEditBlockAdd(tmp_prelim_tback, last_op, last_num);
4163 last_op = next_op;
4164 last_num = next_num;
4165 }
4166 }
4167
4168 /* Handle the crossing over from the left extension to
4169 the right extension. Begin by writing out the final
4170 group of ops from the left extension, except for the
4171 one op that must be merged */
4172
4173 last_num--;
4174 if (last_num > 0) {
4175 GapPrelimEditBlockAdd(tmp_prelim_tback, last_op, last_num);
4176 }
4177
4178 if (last_op != eGapAlignSub) {
4179
4180 /* the last traceback letter from the left extension must
4181 be folded into the right extension. */
4182
4183 for (i = fwd_prelim_tback->num_ops - 1; i >= 0; i--) {
4184
4185 GapPrelimEditScript *new_script = fwd_prelim_tback->edit_ops + i;
4186
4187 if (new_script->op_type == eGapAlignIns ||
4188 new_script->op_type == eGapAlignDel) {
4189
4190 /* do not merge with in-frame gaps; just write out
4191 the gaps and 'flow through' the last op */
4192
4193 GapPrelimEditBlockAdd(tmp_prelim_tback, new_script->op_type,
4194 new_script->num);
4195 }
4196 else {
4197
4198 /* merge with the first letter of new_script */
4199
4200 last_op += new_script->op_type - eGapAlignSub;
4201 GapPrelimEditBlockAdd(tmp_prelim_tback, last_op, 1);
4202
4203 /* borrow one letter from new_script to complete the
4204 merge, and skip new_script if it only had one letter */
4205 new_script->num--;
4206 if (new_script->num == 0)
4207 i--;
4208 break;
4209 }
4210 }
4211 fwd_prelim_tback->num_ops = i + 1;
4212 }
4213
4214 /* form the final edit block */
4215 e_script = Blast_PrelimEditBlockToGapEditScript(tmp_prelim_tback, fwd_prelim_tback);
4216 GapPrelimEditBlockFree(tmp_prelim_tback);
4217
4218 /* postprocess the edit script */
4219 num_nuc = 0;
4220 for (i=0; i<e_script->size; i++)
4221 {
4222 int total_actions=0;
4223 /* Count the number of nucleotides in the next
4224 traceback operation and delete any traceback operations
4225 that would make the alignment too long. This check is
4226 not needed for in-frame alignment because the
4227 traceback is never changed after it is computed */
4228
4229 last_op = e_script->op_type[i];
4230
4231 if (last_op == eGapAlignIns)
4232 last_op = eGapAlignSub;
4233
4234 total_actions = last_op * e_script->num[i];
4235
4236 if (num_nuc + total_actions >= nucl_align_length) {
4237 e_script->num[i] = (nucl_align_length - num_nuc +
4238 last_op - 1) / last_op;
4239 break; /* We delete the rest of the script. */
4240 }
4241 else {
4242 num_nuc += total_actions;;
4243 }
4244 }
4245 e_script->size = MIN(i+1, e_script->size); /* If we broke out early then we truncate the edit script. */
4246
4247 extra_needed = 0;
4248 for (i=0; i<e_script->size; i++)
4249 {
4250 if (e_script->op_type[i] % 3 != 0 && e_script->num[i] > 1) {
4251 extra_needed += e_script->num[i] - 1;
4252 }
4253 }
4254
4255 if (extra_needed)
4256 {
4257 GapEditScript* new_esp = GapEditScriptNew(extra_needed+e_script->size);
4258 int new_esp_i=0;
4259 for (i=0; i<e_script->size; i++)
4260 {
4261 /* Only one frame shift operation at a time is
4262 allowed in a single edit script element. */
4263 new_esp->num[new_esp_i] = e_script->num[i];
4264 new_esp->op_type[new_esp_i] = e_script->op_type[i];
4265 new_esp_i++;
4266 last_op = e_script->op_type[i];
4267 if (last_op % 3 != 0 && e_script->num[i] > 1) {
4268 Int4 num_ops = e_script->num[i];
4269 int esp_index=0;
4270 new_esp->num[new_esp_i-1] = 1;
4271 for (esp_index = 1; esp_index < num_ops; esp_index++) {
4272 new_esp->num[new_esp_i] = 1;
4273 new_esp->op_type[new_esp_i] = last_op;
4274 new_esp_i++;
4275 }
4276 }
4277 }
4278 e_script = GapEditScriptDelete(e_script);
4279 e_script = new_esp;
4280 }
4281 *edit_script_ptr = e_script;
4282
4283 /* finally, add one to the size of any block of substitutions
4284 that follows a frame shift op */
4285 last_op = e_script->op_type[0];
4286 for (i=1; i<e_script->size; i++)
4287 {
4288 if (e_script->op_type[i] == eGapAlignSub && (last_op % 3) != 0)
4289 (e_script->num[i])++;
4290
4291 last_op = e_script->op_type[i];
4292 }
4293
4294 return 0;
4295 }
4296
BLAST_GappedAlignmentWithTraceback(EBlastProgramType program,const Uint1 * query,const Uint1 * subject,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 q_start,Int4 s_start,Int4 query_length,Int4 subject_length,Boolean * fence_hit)4297 Int2 BLAST_GappedAlignmentWithTraceback(EBlastProgramType program,
4298 const Uint1* query, const Uint1* subject, BlastGapAlignStruct* gap_align,
4299 const BlastScoringParameters* score_params,
4300 Int4 q_start, Int4 s_start, Int4 query_length, Int4 subject_length,
4301 Boolean * fence_hit)
4302 {
4303 Boolean found_end;
4304 Int4 score_right, score_left, private_q_length, private_s_length;
4305 Int4 q_length, s_length;
4306 Boolean is_ooframe = score_params->options->is_ooframe;
4307 Int2 status = 0;
4308 Boolean switch_seq = FALSE;
4309 GapPrelimEditBlock *fwd_prelim_tback;
4310 GapPrelimEditBlock *rev_prelim_tback;
4311
4312 if (gap_align == NULL)
4313 return -1;
4314
4315 fwd_prelim_tback = gap_align->fwd_prelim_tback;
4316 rev_prelim_tback = gap_align->rev_prelim_tback;
4317 GapPrelimEditBlockReset(fwd_prelim_tback);
4318 GapPrelimEditBlockReset(rev_prelim_tback);
4319
4320 found_end = FALSE;
4321
4322 q_length = query_length;
4323 s_length = subject_length;
4324 if (is_ooframe) {
4325 /* The mixed frame sequence is shifted to the 3rd position, so its
4326 maximal available length for extension is less by 2 than its
4327 total length. */
4328 switch_seq = (program == eBlastTypeBlastx);
4329 if (switch_seq) {
4330 q_length -= CODON_LENGTH - 1;
4331 } else {
4332 s_length -= CODON_LENGTH - 1;
4333 }
4334 }
4335
4336 score_left = 0;
4337
4338 if(is_ooframe) {
4339 /* NB: Left extension does not include starting point corresponding
4340 to the offset pair; the right extension does. */
4341 score_left =
4342 s_OutOfFrameSemiGappedAlignWrap(query+q_start, subject+s_start,
4343 q_start, s_start, &private_q_length, &private_s_length,
4344 FALSE, rev_prelim_tback, gap_align, score_params, q_start,
4345 TRUE, switch_seq);
4346 gap_align->query_start = q_start - private_q_length;
4347 gap_align->subject_start = s_start - private_s_length;
4348 } else {
4349 /* NB: The left extension includes the starting point
4350 [q_start,s_start]; the right extension does not. */
4351 // <AG> score_left =
4352 // Blast_SemiGappedAlign(query, subject, q_start+1, s_start+1,
4353 // &private_q_length, &private_s_length, FALSE, rev_prelim_tback,
4354 // gap_align, score_params, q_start, FALSE, TRUE,
4355 // fence_hit);
4356
4357 score_left = ALIGN_EX(query, subject, q_start+1, s_start+1, &private_q_length, &private_s_length, rev_prelim_tback,
4358 gap_align,
4359 score_params, q_start, FALSE /*reversed*/, TRUE /*reverse_sequence*/,
4360 fence_hit);
4361
4362 gap_align->query_start = q_start - private_q_length + 1;
4363 gap_align->subject_start = s_start - private_s_length + 1;
4364 }
4365
4366 score_right = 0;
4367
4368 if ((! (fence_hit && *fence_hit)) &&
4369 (q_start < q_length) &&
4370 (s_start < s_length)) {
4371
4372 found_end = TRUE;
4373 if(is_ooframe) {
4374 score_right =
4375 s_OutOfFrameSemiGappedAlignWrap(query+q_start-1, subject+s_start-1,
4376 q_length-q_start, s_length-s_start,
4377 &private_q_length, &private_s_length, FALSE, fwd_prelim_tback,
4378 gap_align, score_params, q_start, FALSE, switch_seq);
4379 } else {
4380 // <AG> score_right =
4381 // Blast_SemiGappedAlign(query+q_start, subject+s_start,
4382 // q_length-q_start-1, s_length-s_start-1, &private_q_length,
4383 // &private_s_length, FALSE, fwd_prelim_tback, gap_align,
4384 // score_params, q_start, FALSE, FALSE,
4385 // fence_hit);
4386 score_right =
4387 ALIGN_EX(query+q_start, subject+s_start,
4388 q_length-q_start-1, s_length-s_start-1, &private_q_length,
4389 &private_s_length, fwd_prelim_tback, gap_align,
4390 score_params, q_start, FALSE, FALSE,
4391 fence_hit);
4392 }
4393
4394 gap_align->query_stop = q_start + private_q_length + 1;
4395 gap_align->subject_stop = s_start + private_s_length + 1;
4396 }
4397
4398 if (found_end == FALSE) {
4399 gap_align->query_stop = q_start - 1;
4400 gap_align->subject_stop = s_start - 1;
4401 }
4402
4403 if(is_ooframe) {
4404 Int4 nucl_align_length;
4405 if (program == eBlastTypeBlastx) {
4406 nucl_align_length = gap_align->query_stop -
4407 gap_align->query_start;
4408 }
4409 else {
4410 nucl_align_length = gap_align->subject_stop -
4411 gap_align->subject_start;
4412 }
4413 status = s_BlastOOFTracebackToGapEditScript(rev_prelim_tback,
4414 fwd_prelim_tback, nucl_align_length,
4415 &gap_align->edit_script);
4416 } else {
4417 Int4 i;
4418 GapEditScript *esp = Blast_PrelimEditBlockToGapEditScript(
4419 rev_prelim_tback,
4420 fwd_prelim_tback);
4421
4422 /* rarely (typically when the scoring system changes between
4423 the score-only and traceback stages, as happens with
4424 composition-based statistics) it is possible to compute an
4425 optimal alignment with a leading or trailing gap. Prune
4426 these unneeded gaps here and update the score and
4427 alignment boundaries */
4428
4429 gap_align->edit_script = esp;
4430 if (esp) {
4431 while (esp->size && esp->op_type[0] != eGapAlignSub) {
4432 score_left += score_params->gap_open +
4433 esp->num[0] * score_params->gap_extend;
4434
4435 if (esp->op_type[0] == eGapAlignDel)
4436 gap_align->subject_start += esp->num[0];
4437 else
4438 gap_align->query_start += esp->num[0];
4439
4440 for (i = 1; i < esp->size; i++) {
4441 esp->op_type[i-1] = esp->op_type[i];
4442 esp->num[i-1] = esp->num[i];
4443 }
4444 esp->size--;
4445 }
4446 i = esp->size;
4447 while (esp->size && esp->op_type[i-1] != eGapAlignSub) {
4448 score_right += score_params->gap_open +
4449 esp->num[i-1] * score_params->gap_extend;
4450
4451 if (esp->op_type[i-1] == eGapAlignDel)
4452 gap_align->subject_stop -= esp->num[i-1];
4453 else
4454 gap_align->query_stop -= esp->num[i-1];
4455
4456 esp->size--;
4457 i--;
4458 ASSERT(esp->size == i);
4459 }
4460 }
4461 }
4462
4463 gap_align->score = score_right + score_left;
4464 return status;
4465 }
4466
BLAST_GetUngappedHSPList(BlastInitHitList * init_hitlist,BlastQueryInfo * query_info,BLAST_SequenceBlk * subject,const BlastHitSavingOptions * hit_options,BlastHSPList ** hsp_list_ptr)4467 Int2 BLAST_GetUngappedHSPList(BlastInitHitList* init_hitlist,
4468 BlastQueryInfo* query_info,
4469 BLAST_SequenceBlk* subject,
4470 const BlastHitSavingOptions* hit_options,
4471 BlastHSPList** hsp_list_ptr)
4472 {
4473 BlastHSPList* hsp_list = NULL;
4474 Int4 index;
4475 BlastInitHSP* init_hsp;
4476 const int kHspNumMax = BlastHspNumMax(FALSE, hit_options);
4477
4478 /* The BlastHSPList structure can be allocated and passed from outside */
4479 if (*hsp_list_ptr != NULL)
4480 hsp_list = *hsp_list_ptr;
4481
4482 if (!init_hitlist) {
4483 if (!hsp_list)
4484 *hsp_list_ptr = NULL;
4485 else
4486 hsp_list->hspcnt = 0;
4487 return 0;
4488 }
4489
4490 for (index = 0; index < init_hitlist->total; ++index) {
4491 BlastHSP* new_hsp;
4492 BlastUngappedData* ungapped_data=NULL;
4493 Int4 context = 0;
4494 init_hsp = &init_hitlist->init_hsp_array[index];
4495 if (!init_hsp->ungapped_data)
4496 continue;
4497
4498 if (!hsp_list) {
4499 hsp_list = Blast_HSPListNew(kHspNumMax);
4500 *hsp_list_ptr = hsp_list;
4501 }
4502 /* Adjust the initial HSP's coordinates in case of concatenated
4503 multiple queries/strands/frames */
4504 context = s_GetUngappedHSPContext(query_info, init_hsp);
4505 s_AdjustInitialHSPOffsets(init_hsp,
4506 query_info->contexts[context].query_offset);
4507 ungapped_data = init_hsp->ungapped_data;
4508 Blast_HSPInit(ungapped_data->q_start,
4509 ungapped_data->length+ungapped_data->q_start,
4510 ungapped_data->s_start,
4511 ungapped_data->length+ungapped_data->s_start,
4512 init_hsp->offsets.qs_offsets.q_off,
4513 init_hsp->offsets.qs_offsets.s_off,
4514 context, query_info->contexts[context].frame,
4515 subject->frame, ungapped_data->score, NULL, &new_hsp);
4516 Blast_HSPListSaveHSP(hsp_list, new_hsp);
4517 }
4518
4519 /* Sort the HSP array by score */
4520 Blast_HSPListSortByScore(hsp_list);
4521
4522 return 0;
4523 }
4524
4525