1 /* $Id: blast_gapalign.c,v 1.229 2016/10/11 12:54:13 fukanchi Exp $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 *
27 */
28
29 /** @file blast_gapalign.c
30 * Functions to perform gapped alignment
31 */
32
33 #include <algo/blast/core/ncbi_math.h>
34 #include <algo/blast/core/blast_gapalign.h>
35 #include <algo/blast/core/blast_util.h> /* for NCBI2NA_UNPACK_BASE macros */
36 #include <algo/blast/core/greedy_align.h>
37 #include "blast_gapalign_priv.h"
38 #include "blast_hits_priv.h"
39 #include "blast_itree.h"
40 #include "jumper.h"
41
42 static Int2 s_BlastDynProgNtGappedAlignment(BLAST_SequenceBlk* query_blk,
43 BLAST_SequenceBlk* subject_blk, BlastGapAlignStruct* gap_align,
44 const BlastScoringParameters* score_params, BlastInitHSP* init_hsp);
45 static Int4 s_BlastAlignPackedNucl(Uint1* B, Uint1* A, Int4 N, Int4 M,
46 Int4* pej, Int4* pei, BlastGapAlignStruct* gap_align,
47 const BlastScoringParameters* score_params, Boolean reverse_sequence);
48
49 static Int2 s_BlastProtGappedAlignment(EBlastProgramType program,
50 BLAST_SequenceBlk* query_in, BLAST_SequenceBlk* subject_in,
51 BlastGapAlignStruct* gap_align,
52 const BlastScoringParameters* score_params,
53 BlastInitHSP* init_hsp, Boolean restricted_alignment,
54 Boolean * fence_hit);
55
56 /** Lower bound for scores. Divide by two to prevent underflows. */
57 #define MININT INT4_MIN/2
58
59 /** Minimal size of a chunk for state array allocation. */
60 #define CHUNKSIZE 2097152
61
62 /** Retrieve the state structure corresponding to a given length
63 * @param head Pointer to the first element of the state structures
64 * array [in]
65 * @param length The length for which the state structure has to be
66 * found [in]
67 * @return The found or created state structure
68 */
69 static GapStateArrayStruct*
s_GapGetState(GapStateArrayStruct ** head,Int4 length)70 s_GapGetState(GapStateArrayStruct** head, Int4 length)
71
72 {
73 GapStateArrayStruct* retval,* var,* last;
74 Int4 chunksize = MAX(CHUNKSIZE, length + length/3);
75
76 length += length/3; /* Add on about 30% so the end will get reused. */
77 retval = NULL;
78 if (*head == NULL) {
79 retval = (GapStateArrayStruct*)
80 malloc(sizeof(GapStateArrayStruct));
81 retval->state_array = (Uint1*) malloc(chunksize*sizeof(Uint1));
82 retval->length = chunksize;
83 retval->used = 0;
84 retval->next = NULL;
85 *head = retval;
86 } else {
87 var = *head;
88 last = *head;
89 while (var) {
90 if (length < (var->length - var->used)) {
91 retval = var;
92 break;
93 } else if (var->used == 0) {
94 /* If it's empty and too small, replace. */
95 sfree(var->state_array);
96 var->state_array = (Uint1*) malloc(chunksize*sizeof(Uint1));
97 var->length = chunksize;
98 retval = var;
99 break;
100 }
101 last = var;
102 var = var->next;
103 }
104
105 if (var == NULL)
106 {
107 retval = (GapStateArrayStruct*) malloc(sizeof(GapStateArrayStruct));
108 retval->state_array = (Uint1*) malloc(chunksize*sizeof(Uint1));
109 retval->length = chunksize;
110 retval->used = 0;
111 retval->next = NULL;
112 last->next = retval;
113 }
114 }
115
116 #ifdef ERR_POST_EX_DEFINED
117 if (retval->state_array == NULL)
118 ErrPostEx(SEV_ERROR, 0, 0, "state array is NULL");
119 #endif
120
121 return retval;
122
123 }
124
125 /** Remove a state from a state structure */
126 static Boolean
s_GapPurgeState(GapStateArrayStruct * state_struct)127 s_GapPurgeState(GapStateArrayStruct* state_struct)
128 {
129 while (state_struct)
130 {
131 /*
132 memset(state_struct->state_array, 0, state_struct->used);
133 */
134 state_struct->used = 0;
135 state_struct = state_struct->next;
136 }
137
138 return TRUE;
139 }
140
141 /** Deallocate the memory for greedy gapped alignment */
142 static SGreedyAlignMem*
s_BlastGreedyAlignsFree(SGreedyAlignMem * gamp)143 s_BlastGreedyAlignsFree(SGreedyAlignMem* gamp)
144 {
145 if (gamp->last_seq2_off) {
146 sfree(gamp->last_seq2_off[0]);
147 sfree(gamp->last_seq2_off);
148 } else {
149 if (gamp->last_seq2_off_affine) {
150 sfree(gamp->last_seq2_off_affine[0]);
151 sfree(gamp->last_seq2_off_affine);
152 }
153 sfree(gamp->diag_bounds);
154 }
155 sfree(gamp->max_score);
156 if (gamp->space)
157 MBSpaceFree(gamp->space);
158 sfree(gamp);
159 return NULL;
160 }
161
162 /** Allocate memory for the greedy gapped alignment algorithm
163 * @param score_params Parameters related to scoring [in]
164 * @param ext_params Options and parameters related to the extension [in]
165 * @param max_d The maximum distance to search [in]
166 * @param Xdrop The Xdrop value [in]
167 * @return The allocated SGreedyAlignMem structure
168 */
169 static SGreedyAlignMem*
s_BlastGreedyAlignMemAlloc(const BlastScoringParameters * score_params,const BlastExtensionParameters * ext_params,Int4 max_d,Int4 Xdrop)170 s_BlastGreedyAlignMemAlloc(const BlastScoringParameters* score_params,
171 const BlastExtensionParameters* ext_params,
172 Int4 max_d, Int4 Xdrop)
173 {
174 SGreedyAlignMem* gamp;
175 Int4 max_d_1, d_diff, max_cost, gd, i;
176 Int4 reward, penalty, gap_open, gap_extend;
177 Int4 Mis_cost, GE_cost;
178
179 if (score_params == NULL || (!ext_params && !Xdrop))
180 return NULL;
181
182 if (score_params->reward % 2 == 1) {
183 reward = 2*score_params->reward;
184 penalty = -2*score_params->penalty;
185 if (!Xdrop)
186 Xdrop = 2*MAX(ext_params->gap_x_dropoff,
187 ext_params->gap_x_dropoff_final);
188 gap_open = 2*score_params->gap_open;
189 gap_extend = 2*score_params->gap_extend;
190 } else {
191 reward = score_params->reward;
192 penalty = -score_params->penalty;
193 if (!Xdrop)
194 Xdrop = MAX(ext_params->gap_x_dropoff,
195 ext_params->gap_x_dropoff_final);
196 gap_open = score_params->gap_open;
197 gap_extend = score_params->gap_extend;
198 }
199
200 if (gap_open == 0 && gap_extend == 0)
201 gap_extend = reward / 2 + penalty;
202
203 gamp = (SGreedyAlignMem*) calloc(1, sizeof(SGreedyAlignMem));
204 gamp->max_dist = max_d;
205 gamp->xdrop = Xdrop;
206
207 if (score_params->gap_open==0 && score_params->gap_extend==0) {
208 d_diff = (Xdrop+reward/2)/(penalty+reward)+1;
209
210 gamp->last_seq2_off = (Int4**) malloc((max_d + 2) * sizeof(Int4*));
211 if (gamp->last_seq2_off == NULL) {
212 sfree(gamp);
213 return NULL;
214 }
215 gamp->last_seq2_off[0] =
216 (Int4*) malloc((max_d + max_d + 6) * sizeof(Int4) * 2);
217 if (gamp->last_seq2_off[0] == NULL) {
218 #ifdef ERR_POST_EX_DEFINED
219 ErrPostEx(SEV_WARNING, 0, 0,
220 "Failed to allocate %ld bytes for greedy alignment",
221 (max_d + max_d + 6) * sizeof(Int4) * 2);
222 #endif
223 s_BlastGreedyAlignsFree(gamp);
224 return NULL;
225 }
226
227 gamp->last_seq2_off[1] = gamp->last_seq2_off[0] + max_d + max_d + 6;
228 gamp->last_seq2_off_affine = NULL;
229 gamp->diag_bounds = NULL;
230 } else {
231 gamp->last_seq2_off = NULL;
232 Mis_cost = reward + penalty;
233 GE_cost = gap_extend+reward/2;
234 max_d_1 = max_d;
235 max_d *= GE_cost;
236 max_cost = MAX(Mis_cost, gap_open+GE_cost);
237 gd = BLAST_Gdb3(&Mis_cost, &gap_open, &GE_cost);
238 d_diff = (Xdrop+reward/2)/gd+1;
239 gamp->diag_bounds = (Int4*) calloc(2*(max_d+1+max_cost), sizeof(Int4));
240 gamp->last_seq2_off_affine = (SGreedyOffset**)
241 malloc((MAX(max_d, max_cost) + 2) * sizeof(SGreedyOffset*));
242 if (!gamp->diag_bounds || !gamp->last_seq2_off_affine) {
243 s_BlastGreedyAlignsFree(gamp);
244 return NULL;
245 }
246 gamp->last_seq2_off_affine[0] = (SGreedyOffset*)
247 calloc((2*max_d_1 + 6) , sizeof(SGreedyOffset) * (max_cost+1));
248 for (i = 1; i <= max_cost; i++)
249 gamp->last_seq2_off_affine[i] =
250 gamp->last_seq2_off_affine[i-1] + 2*max_d_1 + 6;
251 if (!gamp->last_seq2_off_affine || !gamp->last_seq2_off_affine[0]) {
252 s_BlastGreedyAlignsFree(gamp);
253 return NULL;
254 }
255 }
256 gamp->max_score = (Int4*) malloc(sizeof(Int4) * (max_d + 1 + d_diff));
257
258 gamp->space = MBSpaceNew(0);
259 if (!gamp->max_score || !gamp->space)
260 /* Failure in one of the memory allocations */
261 s_BlastGreedyAlignsFree(gamp);
262
263 return gamp;
264 }
265
266 /* Documented in blast_gapalign.h */
267 BlastGapAlignStruct*
BLAST_GapAlignStructFree(BlastGapAlignStruct * gap_align)268 BLAST_GapAlignStructFree(BlastGapAlignStruct* gap_align)
269 {
270 if (!gap_align)
271 return NULL;
272
273 GapEditScriptDelete(gap_align->edit_script);
274 GapPrelimEditBlockFree(gap_align->fwd_prelim_tback);
275 GapPrelimEditBlockFree(gap_align->rev_prelim_tback);
276 if (gap_align->greedy_align_mem)
277 s_BlastGreedyAlignsFree(gap_align->greedy_align_mem);
278 GapStateFree(gap_align->state_struct);
279 sfree(gap_align->dp_mem);
280 JumperGapAlignFree(gap_align->jumper);
281
282 sfree(gap_align);
283 return NULL;
284 }
285
286 /* Documented in blast_gapalign.h */
287 Int2
BLAST_GapAlignStructNew(const BlastScoringParameters * score_params,const BlastExtensionParameters * ext_params,Uint4 max_subject_length,BlastScoreBlk * sbp,BlastGapAlignStruct ** gap_align_ptr)288 BLAST_GapAlignStructNew(const BlastScoringParameters* score_params,
289 const BlastExtensionParameters* ext_params,
290 Uint4 max_subject_length,
291 BlastScoreBlk* sbp, BlastGapAlignStruct** gap_align_ptr)
292 {
293 Int2 status = 0;
294 BlastGapAlignStruct* gap_align;
295
296 if (!gap_align_ptr || !sbp || !score_params || !ext_params)
297 return -1;
298
299 gap_align = (BlastGapAlignStruct*) calloc(1, sizeof(BlastGapAlignStruct));
300
301 *gap_align_ptr = gap_align;
302
303 gap_align->sbp = sbp;
304
305 gap_align->gap_x_dropoff = ext_params->gap_x_dropoff;
306 gap_align->max_mismatches = ext_params->options->max_mismatches;
307 gap_align->mismatch_window = ext_params->options->mismatch_window;
308
309 /* allocate memory either for dynamic programming or jumper */
310 if (ext_params->options->ePrelimGapExt != eJumperWithTraceback) {
311 if (ext_params->options->ePrelimGapExt == eDynProgScoreOnly) {
312 /* allocate structures for ordinary dynamic programming */
313 gap_align->dp_mem_alloc = 1000;
314 gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
315 sizeof(BlastGapDP));
316 if (!gap_align->dp_mem)
317 gap_align = BLAST_GapAlignStructFree(gap_align);
318 }
319 else {
320 /* allocate structures for greedy dynamic programming */
321 max_subject_length = MIN(max_subject_length, MAX_DBSEQ_LEN);
322 max_subject_length = MIN(GREEDY_MAX_COST,
323 max_subject_length / GREEDY_MAX_COST_FRACTION + 1);
324 gap_align->greedy_align_mem =
325 s_BlastGreedyAlignMemAlloc(score_params, ext_params,
326 max_subject_length, 0);
327 if (!gap_align->greedy_align_mem)
328 gap_align = BLAST_GapAlignStructFree(gap_align);
329 }
330 }
331 else {
332 gap_align->jumper = JumperGapAlignNew(200);
333 }
334
335 if (!gap_align)
336 return -1;
337
338 gap_align->positionBased = (sbp->psi_matrix != NULL);
339
340 gap_align->fwd_prelim_tback = GapPrelimEditBlockNew();
341 gap_align->rev_prelim_tback = GapPrelimEditBlockNew();
342
343 return status;
344 }
345
346 /** Values for the editing script operations in traceback */
347 enum {
348 SCRIPT_SUB = eGapAlignSub, /**< Substitution */
349 SCRIPT_GAP_IN_A = eGapAlignDel, /**< Deletion */
350 SCRIPT_GAP_IN_B = eGapAlignIns, /**< Insertion */
351 SCRIPT_OP_MASK = 0x07, /**< Mask for edit script operations */
352
353 SCRIPT_EXTEND_GAP_A = 0x10, /**< continue a gap in A */
354 SCRIPT_EXTEND_GAP_B = 0x40 /**< continue a gap in B */
355 };
356
357 Int4
ALIGN_EX(const Uint1 * A,const Uint1 * B,Int4 M,Int4 N,Int4 * a_offset,Int4 * b_offset,GapPrelimEditBlock * edit_block,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 query_offset,Boolean reversed,Boolean reverse_sequence,Boolean * fence_hit)358 ALIGN_EX(const Uint1* A, const Uint1* B, Int4 M, Int4 N, Int4* a_offset,
359 Int4* b_offset, GapPrelimEditBlock *edit_block,
360 BlastGapAlignStruct* gap_align,
361 const BlastScoringParameters* score_params, Int4 query_offset,
362 Boolean reversed, Boolean reverse_sequence,
363 Boolean * fence_hit)
364 {
365 /* See Blast_SemiGappedAlign for more general comments on
366 what this code is doing; comments in this function
367 only apply to the traceback computations */
368
369 Int4 i;
370 Int4 a_index;
371 Int4 b_index, b_size, first_b_index, last_b_index, b_increment;
372 const Uint1* b_ptr;
373
374 BlastGapDP* score_array;
375
376 Int4 gap_open;
377 Int4 gap_extend;
378 Int4 gap_open_extend;
379 Int4 x_dropoff;
380 Int4 best_score;
381
382 Int4** matrix = NULL;
383 Int4** pssm = NULL;
384 Int4* matrix_row = NULL;
385
386 Int4 score;
387 Int4 score_gap_row;
388 Int4 score_gap_col;
389 Int4 next_score;
390
391 GapStateArrayStruct* state_struct;
392 Uint1* edit_script_row;
393 Uint1** edit_script;
394 Int4 *edit_start_offset;
395 Int4 edit_script_num_rows;
396 Int4 orig_b_index;
397 Uint1 script, next_script, script_row, script_col;
398 Int4 num_extra_cells;
399
400 matrix = gap_align->sbp->matrix->data;
401 if (gap_align->positionBased) {
402 pssm = gap_align->sbp->psi_matrix->pssm->data;
403 }
404
405 *a_offset = 0;
406 *b_offset = 0;
407 gap_open = score_params->gap_open;
408 gap_extend = score_params->gap_extend;
409 gap_open_extend = gap_open + gap_extend;
410 x_dropoff = gap_align->gap_x_dropoff;
411
412 if (x_dropoff < gap_open_extend)
413 x_dropoff = gap_open_extend;
414
415 if(N <= 0 || M <= 0)
416 return 0;
417
418 /* Initialize traceback information. edit_script[] is
419 a 2-D array which is filled in row by row as the
420 dynamic programming takes place */
421
422 s_GapPurgeState(gap_align->state_struct);
423
424 /* Conceptually, traceback requires a byte array of size
425 MxN. To save on memory, each row of the array is allocated
426 separately. edit_script[i] points to the storage reserved
427 for row i, and edit_start_offset[i] gives the offset into
428 the B sequence corresponding to element 0 of edit_script[i].
429
430 Also make the number of edit script rows grow dynamically */
431
432 edit_script_num_rows = 100;
433 edit_script = (Uint1**) malloc(sizeof(Uint1*) * edit_script_num_rows);
434 edit_start_offset = (Int4*) malloc(sizeof(Int4) * edit_script_num_rows);
435
436 /* allocate storage for the first row of the traceback
437 array. Because row elements correspond to gaps in A,
438 the alignment can only go x_dropoff/gap_extend positions
439 at most before failing the X dropoff criterion */
440
441 if (gap_extend > 0)
442 num_extra_cells = x_dropoff / gap_extend + 3;
443 else
444 num_extra_cells = N + 3;
445
446 if (num_extra_cells > gap_align->dp_mem_alloc) {
447 gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
448 2 * gap_align->dp_mem_alloc);
449 sfree(gap_align->dp_mem);
450 gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
451 sizeof(BlastGapDP));
452 }
453
454 state_struct = s_GapGetState(&gap_align->state_struct, num_extra_cells);
455
456 edit_script[0] = state_struct->state_array;
457 edit_start_offset[0] = 0;
458 edit_script_row = state_struct->state_array;
459
460 score = -gap_open_extend;
461 score_array = gap_align->dp_mem;
462 score_array[0].best = 0;
463 score_array[0].best_gap = -gap_open_extend;
464
465 for (i = 1; i <= N; i++) {
466 if (score < -x_dropoff)
467 break;
468
469 score_array[i].best = score;
470 score_array[i].best_gap = score - gap_open_extend;
471 score -= gap_extend;
472 edit_script_row[i] = SCRIPT_GAP_IN_A;
473 }
474 state_struct->used = i + 1;
475
476 b_size = i;
477 best_score = 0;
478 first_b_index = 0;
479 if (reverse_sequence)
480 b_increment = -1;
481 else
482 b_increment = 1;
483
484 for (a_index = 1; a_index <= M; a_index++) {
485
486 /* Set up the next row of the edit script; this involves
487 allocating memory for the row, then pointing to it.
488 It is not necessary to allocate space for offsets less
489 than first_b_index (since the inner loop will never
490 look at them);
491
492 It is unknown at this point how far to the right the
493 current traceback row will extend; all that's known for
494 sure is that the previous row fails the X-dropoff test
495 after b_size cells, and that the current row can go at
496 most num_extra_cells beyond that before failing the test */
497
498 if (gap_extend > 0)
499 state_struct = s_GapGetState(&gap_align->state_struct,
500 b_size - first_b_index + num_extra_cells);
501 else
502 state_struct = s_GapGetState(&gap_align->state_struct,
503 N + 3 - first_b_index);
504
505 if (a_index == edit_script_num_rows) {
506 edit_script_num_rows = edit_script_num_rows * 2;
507 edit_script = (Uint1 **)realloc(edit_script,
508 edit_script_num_rows *
509 sizeof(Uint1 *));
510 edit_start_offset = (Int4 *)realloc(edit_start_offset,
511 edit_script_num_rows *
512 sizeof(Int4));
513 }
514
515 edit_script[a_index] = state_struct->state_array +
516 state_struct->used + 1;
517 edit_start_offset[a_index] = first_b_index;
518
519 /* the inner loop assumes the current traceback
520 row begins at offset zero of B */
521
522 edit_script_row = edit_script[a_index] - first_b_index;
523 orig_b_index = first_b_index;
524
525 if (!(gap_align->positionBased)) {
526 if(reverse_sequence)
527 matrix_row = matrix[ A[ M - a_index ] ];
528 else
529 matrix_row = matrix[ A[ a_index ] ];
530 }
531 else {
532 if(reversed || reverse_sequence)
533 matrix_row = pssm[M - a_index];
534 else
535 matrix_row = pssm[a_index + query_offset];
536 }
537
538 if(reverse_sequence)
539 b_ptr = &B[N - first_b_index];
540 else
541 b_ptr = &B[first_b_index];
542
543 score = MININT;
544 score_gap_row = MININT;
545 last_b_index = first_b_index;
546
547 for (b_index = first_b_index; b_index < b_size; b_index++) {
548 int matrix_index = 0;
549
550 b_ptr += b_increment;
551 score_gap_col = score_array[b_index].best_gap;
552
553 matrix_index = *b_ptr;
554
555 if (matrix_index == FENCE_SENTRY) {
556 if (fence_hit) {
557 *fence_hit = 1;
558 }
559 break;
560 }
561
562 next_score = score_array[b_index].best + matrix_row[ *b_ptr ];
563
564 /* script, script_row and script_col contain the
565 actions specified by the dynamic programming.
566 when the inner loop has finished, 'script' con-
567 tains all of the actions to perform, and is
568 written to edit_script[a_index][b_index]. Otherwise,
569 this inner loop is exactly the same as the one
570 in Blast_SemiGappedAlign() */
571
572 script = SCRIPT_SUB;
573 script_col = SCRIPT_EXTEND_GAP_B;
574 script_row = SCRIPT_EXTEND_GAP_A;
575
576 if (score < score_gap_col) {
577 script = SCRIPT_GAP_IN_B;
578 score = score_gap_col;
579 }
580 if (score < score_gap_row) {
581 script = SCRIPT_GAP_IN_A;
582 score = score_gap_row;
583 }
584
585 if (best_score - score > x_dropoff) {
586
587 if (first_b_index == b_index)
588 first_b_index++;
589 else
590 score_array[b_index].best = MININT;
591 }
592 else {
593 last_b_index = b_index;
594 if (score > best_score) {
595 best_score = score;
596 *a_offset = a_index;
597 *b_offset = b_index;
598 }
599
600 score_gap_row -= gap_extend;
601 score_gap_col -= gap_extend;
602 if (score_gap_col < (score - gap_open_extend)) {
603 score_array[b_index].best_gap = score - gap_open_extend;
604 }
605 else {
606 score_array[b_index].best_gap = score_gap_col;
607 script += script_col;
608 }
609
610 if (score_gap_row < (score - gap_open_extend))
611 score_gap_row = score - gap_open_extend;
612 else
613 script += script_row;
614
615 score_array[b_index].best = score;
616 }
617
618 score = next_score;
619 edit_script_row[b_index] = script;
620 }
621
622 if (first_b_index == b_size || (fence_hit && *fence_hit))
623 break;
624
625 if (last_b_index + num_extra_cells + 3 >= gap_align->dp_mem_alloc) {
626
627 gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100,
628 2 * gap_align->dp_mem_alloc);
629 score_array = (BlastGapDP *)realloc(score_array,
630 gap_align->dp_mem_alloc *
631 sizeof(BlastGapDP));
632 gap_align->dp_mem = score_array;
633 }
634
635
636 if (last_b_index < b_size - 1) {
637 b_size = last_b_index + 1;
638 }
639 else {
640 while (score_gap_row >= (best_score - x_dropoff) && b_size <= N) {
641
642 score_array[b_size].best = score_gap_row;
643 score_array[b_size].best_gap = score_gap_row - gap_open_extend;
644 score_gap_row -= gap_extend;
645 edit_script_row[b_size] = SCRIPT_GAP_IN_A;
646 b_size++;
647 }
648 }
649
650 /* update the memory allocator to reflect the exact number
651 of traceback cells this row needed */
652
653 state_struct->used += MAX(b_index, b_size) - orig_b_index + 1;
654
655 if (b_size <= N) {
656 score_array[b_size].best = MININT;
657 score_array[b_size].best_gap = MININT;
658 b_size++;
659 }
660 }
661
662 /* Pick the optimal path through the now complete
663 edit_script[][]. This is equivalent to flattening
664 the 2-D array into a 1-D list of actions. */
665
666 a_index = *a_offset;
667 b_index = *b_offset;
668 script = SCRIPT_SUB;
669
670 if (fence_hit && *fence_hit)
671 goto done;
672
673 while (a_index > 0 || b_index > 0) {
674 /* Retrieve the next action to perform. Rows of
675 the traceback array do not necessarily start
676 at offset zero of B, so a correction is needed
677 to point to the correct position */
678
679 next_script =
680 edit_script[a_index][b_index - edit_start_offset[a_index]];
681
682 switch(script) {
683 case SCRIPT_GAP_IN_A:
684 script = next_script & SCRIPT_OP_MASK;
685 if (next_script & SCRIPT_EXTEND_GAP_A)
686 script = SCRIPT_GAP_IN_A;
687 break;
688
689 case SCRIPT_GAP_IN_B:
690 script = next_script & SCRIPT_OP_MASK;
691 if (next_script & SCRIPT_EXTEND_GAP_B)
692 script = SCRIPT_GAP_IN_B;
693 break;
694
695 default:
696 script = next_script & SCRIPT_OP_MASK;
697 break;
698 }
699
700 if (script == SCRIPT_GAP_IN_A) {
701 b_index--;
702 }
703 else if (script == SCRIPT_GAP_IN_B) {
704 a_index--;
705 }
706 else {
707 a_index--;
708 b_index--;
709 }
710 GapPrelimEditBlockAdd(edit_block, (EGapAlignOpType)script, 1);
711 }
712
713 done:
714 sfree(edit_start_offset);
715 sfree(edit_script);
716 return best_score;
717 }
718
719 Int4
Blast_SemiGappedAlign(const Uint1 * A,const Uint1 * B,Int4 M,Int4 N,Int4 * a_offset,Int4 * b_offset,Boolean score_only,GapPrelimEditBlock * edit_block,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 query_offset,Boolean reversed,Boolean reverse_sequence,Boolean * fence_hit)720 Blast_SemiGappedAlign(const Uint1* A, const Uint1* B, Int4 M, Int4 N,
721 Int4* a_offset, Int4* b_offset, Boolean score_only,
722 GapPrelimEditBlock *edit_block, BlastGapAlignStruct* gap_align,
723 const BlastScoringParameters* score_params,
724 Int4 query_offset, Boolean reversed, Boolean reverse_sequence,
725 Boolean * fence_hit)
726 {
727 Int4 i; /* sequence pointers and indices */
728 Int4 a_index;
729 Int4 b_index, b_size, first_b_index, last_b_index, b_increment;
730 const Uint1* b_ptr;
731
732 BlastGapDP* score_array;
733
734 Int4 gap_open; /* alignment penalty variables */
735 Int4 gap_extend;
736 Int4 gap_open_extend;
737 Int4 x_dropoff;
738
739 Int4** matrix = NULL; /* pointers to the score matrix */
740 Int4** pssm = NULL;
741 Int4* matrix_row = NULL;
742
743 Int4 score; /* score tracking variables */
744 Int4 score_gap_row;
745 Int4 score_gap_col;
746 Int4 next_score;
747 Int4 best_score;
748 Int4 num_extra_cells;
749
750 if (!score_only) {
751 return ALIGN_EX(A, B, M, N, a_offset, b_offset, edit_block, gap_align,
752 score_params, query_offset, reversed, reverse_sequence,
753 fence_hit);
754 }
755
756 /* do initialization and sanity-checking */
757
758 matrix = gap_align->sbp->matrix->data;
759 if (gap_align->positionBased) {
760 pssm = gap_align->sbp->psi_matrix->pssm->data;
761 }
762 *a_offset = 0;
763 *b_offset = 0;
764 gap_open = score_params->gap_open;
765 gap_extend = score_params->gap_extend;
766 gap_open_extend = gap_open + gap_extend;
767 x_dropoff = gap_align->gap_x_dropoff;
768
769 if (x_dropoff < gap_open_extend)
770 x_dropoff = gap_open_extend;
771
772 if(N <= 0 || M <= 0)
773 return 0;
774
775 /* Allocate and fill in the auxiliary bookeeping structures.
776 Since A and B could be very large, maintain a window
777 of auxiliary structures only large enough to contain to current
778 set of DP computations. The initial window size is determined
779 by the number of cells needed to fail the x-dropoff test */
780
781 if (gap_extend > 0)
782 num_extra_cells = x_dropoff / gap_extend + 3;
783 else
784 num_extra_cells = N + 3;
785
786 if (num_extra_cells > gap_align->dp_mem_alloc) {
787 gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
788 2 * gap_align->dp_mem_alloc);
789 sfree(gap_align->dp_mem);
790 gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
791 sizeof(BlastGapDP));
792 }
793
794 score_array = gap_align->dp_mem;
795 score = -gap_open_extend;
796 score_array[0].best = 0;
797 score_array[0].best_gap = -gap_open_extend;
798
799 for (i = 1; i <= N; i++) {
800 if (score < -x_dropoff)
801 break;
802
803 score_array[i].best = score;
804 score_array[i].best_gap = score - gap_open_extend;
805 score -= gap_extend;
806 }
807
808 /* The inner loop below examines letters of B from
809 index 'first_b_index' to 'b_size' */
810
811 b_size = i;
812 best_score = 0;
813 first_b_index = 0;
814 if (reverse_sequence)
815 b_increment = -1;
816 else
817 b_increment = 1;
818
819 for (a_index = 1; a_index <= M; a_index++) {
820 /* pick out the row of the score matrix
821 appropriate for A[a_index] */
822
823 if (!(gap_align->positionBased)) {
824 if(reverse_sequence)
825 matrix_row = matrix[ A[ M - a_index ] ];
826 else
827 matrix_row = matrix[ A[ a_index ] ];
828 }
829 else {
830 if(reversed || reverse_sequence)
831 matrix_row = pssm[M - a_index];
832 else
833 matrix_row = pssm[a_index + query_offset];
834 }
835
836 if(reverse_sequence)
837 b_ptr = &B[N - first_b_index];
838 else
839 b_ptr = &B[first_b_index];
840
841 /* initialize running-score variables */
842 score = MININT;
843 score_gap_row = MININT;
844 last_b_index = first_b_index;
845
846 for (b_index = first_b_index; b_index < b_size; b_index++) {
847
848 b_ptr += b_increment;
849 score_gap_col = score_array[b_index].best_gap;
850 next_score = score_array[b_index].best + matrix_row[ *b_ptr ];
851
852 if (score < score_gap_col)
853 score = score_gap_col;
854
855 if (score < score_gap_row)
856 score = score_gap_row;
857
858 if (best_score - score > x_dropoff) {
859
860 /* the current best score failed the X-dropoff
861 criterion. Note that this does not stop the
862 inner loop, only forces future iterations to
863 skip this column of B.
864
865 Also, if the very first letter of B that was
866 tested failed the X dropoff criterion, make
867 sure future inner loops start one letter to
868 the right */
869
870 if (b_index == first_b_index)
871 first_b_index++;
872 else
873 score_array[b_index].best = MININT;
874 }
875 else {
876 last_b_index = b_index;
877 if (score > best_score) {
878 best_score = score;
879 *a_offset = a_index;
880 *b_offset = b_index;
881 }
882
883 /* If starting a gap at this position will improve
884 the best row, or column, score, update them to
885 reflect that. */
886
887 score_gap_row -= gap_extend;
888 score_gap_col -= gap_extend;
889 score_array[b_index].best_gap = MAX(score - gap_open_extend,
890 score_gap_col);
891 score_gap_row = MAX(score - gap_open_extend, score_gap_row);
892 score_array[b_index].best = score;
893 }
894
895 score = next_score;
896 }
897
898 /* Finish aligning if the best scores for all positions
899 of B will fail the X-dropoff test, i.e. the inner loop
900 bounds have converged to each other */
901
902 if (first_b_index == b_size)
903 break;
904
905 /* enlarge the window for score data if necessary */
906
907 if (last_b_index + num_extra_cells + 3 >= gap_align->dp_mem_alloc) {
908
909 gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100,
910 2 * gap_align->dp_mem_alloc);
911 score_array = (BlastGapDP *)realloc(score_array,
912 gap_align->dp_mem_alloc *
913 sizeof(BlastGapDP));
914 gap_align->dp_mem = score_array;
915 }
916
917 if (last_b_index < b_size - 1) {
918 /* This row failed the X-dropoff test earlier than
919 the last row did; just shorten the loop bounds
920 before doing the next row */
921
922 b_size = last_b_index + 1;
923 }
924 else {
925 /* The inner loop finished without failing the X-dropoff
926 test; initialize extra bookkeeping structures until
927 the X dropoff test fails or we run out of letters in B.
928 The next inner loop will have larger bounds */
929
930 while (score_gap_row >= (best_score - x_dropoff) && b_size <= N) {
931 score_array[b_size].best = score_gap_row;
932 score_array[b_size].best_gap = score_gap_row - gap_open_extend;
933 score_gap_row -= gap_extend;
934 b_size++;
935 }
936 }
937
938 if (b_size <= N) {
939 score_array[b_size].best = MININT;
940 score_array[b_size].best_gap = MININT;
941 b_size++;
942 }
943 }
944
945 return best_score;
946 }
947
948 /** For restricted gapped alignment, gaps may only start once in
949 this many sequence offsets */
950 #define RESTRICT_SIZE 10
951
952 /** Low level function to perform score-only gapped extension
953 * in one direction. Gaps in one sequence or the other are only
954 * allowed to start at sequence offsets that are a multiple of
955 * RESTRICT_SIZE from the alignment start point. For more details see
956 * <PRE>
957 * Michael Cameron, Hugh E. Williams, Adam Cannane, "Improved
958 * Gapped Alignment in BLAST". IEEE/ACM Transactions on Computational
959 * Biology and Bioinformatics, vol 1 #3 (2004) pp 116-129
960 * </PRE>
961 * @param A The query sequence [in]
962 * @param B The subject sequence [in]
963 * @param M Maximal extension length in query [in]
964 * @param N Maximal extension length in subject [in]
965 * @param a_offset Resulting starting offset in query [out]
966 * @param b_offset Resulting starting offset in subject [out]
967 * @param gap_align Structure holding various information and allocated
968 * memory for the gapped alignment [in]
969 * @param score_params Parameters related to scoring [in]
970 * @param query_offset The starting offset in query [in]
971 * @param reverse_sequence Do reverse the sequence [in]
972 * @return The best alignment score found.
973 */
974 static Int4
s_RestrictedGappedAlign(const Uint1 * A,const Uint1 * B,Int4 M,Int4 N,Int4 * a_offset,Int4 * b_offset,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 query_offset,Boolean reverse_sequence)975 s_RestrictedGappedAlign(const Uint1* A, const Uint1* B, Int4 M, Int4 N,
976 Int4* a_offset, Int4* b_offset,
977 BlastGapAlignStruct* gap_align,
978 const BlastScoringParameters* score_params,
979 Int4 query_offset, Boolean reverse_sequence)
980 {
981 /* see Blast_SemiGappedAlign for general details */
982
983 Int4 i;
984 Int4 a_index;
985 Int4 b_index, b_size, first_b_index, last_b_index, b_increment;
986 const Uint1* b_ptr;
987 Int4 b_gap;
988
989 BlastGapDP* score_array;
990
991 Int4 gap_open;
992 Int4 gap_extend;
993 Int4 gap_open_extend;
994 Int4 x_dropoff;
995
996 Int4** matrix = NULL;
997 Int4** pssm = NULL;
998 Int4* matrix_row = NULL;
999
1000 Int4 score;
1001 Int4 score_gap_row;
1002 Int4 score_gap_col;
1003 Int4 next_score;
1004 Int4 best_score;
1005 Int4 num_extra_cells;
1006
1007 matrix = gap_align->sbp->matrix->data;
1008 if (gap_align->positionBased) {
1009 pssm = gap_align->sbp->psi_matrix->pssm->data;
1010 }
1011 *a_offset = 0;
1012 *b_offset = 0;
1013 gap_open = score_params->gap_open;
1014 gap_extend = score_params->gap_extend;
1015 gap_open_extend = gap_open + gap_extend;
1016 x_dropoff = gap_align->gap_x_dropoff;
1017
1018 if (x_dropoff < gap_open_extend)
1019 x_dropoff = gap_open_extend;
1020
1021 if(N <= 0 || M <= 0)
1022 return 0;
1023
1024 if (gap_extend > 0)
1025 num_extra_cells = x_dropoff / gap_extend + 3;
1026 else
1027 num_extra_cells = N + 3;
1028
1029 if (num_extra_cells > gap_align->dp_mem_alloc) {
1030 gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
1031 2 * gap_align->dp_mem_alloc);
1032 sfree(gap_align->dp_mem);
1033 gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
1034 sizeof(BlastGapDP));
1035 }
1036
1037 score_array = gap_align->dp_mem;
1038 score = -gap_open_extend;
1039 score_array[0].best = 0;
1040 score_array[0].best_gap = -gap_open_extend;
1041
1042 for (i = 1; i <= N; i++) {
1043 if (score < -x_dropoff)
1044 break;
1045
1046 score_array[i].best = score;
1047 score_array[i].best_gap = score - gap_open_extend;
1048 score -= gap_extend;
1049 }
1050
1051 b_size = i;
1052 best_score = 0;
1053 first_b_index = 0;
1054 b_gap = 0;
1055 if (reverse_sequence)
1056 b_increment = -1;
1057 else
1058 b_increment = 1;
1059
1060 for (a_index = 1; a_index <= M; a_index++) {
1061
1062 if (!(gap_align->positionBased)) {
1063 if(reverse_sequence)
1064 matrix_row = matrix[ A[ M - a_index ] ];
1065 else
1066 matrix_row = matrix[ A[ a_index ] ];
1067 }
1068 else {
1069 if(reverse_sequence)
1070 matrix_row = pssm[M - a_index];
1071 else
1072 matrix_row = pssm[a_index + query_offset];
1073 }
1074
1075 if(reverse_sequence)
1076 b_ptr = &B[N - first_b_index];
1077 else
1078 b_ptr = &B[first_b_index];
1079
1080 score = MININT;
1081 score_gap_row = MININT;
1082 last_b_index = first_b_index;
1083
1084 /* The double loop that computes the alignment is essentially
1085 unchanged from that of Blast_SemiGappedAlign. The only
1086 real difference is that a gap in A or B is not allowed to
1087 start unless the offset of the gap is divisible by
1088 RESTRICT_SIZE. This allows the ordinary dynamic programming
1089 recurrence relations to be simplified */
1090
1091 if (a_index % RESTRICT_SIZE != 0) {
1092
1093 /* a gap will never start in A; do not bother checking
1094 or updating score_gap_row */
1095
1096 for (b_index = first_b_index; b_index < b_size; b_index++) {
1097
1098 b_ptr += b_increment;
1099 next_score = score_array[b_index].best + matrix_row[ *b_ptr ];
1100
1101 if (b_index != b_gap) {
1102
1103 /* the majority of cases fall here; a gap
1104 may not start in either A or B */
1105
1106 if (best_score - score > x_dropoff) {
1107 score_array[b_index].best = MININT;
1108 if (b_index == first_b_index)
1109 first_b_index++;
1110 }
1111 else {
1112 last_b_index = b_index;
1113 if (score > best_score) {
1114 best_score = score;
1115 *a_offset = a_index;
1116 *b_offset = b_index;
1117 }
1118 score_array[b_index].best = score;
1119 }
1120 }
1121 else {
1122
1123 /* a gap may start in B. Update b_gap, the
1124 offset when this will next happen, and
1125 compute the two-term recurrence */
1126
1127 b_gap += RESTRICT_SIZE;
1128 score_gap_col = score_array[b_index].best_gap;
1129
1130 if (score < score_gap_col)
1131 score = score_gap_col;
1132
1133 if (best_score - score > x_dropoff) {
1134 score_array[b_index].best = MININT;
1135 if (b_index == first_b_index)
1136 first_b_index++;
1137 }
1138 else {
1139 last_b_index = b_index;
1140 if (score > best_score) {
1141 best_score = score;
1142 *a_offset = a_index;
1143 *b_offset = b_index;
1144 }
1145
1146 score_gap_col -= gap_extend;
1147 score_array[b_index].best_gap =
1148 MAX(score - gap_open_extend, score_gap_col);
1149 score_array[b_index].best = score;
1150 }
1151 }
1152 score = next_score;
1153 }
1154
1155 score_gap_row = score;
1156 }
1157 else {
1158
1159 /* gap may start in A */
1160
1161 for (b_index = first_b_index; b_index < b_size; b_index++) {
1162
1163 b_ptr += b_increment;
1164 next_score = score_array[b_index].best + matrix_row[ *b_ptr ];
1165
1166 if (b_index != b_gap) {
1167
1168 /* gap may not start in B. Compute
1169 the resulting two-term recurrence */
1170
1171 if (score < score_gap_row)
1172 score = score_gap_row;
1173
1174 if (best_score - score > x_dropoff) {
1175 score_array[b_index].best = MININT;
1176 if (b_index == first_b_index)
1177 first_b_index++;
1178 }
1179 else {
1180 last_b_index = b_index;
1181 if (score > best_score) {
1182 best_score = score;
1183 *a_offset = a_index;
1184 *b_offset = b_index;
1185 }
1186
1187 score_gap_row -= gap_extend;
1188 score_gap_row = MAX(score - gap_open_extend,
1189 score_gap_row);
1190 score_array[b_index].best = score;
1191 }
1192 }
1193 else {
1194
1195 /* the ordinary case: a gap may start in A
1196 or B and the full three-term recurrence
1197 must be computed. This happens once every
1198 RESTRICT_SIZE*RESTRICT_SIZE cells */
1199
1200 b_gap += RESTRICT_SIZE;
1201 score_gap_col = score_array[b_index].best_gap;
1202
1203 if (score < score_gap_col)
1204 score = score_gap_col;
1205 if (score < score_gap_row)
1206 score = score_gap_row;
1207
1208 if (best_score - score > x_dropoff) {
1209 score_array[b_index].best = MININT;
1210 if (b_index == first_b_index)
1211 first_b_index++;
1212 }
1213 else {
1214 last_b_index = b_index;
1215 if (score > best_score) {
1216 best_score = score;
1217 *a_offset = a_index;
1218 *b_offset = b_index;
1219 }
1220
1221 score_gap_row -= gap_extend;
1222 score_gap_col -= gap_extend;
1223 score_array[b_index].best_gap =
1224 MAX(score - gap_open_extend, score_gap_col);
1225 score_gap_row = MAX(score - gap_open_extend,
1226 score_gap_row);
1227 score_array[b_index].best = score;
1228 }
1229 }
1230 score = next_score;
1231 }
1232 }
1233
1234 if (first_b_index == b_size)
1235 break;
1236
1237 /* compute the first offset of the next row where a
1238 gap in B may start */
1239
1240 b_index = first_b_index % RESTRICT_SIZE;
1241 b_gap = first_b_index;
1242 if (b_index > 0)
1243 b_gap += RESTRICT_SIZE - b_index;
1244
1245 if (last_b_index + num_extra_cells + 3 >= gap_align->dp_mem_alloc) {
1246
1247 gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100,
1248 2 * gap_align->dp_mem_alloc);
1249 score_array = (BlastGapDP *)realloc(score_array,
1250 gap_align->dp_mem_alloc *
1251 sizeof(BlastGapDP));
1252 gap_align->dp_mem = score_array;
1253 }
1254
1255 if (last_b_index < b_size - 1) {
1256 b_size = last_b_index + 1;
1257 }
1258 else {
1259 /* The inner loop finished without failing the X-dropoff
1260 test; initialize extra bookkeeping structures until
1261 the X dropoff test fails or we run out of letters in B.
1262 The next inner loop will have larger bounds.
1263
1264 Note that if gaps in A are not allowed for this row, then
1265 score_gap_row is -infinity and the loop below will not
1266 extend the region to explore */
1267
1268 while (score_gap_row >= (best_score - x_dropoff) && b_size <= N) {
1269 score_array[b_size].best = score_gap_row;
1270 score_array[b_size].best_gap = score_gap_row - gap_open_extend;
1271 score_gap_row -= gap_extend;
1272 b_size++;
1273 }
1274 }
1275
1276 if (b_size <= N) {
1277 score_array[b_size].best = MININT;
1278 score_array[b_size].best_gap = MININT;
1279 b_size++;
1280 }
1281 }
1282
1283 return best_score;
1284 }
1285
1286 /** Editing script operations for out-of-frame traceback */
1287 enum {
1288 SCRIPT_AHEAD_ONE_FRAME = 1, /**< Shift 1 frame in sequence A
1289 (gap 2 nucleotides) */
1290 SCRIPT_AHEAD_TWO_FRAMES = 2, /**< Shift 2 frames in sequence A
1291 (gap 1 nucleotide) */
1292 SCRIPT_NEXT_IN_FRAME = 3, /**< Shift to next base (substitution) */
1293 SCRIPT_NEXT_PLUS_ONE_FRAME = 4, /**< Shift to next base plus 1 frame
1294 (gap 1 nucleotide in sequence B) */
1295 SCRIPT_NEXT_PLUS_TWO_FRAMES = 5, /**< Shift to next base plus 2 frames
1296 (gap 2 nucleotides in sequence B) */
1297 SCRIPT_OOF_OPEN_GAP = 0x08 /**< Opening a gap */
1298 };
1299
1300 /** Low level function to perform gapped extension with out-of-frame
1301 * gapping with traceback.
1302 * @param A The query sequence [in]
1303 * @param B The subject sequence [in]
1304 * @param M Maximal extension length in query [in]
1305 * @param N Maximal extension length in subject [in]
1306 * @param a_offset Resulting starting offset in query [out]
1307 * @param b_offset Resulting starting offset in subject [out]
1308 * @param edit_block Structure to hold generated traceback [out]
1309 * @param gap_align Structure holding various information and allocated
1310 * memory for the gapped alignment [in]
1311 * @param score_params Parameters related to scoring [in]
1312 * @param query_offset The starting offset in query [in]
1313 * @param reversed Has the sequence been reversed? Used for psi-blast [in]
1314 * @return The best alignment score found.
1315 */
1316 static Int4
s_OutOfFrameAlignWithTraceback(const Uint1 * A,const Uint1 * B,Int4 M,Int4 N,Int4 * a_offset,Int4 * b_offset,GapPrelimEditBlock * edit_block,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 query_offset,Boolean reversed)1317 s_OutOfFrameAlignWithTraceback(const Uint1* A, const Uint1* B, Int4 M, Int4 N,
1318 Int4* a_offset, Int4* b_offset, GapPrelimEditBlock *edit_block,
1319 BlastGapAlignStruct* gap_align, const BlastScoringParameters* score_params,
1320 Int4 query_offset, Boolean reversed)
1321 {
1322 Int4 i, increment; /* sequence pointers and indices */
1323 Int4 a_index;
1324 Int4 b_index, b_size, first_b_index, last_b_index;
1325
1326 BlastGapDP* score_array;
1327
1328 Int4 gap_open; /* alignment penalty variables */
1329 Int4 gap_extend;
1330 Int4 gap_open_extend;
1331 Int4 shift_penalty;
1332 Int4 x_dropoff;
1333
1334 Int4** matrix = NULL; /* pointers to the score matrix */
1335 Int4** pssm = NULL;
1336 Int4* matrix_row = NULL;
1337
1338 Int4 score; /* score tracking variables */
1339 Int4 score_row1;
1340 Int4 score_row2;
1341 Int4 score_row3;
1342 Int4 score_gap_col;
1343 Int4 score_col1;
1344 Int4 score_col2;
1345 Int4 score_col3;
1346 Int4 score_other_frame1;
1347 Int4 score_other_frame2;
1348 Int4 best_score;
1349
1350 GapStateArrayStruct* state_struct;
1351 Uint1** edit_script;
1352 Uint1* edit_script_row;
1353 Int4 *edit_start_offset;
1354 Int4 edit_script_num_rows;
1355 Int4 orig_b_index;
1356 Int1 script, next_script;
1357 Int4 num_extra_cells;
1358
1359 /* do initialization and sanity-checking */
1360
1361 matrix = gap_align->sbp->matrix->data;
1362 if (gap_align->positionBased) {
1363 pssm = gap_align->sbp->psi_matrix->pssm->data;
1364 }
1365 *a_offset = 0;
1366 *b_offset = -2;
1367 gap_open = score_params->gap_open;
1368 gap_extend = score_params->gap_extend;
1369 gap_open_extend = gap_open + gap_extend;
1370 shift_penalty = score_params->shift_pen;
1371 x_dropoff = gap_align->gap_x_dropoff;
1372
1373 if (x_dropoff < gap_open_extend)
1374 x_dropoff = gap_open_extend;
1375
1376 if(N <= 0 || M <= 0)
1377 return 0;
1378
1379 /* Initialize traceback information. edit_script[] is
1380 a 2-D array which is filled in row by row as the
1381 dynamic programming takes place */
1382
1383 s_GapPurgeState(gap_align->state_struct);
1384
1385 /* Conceptually, traceback requires a byte array of size
1386 MxN. To save on memory, each row of the array is allocated
1387 separately. edit_script[i] points to the storage reserved
1388 for row i, and edit_start_offset[i] gives the offset into
1389 the B sequence corresponding to element 0 of edit_script[i]
1390
1391 Also make the number of edit script rows grow dynamically */
1392
1393 edit_script_num_rows = 100;
1394 edit_script = (Uint1**) malloc(sizeof(Uint1*) * edit_script_num_rows);
1395 edit_start_offset = (Int4*) malloc(sizeof(Int4) * edit_script_num_rows);
1396
1397 /* allocate storage for the first row of the traceback
1398 array. Because row elements correspond to gaps in A,
1399 the alignment can only go at most x_dropoff/gap_extend
1400 positions, in all three frames, before failing the
1401 X dropoff criterion */
1402
1403 if (gap_extend > 0)
1404 num_extra_cells = CODON_LENGTH * (x_dropoff / gap_extend + 5);
1405 else
1406 num_extra_cells = N + 5;
1407
1408 if (num_extra_cells > gap_align->dp_mem_alloc) {
1409 gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
1410 2 * gap_align->dp_mem_alloc);
1411 sfree(gap_align->dp_mem);
1412 gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
1413 sizeof(BlastGapDP));
1414 }
1415
1416 state_struct = s_GapGetState(&gap_align->state_struct, num_extra_cells);
1417
1418 edit_script[0] = state_struct->state_array;
1419 edit_start_offset[0] = 0;
1420 edit_script_row = state_struct->state_array;
1421
1422 score_array = gap_align->dp_mem;
1423 score = -gap_open_extend;
1424 score_array[0].best = 0;
1425 score_array[0].best_gap = -gap_open_extend;
1426
1427 for (i = 3; i <= N + 2; i += 3) {
1428 score_array[i].best = score;
1429 score_array[i].best_gap = score - gap_open_extend;
1430 edit_script_row[i] = SCRIPT_GAP_IN_B;
1431
1432 score_array[i-1].best = MININT;
1433 score_array[i-1].best_gap = MININT;
1434 edit_script_row[i-1] = SCRIPT_GAP_IN_B;
1435
1436 score_array[i-2].best = MININT;
1437 score_array[i-2].best_gap = MININT;
1438 edit_script_row[i-2] = SCRIPT_GAP_IN_B;
1439
1440 if (score < -x_dropoff)
1441 break;
1442 score -= gap_extend;
1443 }
1444
1445 /* The inner loop below examines letters of B from
1446 index 'first_b_index' to 'b_size' */
1447
1448 score_array[i].best = MININT;
1449 score_array[i].best_gap = MININT;
1450 b_size = i - 2;
1451 state_struct->used = b_size + 1;
1452
1453 best_score = 0;
1454 first_b_index = 0;
1455 if (reversed) {
1456 increment = -1;
1457 }
1458 else {
1459 /* Allow for a backwards frame shift */
1460 B -= 2;
1461 increment = 1;
1462 }
1463
1464 for (a_index = 1; a_index <= M; a_index++) {
1465
1466 /* Set up the next row of the edit script; this involves
1467 allocating memory for the row, then pointing to it.
1468 It is not necessary to allocate space for offsets less
1469 than first_b_index (since the inner loop will never
1470 look at them).
1471
1472 It is unknown at this point how far to the right the
1473 current traceback row will extend; all that's known for
1474 sure is that the previous row fails the X-dropoff test
1475 after b_size cells, and that the current row can go at
1476 most num_extra_cells beyond that before failing the test */
1477
1478 if (gap_extend > 0)
1479 state_struct = s_GapGetState(&gap_align->state_struct,
1480 b_size - first_b_index + num_extra_cells);
1481 else
1482 state_struct = s_GapGetState(&gap_align->state_struct,
1483 N + 5 - first_b_index);
1484
1485 if (a_index == edit_script_num_rows) {
1486 edit_script_num_rows = edit_script_num_rows * 2;
1487 edit_script = (Uint1 **)realloc(edit_script,
1488 edit_script_num_rows *
1489 sizeof(Uint1 *));
1490 edit_start_offset = (Int4 *)realloc(edit_start_offset,
1491 edit_script_num_rows *
1492 sizeof(Int4));
1493 }
1494
1495 edit_script[a_index] = state_struct->state_array +
1496 state_struct->used + 1;
1497 edit_start_offset[a_index] = first_b_index;
1498
1499 /* the inner loop assumes the current traceback
1500 row begins at offset zero of B */
1501
1502 edit_script_row = edit_script[a_index] - first_b_index;
1503 orig_b_index = first_b_index;
1504
1505 if (!(gap_align->positionBased)) {
1506 matrix_row = matrix[ A[ a_index * increment ] ];
1507 }
1508 else {
1509 if(reversed)
1510 matrix_row = pssm[M - a_index];
1511 else
1512 matrix_row = pssm[a_index + query_offset];
1513 }
1514
1515 score = MININT;
1516 score_row1 = MININT;
1517 score_row2 = MININT;
1518 score_row3 = MININT;
1519 score_gap_col = MININT;
1520 score_col1 = MININT;
1521 score_col2 = MININT;
1522 score_col3 = MININT;
1523 score_other_frame1 = MININT;
1524 score_other_frame2 = MININT;
1525 last_b_index = first_b_index;
1526 b_index = first_b_index;
1527
1528 /* The inner loop is identical to that of OOF_SEMI_G_ALIGN,
1529 except that traceback operations are sprinkled throughout. */
1530
1531 while (b_index < b_size) {
1532
1533 /* FRAME 0 */
1534
1535 score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
1536 score = MAX(score, score_col1);
1537 if (score == score_col1) {
1538 script = SCRIPT_NEXT_IN_FRAME;
1539 }
1540 else if (score + shift_penalty == score_other_frame1) {
1541 if (score_other_frame1 == score_col2)
1542 script = SCRIPT_AHEAD_TWO_FRAMES;
1543 else
1544 script = SCRIPT_NEXT_PLUS_TWO_FRAMES;
1545 }
1546 else {
1547 if (score_other_frame2 == score_col3)
1548 script = SCRIPT_AHEAD_ONE_FRAME;
1549 else
1550 script = SCRIPT_NEXT_PLUS_ONE_FRAME;
1551 }
1552 score += matrix_row[ B[ b_index * increment ] ];
1553
1554 score_other_frame1 = MAX(score_col1, score_array[b_index].best);
1555 score_col1 = score_array[b_index].best;
1556 score_gap_col = score_array[b_index].best_gap;
1557
1558 if (score < MAX(score_gap_col, score_row1)) {
1559 if (score_gap_col > score_row1) {
1560 score = score_gap_col;
1561 script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_A;
1562 }
1563 else {
1564 score = score_row1;
1565 script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_B;
1566 }
1567
1568 if (best_score - score > x_dropoff) {
1569 if (first_b_index == b_index)
1570 first_b_index = b_index + 1;
1571 else
1572 score_array[b_index].best = MININT;
1573 }
1574 else {
1575 last_b_index = b_index;
1576 score_array[b_index].best = score;
1577 score_array[b_index].best_gap = score_gap_col - gap_extend;
1578 score_row1 -= gap_extend;
1579 }
1580 }
1581 else {
1582 if (best_score - score > x_dropoff) {
1583 if (first_b_index == b_index)
1584 first_b_index = b_index + 1;
1585 else
1586 score_array[b_index].best = MININT;
1587 }
1588 else {
1589 last_b_index = b_index;
1590 score_array[b_index].best = score;
1591 if (score > best_score) {
1592 best_score = score;
1593 *a_offset = a_index;
1594 *b_offset = b_index;
1595 }
1596
1597 score -= gap_open_extend;
1598 score_row1 -= gap_extend;
1599 if (score > score_row1)
1600 score_row1 = score;
1601 else
1602 script |= SCRIPT_EXTEND_GAP_B;
1603
1604 score_gap_col -= gap_extend;
1605 if (score < score_gap_col) {
1606 score_array[b_index].best_gap = score_gap_col;
1607 script |= SCRIPT_EXTEND_GAP_A;
1608 }
1609 else {
1610 score_array[b_index].best_gap = score;
1611 }
1612 }
1613 }
1614
1615 edit_script_row[b_index++] = script;
1616 if (b_index >= b_size) {
1617 score = score_row1;
1618 score_row1 = score_row2;
1619 score_row2 = score_row3;
1620 score_row3 = score;
1621 break;
1622 }
1623
1624 /* FRAME 1 */
1625
1626 score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
1627 score = MAX(score, score_col2);
1628 if (score == score_col2) {
1629 script = SCRIPT_NEXT_IN_FRAME;
1630 }
1631 else if (score + shift_penalty == score_other_frame1) {
1632 if (score_other_frame1 == score_col1)
1633 script = SCRIPT_AHEAD_ONE_FRAME;
1634 else
1635 script = SCRIPT_NEXT_PLUS_ONE_FRAME;
1636 }
1637 else {
1638 if (score_other_frame2 == score_col3)
1639 script = SCRIPT_AHEAD_TWO_FRAMES;
1640 else
1641 script = SCRIPT_NEXT_PLUS_TWO_FRAMES;
1642 }
1643 score += matrix_row[ B[ b_index * increment ] ];
1644 score_other_frame2 = MAX(score_col2, score_array[b_index].best);
1645 score_col2 = score_array[b_index].best;
1646 score_gap_col = score_array[b_index].best_gap;
1647
1648 if (score < MAX(score_gap_col, score_row2)) {
1649 score = MAX(score_gap_col, score_row2);
1650 if (best_score - score > x_dropoff) {
1651 if (first_b_index == b_index)
1652 first_b_index = b_index + 1;
1653 else
1654 score_array[b_index].best = MININT;
1655 }
1656 else {
1657 if (score == score_gap_col)
1658 script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_A;
1659 else
1660 script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_B;
1661
1662 last_b_index = b_index;
1663 score_array[b_index].best = score;
1664 score_array[b_index].best_gap = score_gap_col - gap_extend;
1665 score_row2 -= gap_extend;
1666 }
1667 }
1668 else {
1669 if (best_score - score > x_dropoff) {
1670 if (first_b_index == b_index)
1671 first_b_index = b_index + 1;
1672 else
1673 score_array[b_index].best = MININT;
1674 }
1675 else {
1676 last_b_index = b_index;
1677 score_array[b_index].best = score;
1678 if (score > best_score) {
1679 best_score = score;
1680 *a_offset = a_index;
1681 *b_offset = b_index;
1682 }
1683 score -= gap_open_extend;
1684 score_row2 -= gap_extend;
1685 if (score > score_row2)
1686 score_row2 = score;
1687 else
1688 script |= SCRIPT_EXTEND_GAP_B;
1689
1690 score_gap_col -= gap_extend;
1691 if (score < score_gap_col) {
1692 score_array[b_index].best_gap = score_gap_col;
1693 script |= SCRIPT_EXTEND_GAP_A;
1694 }
1695 else {
1696 score_array[b_index].best_gap = score;
1697 }
1698 }
1699 }
1700
1701 edit_script_row[b_index++] = script;
1702 if (b_index >= b_size) {
1703 score = score_row2;
1704 score_row2 = score_row1;
1705 score_row1 = score_row3;
1706 score_row3 = score;
1707 break;
1708 }
1709
1710 /* FRAME 2 */
1711
1712 score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
1713 score = MAX(score, score_col3);
1714 if (score == score_col3) {
1715 script = SCRIPT_NEXT_IN_FRAME;
1716 }
1717 else if (score + shift_penalty == score_other_frame1) {
1718 if (score_other_frame1 == score_col1)
1719 script = SCRIPT_AHEAD_TWO_FRAMES;
1720 else
1721 script = SCRIPT_NEXT_PLUS_TWO_FRAMES;
1722 }
1723 else {
1724 if (score_other_frame2 == score_col2)
1725 script = SCRIPT_AHEAD_ONE_FRAME;
1726 else
1727 script = SCRIPT_NEXT_PLUS_ONE_FRAME;
1728 }
1729 score += matrix_row[ B[ b_index * increment ] ];
1730 score_other_frame1 = score_other_frame2;
1731 score_other_frame2 = MAX(score_col3, score_array[b_index].best);
1732 score_col3 = score_array[b_index].best;
1733 score_gap_col = score_array[b_index].best_gap;
1734
1735 if (score < MAX(score_gap_col, score_row3)) {
1736 score = MAX(score_gap_col, score_row3);
1737 if (best_score - score > x_dropoff) {
1738 if (first_b_index == b_index)
1739 first_b_index = b_index + 1;
1740 else
1741 score_array[b_index].best = MININT;
1742 }
1743 else {
1744 if (score == score_gap_col)
1745 script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_A;
1746 else
1747 script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_B;
1748
1749 last_b_index = b_index;
1750 score_array[b_index].best = score;
1751 score_array[b_index].best_gap = score_gap_col - gap_extend;
1752 score_row3 -= gap_extend;
1753 }
1754 }
1755 else {
1756 if (best_score - score > x_dropoff) {
1757 if (first_b_index == b_index)
1758 first_b_index = b_index + 1;
1759 else
1760 score_array[b_index].best = MININT;
1761 }
1762 else {
1763 last_b_index = b_index;
1764 score_array[b_index].best = score;
1765 if (score > best_score) {
1766 best_score = score;
1767 *a_offset = a_index;
1768 *b_offset = b_index;
1769 }
1770 score -= gap_open_extend;
1771 score_row3 -= gap_extend;
1772 if (score > score_row3)
1773 score_row3 = score;
1774 else
1775 script |= SCRIPT_EXTEND_GAP_B;
1776
1777 score_gap_col -= gap_extend;
1778 if (score < score_gap_col) {
1779 score_array[b_index].best_gap = score_gap_col;
1780 script |= SCRIPT_EXTEND_GAP_A;
1781 }
1782 else {
1783 score_array[b_index].best_gap = score;
1784 }
1785 }
1786 }
1787 edit_script_row[b_index++] = script;
1788 }
1789
1790 /* Finish aligning if the best scores for all positions
1791 of B will fail the X-dropoff test, i.e. the inner loop
1792 bounds have converged to each other */
1793
1794 if (first_b_index == b_size)
1795 break;
1796
1797 /* Enlarge the window for score data if necessary */
1798
1799 if (last_b_index + num_extra_cells + 5 >= gap_align->dp_mem_alloc) {
1800
1801 gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100,
1802 2 * gap_align->dp_mem_alloc);
1803 score_array = (BlastGapDP *)realloc(score_array,
1804 gap_align->dp_mem_alloc *
1805 sizeof(BlastGapDP));
1806 gap_align->dp_mem = score_array;
1807 }
1808
1809 if (last_b_index < b_size - 1) {
1810 /* This row failed the X-dropoff test earlier than
1811 the last row did; just shorten the loop bounds
1812 before doing the next row */
1813
1814 b_size = last_b_index + 1;
1815 }
1816 else {
1817 /* The inner loop finished without failing the X-dropoff
1818 test; initialize extra bookkeeping structures until
1819 the X dropoff test fails or we run out of letters in B.
1820 The next inner loop will have larger bounds.
1821
1822 Keep initializing extra structures until the X dropoff
1823 test fails in all frames for this row */
1824
1825 score = MAX(score_row1, score_row2);
1826 score = MAX(score, score_row3);
1827 while (score >= (best_score - x_dropoff) && b_size < N + 1) {
1828
1829 score_array[b_size].best = score_row1;
1830 score_array[b_size].best_gap = score_row1 - gap_open_extend;
1831 score_row1 -= gap_extend;
1832 edit_script_row[b_size] = SCRIPT_OOF_OPEN_GAP |
1833 SCRIPT_GAP_IN_B;
1834
1835 score_array[b_size+1].best = score_row2;
1836 score_array[b_size+1].best_gap = score_row2 - gap_open_extend;
1837 score_row2 -= gap_extend;
1838 edit_script_row[b_size+1] = SCRIPT_OOF_OPEN_GAP |
1839 SCRIPT_GAP_IN_B;
1840
1841 score_array[b_size+2].best = score_row3;
1842 score_array[b_size+2].best_gap = score_row3 - gap_open_extend;
1843 score_row3 -= gap_extend;
1844 edit_script_row[b_size+2] = SCRIPT_OOF_OPEN_GAP |
1845 SCRIPT_GAP_IN_B;
1846 b_size += 3;
1847 score -= gap_extend;
1848 }
1849 }
1850
1851 /* update the memory allocator to reflect the exact number
1852 of traceback cells this row needed */
1853
1854 b_size = MIN(b_size, N + 1);
1855 state_struct->used += MAX(b_index, b_size) - orig_b_index + 1;
1856
1857 /* chop off the best score in each frame */
1858
1859 last_b_index = MIN(b_size + 4, N + 3);
1860 while (b_size < last_b_index) {
1861 score_array[b_size].best = MININT;
1862 score_array[b_size].best_gap = MININT;
1863 b_size++;
1864 }
1865 }
1866
1867 a_index = *a_offset;
1868 b_index = *b_offset;
1869 script = SCRIPT_AHEAD_ONE_FRAME;
1870
1871 while (a_index > 0 || b_index > 0) {
1872 /* Retrieve the next action to perform. Rows of
1873 the traceback array do not necessarily start
1874 at offset zero of B, so a correction is needed
1875 to point to the correct position */
1876
1877 next_script =
1878 edit_script[a_index][b_index - edit_start_offset[a_index]];
1879
1880 switch (script) {
1881 case SCRIPT_GAP_IN_A:
1882 script = next_script & SCRIPT_OP_MASK;
1883 if (next_script & (SCRIPT_OOF_OPEN_GAP | SCRIPT_EXTEND_GAP_A))
1884 script = SCRIPT_GAP_IN_A;
1885 break;
1886 case SCRIPT_GAP_IN_B:
1887 script = next_script & SCRIPT_OP_MASK;
1888 if (next_script & (SCRIPT_OOF_OPEN_GAP | SCRIPT_EXTEND_GAP_B))
1889 script = SCRIPT_GAP_IN_B;
1890 break;
1891 default:
1892 script = next_script & SCRIPT_OP_MASK;
1893 break;
1894 }
1895
1896 if (script == SCRIPT_GAP_IN_B) {
1897 b_index -= 3;
1898 }
1899 else {
1900 b_index -= script;
1901 a_index--;
1902 }
1903
1904 GapPrelimEditBlockAdd(edit_block, (EGapAlignOpType)script, 1);
1905 }
1906
1907 sfree(edit_start_offset);
1908 sfree(edit_script);
1909
1910 if (!reversed)
1911 *b_offset -= 2;
1912 return best_score;
1913 }
1914
1915 /** Low level function to perform gapped extension with out-of-frame
1916 * gapping with or without traceback.
1917 * @param A The query sequence [in]
1918 * @param B The subject sequence [in]
1919 * @param M Maximal extension length in query [in]
1920 * @param N Maximal extension length in subject [in]
1921 * @param a_offset Resulting starting offset in query [out]
1922 * @param b_offset Resulting starting offset in subject [out]
1923 * @param score_only Only find the score, without saving traceback [in]
1924 * @param edit_block Structure to hold generated traceback [out]
1925 * @param gap_align Structure holding various information and allocated
1926 * memory for the gapped alignment [in]
1927 * @param score_params Parameters related to scoring [in]
1928 * @param query_offset The starting offset in query [in]
1929 * @param reversed Has the sequence been reversed? Used for psi-blast [in]
1930 * @return The best alignment score found.
1931 */
1932 static Int4
s_OutOfFrameGappedAlign(const Uint1 * A,const Uint1 * B,Int4 M,Int4 N,Int4 * a_offset,Int4 * b_offset,Boolean score_only,GapPrelimEditBlock * edit_block,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 query_offset,Boolean reversed)1933 s_OutOfFrameGappedAlign(const Uint1* A, const Uint1* B, Int4 M, Int4 N,
1934 Int4* a_offset, Int4* b_offset, Boolean score_only,
1935 GapPrelimEditBlock *edit_block, BlastGapAlignStruct* gap_align,
1936 const BlastScoringParameters* score_params,
1937 Int4 query_offset, Boolean reversed)
1938 {
1939 Int4 i, increment; /* sequence pointers and indices */
1940 Int4 a_index;
1941 Int4 b_index, b_size, first_b_index, last_b_index;
1942
1943 Int4 gap_open; /* alignment penalty variables */
1944 Int4 gap_extend;
1945 Int4 gap_open_extend;
1946 Int4 shift_penalty;
1947 Int4 x_dropoff;
1948
1949 BlastGapDP* score_array;
1950 Int4 num_extra_cells;
1951
1952 Int4** matrix = NULL; /* pointers to the score matrix */
1953 Int4** pssm = NULL;
1954 Int4* matrix_row = NULL;
1955
1956 Int4 score; /* score tracking variables */
1957 Int4 score_row1;
1958 Int4 score_row2;
1959 Int4 score_row3;
1960 Int4 score_gap_col;
1961 Int4 score_col1;
1962 Int4 score_col2;
1963 Int4 score_col3;
1964 Int4 score_other_frame1;
1965 Int4 score_other_frame2;
1966 Int4 best_score;
1967
1968 if (!score_only) {
1969 return s_OutOfFrameAlignWithTraceback(A, B, M, N, a_offset, b_offset,
1970 edit_block, gap_align,
1971 score_params, query_offset,
1972 reversed);
1973 }
1974
1975 /* do initialization and sanity-checking */
1976
1977 matrix = gap_align->sbp->matrix->data;
1978 if (gap_align->positionBased) {
1979 pssm = gap_align->sbp->psi_matrix->pssm->data;
1980 }
1981 *a_offset = 0;
1982 *b_offset = -2;
1983 gap_open = score_params->gap_open;
1984 gap_extend = score_params->gap_extend;
1985 gap_open_extend = gap_open + gap_extend;
1986 shift_penalty = score_params->shift_pen;
1987 x_dropoff = gap_align->gap_x_dropoff;
1988
1989 if (x_dropoff < gap_open_extend)
1990 x_dropoff = gap_open_extend;
1991
1992 if(N <= 0 || M <= 0)
1993 return 0;
1994
1995 /* Allocate and fill in the auxiliary bookeeping structures.
1996 Since A and B could be very large, maintain a window
1997 of auxiliary structures only large enough to contain to current
1998 set of DP computations. The initial window size is determined
1999 by the number of cells needed to fail the x-dropoff test */
2000
2001 if (gap_extend > 0)
2002 num_extra_cells = CODON_LENGTH * (x_dropoff / gap_extend + 5);
2003 else
2004 num_extra_cells = N + 5;
2005
2006 if (num_extra_cells > gap_align->dp_mem_alloc) {
2007 gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
2008 2 * gap_align->dp_mem_alloc);
2009 sfree(gap_align->dp_mem);
2010 gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
2011 sizeof(BlastGapDP));
2012 }
2013
2014 score_array = gap_align->dp_mem;
2015 score = -gap_open_extend;
2016 score_array[0].best = 0;
2017 score_array[0].best_gap = -gap_open_extend;
2018
2019 for (i = 3; i <= N + 2; i += 3) {
2020 score_array[i].best = score;
2021 score_array[i].best_gap = score - gap_open_extend;
2022 score_array[i-1].best = MININT;
2023 score_array[i-1].best_gap = MININT;
2024 score_array[i-2].best = MININT;
2025 score_array[i-2].best_gap = MININT;
2026 if (score < -x_dropoff)
2027 break;
2028 score -= gap_extend;
2029 }
2030
2031 /* The inner loop below examines letters of B from
2032 index 'first_b_index' to 'b_size' */
2033
2034 b_size = i - 2;
2035 score_array[i].best = MININT;
2036 score_array[i].best_gap = MININT;
2037
2038 best_score = 0;
2039 first_b_index = 0;
2040 if (reversed) {
2041 increment = -1;
2042 }
2043 else {
2044 /* Allow for a backwards frame shift */
2045 B -= 2;
2046 increment = 1;
2047 }
2048
2049 for (a_index = 1; a_index <= M; a_index++) {
2050
2051 /* pick out the row of the score matrix
2052 appropriate for A[a_index] */
2053
2054 if (!(gap_align->positionBased)) {
2055 matrix_row = matrix[ A[ a_index * increment ] ];
2056 }
2057 else {
2058 if(reversed)
2059 matrix_row = pssm[M - a_index];
2060 else
2061 matrix_row = pssm[a_index + query_offset];
2062 }
2063
2064 /* initialize running-score variables */
2065 score = MININT;
2066 score_row1 = MININT;
2067 score_row2 = MININT;
2068 score_row3 = MININT;
2069 score_gap_col = MININT;
2070 score_col1 = MININT;
2071 score_col2 = MININT;
2072 score_col3 = MININT;
2073 score_other_frame1 = MININT;
2074 score_other_frame2 = MININT;
2075 last_b_index = first_b_index;
2076 b_index = first_b_index;
2077
2078 while (b_index < b_size) {
2079
2080 /* FRAME 0 */
2081
2082 /* Pick the best score among all frames */
2083 score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
2084 score = MAX(score, score_col1) +
2085 matrix_row[ B[ b_index * increment ] ];
2086 score_other_frame1 = MAX(score_col1, score_array[b_index].best);
2087 score_col1 = score_array[b_index].best;
2088 score_gap_col = score_array[b_index].best_gap;
2089
2090 /* Use the row and column scores if they improve
2091 the score overall */
2092
2093 if (score < MAX(score_gap_col, score_row1)) {
2094 score = MAX(score_gap_col, score_row1);
2095 if (best_score - score > x_dropoff) {
2096
2097 /* the current best score failed the X-dropoff
2098 criterion. Note that this does not stop the
2099 inner loop, only forces future iterations to
2100 skip this column of B.
2101
2102 Also, if the very first letter of B that was
2103 tested failed the X dropoff criterion, make
2104 sure future inner loops start one letter to
2105 the right */
2106
2107 if (first_b_index == b_index)
2108 first_b_index = b_index + 1;
2109 else
2110 score_array[b_index].best = MININT;
2111 }
2112 else {
2113 /* update the row and column running scores */
2114 last_b_index = b_index;
2115 score_array[b_index].best = score;
2116 score_array[b_index].best_gap = score_gap_col - gap_extend;
2117 score_row1 -= gap_extend;
2118 }
2119 }
2120 else {
2121 if (best_score - score > x_dropoff) {
2122
2123 /* the current best score failed the X-dropoff
2124 criterion. */
2125
2126 if (first_b_index == b_index)
2127 first_b_index = b_index + 1;
2128 else
2129 score_array[b_index].best = MININT;
2130 }
2131 else {
2132 /* The current best score exceeds the
2133 row and column scores, and thus may
2134 improve on the current optimal score */
2135
2136 last_b_index = b_index;
2137 score_array[b_index].best = score;
2138 if (score > best_score) {
2139 best_score = score;
2140 *a_offset = a_index;
2141 *b_offset = b_index;
2142 }
2143
2144 /* compute the best scores that include gaps
2145 or gap extensions */
2146
2147 score -= gap_open_extend;
2148 score_row1 -= gap_extend;
2149 score_row1 = MAX(score, score_row1);
2150 score_array[b_index].best_gap = MAX(score,
2151 score_gap_col - gap_extend);
2152 }
2153 }
2154
2155 /* If this was the last letter of B checked, rotate
2156 the row scores so that code beyond the inner loop
2157 works correctly */
2158
2159 if (++b_index >= b_size) {
2160 score = score_row1;
2161 score_row1 = score_row2;
2162 score_row2 = score_row3;
2163 score_row3 = score;
2164 break;
2165 }
2166
2167 /* FRAME 1 */
2168
2169 /* This code, and that for Frame 2, are essentially the
2170 same as the preceeding code. The only real difference
2171 is the updating of the other_frame best scores */
2172
2173 score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
2174 score = MAX(score, score_col2) +
2175 matrix_row[ B[ b_index * increment ] ];
2176 score_other_frame2 = MAX(score_col2, score_array[b_index].best);
2177 score_col2 = score_array[b_index].best;
2178 score_gap_col = score_array[b_index].best_gap;
2179
2180 if (score < MAX(score_gap_col, score_row2)) {
2181 score = MAX(score_gap_col, score_row2);
2182 if (best_score - score > x_dropoff) {
2183 if (first_b_index == b_index)
2184 first_b_index = b_index + 1;
2185 else
2186 score_array[b_index].best = MININT;
2187 }
2188 else {
2189 last_b_index = b_index;
2190 score_array[b_index].best = score;
2191 score_array[b_index].best_gap = score_gap_col - gap_extend;
2192 score_row2 -= gap_extend;
2193 }
2194 }
2195 else {
2196 if (best_score - score > x_dropoff) {
2197 if (first_b_index == b_index)
2198 first_b_index = b_index + 1;
2199 else
2200 score_array[b_index].best = MININT;
2201 }
2202 else {
2203 last_b_index = b_index;
2204 score_array[b_index].best = score;
2205 if (score > best_score) {
2206 best_score = score;
2207 *a_offset = a_index;
2208 *b_offset = b_index;
2209 }
2210 score -= gap_open_extend;
2211 score_row2 -= gap_extend;
2212 score_row2 = MAX(score, score_row2);
2213 score_array[b_index].best_gap = MAX(score,
2214 score_gap_col - gap_extend);
2215 }
2216 }
2217
2218 if (++b_index >= b_size) {
2219 score = score_row2;
2220 score_row2 = score_row1;
2221 score_row1 = score_row3;
2222 score_row3 = score;
2223 break;
2224 }
2225
2226 /* FRAME 2 */
2227
2228 score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
2229 score = MAX(score, score_col3) +
2230 matrix_row[ B[ b_index * increment ] ];
2231 score_other_frame1 = score_other_frame2;
2232 score_other_frame2 = MAX(score_col3, score_array[b_index].best);
2233 score_col3 = score_array[b_index].best;
2234 score_gap_col = score_array[b_index].best_gap;
2235
2236 if (score < MAX(score_gap_col, score_row3)) {
2237 score = MAX(score_gap_col, score_row3);
2238 if (best_score - score > x_dropoff) {
2239 if (first_b_index == b_index)
2240 first_b_index = b_index + 1;
2241 else
2242 score_array[b_index].best = MININT;
2243 }
2244 else {
2245 last_b_index = b_index;
2246 score_array[b_index].best = score;
2247 score_array[b_index].best_gap = score_gap_col - gap_extend;
2248 score_row3 -= gap_extend;
2249 }
2250 }
2251 else {
2252 if (best_score - score > x_dropoff) {
2253 if (first_b_index == b_index)
2254 first_b_index = b_index + 1;
2255 else
2256 score_array[b_index].best = MININT;
2257 }
2258 else {
2259 last_b_index = b_index;
2260 score_array[b_index].best = score;
2261 if (score > best_score) {
2262 best_score = score;
2263 *a_offset = a_index;
2264 *b_offset = b_index;
2265 }
2266 score -= gap_open_extend;
2267 score_row3 -= gap_extend;
2268 score_row3 = MAX(score, score_row3);
2269 score_array[b_index].best_gap = MAX(score,
2270 score_gap_col - gap_extend);
2271 }
2272 }
2273 b_index++;
2274 }
2275
2276 /* Finish aligning if the best scores for all positions
2277 of B will fail the X-dropoff test, i.e. the inner loop
2278 bounds have converged to each other */
2279
2280 if (first_b_index == b_size)
2281 break;
2282
2283 /* Enlarge the window for score data, if necessary */
2284
2285 if (b_size + num_extra_cells + 5 >= gap_align->dp_mem_alloc) {
2286
2287 gap_align->dp_mem_alloc = MAX(b_size + num_extra_cells + 100,
2288 2 * gap_align->dp_mem_alloc);
2289 score_array = (BlastGapDP *)realloc(score_array,
2290 gap_align->dp_mem_alloc *
2291 sizeof(BlastGapDP));
2292 gap_align->dp_mem = score_array;
2293 }
2294
2295 if (last_b_index < b_size - 1) {
2296 /* This row failed the X-dropoff test earlier than
2297 the last row did; just shorten the loop bounds
2298 before doing the next row */
2299
2300 b_size = last_b_index + 1;
2301 }
2302 else {
2303 /* The inner loop finished without failing the X-dropoff
2304 test; initialize extra bookkeeping structures until
2305 the X dropoff test fails or we run out of letters in B.
2306 The next inner loop will have larger bounds.
2307
2308 Keep initializing extra structures until the X dropoff
2309 test fails in all frames for this row */
2310
2311 score = MAX(score_row1, score_row2);
2312 score = MAX(score, score_row3);
2313 while (score >= (best_score - x_dropoff) && b_size < N + 1) {
2314
2315 score_array[b_size].best = score_row1;
2316 score_array[b_size].best_gap = score_row1 - gap_open_extend;
2317 score_row1 -= gap_extend;
2318
2319 score_array[b_size+1].best = score_row2;
2320 score_array[b_size+1].best_gap = score_row2 - gap_open_extend;
2321 score_row2 -= gap_extend;
2322
2323 score_array[b_size+2].best = score_row3;
2324 score_array[b_size+2].best_gap = score_row3 - gap_open_extend;
2325 score_row3 -= gap_extend;
2326
2327 b_size += 3;
2328 score -= gap_extend;
2329 }
2330 }
2331
2332 /* chop off the best score in each frame */
2333 b_size = MIN(b_size, N + 1);
2334 last_b_index = MIN(b_size + 4, N + 3);
2335 while (b_size < last_b_index) {
2336 score_array[b_size].best = MININT;
2337 score_array[b_size].best_gap = MININT;
2338 b_size++;
2339 }
2340 }
2341
2342 if (!reversed) {
2343 /* The sequence was shifted, so length should be adjusted as well */
2344 *b_offset -= 2;
2345 }
2346 return best_score;
2347 }
2348
2349 /** Simple wrapper around binary search function for BlastQueryInfo structure
2350 * to obtain the context in which this initial ungapped HSP is found
2351 * @param query_info Query information structure [in]
2352 * @param init_hsp Initial HSP [in]
2353 */
2354 static NCBI_INLINE Int4
s_GetUngappedHSPContext(const BlastQueryInfo * query_info,const BlastInitHSP * init_hsp)2355 s_GetUngappedHSPContext(const BlastQueryInfo* query_info,
2356 const BlastInitHSP* init_hsp)
2357 {
2358 return BSearchContextInfo(init_hsp->offsets.qs_offsets.q_off, query_info);
2359 }
2360
2361 /** Adjust the HSP offsets in the initial ungapped HSP structure given the
2362 * query start
2363 * @param init_hsp Initial HSP [in|out]
2364 * @param query_start Starting offset of the query sequence in the query info
2365 * structure [in]
2366 */
2367 static NCBI_INLINE void
s_AdjustInitialHSPOffsets(BlastInitHSP * init_hsp,Int4 query_start)2368 s_AdjustInitialHSPOffsets(BlastInitHSP* init_hsp, Int4 query_start)
2369 {
2370 init_hsp->offsets.qs_offsets.q_off -= query_start;
2371 if (init_hsp->ungapped_data) {
2372 init_hsp->ungapped_data->q_start -= query_start;
2373 }
2374 ASSERT(init_hsp->ungapped_data == NULL ||
2375 init_hsp->ungapped_data->q_start >= 0);
2376 }
2377
2378 /** Set up a BLAST_SequenceBlk structure for a single query sequence
2379 * @param concatenated_query Query sequence block for concatenated query [in]
2380 * @param query_info Query information structure for concatenated query [in]
2381 * @param context Context in which the HSP ocurred [in]
2382 * @param single_query query Output sequence block which will contain adjusted
2383 * sequence pointer and sequence length [out]
2384 * @param query_start Starting offset of the single query sequence in the
2385 * concatenated query [out]
2386 */
2387 static void
s_SetUpLocalBlastSequenceBlk(const BLAST_SequenceBlk * concatenated_query,const BlastQueryInfo * query_info,Int4 context,BLAST_SequenceBlk * single_query,Int4 * query_start)2388 s_SetUpLocalBlastSequenceBlk(const BLAST_SequenceBlk* concatenated_query,
2389 const BlastQueryInfo* query_info, Int4 context,
2390 BLAST_SequenceBlk* single_query,
2391 Int4* query_start)
2392 {
2393 Int4 query_length = 0;
2394 if (concatenated_query->oof_sequence) {
2395 /* Out-of-frame blastx case: all frames of the same parity are mixed
2396 together in a special sequence. */
2397 Int4 query_end_pt = 0;
2398 Int4 mixed_frame_context = context - context % CODON_LENGTH;
2399
2400 *query_start = query_info->contexts[mixed_frame_context].query_offset;
2401 query_end_pt =
2402 query_info->contexts[mixed_frame_context+CODON_LENGTH-1].query_offset +
2403 query_info->contexts[mixed_frame_context+CODON_LENGTH-1].query_length;
2404 query_length = query_end_pt - *query_start;
2405 single_query->sequence = NULL;
2406 single_query->oof_sequence = concatenated_query->oof_sequence + *query_start;
2407 } else {
2408 *query_start = query_info->contexts[context].query_offset;
2409 query_length = query_info->contexts[context].query_length;
2410 single_query->sequence = concatenated_query->sequence + *query_start;
2411 single_query->oof_sequence = NULL;
2412 }
2413 single_query->length = query_length;
2414 }
2415
2416
2417 /** Find the HSP offsets relative to the individual query sequence instead of
2418 * the concatenated sequence and retrieve relevant portions of the query
2419 * sequence data.
2420 * @param query Query sequence block [in]
2421 * @param query_info Query information structure, including context offsets
2422 * array [in]
2423 * @param init_hsp Initial HSP, this will be modified to hold the adjusted
2424 * offsets [in] [out]
2425 * @param query_out query sequence block with modified sequence
2426 * pointer and sequence length [out]
2427 * @param context Which context this HSP belongs to? [out]
2428 */
2429 static void
s_AdjustHspOffsetsAndGetQueryData(const BLAST_SequenceBlk * query,const BlastQueryInfo * query_info,BlastInitHSP * init_hsp,BLAST_SequenceBlk * query_out,Int4 * context)2430 s_AdjustHspOffsetsAndGetQueryData(const BLAST_SequenceBlk* query,
2431 const BlastQueryInfo* query_info,
2432 BlastInitHSP* init_hsp,
2433 BLAST_SequenceBlk* query_out,
2434 Int4* context)
2435 {
2436 Int4 query_start = 0;
2437
2438 ASSERT(query);
2439 ASSERT(query_info);
2440 ASSERT(query_out);
2441 ASSERT(init_hsp);
2442 ASSERT(context);
2443
2444 ASSERT(init_hsp->ungapped_data == NULL ||
2445 init_hsp->ungapped_data->q_start >= 0);
2446 *context = s_GetUngappedHSPContext(query_info, init_hsp);
2447 s_SetUpLocalBlastSequenceBlk(query, query_info, *context,
2448 query_out, &query_start);
2449 ASSERT(init_hsp->ungapped_data == NULL ||
2450 (init_hsp->ungapped_data->q_start-query_start) >= 0);
2451 s_AdjustInitialHSPOffsets(init_hsp, query_start);
2452 }
2453
2454
2455 /** Convert the initial list of traceback actions from a non-OOF
2456 * gapped alignment into a blast edit script. Note that this routine
2457 * assumes the input edit blocks have not been reversed or rearranged
2458 * by calling code
2459 * @param rev_prelim_tback Traceback from extension to the left [in]
2460 * @param fwd_prelim_tback Traceback from extension to the right [in]
2461 * @return Pointer to the resulting edit script, or NULL if there
2462 * are no traceback actions specified
2463 */
2464 GapEditScript*
Blast_PrelimEditBlockToGapEditScript(GapPrelimEditBlock * rev_prelim_tback,GapPrelimEditBlock * fwd_prelim_tback)2465 Blast_PrelimEditBlockToGapEditScript (GapPrelimEditBlock* rev_prelim_tback,
2466 GapPrelimEditBlock* fwd_prelim_tback)
2467 {
2468 Boolean merge_ops = FALSE;
2469 GapEditScript* esp;
2470 GapPrelimEditScript *op;
2471 Int4 i;
2472 Int4 index=0;
2473 Int4 size = 0;
2474
2475 if (rev_prelim_tback == NULL || fwd_prelim_tback == NULL)
2476 return NULL;
2477
2478 /* The fwd_prelim_tback script will get reversed here as the traceback started from the highest scoring point
2479 and worked backwards. The rev_prelim_tback script does NOT get reversed. Since it was reversed when the
2480 traceback was produced it's already "forward" */
2481
2482 if (fwd_prelim_tback->num_ops > 0 && rev_prelim_tback->num_ops > 0 &&
2483 fwd_prelim_tback->edit_ops[(fwd_prelim_tback->num_ops)-1].op_type ==
2484 rev_prelim_tback->edit_ops[(rev_prelim_tback->num_ops)-1].op_type)
2485 merge_ops = TRUE;
2486
2487 size = fwd_prelim_tback->num_ops+rev_prelim_tback->num_ops;
2488 if (merge_ops)
2489 size--;
2490
2491 esp = GapEditScriptNew(size);
2492
2493 index = 0;
2494 for (i=0; i < rev_prelim_tback->num_ops; i++) {
2495 op = rev_prelim_tback->edit_ops + i;
2496 esp->op_type[index] = op->op_type;
2497 esp->num[index] = op->num;
2498 index++;
2499 }
2500
2501 if (fwd_prelim_tback->num_ops == 0)
2502 return esp;
2503
2504 if (merge_ops)
2505 esp->num[index-1] += fwd_prelim_tback->edit_ops[(fwd_prelim_tback->num_ops)-1].num;
2506
2507 /* If we merge, then we skip the first one. */
2508 if (merge_ops)
2509 i = fwd_prelim_tback->num_ops - 2;
2510 else
2511 i = fwd_prelim_tback->num_ops - 1;
2512
2513 for (; i >= 0; i--) {
2514 op = fwd_prelim_tback->edit_ops + i;
2515 esp->op_type[index] = op->op_type;
2516 esp->num[index] = op->num;
2517 index++;
2518 }
2519
2520 return esp;
2521 }
2522
2523 /** Fills the BlastGapAlignStruct structure with the results
2524 * of a greedy gapped extension.
2525 * @param gap_align the initialized gapped alignment structure [in] [out]
2526 * @param q_start The starting offset in query [in]
2527 * @param s_start The starting offset in subject [in]
2528 * @param q_end The ending offset in query [in]
2529 * @param s_end The ending offset in subject [in]
2530 * @param q_seed_start The query offset of the alignment start point [in]
2531 * @param s_seed_start The subject offset of the alignment start point [in]
2532 * @param score The alignment score [in]
2533 * @param esp The edit script containing the traceback information [in]
2534 */
2535 static Int2
s_BlastGreedyGapAlignStructFill(BlastGapAlignStruct * gap_align,Int4 q_start,Int4 s_start,Int4 q_end,Int4 s_end,Int4 q_seed_start,Int4 s_seed_start,Int4 score,GapEditScript * esp)2536 s_BlastGreedyGapAlignStructFill(BlastGapAlignStruct* gap_align,
2537 Int4 q_start, Int4 s_start,
2538 Int4 q_end, Int4 s_end,
2539 Int4 q_seed_start, Int4 s_seed_start,
2540 Int4 score, GapEditScript* esp)
2541 {
2542 gap_align->query_start = q_start;
2543 gap_align->query_stop = q_end;
2544 gap_align->subject_start = s_start;
2545 gap_align->subject_stop = s_end;
2546 gap_align->greedy_query_seed_start = q_seed_start;
2547 gap_align->greedy_subject_seed_start = s_seed_start;
2548 gap_align->score = score;
2549
2550 if (esp)
2551 gap_align->edit_script = esp;
2552
2553 return 0;
2554 }
2555
s_UpdateEditScript(GapEditScript * esp,int pos,int bf,int af)2556 static void s_UpdateEditScript(GapEditScript* esp, int pos, int bf, int af) {
2557 int op, qd, sd;
2558
2559 if (bf > 0) {
2560 op = pos;
2561 qd = sd = bf;
2562 do {
2563 if (--op < 0) return;
2564 switch(esp->op_type[op]) {
2565 case eGapAlignSub:
2566 qd -= esp->num[op];
2567 sd -= esp->num[op];
2568 break;
2569 case eGapAlignIns:
2570 qd -= esp->num[op];
2571 break;
2572 case eGapAlignDel:
2573 sd -= esp->num[op];
2574 default:
2575 break;
2576 }
2577 } while (qd > 0 || sd > 0);
2578
2579 esp->num[op] = -MAX(qd, sd);
2580 esp->op_type[op++] = eGapAlignSub;
2581 for (; op < pos-1; op++) esp->num[op] = 0;
2582 esp->num[pos] += bf;
2583 qd -= sd;
2584 esp->op_type[pos-1] = (qd>0) ? eGapAlignDel: eGapAlignIns;
2585 esp->num[pos-1] = (qd>0) ? qd : -qd;
2586 }
2587
2588 if (af > 0) {
2589 op = pos;
2590 qd = sd = af;
2591 do {
2592 if (++op >= esp->size) return;
2593 switch(esp->op_type[op]) {
2594 case eGapAlignSub:
2595 qd -= esp->num[op];
2596 sd -= esp->num[op];
2597 break;
2598 case eGapAlignIns:
2599 qd -= esp->num[op];
2600 break;
2601 case eGapAlignDel:
2602 sd -= esp->num[op];
2603 default:
2604 break;
2605 }
2606 } while (qd > 0 || sd > 0);
2607
2608 esp->num[op] = -MAX(qd, sd);
2609 esp->op_type[op--] = eGapAlignSub;
2610 for (; op > pos+1; op--) esp->num[op] = 0;
2611 esp->num[pos] += af;
2612 qd -= sd;
2613 esp->op_type[pos+1] = (qd>0) ? eGapAlignDel: eGapAlignIns;
2614 esp->num[pos+1] = (qd>0) ? qd : -qd;
2615 }
2616 }
2617
s_RebuildEditScript(GapEditScript * esp)2618 static void s_RebuildEditScript(GapEditScript* esp) {
2619 int i, j;
2620 for (i=0, j=-1; i<esp->size; i++) {
2621 if (esp->num[i] == 0) continue;
2622 if (j>=0 && esp->op_type[i] == esp->op_type[j]) {
2623 esp->num[j] += esp->num[i];
2624 } else if (j==-1 || esp->op_type[i] == eGapAlignSub
2625 || esp->op_type[j] == eGapAlignSub) {
2626 esp->op_type[++j] = esp->op_type[i];
2627 esp->num[j] = esp->num[i];
2628 } else {
2629 int d = esp->num[j] - esp->num[i];
2630 if (d > 0) {
2631 esp->num[j-1] += esp->num[i];
2632 esp->num[j] = d;
2633 } else if (d < 0) {
2634 esp->num[j-1] += esp->num[j];
2635 esp->num[j] = -d;
2636 esp->op_type[j] = esp->op_type[i];
2637 } else {
2638 esp->num[j-1] += esp->num[j];
2639 --j;
2640 }
2641 }
2642 }
2643 esp->size = ++j;
2644 }
2645
s_ReduceGaps(GapEditScript * esp,const Uint1 * q,const Uint1 * s,const Uint1 * qf,const Uint1 * sf)2646 static void s_ReduceGaps(GapEditScript* esp, const Uint1 *q, const Uint1 *s,
2647 const Uint1 *qf,const Uint1 *sf){
2648 int i, j, nm1, nm2, d;
2649 const Uint1 *q1, *s1;
2650
2651 for (q1=q, s1=s, i=0; i<esp->size; i++) {
2652 if (esp->num[i] == 0) continue;
2653 if (esp->op_type[i] == eGapAlignSub) {
2654 if(esp->num[i] >= 12) {
2655 nm1 = 1;
2656 if (i > 0) {
2657 while (q1-nm1>=q && (*(q1-nm1) == *(s1-nm1))) ++nm1;
2658 }
2659 q1 += esp->num[i];
2660 s1 += esp->num[i];
2661 nm2 = 0;
2662 if (i < esp->size -1) {
2663 while ((q1+1<qf) && (s1+1<sf) && (*(q1++) == *(s1++))) ++nm2;
2664 }
2665 if (nm1>1 || nm2>0) s_UpdateEditScript(esp, i, nm1-1, nm2);
2666 q1--; s1--;
2667 } else {
2668 q1 += esp->num[i];
2669 s1 += esp->num[i];
2670 }
2671 } else if (esp->op_type[i] == eGapAlignIns) {
2672 q1 += esp->num[i];
2673 } else {
2674 s1 += esp->num[i];
2675 }
2676 }
2677 s_RebuildEditScript(esp);
2678
2679 for (i=0; i<esp->size; i++) {
2680 if (esp->op_type[i] == eGapAlignSub) {
2681 q += esp->num[i];
2682 s += esp->num[i];
2683 continue;
2684 }
2685 if (i>1 && esp->op_type[i] != esp->op_type[i-2]
2686 && esp->num[i-2] > 0) {
2687 d = esp->num[i] + esp->num[i-1] + esp->num[i-2];
2688 if (d == 3) {
2689 /* special case, no need to do further testing */
2690 (esp->num[i-2]) = 0;
2691 (esp->num[i-1]) = 2;
2692 (esp->num[i]) = 0;
2693 if (esp->op_type[i] == eGapAlignIns) {
2694 ++q;
2695 } else {
2696 ++s;
2697 }
2698 } else if (d < 12) {
2699 /* Try reducing this sub... */
2700 nm1 = 0;
2701 nm2 = 0;
2702 d = MIN(esp->num[i], esp->num[i-2]);
2703 q -= esp->num[i-1];
2704 s -= esp->num[i-1];
2705 q1 = q;
2706 s1 = s;
2707 if (esp->op_type[i] == eGapAlignIns) {
2708 s -= d;
2709 } else {
2710 q -= d;
2711 }
2712 for (j=0; j<esp->num[i-1]; ++j, ++q1, ++s1, ++q, ++s) {
2713 if (*q1 == *s1) nm1++;
2714 if (*q == *s) nm2++;
2715 }
2716 for (j=0; j<d; ++j, ++q, ++s) {
2717 if (*q == *s) nm2++;
2718 }
2719 if (nm2 >= nm1 - d) {
2720 (esp->num[i-2]) -= d;
2721 (esp->num[i-1]) += d;
2722 (esp->num[i]) -= d;
2723 } else {
2724 q = q1;
2725 s = s1;
2726 }
2727 }
2728 }
2729 if (esp->op_type[i] == eGapAlignIns) {
2730 q += esp->num[i];
2731 } else {
2732 s += esp->num[i];
2733 }
2734 }
2735 s_RebuildEditScript(esp);
2736 }
2737
2738 Int2
BLAST_GreedyGappedAlignment(const Uint1 * query,const Uint1 * subject,Int4 query_length,Int4 subject_length,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 q_off,Int4 s_off,Boolean compressed_subject,Boolean do_traceback,Boolean * fence_hit)2739 BLAST_GreedyGappedAlignment(const Uint1* query, const Uint1* subject,
2740 Int4 query_length, Int4 subject_length, BlastGapAlignStruct* gap_align,
2741 const BlastScoringParameters* score_params,
2742 Int4 q_off, Int4 s_off, Boolean compressed_subject, Boolean do_traceback,
2743 Boolean * fence_hit)
2744 {
2745 const Uint1* q;
2746 const Uint1* s;
2747 Int4 score;
2748 Int4 X;
2749 Int4 q_avail, s_avail;
2750 Int4 q_ext_l, q_ext_r, s_ext_l, s_ext_r;
2751 GapPrelimEditBlock *fwd_prelim_tback = NULL;
2752 GapPrelimEditBlock *rev_prelim_tback = NULL;
2753 SGreedySeed fwd_start_point;
2754 SGreedySeed rev_start_point;
2755 Uint1 rem;
2756 GapEditScript* esp = NULL;
2757 Int4 q_seed_start = q_off;
2758 Int4 s_seed_start = s_off;
2759
2760 q_avail = query_length - q_off;
2761 s_avail = subject_length - s_off;
2762
2763 q = query + q_off;
2764 if (!compressed_subject) {
2765 s = subject + s_off;
2766 rem = 4; /* Special value to indicate that sequence is already
2767 uncompressed */
2768 } else {
2769 s = subject + s_off/4;
2770 rem = s_off % 4;
2771 }
2772
2773 X = gap_align->gap_x_dropoff;
2774
2775 if (do_traceback) {
2776 fwd_prelim_tback = gap_align->fwd_prelim_tback;
2777 rev_prelim_tback = gap_align->rev_prelim_tback;
2778 GapPrelimEditBlockReset(fwd_prelim_tback);
2779 GapPrelimEditBlockReset(rev_prelim_tback);
2780 }
2781
2782 /* extend to the right */
2783 while (TRUE) {
2784 Int4 new_dist, xdrop;
2785 score = BLAST_AffineGreedyAlign(q, q_avail, s, s_avail, FALSE, X,
2786 score_params->reward, -score_params->penalty,
2787 score_params->gap_open, score_params->gap_extend,
2788 &q_ext_r, &s_ext_r, gap_align->greedy_align_mem,
2789 fwd_prelim_tback, rem, fence_hit, &fwd_start_point);
2790 if (score >=0) break;
2791
2792 /* double the max distance */
2793 new_dist = gap_align->greedy_align_mem->max_dist * 2;
2794 xdrop = gap_align->greedy_align_mem->xdrop;
2795 s_BlastGreedyAlignsFree(gap_align->greedy_align_mem);
2796 gap_align->greedy_align_mem =
2797 s_BlastGreedyAlignMemAlloc(score_params, NULL, new_dist, xdrop);
2798
2799 if (!gap_align->greedy_align_mem) {
2800 gap_align = BLAST_GapAlignStructFree(gap_align);
2801 return -1;
2802 }
2803 }
2804
2805 if (compressed_subject)
2806 rem = 0;
2807
2808 /* extend to the left */
2809 while (TRUE) {
2810 Int4 new_dist, xdrop, score1;
2811 score1 = BLAST_AffineGreedyAlign(query, q_off,
2812 subject, s_off, TRUE, X,
2813 score_params->reward, -score_params->penalty,
2814 score_params->gap_open, score_params->gap_extend,
2815 &q_ext_l, &s_ext_l, gap_align->greedy_align_mem,
2816 rev_prelim_tback, rem, fence_hit, &rev_start_point);
2817 if (score1 >=0) {
2818 score += score1;
2819 break;
2820 }
2821
2822 /* double the max distance */
2823 new_dist = gap_align->greedy_align_mem->max_dist * 2;
2824 xdrop = gap_align->greedy_align_mem->xdrop;
2825 s_BlastGreedyAlignsFree(gap_align->greedy_align_mem);
2826 gap_align->greedy_align_mem =
2827 s_BlastGreedyAlignMemAlloc(score_params, NULL, new_dist, xdrop);
2828
2829 if (!gap_align->greedy_align_mem) {
2830 gap_align = BLAST_GapAlignStructFree(gap_align);
2831 return -1;
2832 }
2833 }
2834
2835 /* In basic case the greedy algorithm returns number of
2836 differences, hence we need to convert it to score */
2837 if (score_params->gap_open==0 && score_params->gap_extend==0) {
2838 score =
2839 (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*score_params->reward/2 -
2840 score*(score_params->reward - score_params->penalty);
2841 } else if (score_params->reward % 2 == 1) {
2842 score /= 2;
2843 }
2844
2845 if (do_traceback) {
2846 esp = Blast_PrelimEditBlockToGapEditScript(rev_prelim_tback,
2847 fwd_prelim_tback);
2848 /* check for possible gap elimination */
2849 ASSERT(!compressed_subject);
2850 if (esp) s_ReduceGaps(esp, query+q_off-q_ext_l, subject+s_off-s_ext_l,
2851 query+q_off+q_ext_r, subject+s_off+s_ext_r);
2852 }
2853 else {
2854 /* estimate the best alignment start point. This is the middle
2855 of the longest run of exact matches found by the aligner,
2856 subject to the constraint that the start point lies in the
2857 box formed by the optimal alignment */
2858
2859 Int4 q_box_l = q_off - q_ext_l;
2860 Int4 s_box_l = s_off - s_ext_l;
2861 Int4 q_box_r = q_off + q_ext_r;
2862 Int4 s_box_r = s_off + s_ext_r;
2863 Int4 q_seed_start_l = q_off - rev_start_point.start_q;
2864 Int4 s_seed_start_l = s_off - rev_start_point.start_s;
2865 Int4 q_seed_start_r = q_off + fwd_start_point.start_q;
2866 Int4 s_seed_start_r = s_off + fwd_start_point.start_s;
2867 Int4 valid_seed_len_l = 0;
2868 Int4 valid_seed_len_r = 0;
2869
2870 if (q_seed_start_r < q_box_r && s_seed_start_r < s_box_r) {
2871 valid_seed_len_r = MIN(q_box_r - q_seed_start_r,
2872 s_box_r - s_seed_start_r);
2873 valid_seed_len_r = MIN(valid_seed_len_r,
2874 fwd_start_point.match_length) / 2;
2875 } else {
2876 q_seed_start_r = q_off;
2877 s_seed_start_r = s_off;
2878 }
2879
2880 if (q_seed_start_l > q_box_l && s_seed_start_l > s_box_l) {
2881 valid_seed_len_l = MIN(q_seed_start_l - q_box_l,
2882 s_seed_start_l - s_box_l);
2883 valid_seed_len_l = MIN(valid_seed_len_l,
2884 rev_start_point.match_length) / 2;
2885 } else {
2886 q_seed_start_l = q_off;
2887 s_seed_start_l = s_off;
2888 }
2889
2890 if (valid_seed_len_r > valid_seed_len_l) {
2891 q_seed_start = q_seed_start_r + valid_seed_len_r;
2892 s_seed_start = s_seed_start_r + valid_seed_len_r;
2893 }
2894 else {
2895 q_seed_start = q_seed_start_l - valid_seed_len_l;
2896 s_seed_start = s_seed_start_l - valid_seed_len_l;
2897 }
2898 }
2899
2900 s_BlastGreedyGapAlignStructFill(gap_align,
2901 q_off-q_ext_l, s_off-s_ext_l,
2902 q_off+q_ext_r, s_off+s_ext_r,
2903 q_seed_start, s_seed_start,
2904 score, esp);
2905 return 0;
2906 }
2907
2908 /** Performs dynamic programming style gapped extension, given an initial
2909 * HSP (offset pair), two sequence blocks and scoring and extension options.
2910 * Note: traceback is not done in this function.
2911 * @param query_blk The query sequence [in]
2912 * @param subject_blk The subject sequence [in]
2913 * @param gap_align The auxiliary structure for gapped alignment [in]
2914 * @param score_params Parameters related to scoring [in]
2915 * @param init_hsp The initial HSP that needs to be extended [in]
2916 */
2917 static Int2
s_BlastDynProgNtGappedAlignment(BLAST_SequenceBlk * query_blk,BLAST_SequenceBlk * subject_blk,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,BlastInitHSP * init_hsp)2918 s_BlastDynProgNtGappedAlignment(BLAST_SequenceBlk* query_blk,
2919 BLAST_SequenceBlk* subject_blk, BlastGapAlignStruct* gap_align,
2920 const BlastScoringParameters* score_params, BlastInitHSP* init_hsp)
2921 {
2922 Int4 q_length, s_length;
2923 Int4 private_q_start, private_s_start;
2924 Int4 score_right = 0, score_left = 0;
2925 Uint1 offset_adjustment;
2926 Uint1* query = query_blk->sequence;
2927 Uint1* subject = subject_blk->sequence;
2928
2929 /* If subject offset is not at the start of a full byte,
2930 s_BlastAlignPackedNucl won't work, so shift the alignment start
2931 to the next multiple of 4 subject letters. Note that the
2932 shift amount is always nonzero, so a left extension always happens
2933 (and has a few presumed exact matches to start with). In the case
2934 of the smallest ungapped alignment (4 nucleotides), this can
2935 conceivably put the start point at the end of one sequence */
2936
2937 offset_adjustment = COMPRESSION_RATIO -
2938 (init_hsp->offsets.qs_offsets.s_off % COMPRESSION_RATIO);
2939 q_length = init_hsp->offsets.qs_offsets.q_off + offset_adjustment;
2940 s_length = init_hsp->offsets.qs_offsets.s_off + offset_adjustment;
2941
2942 /* This prevents the starting point from being at the end of a sequence
2943 as mentioned in the comment above. */
2944 if (q_length > query_blk->length || s_length > subject_blk->length)
2945 {
2946 q_length -= COMPRESSION_RATIO;
2947 s_length -= COMPRESSION_RATIO;
2948 }
2949
2950 /* perform extension to left */
2951 score_left = s_BlastAlignPackedNucl(query, subject, q_length, s_length,
2952 &private_q_start, &private_s_start, gap_align,
2953 score_params, TRUE);
2954 if (score_left < 0)
2955 return -1;
2956 gap_align->query_start = q_length - private_q_start;
2957 gap_align->subject_start = s_length - private_s_start;
2958
2959 /* perform extension to right */
2960 if (q_length < query_blk->length &&
2961 s_length < subject_blk->length)
2962 {
2963 score_right = s_BlastAlignPackedNucl(query+q_length-1,
2964 subject+(s_length+3)/COMPRESSION_RATIO - 1,
2965 query_blk->length-q_length,
2966 subject_blk->length-s_length, &(gap_align->query_stop),
2967 &(gap_align->subject_stop), gap_align, score_params, FALSE);
2968 if (score_right < 0)
2969 return -1;
2970 gap_align->query_stop += q_length;
2971 gap_align->subject_stop += s_length;
2972 }
2973 else {
2974 gap_align->query_stop = q_length;
2975 gap_align->subject_stop = s_length;
2976 }
2977
2978 gap_align->score = score_right+score_left;
2979
2980 return 0;
2981 }
2982
2983 /** Aligns two nucleotide sequences, one (A) should be packed in the
2984 * same way as the BLAST databases, the other (B) should contain one
2985 * basepair/byte. Traceback is not done in this function.
2986 * @param B The query sequence [in]
2987 * @param A The subject sequence [in]
2988 * @param N Maximal extension length in query [in]
2989 * @param M Maximal extension length in subject [in]
2990 * @param b_offset Resulting starting offset in query [out]
2991 * @param a_offset Resulting starting offset in subject [out]
2992 * @param gap_align The auxiliary structure for gapped alignment [in]
2993 * @param score_params Parameters related to scoring [in]
2994 * @param reverse_sequence Reverse the sequence.
2995 * @return The best alignment score found.
2996 */
2997 static Int4
s_BlastAlignPackedNucl(Uint1 * B,Uint1 * A,Int4 N,Int4 M,Int4 * b_offset,Int4 * a_offset,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Boolean reverse_sequence)2998 s_BlastAlignPackedNucl(Uint1* B, Uint1* A, Int4 N, Int4 M,
2999 Int4* b_offset, Int4* a_offset,
3000 BlastGapAlignStruct* gap_align,
3001 const BlastScoringParameters* score_params,
3002 Boolean reverse_sequence)
3003 {
3004 Int4 i; /* sequence pointers and indices */
3005 Int4 a_index, a_base_pair;
3006 Int4 b_index, b_size, first_b_index, last_b_index, b_increment;
3007 Uint1* b_ptr;
3008
3009 BlastGapDP* score_array;
3010 Int4 num_extra_cells;
3011
3012 Int4 gap_open; /* alignment penalty variables */
3013 Int4 gap_extend;
3014 Int4 gap_open_extend;
3015 Int4 x_dropoff;
3016
3017 Int4* *matrix; /* pointers to the score matrix */
3018 Int4* matrix_row;
3019
3020 Int4 score; /* score tracking variables */
3021 Int4 score_gap_row;
3022 Int4 score_gap_col;
3023 Int4 next_score;
3024 Int4 best_score;
3025
3026 /* do initialization and sanity-checking */
3027
3028 matrix = gap_align->sbp->matrix->data;
3029 *a_offset = 0;
3030 *b_offset = 0;
3031 gap_open = score_params->gap_open;
3032 gap_extend = score_params->gap_extend;
3033 gap_open_extend = gap_open + gap_extend;
3034 x_dropoff = gap_align->gap_x_dropoff;
3035
3036 if (x_dropoff < gap_open_extend)
3037 x_dropoff = gap_open_extend;
3038
3039 if(N <= 0 || M <= 0)
3040 return 0;
3041
3042 /* Allocate and fill in the auxiliary bookeeping structures.
3043 Since A and B could be very large, maintain a window
3044 of auxiliary structures only large enough to contain the current
3045 set of DP computations. The initial window size is determined
3046 by the number of cells needed to fail the x-dropoff test */
3047
3048 if (gap_extend > 0)
3049 num_extra_cells = x_dropoff / gap_extend + 3;
3050 else
3051 num_extra_cells = N + 3;
3052
3053 if (num_extra_cells > gap_align->dp_mem_alloc) {
3054 gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
3055 2 * gap_align->dp_mem_alloc);
3056 sfree(gap_align->dp_mem);
3057 gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
3058 sizeof(BlastGapDP));
3059 }
3060
3061 score_array = gap_align->dp_mem;
3062 score = -gap_open_extend;
3063 score_array[0].best = 0;
3064 score_array[0].best_gap = -gap_open_extend;
3065
3066 for (i = 1; i <= N; i++) {
3067 if (score < -x_dropoff)
3068 break;
3069
3070 score_array[i].best = score;
3071 score_array[i].best_gap = score - gap_open_extend;
3072 score -= gap_extend;
3073 }
3074
3075 /* The inner loop below examines letters of B from
3076 index 'first_b_index' to 'b_size' */
3077
3078 b_size = i;
3079 best_score = 0;
3080 first_b_index = 0;
3081 if (reverse_sequence)
3082 b_increment = -1;
3083 else
3084 b_increment = 1;
3085
3086 for (a_index = 1; a_index <= M; a_index++) {
3087
3088 /* pick out the row of the score matrix
3089 appropriate for A[a_index] */
3090
3091 if(reverse_sequence) {
3092 a_base_pair = NCBI2NA_UNPACK_BASE(A[(M-a_index)/4],
3093 ((a_index-1)%4));
3094 matrix_row = matrix[a_base_pair];
3095 }
3096 else {
3097 a_base_pair = NCBI2NA_UNPACK_BASE(A[1+((a_index-1)/4)],
3098 (3-((a_index-1)%4)));
3099 matrix_row = matrix[a_base_pair];
3100 }
3101
3102 if(reverse_sequence)
3103 b_ptr = &B[N - first_b_index];
3104 else
3105 b_ptr = &B[first_b_index];
3106
3107 /* initialize running-score variables */
3108 score = MININT;
3109 score_gap_row = MININT;
3110 last_b_index = first_b_index;
3111
3112 for (b_index = first_b_index; b_index < b_size; b_index++) {
3113
3114 b_ptr += b_increment;
3115 score_gap_col = score_array[b_index].best_gap;
3116 next_score = score_array[b_index].best + matrix_row[ *b_ptr ];
3117
3118 if (score < score_gap_col)
3119 score = score_gap_col;
3120
3121 if (score < score_gap_row)
3122 score = score_gap_row;
3123
3124 if (best_score - score > x_dropoff) {
3125
3126 /* the current best score failed the X-dropoff
3127 criterion. Note that this does not stop the
3128 inner loop, only forces future iterations to
3129 skip this column of B.
3130
3131 Also, if the very first letter of B that was
3132 tested failed the X dropoff criterion, make
3133 sure future inner loops start one letter to
3134 the right */
3135
3136 if (b_index == first_b_index)
3137 first_b_index++;
3138 else
3139 score_array[b_index].best = MININT;
3140 }
3141 else {
3142 last_b_index = b_index;
3143 if (score > best_score) {
3144 best_score = score;
3145 *a_offset = a_index;
3146 *b_offset = b_index;
3147 }
3148
3149 /* If starting a gap at this position will improve
3150 the best row or column score, update them to
3151 reflect that. */
3152
3153 score_gap_row -= gap_extend;
3154 score_gap_col -= gap_extend;
3155 score_array[b_index].best_gap = MAX(score - gap_open_extend,
3156 score_gap_col);
3157 score_gap_row = MAX(score - gap_open_extend, score_gap_row);
3158
3159 score_array[b_index].best = score;
3160 }
3161
3162 score = next_score;
3163 }
3164
3165 /* Finish aligning if the best scores for all positions
3166 of B will fail the X-dropoff test, i.e. the inner loop
3167 bounds have converged to each other */
3168
3169 if (first_b_index == b_size)
3170 break;
3171
3172 if (last_b_index + num_extra_cells + 3 >= gap_align->dp_mem_alloc) {
3173
3174 gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100,
3175 2 * gap_align->dp_mem_alloc);
3176 score_array = (BlastGapDP *)realloc(score_array,
3177 gap_align->dp_mem_alloc *
3178 sizeof(BlastGapDP));
3179 gap_align->dp_mem = score_array;
3180 }
3181
3182 if (last_b_index < b_size - 1) {
3183 /* This row failed the X-dropoff test earlier than
3184 the last row did; just shorten the loop bounds
3185 before doing the next row */
3186
3187 b_size = last_b_index + 1;
3188 }
3189 else {
3190 /* The inner loop finished without failing the X-dropoff
3191 test; initialize extra bookkeeping structures until
3192 the X dropoff test fails or we run out of letters in B.
3193 The next inner loop will have larger bounds */
3194
3195 while (score_gap_row >= (best_score - x_dropoff) && b_size <= N) {
3196 score_array[b_size].best = score_gap_row;
3197 score_array[b_size].best_gap = score_gap_row - gap_open_extend;
3198 score_gap_row -= gap_extend;
3199 b_size++;
3200 }
3201 }
3202
3203 if (b_size <= N) {
3204 score_array[b_size].best = MININT;
3205 score_array[b_size].best_gap = MININT;
3206 b_size++;
3207 }
3208 }
3209
3210 return best_score;
3211 }
3212
3213 Boolean
BlastGetOffsetsForGappedAlignment(const Uint1 * query,const Uint1 * subject,const BlastScoreBlk * sbp,BlastHSP * hsp,Int4 * q_retval,Int4 * s_retval)3214 BlastGetOffsetsForGappedAlignment (const Uint1* query, const Uint1* subject,
3215 const BlastScoreBlk* sbp, BlastHSP* hsp, Int4* q_retval, Int4* s_retval)
3216 {
3217 Int4 index1, max_offset, score, max_score, hsp_end;
3218 const Uint1* query_var,* subject_var;
3219 Boolean positionBased = (sbp->psi_matrix != NULL);
3220 Int4 q_length = hsp->query.end - hsp->query.offset;
3221 Int4 s_length = hsp->subject.end - hsp->subject.offset;
3222 int q_start = hsp->query.offset;
3223 int s_start = hsp->subject.offset;
3224
3225 if (q_length <= HSP_MAX_WINDOW) {
3226 *q_retval = q_start + q_length/2;
3227 *s_retval = s_start + q_length/2;
3228 return TRUE;
3229 }
3230
3231 hsp_end = q_start + HSP_MAX_WINDOW;
3232 query_var = query + q_start;
3233 subject_var = subject + s_start;
3234 score=0;
3235 for (index1=q_start; index1<hsp_end; index1++) {
3236 if (!(positionBased))
3237 score += sbp->matrix->data[*query_var][*subject_var];
3238 else
3239 score += sbp->psi_matrix->pssm->data[index1][*subject_var];
3240 query_var++; subject_var++;
3241 }
3242 max_score = score;
3243 max_offset = hsp_end - 1;
3244 hsp_end = q_start + MIN(q_length, s_length);
3245 for (index1=q_start + HSP_MAX_WINDOW; index1<hsp_end; index1++) {
3246 if (!(positionBased)) {
3247 score -= sbp->matrix->data[*(query_var-HSP_MAX_WINDOW)][*(subject_var-HSP_MAX_WINDOW)];
3248 score += sbp->matrix->data[*query_var][*subject_var];
3249 } else {
3250 score -= sbp->psi_matrix->pssm->data[index1-HSP_MAX_WINDOW][*(subject_var-HSP_MAX_WINDOW)];
3251 score += sbp->psi_matrix->pssm->data[index1][*subject_var];
3252 }
3253 if (score > max_score) {
3254 max_score = score;
3255 max_offset = index1;
3256 }
3257 query_var++; subject_var++;
3258 }
3259
3260 if (max_score > 0)
3261 {
3262 *q_retval = max_offset;
3263 *s_retval = (max_offset - q_start) + s_start;
3264 return TRUE;
3265 }
3266 else /* Test the window around the ends of the HSP. */
3267 {
3268 score=0;
3269 query_var = query + q_start + q_length - HSP_MAX_WINDOW;
3270 subject_var = subject + s_start + s_length - HSP_MAX_WINDOW;
3271 for (index1=hsp->query.end-HSP_MAX_WINDOW; index1<hsp->query.end; index1++) {
3272 if (!(positionBased))
3273 score += sbp->matrix->data[*query_var][*subject_var];
3274 else
3275 score += sbp->psi_matrix->pssm->data[index1][*subject_var];
3276 query_var++; subject_var++;
3277 }
3278 if (score > 0)
3279 {
3280 *q_retval = hsp->query.end - HSP_MAX_WINDOW/2;
3281 *s_retval = hsp->subject.end - HSP_MAX_WINDOW/2;
3282 return TRUE;
3283 }
3284 }
3285 return FALSE;
3286 }
3287
3288 void
BlastGetStartForGappedAlignmentNucl(const Uint1 * query,const Uint1 * subject,BlastHSP * hsp)3289 BlastGetStartForGappedAlignmentNucl (const Uint1* query, const Uint1* subject,
3290 BlastHSP* hsp)
3291 {
3292 /* We will stop when the identity count reaches to this number */
3293 int hspMaxIdentRun = 10;
3294 const Uint1 *q, *s;
3295 Int4 index, max_offset, score, max_score, q_start, s_start, q_len;
3296 Boolean match, prev_match;
3297 Int4 offset = MIN(hsp->subject.gapped_start - hsp->subject.offset,
3298 hsp->query.gapped_start - hsp->query.offset);
3299 /* first check if the old value is ok */
3300 q_start = hsp->query.gapped_start;
3301 s_start = hsp->subject.gapped_start;
3302 score = -1;
3303 q = query + q_start;
3304 s = subject + s_start;
3305 q_len = hsp->query.end;
3306 while ((q-query < q_len) && (*q++ == *s++)) {
3307 score++;
3308 if (score > hspMaxIdentRun) return;
3309 }
3310 q = query + q_start;
3311 s = subject + s_start;
3312 while ((q-query >= 0) && (*q-- == *s--)) {
3313 score++;
3314 if (score > hspMaxIdentRun) return;
3315 }
3316 hspMaxIdentRun *= 1.5; /* Demand larger value if we move */
3317 /* if the old value is not ok, try to find a better point */
3318 q_start = hsp->query.gapped_start - offset;
3319 s_start = hsp->subject.gapped_start - offset;
3320 q_len = MIN(hsp->subject.end - s_start, hsp->query.end - q_start);
3321 q = query + q_start;
3322 s = subject + s_start;
3323 max_score = 0;
3324 max_offset = q_start;
3325 score = 0;
3326 match = FALSE;
3327 prev_match = FALSE;
3328 for (index = q_start; index < q_start + q_len; index++) {
3329 match = (*q++ == *s++);
3330 if (match != prev_match) {
3331 prev_match = match;
3332 if (match) {
3333 score = 1;
3334 } else if (score > max_score) {
3335 max_score = score;
3336 max_offset = index - score/2;
3337 }
3338 } else if (match) {
3339 ++score;
3340 if (score > hspMaxIdentRun) {
3341 max_offset = index - hspMaxIdentRun/2;
3342 hsp->query.gapped_start = max_offset;
3343 hsp->subject.gapped_start = max_offset + s_start - q_start;
3344 return;
3345 }
3346 }
3347 }
3348 if (match && score > max_score) {
3349 max_score = score;
3350 max_offset = index - score/2;
3351 }
3352 if (max_score > 0) {
3353 hsp->query.gapped_start = max_offset;
3354 hsp->subject.gapped_start = max_offset + s_start - q_start;
3355 }
3356 }
3357
3358 Int4
BlastGetStartForGappedAlignment(const Uint1 * query,const Uint1 * subject,const BlastScoreBlk * sbp,Uint4 q_start,Uint4 q_length,Uint4 s_start,Uint4 s_length)3359 BlastGetStartForGappedAlignment (const Uint1* query, const Uint1* subject,
3360 const BlastScoreBlk* sbp, Uint4 q_start, Uint4 q_length,
3361 Uint4 s_start, Uint4 s_length)
3362 {
3363 Int4 index1, max_offset, score, max_score, hsp_end;
3364 const Uint1* query_var,* subject_var;
3365 Boolean positionBased = (sbp->psi_matrix != NULL);
3366
3367 if (q_length <= HSP_MAX_WINDOW) {
3368 max_offset = q_start + q_length/2;
3369 return max_offset;
3370 }
3371
3372 hsp_end = q_start + HSP_MAX_WINDOW;
3373 query_var = query + q_start;
3374 subject_var = subject + s_start;
3375 score=0;
3376 for (index1=q_start; index1<hsp_end; index1++) {
3377 if (!(positionBased))
3378 score += sbp->matrix->data[*query_var][*subject_var];
3379 else
3380 score += sbp->psi_matrix->pssm->data[index1][*subject_var];
3381 query_var++; subject_var++;
3382 }
3383 max_score = score;
3384 max_offset = hsp_end - 1;
3385 hsp_end = q_start + MIN(q_length, s_length);
3386 for (index1=q_start + HSP_MAX_WINDOW; index1<hsp_end; index1++) {
3387 if (!(positionBased)) {
3388 score -= sbp->matrix->data[*(query_var-HSP_MAX_WINDOW)][*(subject_var-HSP_MAX_WINDOW)];
3389 score += sbp->matrix->data[*query_var][*subject_var];
3390 } else {
3391 score -= sbp->psi_matrix->pssm->data[index1-HSP_MAX_WINDOW][*(subject_var-HSP_MAX_WINDOW)];
3392 score += sbp->psi_matrix->pssm->data[index1][*subject_var];
3393 }
3394 if (score > max_score) {
3395 max_score = score;
3396 max_offset = index1;
3397 }
3398 query_var++; subject_var++;
3399 }
3400 if (max_score > 0)
3401 max_offset -= HSP_MAX_WINDOW/2;
3402 else
3403 max_offset = q_start;
3404
3405 return max_offset;
3406 }
3407
BLAST_GetGappedScore(EBlastProgramType program_number,BLAST_SequenceBlk * query,BlastQueryInfo * query_info,BLAST_SequenceBlk * subject,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,const BlastExtensionParameters * ext_params,const BlastHitSavingParameters * hit_params,BlastInitHitList * init_hitlist,BlastHSPList ** hsp_list_ptr,BlastGappedStats * gapped_stats,Boolean * fence_hit)3408 Int2 BLAST_GetGappedScore (EBlastProgramType program_number,
3409 BLAST_SequenceBlk* query, BlastQueryInfo* query_info,
3410 BLAST_SequenceBlk* subject,
3411 BlastGapAlignStruct* gap_align,
3412 const BlastScoringParameters* score_params,
3413 const BlastExtensionParameters* ext_params,
3414 const BlastHitSavingParameters* hit_params,
3415 BlastInitHitList* init_hitlist,
3416 BlastHSPList** hsp_list_ptr, BlastGappedStats* gapped_stats,
3417 Boolean * fence_hit)
3418 {
3419 Int4 index;
3420 BlastInitHSP* init_hsp = NULL;
3421 BlastInitHSP* init_hsp_array;
3422 Int4 q_start, s_start, q_end, s_end;
3423 Boolean is_prot;
3424 Boolean* restricted_align_array = NULL; /* if allocated, restricted
3425 alignment option per query */
3426 Boolean is_greedy;
3427 Boolean is_rpsblast = Blast_ProgramIsRpsBlast(program_number);
3428 Int4 max_offset;
3429 Int4 rps_cutoff_index = 0;
3430 Int2 status = 0;
3431 BlastHSPList* hsp_list = NULL;
3432 const BlastHitSavingOptions* hit_options = hit_params->options;
3433 BLAST_SequenceBlk query_tmp;
3434 Int4 context;
3435 BlastIntervalTree *tree;
3436 Int4 score;
3437 Int4* found_high_score = NULL;
3438 Int4 **rpsblast_pssms = NULL; /* Pointer to concatenated PSSMs in
3439 RPS-BLAST database */
3440 const int kHspNumMax = BlastHspNumMax(TRUE, hit_options);
3441 const double kRestrictedMult = 0.68;/* fraction of the ordinary cutoff score
3442 used for approximate gapped alignment */
3443 Int4 redo_index = -1; /* these are used for recomputing alignmnets if the */
3444 Int4 redo_query = -1; /* approximate alignment score is inconclusive */
3445
3446 if (!query || !subject || !gap_align || !score_params || !ext_params ||
3447 !hit_params || !init_hitlist || !hsp_list_ptr)
3448 return 1;
3449
3450 if (init_hitlist->total == 0)
3451 return 0;
3452
3453 is_prot = (program_number != eBlastTypeBlastn &&
3454 program_number != eBlastTypePhiBlastn &&
3455 program_number != eBlastTypeMapping);
3456 is_greedy = (ext_params->options->ePrelimGapExt == eGreedyScoreOnly);
3457
3458 /* turn on approximate gapped alignment if 1) the search
3459 specifies it, and 2) the first, highest-scoring ungapped
3460 alignment in init_hitlist scores below a reduced cutoff.
3461 The second condition is used to avoid computing approximate
3462 alignments that experiment shows are likely to be thrown
3463 away and recomputed optimally */
3464
3465 if (hit_params->restricted_align && !score_params->options->is_ooframe) {
3466 int i;
3467 Boolean* found = calloc(query_info->last_context + 1, sizeof(Boolean));
3468
3469 restricted_align_array =
3470 (Boolean*)calloc(Blast_GetQueryIndexFromContext(
3471 query_info->last_context,
3472 program_number) + 1,
3473 sizeof(Boolean));
3474
3475 /* find the first ungapped alignment for each query and determine
3476 whetherto use approximate gapped alignment for the query */
3477 for (i = 0;i < init_hitlist->total;i++) {
3478 int contxt; /* context */
3479 int query_index = -1;
3480 Int4 q_off;
3481
3482 init_hsp = &init_hitlist->init_hsp_array[i];
3483 /* find query index */
3484 q_off = init_hsp->offsets.qs_offsets.q_off;
3485 for (contxt = query_info->first_context;
3486 contxt <= query_info->last_context; contxt++) {
3487
3488 if (q_off >= query_info->contexts[contxt].query_offset &&
3489 q_off < query_info->contexts[contxt].query_offset +
3490 query_info->contexts[contxt].query_length) {
3491
3492 query_index = Blast_GetQueryIndexFromContext(contxt,
3493 program_number);
3494 break;
3495 }
3496 }
3497 ASSERT(query_index >= 0);
3498
3499 /* if this is not the first initial hit for the query, disregard it */
3500 if (found[query_index]) {
3501 continue;
3502 }
3503
3504 found[query_index] = TRUE;
3505
3506 /* use approximate gapped alignment if the highest scoring ungapped
3507 alignment scores below the reduced cutoff */
3508 if (init_hsp->ungapped_data && init_hsp->ungapped_data->score <
3509 (Int4)(kRestrictedMult *
3510 hit_params->cutoffs[contxt].cutoff_score)) {
3511
3512 restricted_align_array[query_index] = TRUE;
3513 }
3514 else {
3515 restricted_align_array[query_index] = FALSE;
3516 }
3517 }
3518 if (found) {
3519 free(found);
3520 }
3521 }
3522
3523 if (is_rpsblast) {
3524 rpsblast_pssms = gap_align->sbp->psi_matrix->pssm->data;
3525 if (program_number == eBlastTypeRpsBlast)
3526 rps_cutoff_index = subject->oid;
3527 else
3528 rps_cutoff_index = subject->oid * NUM_FRAMES +
3529 BLAST_FrameToContext(subject->frame,
3530 program_number);
3531 }
3532
3533 ASSERT(Blast_InitHitListIsSortedByScore(init_hitlist));
3534
3535 if (*hsp_list_ptr == NULL)
3536 *hsp_list_ptr = hsp_list = Blast_HSPListNew(kHspNumMax);
3537 else
3538 hsp_list = *hsp_list_ptr;
3539
3540 /* Initialize the interval tree with the maximum possible
3541 query and subject offsets. For query sequences this is always
3542 query->length, and for subject sequences it is subject->length
3543 except for out-of-frame tblastn. In that case, subject->length
3544 is the length of one strand of the subject sequence, but the
3545 subject offsets of each ungapped alignment are wrt all six subject
3546 frames concatenated together. Finally, add 1 to the maximum,
3547 to account for HSPs that include the end of a sequence */
3548
3549 if (Blast_SubjectIsTranslated(program_number) &&
3550 score_params->options->is_ooframe) {
3551 /* OOF search only supported for tblastn and blastx. (blastx
3552 does not have a translated subject, so can't get here.) */
3553 ASSERT(program_number == eBlastTypeTblastn);
3554
3555 tree = Blast_IntervalTreeInit(0, query->length+1,
3556 0, 2*(subject->length + CODON_LENGTH)+1);
3557 }
3558 else {
3559 tree = Blast_IntervalTreeInit(0, query->length+1,
3560 0, subject->length+1);
3561 }
3562 if (!tree)
3563 return BLASTERR_MEMORY;
3564
3565
3566 init_hsp_array = init_hitlist->init_hsp_array;
3567 found_high_score = (Int4*) calloc(query_info->num_queries, sizeof(Int4));
3568 if (hit_params->low_score)
3569 {
3570 for (index=0; index<init_hitlist->total; index++)
3571 {
3572 int query_index =
3573 Blast_GetQueryIndexFromQueryOffset(init_hsp_array[index].offsets.qs_offsets.q_off, program_number, query_info);
3574 if (init_hsp_array[index].ungapped_data->score > hit_params->low_score[query_index])
3575 found_high_score[query_index] = 1;
3576 }
3577 }
3578
3579 for (index=0; index<init_hitlist->total; index++)
3580 {
3581 BlastHSP tmp_hsp;
3582 BlastInitHSP tmp_init_hsp;
3583 BlastUngappedData tmp_ungapped_data;
3584 int query_index=0;
3585 Boolean restricted_alignment = FALSE;
3586
3587 /* make a local copy of the initial HSP */
3588
3589 tmp_init_hsp = init_hsp_array[index];
3590 if (tmp_init_hsp.ungapped_data) {
3591 tmp_ungapped_data = *(init_hsp_array[index].ungapped_data);
3592 tmp_init_hsp.ungapped_data = &tmp_ungapped_data;
3593 }
3594 init_hsp = &tmp_init_hsp;
3595
3596 s_AdjustHspOffsetsAndGetQueryData(query, query_info, init_hsp,
3597 &query_tmp, &context);
3598
3599 query_index = Blast_GetQueryIndexFromContext(context, program_number);
3600
3601 /* If redo_index > -1 and redo_query > -1, the main loop is recomputing
3602 gappaed alignments for a single query, becuase the approximate
3603 alignment score was inconclusive. This recomputing was triggered when
3604 index was equal to redo_index. Until index reaches redo_index again,
3605 skip all concatenated queries with index different from redo_query */
3606
3607 if (index < redo_index && query_index != redo_query) {
3608 continue;
3609 }
3610
3611 if (restricted_align_array && restricted_align_array[query_index]) {
3612 restricted_alignment = TRUE;
3613 }
3614
3615 if (hit_params->low_score)
3616 {
3617 if (!found_high_score[query_index])
3618 continue;
3619 }
3620
3621 if (rpsblast_pssms)
3622 gap_align->sbp->psi_matrix->pssm->data = rpsblast_pssms +
3623 query_info->contexts[context].query_offset;
3624
3625 if (!init_hsp->ungapped_data) {
3626 q_start = q_end = init_hsp->offsets.qs_offsets.q_off;
3627 s_start = s_end = init_hsp->offsets.qs_offsets.s_off;
3628 score = INT4_MIN;
3629 } else {
3630 q_start = init_hsp->ungapped_data->q_start;
3631 q_end = q_start + init_hsp->ungapped_data->length;
3632 s_start = init_hsp->ungapped_data->s_start;
3633 s_end = s_start + init_hsp->ungapped_data->length;
3634 score = init_hsp->ungapped_data->score;
3635 }
3636
3637 tmp_hsp.score = score;
3638 tmp_hsp.context = context;
3639 tmp_hsp.query.offset = q_start;
3640 tmp_hsp.query.end = q_end;
3641 tmp_hsp.query.frame = query_info->contexts[context].frame;
3642 tmp_hsp.subject.offset = s_start;
3643 tmp_hsp.subject.end = s_end;
3644 tmp_hsp.subject.frame = subject->frame;
3645
3646 /* use priate interval tree when recomputing alignments */
3647 if (!BlastIntervalTreeContainsHSP(tree, &tmp_hsp, query_info,
3648 hit_options->min_diag_separation))
3649 {
3650 BlastHSP* new_hsp;
3651 Int4 cutoff, restricted_cutoff = 0;
3652
3653 if (is_rpsblast)
3654 cutoff = hit_params->cutoffs[rps_cutoff_index].cutoff_score;
3655 else
3656 cutoff = hit_params->cutoffs[context].cutoff_score;
3657
3658 if (restricted_alignment)
3659 restricted_cutoff = (Int4)(kRestrictedMult * cutoff);
3660
3661 if (gapped_stats)
3662 ++gapped_stats->extensions;
3663
3664 if(is_prot && !score_params->options->is_ooframe) {
3665 max_offset =
3666 BlastGetStartForGappedAlignment(query_tmp.sequence,
3667 subject->sequence, gap_align->sbp,
3668 init_hsp->ungapped_data->q_start,
3669 init_hsp->ungapped_data->length,
3670 init_hsp->ungapped_data->s_start,
3671 init_hsp->ungapped_data->length);
3672 init_hsp->offsets.qs_offsets.s_off +=
3673 max_offset - init_hsp->offsets.qs_offsets.q_off;
3674 init_hsp->offsets.qs_offsets.q_off = max_offset;
3675 }
3676
3677 if (is_prot) {
3678 status = s_BlastProtGappedAlignment(program_number, &query_tmp,
3679 subject, gap_align,
3680 score_params, init_hsp,
3681 restricted_alignment,
3682 fence_hit);
3683
3684 if (restricted_alignment &&
3685 gap_align->score < cutoff &&
3686 gap_align->score >= restricted_cutoff) {
3687
3688 /* the alignment score is inconclusive; it may really be
3689 low-scoring, or the approximate gapped alignment
3690 could be highly suboptimal. Recomputing the alignment
3691 would resolve the ambiguity, but when the final list of
3692 HSPs contains a mix of approximate and exact gapped
3693 alignments it's possible for a highly suboptimal
3694 alignment to make it to the traceback phase, where
3695 its start point is reused. We sidestep this problem
3696 by throwing away all the previously computed gapped
3697 alignments and recomputing them exactly. */
3698
3699 Int4 index2;
3700 BlastHSPList* new_hsp_list = Blast_HSPListNew(kHspNumMax);
3701 ASSERT(restricted_align_array);
3702 restricted_align_array[query_index] = FALSE;
3703
3704 Blast_IntervalTreeReset(tree);
3705
3706 /* remove all HSPs computed for the current query */
3707 for (index2 = 0; index2 < hsp_list->hspcnt; index2++) {
3708 Int4 context2 = hsp_list->hsp_array[index2]->context;
3709 Int4 q_index2 = Blast_GetQueryIndexFromContext(context2,
3710 program_number);
3711
3712 if (q_index2 != query_index) {
3713
3714 Blast_HSPListSaveHSP(new_hsp_list,
3715 hsp_list->hsp_array[index2]);
3716
3717 status = BlastIntervalTreeAddHSP(
3718 hsp_list->hsp_array[index2],
3719 tree,
3720 query_info,
3721 eQueryAndSubject);
3722
3723 hsp_list->hsp_array[index2] = NULL;
3724
3725 }
3726 else {
3727 hsp_list->hsp_array[index2] =
3728 Blast_HSPFree(hsp_list->hsp_array[index2]);
3729 }
3730 }
3731 Blast_HSPListFree(hsp_list);
3732 hsp_list = new_hsp_list;
3733
3734 /* restart the loop over initial hits to recompute
3735 alignments for the current query */
3736 redo_index = index;
3737 redo_query = query_index;
3738 index = -1;
3739 continue;
3740 }
3741 } else if (is_greedy) {
3742 if (init_hsp->ungapped_data) {
3743 init_hsp->offsets.qs_offsets.q_off =
3744 init_hsp->ungapped_data->q_start + init_hsp->ungapped_data->length/2;
3745 init_hsp->offsets.qs_offsets.s_off =
3746 init_hsp->ungapped_data->s_start + init_hsp->ungapped_data->length/2;
3747 }
3748 status = BLAST_GreedyGappedAlignment(
3749 query_tmp.sequence, subject->sequence,
3750 query_tmp.length, subject->length,
3751 gap_align, score_params,
3752 init_hsp->offsets.qs_offsets.q_off,
3753 init_hsp->offsets.qs_offsets.s_off,
3754 (Boolean) TRUE, FALSE, fence_hit);
3755 /* override the alignment start point with the estimate
3756 by the aligner of the best start point */
3757 init_hsp->offsets.qs_offsets.q_off =
3758 gap_align->greedy_query_seed_start;
3759 init_hsp->offsets.qs_offsets.s_off =
3760 gap_align->greedy_subject_seed_start;
3761 } else {
3762 /* Assuming the ungapped alignment is long enough to
3763 contain an 8-letter seed of exact matches, start
3764 the gapped alignment inside the first byte of the
3765 seed that contains all matches. Note that even if
3766 the ungapped alignment is not long enough,
3767 s_BlastDynProgNtGappedAlignment will still round
3768 the start point up to the first 4-base boundary
3769 on the subject sequence */
3770 if (s_end >= (Int4)init_hsp->offsets.qs_offsets.s_off + 8) {
3771 init_hsp->offsets.qs_offsets.s_off += 3;
3772 init_hsp->offsets.qs_offsets.q_off += 3;
3773 }
3774 status = s_BlastDynProgNtGappedAlignment(&query_tmp, subject,
3775 gap_align, score_params, init_hsp);
3776 }
3777
3778 if (status) {
3779 if (rpsblast_pssms) {
3780 gap_align->sbp->psi_matrix->pssm->data = rpsblast_pssms;
3781 }
3782 sfree(found_high_score);
3783 tree = Blast_IntervalTreeFree(tree);
3784 return status;
3785 }
3786
3787 if (gap_align->score >= cutoff) {
3788 Int2 query_frame = 0;
3789 /* For mixed-frame search, the query frame is determined
3790 from the offset, not only from context. */
3791 if (score_params->options->is_ooframe &&
3792 program_number == eBlastTypeBlastx) {
3793 query_frame = gap_align->query_start % CODON_LENGTH + 1;
3794 if ((context % NUM_FRAMES) >= CODON_LENGTH)
3795 query_frame = -query_frame;
3796 } else {
3797 query_frame = query_info->contexts[context].frame;
3798 }
3799
3800 status = Blast_HSPInit(gap_align->query_start,
3801 gap_align->query_stop, gap_align->subject_start,
3802 gap_align->subject_stop,
3803 init_hsp->offsets.qs_offsets.q_off,
3804 init_hsp->offsets.qs_offsets.s_off, context,
3805 query_frame, subject->frame, gap_align->score,
3806 &(gap_align->edit_script), &new_hsp);
3807 if (status)
3808 {
3809 sfree(found_high_score);
3810 tree = Blast_IntervalTreeFree(tree);
3811 return status;
3812 }
3813 status = Blast_HSPListSaveHSP(hsp_list, new_hsp);
3814 if (status)
3815 break;
3816 status = BlastIntervalTreeAddHSP(new_hsp, tree, query_info,
3817 eQueryAndSubject);
3818 if (status)
3819 break;
3820 }
3821 else {
3822 /* a greedy alignment may have traceback associated with it;
3823 free that traceback if the alignment will not be used */
3824 if (is_greedy) {
3825 gap_align->edit_script = GapEditScriptDelete(
3826 gap_align->edit_script);
3827 }
3828 }
3829 }
3830 }
3831
3832 sfree(found_high_score);
3833 if (restricted_align_array) {
3834 sfree(restricted_align_array);
3835 }
3836 tree = Blast_IntervalTreeFree(tree);
3837 if (rpsblast_pssms) {
3838 gap_align->sbp->psi_matrix->pssm->data = rpsblast_pssms;
3839 }
3840
3841 *hsp_list_ptr = hsp_list;
3842 return status;
3843 }
3844
3845 /** Out-of-frame gapped alignment wrapper function.
3846 * @param query Query sequence [in]
3847 * @param subject Subject sequence [in]
3848 * @param q_off Offset in query [in]
3849 * @param s_off Offset in subject [in]
3850 * @param private_q_start Extent of alignment in query [out]
3851 * @param private_s_start Extent of alignment in subject [out]
3852 * @param score_only Return score only, without traceback [in]
3853 * @param edit_block Structure to hold generated traceback [out]
3854 * @param gap_align Gapped alignment information and preallocated
3855 * memory [in] [out]
3856 * @param score_params Scoring parameters [in]
3857 * @param psi_offset Starting position in PSI-BLAST matrix [in]
3858 * @param reversed Direction of the extension [in]
3859 * @param switch_seq Sequences need to be switched for blastx,
3860 * but not for tblastn [in]
3861 */
3862 static Int4
s_OutOfFrameSemiGappedAlignWrap(const Uint1 * query,const Uint1 * subject,Int4 q_off,Int4 s_off,Int4 * private_q_start,Int4 * private_s_start,Boolean score_only,GapPrelimEditBlock * edit_block,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 psi_offset,Boolean reversed,Boolean switch_seq)3863 s_OutOfFrameSemiGappedAlignWrap(const Uint1* query, const Uint1* subject, Int4 q_off,
3864 Int4 s_off, Int4* private_q_start, Int4* private_s_start,
3865 Boolean score_only, GapPrelimEditBlock *edit_block,
3866 BlastGapAlignStruct* gap_align,
3867 const BlastScoringParameters* score_params, Int4 psi_offset,
3868 Boolean reversed, Boolean switch_seq)
3869 {
3870 if (switch_seq) {
3871 return s_OutOfFrameGappedAlign(subject, query, s_off, q_off,
3872 private_s_start, private_q_start, score_only, edit_block,
3873 gap_align, score_params, psi_offset, reversed);
3874 } else {
3875 return s_OutOfFrameGappedAlign(query, subject, q_off, s_off,
3876 private_q_start, private_s_start, score_only, edit_block,
3877 gap_align, score_params, psi_offset, reversed);
3878 }
3879 }
3880
3881 /** Maximal subject length after which the offsets are adjusted to a
3882 * subsequence.
3883 */
3884 #define MAX_SUBJECT_OFFSET 90000
3885 /** Approximate upper bound on a number of gaps in an HSP, needed to determine
3886 * the length of the subject subsequence to be retrieved for alignment with
3887 * traceback.
3888 */
3889 #define MAX_TOTAL_GAPS 3000
3890
3891 void
AdjustSubjectRange(Int4 * subject_offset_ptr,Int4 * subject_length_ptr,Int4 query_offset,Int4 query_length,Int4 * start_shift)3892 AdjustSubjectRange(Int4* subject_offset_ptr, Int4* subject_length_ptr,
3893 Int4 query_offset, Int4 query_length, Int4* start_shift)
3894 {
3895 Int4 s_offset;
3896 Int4 subject_length = *subject_length_ptr;
3897 Int4 max_extension_left, max_extension_right;
3898
3899 /* If subject sequence is not too long, leave everything as is */
3900 if (subject_length < MAX_SUBJECT_OFFSET) {
3901 *start_shift = 0;
3902 return;
3903 }
3904
3905 s_offset = *subject_offset_ptr;
3906 /* Maximal extension length is the remaining length in the query, plus
3907 an estimate of a maximal total number of gaps. */
3908 max_extension_left = query_offset + MAX_TOTAL_GAPS;
3909 max_extension_right = query_length - query_offset + MAX_TOTAL_GAPS;
3910
3911 if (s_offset <= max_extension_left) {
3912 *start_shift = 0;
3913 } else {
3914 *start_shift = s_offset - max_extension_left;
3915 *subject_offset_ptr = max_extension_left;
3916 }
3917
3918 *subject_length_ptr =
3919 MIN(subject_length, s_offset + max_extension_right) - *start_shift;
3920 }
3921
3922 /** Performs gapped extension for protein sequences, given two
3923 * sequence blocks, scoring and extension options, and an initial HSP
3924 * with information from the previously performed ungapped extension
3925 * @param program BLAST program [in]
3926 * @param query_blk The query sequence block [in]
3927 * @param subject_blk The subject sequence block [in]
3928 * @param gap_align The auxiliary structure for gapped alignment [in]
3929 * @param score_params Parameters related to scoring [in]
3930 * @param init_hsp The initial HSP information [in]
3931 * @param restricted_alignment If true and search is not out-of-frame,
3932 * use a faster approximate gapped alignment algorithm [in]
3933 * @param fence_hit If not NULL, pointer to a boolean that is set to
3934 * TRUE if gapped extension reaches a neighborhood
3935 * of the subject sequence that is not initialized [in][out]
3936 */
3937 static Int2
s_BlastProtGappedAlignment(EBlastProgramType program,BLAST_SequenceBlk * query_blk,BLAST_SequenceBlk * subject_blk,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,BlastInitHSP * init_hsp,Boolean restricted_alignment,Boolean * fence_hit)3938 s_BlastProtGappedAlignment(EBlastProgramType program,
3939 BLAST_SequenceBlk* query_blk, BLAST_SequenceBlk* subject_blk,
3940 BlastGapAlignStruct* gap_align,
3941 const BlastScoringParameters* score_params, BlastInitHSP* init_hsp,
3942 Boolean restricted_alignment,
3943 Boolean * fence_hit)
3944 {
3945 Boolean found_start, found_end;
3946 Int4 q_length=0, s_length=0, score_right, score_left;
3947 Int4 private_q_start, private_s_start;
3948 Uint1* query=NULL,* subject=NULL;
3949 Boolean switch_seq = FALSE;
3950 Int4 query_length = query_blk->length;
3951 Int4 subject_length = subject_blk->length;
3952 Int4 subject_shift = 0;
3953 BlastScoringOptions *score_options = score_params->options;
3954
3955 if (gap_align == NULL)
3956 return -1;
3957
3958 if (score_options->is_ooframe) {
3959 ASSERT(program == eBlastTypeTblastn || program == eBlastTypeBlastx);
3960
3961 q_length = init_hsp->offsets.qs_offsets.q_off;
3962 /* For negative subject frames, make subject offset relative to the part
3963 of the mixed-frame sequence corresponding to the reverse strand. */
3964 if (program == eBlastTypeTblastn && subject_blk->frame < 0)
3965 init_hsp->offsets.qs_offsets.s_off -= subject_length + 1;
3966
3967 s_length = init_hsp->offsets.qs_offsets.s_off;
3968
3969 if (program == eBlastTypeBlastx) {
3970 subject = subject_blk->sequence + s_length;
3971 query = query_blk->oof_sequence + CODON_LENGTH + q_length;
3972 query_length -= CODON_LENGTH - 1;
3973 switch_seq = TRUE;
3974 } else if (program == eBlastTypeTblastn) {
3975 subject = subject_blk->oof_sequence + CODON_LENGTH + s_length;
3976 query = query_blk->sequence + q_length;
3977 subject_length -= CODON_LENGTH - 1;
3978 }
3979 } else {
3980 q_length = init_hsp->offsets.qs_offsets.q_off + 1;
3981 s_length = init_hsp->offsets.qs_offsets.s_off + 1;
3982 query = query_blk->sequence;
3983 subject = subject_blk->sequence;
3984 }
3985
3986 AdjustSubjectRange(&s_length, &subject_length, q_length, query_length,
3987 &subject_shift);
3988
3989 found_start = FALSE;
3990 found_end = FALSE;
3991
3992 /* Looking for "left" score */
3993 score_left = 0;
3994 if (q_length != 0 && s_length != 0) {
3995 found_start = TRUE;
3996 if(score_options->is_ooframe) {
3997 score_left =
3998 s_OutOfFrameSemiGappedAlignWrap(query, subject, q_length, s_length,
3999 &private_q_start, &private_s_start, TRUE, NULL,
4000 gap_align, score_params, q_length, TRUE, switch_seq);
4001 } else {
4002 if (restricted_alignment) {
4003 score_left = s_RestrictedGappedAlign(query, subject+subject_shift,
4004 q_length, s_length,
4005 &private_q_start, &private_s_start,
4006 gap_align, score_params,
4007 init_hsp->offsets.qs_offsets.q_off, TRUE);
4008 }
4009 else {
4010 score_left = Blast_SemiGappedAlign(query, subject+subject_shift,
4011 q_length, s_length,
4012 &private_q_start, &private_s_start, TRUE,
4013 NULL, gap_align, score_params,
4014 init_hsp->offsets.qs_offsets.q_off,
4015 FALSE, TRUE,
4016 fence_hit);
4017 }
4018 }
4019 gap_align->query_start = q_length - private_q_start;
4020 gap_align->subject_start = s_length - private_s_start + subject_shift;
4021 }
4022
4023 score_right = 0;
4024 if (q_length < query_length && s_length < subject_length) {
4025 found_end = TRUE;
4026 if(score_options->is_ooframe) {
4027 score_right =
4028 s_OutOfFrameSemiGappedAlignWrap(query-1, subject-1,
4029 query_length-q_length+1, subject_length-s_length+1,
4030 &(gap_align->query_stop), &(gap_align->subject_stop),
4031 TRUE, NULL, gap_align, score_params, q_length, FALSE,
4032 switch_seq);
4033 gap_align->query_stop += q_length;
4034 gap_align->subject_stop += s_length + subject_shift;
4035 } else {
4036 if (restricted_alignment) {
4037 score_right = s_RestrictedGappedAlign(
4038 query+init_hsp->offsets.qs_offsets.q_off,
4039 subject+init_hsp->offsets.qs_offsets.s_off,
4040 query_length-q_length,
4041 subject_length-s_length,
4042 &(gap_align->query_stop),
4043 &(gap_align->subject_stop),
4044 gap_align, score_params,
4045 init_hsp->offsets.qs_offsets.q_off, FALSE);
4046 }
4047 else {
4048 score_right = Blast_SemiGappedAlign(
4049 query+init_hsp->offsets.qs_offsets.q_off,
4050 subject+init_hsp->offsets.qs_offsets.s_off,
4051 query_length-q_length,
4052 subject_length-s_length,
4053 &(gap_align->query_stop),
4054 &(gap_align->subject_stop),
4055 TRUE, NULL, gap_align, score_params,
4056 init_hsp->offsets.qs_offsets.q_off, FALSE,
4057 FALSE,
4058 fence_hit);
4059 }
4060 /* Make end offsets point to the byte after the end of the
4061 alignment */
4062 gap_align->query_stop += init_hsp->offsets.qs_offsets.q_off + 1;
4063 gap_align->subject_stop += init_hsp->offsets.qs_offsets.s_off + 1;
4064 }
4065 }
4066
4067 if (found_start == FALSE) { /* impossible for in-frame */
4068 gap_align->query_start = init_hsp->offsets.qs_offsets.q_off;
4069 gap_align->subject_start = init_hsp->offsets.qs_offsets.s_off;
4070 }
4071 if (found_end == FALSE) { /* impossible for out-of-frame */
4072 gap_align->query_stop = init_hsp->offsets.qs_offsets.q_off;
4073 gap_align->subject_stop = init_hsp->offsets.qs_offsets.s_off;
4074 }
4075
4076 gap_align->score = score_right+score_left;
4077
4078 return 0;
4079 }
4080
4081 /** Converts OOF traceback from a gapped alignment to a GapEditScript.
4082 * This function is for out-of-frame gapping.
4083 * @param rev_prelim_tback Traceback operations for left extension [in]
4084 * @param fwd_prelim_tback Traceback operations for right extension [in]
4085 * @param nucl_align_length Length of alignment on the translated
4086 * nucleotide sequence [in]
4087 * @param edit_script_ptr The resulting edit block [out]
4088 * @return Always zero
4089 */
4090 static Int2
s_BlastOOFTracebackToGapEditScript(GapPrelimEditBlock * rev_prelim_tback,GapPrelimEditBlock * fwd_prelim_tback,Int4 nucl_align_length,GapEditScript ** edit_script_ptr)4091 s_BlastOOFTracebackToGapEditScript(GapPrelimEditBlock *rev_prelim_tback,
4092 GapPrelimEditBlock *fwd_prelim_tback,
4093 Int4 nucl_align_length,
4094 GapEditScript** edit_script_ptr)
4095 {
4096 GapEditScript* e_script;
4097 EGapAlignOpType last_op;
4098 Int4 last_num;
4099 GapPrelimEditBlock *tmp_prelim_tback;
4100 Int4 i, num_nuc;
4101 int extra_needed=0;
4102
4103 /* prepend a substitution, since the input sequences were
4104 shifted prior to the OOF alignment */
4105
4106 tmp_prelim_tback = GapPrelimEditBlockNew();
4107 last_op = eGapAlignSub;
4108 last_num = 1;
4109
4110 /* Propagate the extra substitution through the edit script. */
4111
4112 for (i = 0; i < rev_prelim_tback->num_ops; i++) {
4113
4114 EGapAlignOpType next_op = rev_prelim_tback->edit_ops[i].op_type;
4115 Int4 next_num = rev_prelim_tback->edit_ops[i].num;
4116
4117 if (next_op == last_op) {
4118
4119 /* merge consecutive operations that are identical */
4120
4121 last_num += next_num;
4122 }
4123 else if (next_op == eGapAlignIns || next_op == eGapAlignDel) {
4124
4125 /* Propagating the initial extra substitution requires
4126 that edit operations which are not in-frame gaps must
4127 'flow through' an in-frame gap; the last non-gap
4128 operation before the gap will appear after the gap.
4129 This could mean either one or two edit blocks
4130 getting created */
4131
4132 if (last_num > 1) {
4133 GapPrelimEditBlockAdd(tmp_prelim_tback, last_op, last_num - 1);
4134 }
4135 GapPrelimEditBlockAdd(tmp_prelim_tback, next_op, next_num);
4136 last_num = 1; /* last_op unchanged; one 'orphan' now */
4137 }
4138 else {
4139
4140 /* write out the last edit operation and
4141 move forward in the script */
4142
4143 GapPrelimEditBlockAdd(tmp_prelim_tback, last_op, last_num);
4144 last_op = next_op;
4145 last_num = next_num;
4146 }
4147 }
4148
4149 /* Handle the crossing over from the left extension to
4150 the right extension. Begin by writing out the final
4151 group of ops from the left extension, except for the
4152 one op that must be merged */
4153
4154 last_num--;
4155 if (last_num > 0) {
4156 GapPrelimEditBlockAdd(tmp_prelim_tback, last_op, last_num);
4157 }
4158
4159 if (last_op != eGapAlignSub) {
4160
4161 /* the last traceback letter from the left extension must
4162 be folded into the right extension. */
4163
4164 for (i = fwd_prelim_tback->num_ops - 1; i >= 0; i--) {
4165
4166 GapPrelimEditScript *new_script = fwd_prelim_tback->edit_ops + i;
4167
4168 if (new_script->op_type == eGapAlignIns ||
4169 new_script->op_type == eGapAlignDel) {
4170
4171 /* do not merge with in-frame gaps; just write out
4172 the gaps and 'flow through' the last op */
4173
4174 GapPrelimEditBlockAdd(tmp_prelim_tback, new_script->op_type,
4175 new_script->num);
4176 }
4177 else {
4178
4179 /* merge with the first letter of new_script */
4180
4181 last_op += new_script->op_type - eGapAlignSub;
4182 GapPrelimEditBlockAdd(tmp_prelim_tback, last_op, 1);
4183
4184 /* borrow one letter from new_script to complete the
4185 merge, and skip new_script if it only had one letter */
4186 new_script->num--;
4187 if (new_script->num == 0)
4188 i--;
4189 break;
4190 }
4191 }
4192 fwd_prelim_tback->num_ops = i + 1;
4193 }
4194
4195 /* form the final edit block */
4196 e_script = Blast_PrelimEditBlockToGapEditScript(tmp_prelim_tback, fwd_prelim_tback);
4197 GapPrelimEditBlockFree(tmp_prelim_tback);
4198
4199 /* postprocess the edit script */
4200 num_nuc = 0;
4201 for (i=0; i<e_script->size; i++)
4202 {
4203 int total_actions=0;
4204 /* Count the number of nucleotides in the next
4205 traceback operation and delete any traceback operations
4206 that would make the alignment too long. This check is
4207 not needed for in-frame alignment because the
4208 traceback is never changed after it is computed */
4209
4210 last_op = e_script->op_type[i];
4211
4212 if (last_op == eGapAlignIns)
4213 last_op = eGapAlignSub;
4214
4215 total_actions = last_op * e_script->num[i];
4216
4217 if (num_nuc + total_actions >= nucl_align_length) {
4218 e_script->num[i] = (nucl_align_length - num_nuc +
4219 last_op - 1) / last_op;
4220 break; /* We delete the rest of the script. */
4221 }
4222 else {
4223 num_nuc += total_actions;;
4224 }
4225 }
4226 e_script->size = MIN(i+1, e_script->size); /* If we broke out early then we truncate the edit script. */
4227
4228 extra_needed = 0;
4229 for (i=0; i<e_script->size; i++)
4230 {
4231 if (e_script->op_type[i] % 3 != 0 && e_script->num[i] > 1) {
4232 extra_needed += e_script->num[i] - 1;
4233 }
4234 }
4235
4236 if (extra_needed)
4237 {
4238 GapEditScript* new_esp = GapEditScriptNew(extra_needed+e_script->size);
4239 int new_esp_i=0;
4240 for (i=0; i<e_script->size; i++)
4241 {
4242 /* Only one frame shift operation at a time is
4243 allowed in a single edit script element. */
4244 new_esp->num[new_esp_i] = e_script->num[i];
4245 new_esp->op_type[new_esp_i] = e_script->op_type[i];
4246 new_esp_i++;
4247 last_op = e_script->op_type[i];
4248 if (last_op % 3 != 0 && e_script->num[i] > 1) {
4249 Int4 num_ops = e_script->num[i];
4250 int esp_index=0;
4251 new_esp->num[new_esp_i-1] = 1;
4252 for (esp_index = 1; esp_index < num_ops; esp_index++) {
4253 new_esp->num[new_esp_i] = 1;
4254 new_esp->op_type[new_esp_i] = last_op;
4255 new_esp_i++;
4256 }
4257 }
4258 }
4259 e_script = GapEditScriptDelete(e_script);
4260 e_script = new_esp;
4261 }
4262 *edit_script_ptr = e_script;
4263
4264 /* finally, add one to the size of any block of substitutions
4265 that follows a frame shift op */
4266 last_op = e_script->op_type[0];
4267 for (i=1; i<e_script->size; i++)
4268 {
4269 if (e_script->op_type[i] == eGapAlignSub && (last_op % 3) != 0)
4270 (e_script->num[i])++;
4271
4272 last_op = e_script->op_type[i];
4273 }
4274
4275 return 0;
4276 }
4277
BLAST_GappedAlignmentWithTraceback(EBlastProgramType program,const Uint1 * query,const Uint1 * subject,BlastGapAlignStruct * gap_align,const BlastScoringParameters * score_params,Int4 q_start,Int4 s_start,Int4 query_length,Int4 subject_length,Boolean * fence_hit)4278 Int2 BLAST_GappedAlignmentWithTraceback(EBlastProgramType program,
4279 const Uint1* query, const Uint1* subject, BlastGapAlignStruct* gap_align,
4280 const BlastScoringParameters* score_params,
4281 Int4 q_start, Int4 s_start, Int4 query_length, Int4 subject_length,
4282 Boolean * fence_hit)
4283 {
4284 Boolean found_end;
4285 Int4 score_right, score_left, private_q_length, private_s_length;
4286 Int4 q_length, s_length;
4287 Boolean is_ooframe = score_params->options->is_ooframe;
4288 Int2 status = 0;
4289 Boolean switch_seq = FALSE;
4290 GapPrelimEditBlock *fwd_prelim_tback;
4291 GapPrelimEditBlock *rev_prelim_tback;
4292
4293 if (gap_align == NULL)
4294 return -1;
4295
4296 fwd_prelim_tback = gap_align->fwd_prelim_tback;
4297 rev_prelim_tback = gap_align->rev_prelim_tback;
4298 GapPrelimEditBlockReset(fwd_prelim_tback);
4299 GapPrelimEditBlockReset(rev_prelim_tback);
4300
4301 found_end = FALSE;
4302
4303 q_length = query_length;
4304 s_length = subject_length;
4305 if (is_ooframe) {
4306 /* The mixed frame sequence is shifted to the 3rd position, so its
4307 maximal available length for extension is less by 2 than its
4308 total length. */
4309 switch_seq = (program == eBlastTypeBlastx);
4310 if (switch_seq) {
4311 q_length -= CODON_LENGTH - 1;
4312 } else {
4313 s_length -= CODON_LENGTH - 1;
4314 }
4315 }
4316
4317 score_left = 0;
4318
4319 if(is_ooframe) {
4320 /* NB: Left extension does not include starting point corresponding
4321 to the offset pair; the right extension does. */
4322 score_left =
4323 s_OutOfFrameSemiGappedAlignWrap(query+q_start, subject+s_start,
4324 q_start, s_start, &private_q_length, &private_s_length,
4325 FALSE, rev_prelim_tback, gap_align, score_params, q_start,
4326 TRUE, switch_seq);
4327 gap_align->query_start = q_start - private_q_length;
4328 gap_align->subject_start = s_start - private_s_length;
4329 } else {
4330 /* NB: The left extension includes the starting point
4331 [q_start,s_start]; the right extension does not. */
4332 // <AG> score_left =
4333 // Blast_SemiGappedAlign(query, subject, q_start+1, s_start+1,
4334 // &private_q_length, &private_s_length, FALSE, rev_prelim_tback,
4335 // gap_align, score_params, q_start, FALSE, TRUE,
4336 // fence_hit);
4337
4338 score_left = ALIGN_EX(query, subject, q_start+1, s_start+1, &private_q_length, &private_s_length, rev_prelim_tback,
4339 gap_align,
4340 score_params, q_start, FALSE /*reversed*/, TRUE /*reverse_sequence*/,
4341 fence_hit);
4342
4343 gap_align->query_start = q_start - private_q_length + 1;
4344 gap_align->subject_start = s_start - private_s_length + 1;
4345 }
4346
4347 score_right = 0;
4348
4349 if ((! (fence_hit && *fence_hit)) &&
4350 (q_start < q_length) &&
4351 (s_start < s_length)) {
4352
4353 found_end = TRUE;
4354 if(is_ooframe) {
4355 score_right =
4356 s_OutOfFrameSemiGappedAlignWrap(query+q_start-1, subject+s_start-1,
4357 q_length-q_start, s_length-s_start,
4358 &private_q_length, &private_s_length, FALSE, fwd_prelim_tback,
4359 gap_align, score_params, q_start, FALSE, switch_seq);
4360 } else {
4361 // <AG> score_right =
4362 // Blast_SemiGappedAlign(query+q_start, subject+s_start,
4363 // q_length-q_start-1, s_length-s_start-1, &private_q_length,
4364 // &private_s_length, FALSE, fwd_prelim_tback, gap_align,
4365 // score_params, q_start, FALSE, FALSE,
4366 // fence_hit);
4367 score_right =
4368 ALIGN_EX(query+q_start, subject+s_start,
4369 q_length-q_start-1, s_length-s_start-1, &private_q_length,
4370 &private_s_length, fwd_prelim_tback, gap_align,
4371 score_params, q_start, FALSE, FALSE,
4372 fence_hit);
4373 }
4374
4375 gap_align->query_stop = q_start + private_q_length + 1;
4376 gap_align->subject_stop = s_start + private_s_length + 1;
4377 }
4378
4379 if (found_end == FALSE) {
4380 gap_align->query_stop = q_start - 1;
4381 gap_align->subject_stop = s_start - 1;
4382 }
4383
4384 if(is_ooframe) {
4385 Int4 nucl_align_length;
4386 if (program == eBlastTypeBlastx) {
4387 nucl_align_length = gap_align->query_stop -
4388 gap_align->query_start;
4389 }
4390 else {
4391 nucl_align_length = gap_align->subject_stop -
4392 gap_align->subject_start;
4393 }
4394 status = s_BlastOOFTracebackToGapEditScript(rev_prelim_tback,
4395 fwd_prelim_tback, nucl_align_length,
4396 &gap_align->edit_script);
4397 } else {
4398 Int4 i;
4399 GapEditScript *esp = Blast_PrelimEditBlockToGapEditScript(
4400 rev_prelim_tback,
4401 fwd_prelim_tback);
4402
4403 /* rarely (typically when the scoring system changes between
4404 the score-only and traceback stages, as happens with
4405 composition-based statistics) it is possible to compute an
4406 optimal alignment with a leading or trailing gap. Prune
4407 these unneeded gaps here and update the score and
4408 alignment boundaries */
4409
4410 gap_align->edit_script = esp;
4411 if (esp) {
4412 while (esp->size && esp->op_type[0] != eGapAlignSub) {
4413 score_left += score_params->gap_open +
4414 esp->num[0] * score_params->gap_extend;
4415
4416 if (esp->op_type[0] == eGapAlignDel)
4417 gap_align->subject_start += esp->num[0];
4418 else
4419 gap_align->query_start += esp->num[0];
4420
4421 for (i = 1; i < esp->size; i++) {
4422 esp->op_type[i-1] = esp->op_type[i];
4423 esp->num[i-1] = esp->num[i];
4424 }
4425 esp->size--;
4426 }
4427 i = esp->size;
4428 while (esp->size && esp->op_type[i-1] != eGapAlignSub) {
4429 score_right += score_params->gap_open +
4430 esp->num[i-1] * score_params->gap_extend;
4431
4432 if (esp->op_type[i-1] == eGapAlignDel)
4433 gap_align->subject_stop -= esp->num[i-1];
4434 else
4435 gap_align->query_stop -= esp->num[i-1];
4436
4437 esp->size--;
4438 i--;
4439 ASSERT(esp->size == i);
4440 }
4441 }
4442 }
4443
4444 gap_align->score = score_right + score_left;
4445 return status;
4446 }
4447
BLAST_GetUngappedHSPList(BlastInitHitList * init_hitlist,BlastQueryInfo * query_info,BLAST_SequenceBlk * subject,const BlastHitSavingOptions * hit_options,BlastHSPList ** hsp_list_ptr)4448 Int2 BLAST_GetUngappedHSPList(BlastInitHitList* init_hitlist,
4449 BlastQueryInfo* query_info,
4450 BLAST_SequenceBlk* subject,
4451 const BlastHitSavingOptions* hit_options,
4452 BlastHSPList** hsp_list_ptr)
4453 {
4454 BlastHSPList* hsp_list = NULL;
4455 Int4 index;
4456 BlastInitHSP* init_hsp;
4457 const int kHspNumMax = BlastHspNumMax(FALSE, hit_options);
4458
4459 /* The BlastHSPList structure can be allocated and passed from outside */
4460 if (*hsp_list_ptr != NULL)
4461 hsp_list = *hsp_list_ptr;
4462
4463 if (!init_hitlist) {
4464 if (!hsp_list)
4465 *hsp_list_ptr = NULL;
4466 else
4467 hsp_list->hspcnt = 0;
4468 return 0;
4469 }
4470
4471 for (index = 0; index < init_hitlist->total; ++index) {
4472 BlastHSP* new_hsp;
4473 BlastUngappedData* ungapped_data=NULL;
4474 Int4 context = 0;
4475 init_hsp = &init_hitlist->init_hsp_array[index];
4476 if (!init_hsp->ungapped_data)
4477 continue;
4478
4479 if (!hsp_list) {
4480 hsp_list = Blast_HSPListNew(kHspNumMax);
4481 *hsp_list_ptr = hsp_list;
4482 }
4483 /* Adjust the initial HSP's coordinates in case of concatenated
4484 multiple queries/strands/frames */
4485 context = s_GetUngappedHSPContext(query_info, init_hsp);
4486 s_AdjustInitialHSPOffsets(init_hsp,
4487 query_info->contexts[context].query_offset);
4488 ungapped_data = init_hsp->ungapped_data;
4489 Blast_HSPInit(ungapped_data->q_start,
4490 ungapped_data->length+ungapped_data->q_start,
4491 ungapped_data->s_start,
4492 ungapped_data->length+ungapped_data->s_start,
4493 init_hsp->offsets.qs_offsets.q_off,
4494 init_hsp->offsets.qs_offsets.s_off,
4495 context, query_info->contexts[context].frame,
4496 subject->frame, ungapped_data->score, NULL, &new_hsp);
4497 Blast_HSPListSaveHSP(hsp_list, new_hsp);
4498 }
4499
4500 /* Sort the HSP array by score */
4501 Blast_HSPListSortByScore(hsp_list);
4502
4503 return 0;
4504 }
4505
4506