1 static char const rcsid[] = "$Id: mbalign.c,v 6.43 2006/02/02 17:57:34 papadopo Exp $";
2
3 /* $Id: mbalign.c,v 6.43 2006/02/02 17:57:34 papadopo Exp $
4 * ===========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 *
28 * File Name: $RCSfile: mbalign.c,v $
29 *
30 * Author: Webb Miller and Co.
31 * Adopted for NCBI standard libraries by Sergey Shavirin
32 *
33 * Initial Creation Date: 10/27/1999
34 *
35 * $Revision: 6.43 $
36 *
37 * File Description:
38 * Alignment functions for Mega Blast program
39 *
40 * $Log: mbalign.c,v $
41 * Revision 6.43 2006/02/02 17:57:34 papadopo
42 * fix a rare bug computing greedy traceback
43 *
44 * Revision 6.42 2005/05/03 20:29:17 papadopo
45 * remove use of ICEIL macro; use either a true integer ceiling or (quotient+1) as circumstances warrant
46 *
47 * Revision 6.41 2003/10/29 17:46:59 dondosha
48 * Allow 2-stage greedy extension in megablast
49 *
50 * Revision 6.40 2003/05/30 17:25:36 coulouri
51 * add rcsid
52 *
53 * Revision 6.39 2003/05/13 16:02:53 coulouri
54 * make ErrPostEx(SEV_FATAL, ...) exit with nonzero status
55 *
56 * Revision 6.38 2003/01/30 19:40:13 dondosha
57 * No need to add gap extension penalty to gap open in greedy alignment functions
58 *
59 * Revision 6.37 2002/08/01 21:02:12 dondosha
60 * Added a sanity check in GreedyAlignMemFree
61 *
62 * Revision 6.36 2002/03/20 19:56:34 dondosha
63 * Check entire list of readdb structures for maximal length when allocating memory for greedy algorithm
64 *
65 * Revision 6.35 2001/12/28 20:38:41 dondosha
66 * Moved Mega BLAST related parameters into a separate structure
67 *
68 * Revision 6.34 2001/11/14 23:43:40 dondosha
69 * Subject sequence now already in blastna encoding, no need to convert
70 *
71 * Revision 6.33 2001/09/18 16:49:24 dondosha
72 * Removed unneeded functions, eliminated mbutils.h header
73 *
74 * Revision 6.32 2001/07/20 19:20:47 dondosha
75 * Allow packed or unpacked subject sequence
76 *
77 * Revision 6.31 2001/05/23 20:14:47 dondosha
78 * Corrected the row shift in non-affine greedy algorithm
79 *
80 * Revision 6.30 2001/05/07 22:05:24 dondosha
81 * Small correction in affine extension
82 *
83 * Revision 6.29 2001/05/03 21:48:28 dondosha
84 * Handle some cases when memory allocation fails
85 *
86 * Revision 6.28 2001/05/02 21:32:16 dondosha
87 * Return null from GreedyAlignMemFree
88 *
89 * Revision 6.27 2001/03/30 22:12:11 dondosha
90 * Improved protection from moving beyond sequence boundaries
91 *
92 * Revision 6.26 2001/02/12 19:47:49 dondosha
93 * Make space for the greedy algorithm a linked list, to allow its extention
94 *
95 * Revision 6.25 2001/01/25 18:50:53 dondosha
96 * Check for row going out of bounds in greedy extension routines
97 *
98 * Revision 6.24 2001/01/11 20:22:16 dondosha
99 * Allow for expansion of the space array for gapped extensions
100 *
101 * Revision 6.23 2000/12/07 17:45:14 dondosha
102 * Use actual subject sequence length in GreedyAlignMemAlloc for 2 Sequences engine
103 *
104 * Revision 6.22 2000/11/29 16:31:20 dondosha
105 * Database sequence length cannot be more than MAX_DBSEQ_LEN
106 *
107 * Revision 6.21 2000/11/03 20:58:35 dondosha
108 * Changed parameter one_line_results to no_traceback
109 *
110 * Revision 6.20 2000/10/13 16:57:16 dondosha
111 * Corrected behavior at the border between queries
112 *
113 * Revision 6.19 2000/10/02 14:48:49 dondosha
114 * Bug in previous change
115 *
116 * Revision 6.18 2000/09/29 22:16:48 dondosha
117 * Check if memory allocation failed in GreedyAlignMemAlloc
118 *
119 * Revision 6.17 2000/09/28 14:26:34 dondosha
120 * Moved all memory allocation away from the low level greedy algorithm
121 *
122 * Revision 6.16 2000/08/28 17:15:36 dondosha
123 * The sentinel value between queries changed to gap value
124 *
125 * Revision 6.15 2000/08/03 22:15:29 dondosha
126 * Use correct default gap extension parameter when allocating memory for affine case
127 *
128 * Revision 6.14 2000/05/24 19:51:28 dondosha
129 * Do not use readdb when megablast is used for two sequences
130 *
131 * Revision 6.13 2000/05/17 17:12:39 dondosha
132 * Removed unused variable
133 *
134 * Revision 6.12 2000/05/12 19:45:32 dondosha
135 * Subject sequence is now passed in ncbi4na encoding
136 *
137 * Revision 6.11 2000/05/01 21:23:40 dondosha
138 * Changed greedy_align_basic to MegaBlastGreedyAlign, added MegaBlastAffineGreedyAlign
139 *
140 * Revision 6.10 2000/04/19 18:53:31 dondosha
141 * Bug fix: allow to extend through gap and masked characters, but not end-points
142 *
143 * Revision 6.9 2000/04/07 18:52:32 dondosha
144 * Do not allocate new space for flast_d[0,1] every time
145 *
146 * Revision 6.8 2000/04/05 16:30:07 dondosha
147 * Fixed a bug that allowed crossing the end of a strand or query
148 *
149 * Revision 6.7 2000/03/30 20:38:33 dondosha
150 * Returned one modification from 6.5 that was lost in 6.6
151 *
152 * Revision 6.6 2000/03/29 22:03:14 dondosha
153 * Made edit_script_new and edit_script_append public for use in MegaBlastNtWordExtend
154 *
155 * Revision 6.5 2000/02/01 22:42:11 dondosha
156 * Moved some definitions to mbalign.h; added 3 new routines
157 *
158 * Revision 6.4 2000/01/26 16:39:18 beloslyu
159 * change space_t to space_type because HP-UX compiler got confused with ANSI C space_t
160 *
161 * Revision 6.3 1999/11/24 20:38:09 shavirin
162 * Added possibility to produce Traditional Blast Output.
163 *
164 * Revision 6.2 1999/11/03 19:53:57 shavirin
165 * Fixed problem with Realloc() function.
166 *
167 * Revision 6.1 1999/10/29 15:37:12 shavirin
168 * Initial revision.
169 *
170 *
171 * Initial revision.
172 *
173 * ==========================================================================
174 */
175
176 #include <mbalign.h>
177 #include <blastdef.h>
178
179 /* -------- From original file edit.c ------------- */
180
edit_val_get(edit_op_t op)181 static Uint4 edit_val_get(edit_op_t op)
182 {
183 return op >> 2;
184 }
edit_opc_get(edit_op_t op)185 static Uint4 edit_opc_get(edit_op_t op)
186 {
187 return op & EDIT_OP_MASK;
188 }
189
edit_op_cons(Uint4 op,Uint4 val)190 static edit_op_t edit_op_cons(Uint4 op, Uint4 val)
191 {
192 return (val << 2) | (op & EDIT_OP_MASK);
193 }
194
edit_op_inc(edit_op_t op,Uint4 n)195 static edit_op_t edit_op_inc(edit_op_t op, Uint4 n)
196 {
197 return edit_op_cons(edit_opc_get(op), edit_val_get(op) + n);
198 }
199
edit_op_inc_last(edit_script_t * es,Uint4 n)200 static edit_op_t edit_op_inc_last(edit_script_t *es, Uint4 n)
201 {
202 edit_op_t *last;
203 ASSERT (es->num > 0);
204 last = &(es->op[es->num-1]);
205 *last = edit_op_inc(*last, n);
206 return *last;
207 }
208
edit_script_ready(edit_script_t * es,Uint4 n)209 static Int4 edit_script_ready(edit_script_t *es, Uint4 n)
210 {
211 edit_op_t *p;
212 Uint4 m = n + n/2;
213
214 if (es->size <= n) {
215 p = Realloc(es->op, m*sizeof(edit_op_t));
216 if (p == 0) {
217 return 0;
218 } else {
219 es->op = p;
220 es->size = m;
221 }
222 }
223 return 1;
224 }
225
edit_script_readyplus(edit_script_t * es,Uint4 n)226 static Int4 edit_script_readyplus(edit_script_t *es, Uint4 n)
227 {
228 if (es->size - es->num <= n)
229 return edit_script_ready(es, n + es->num);
230 return 1;
231 }
232
edit_script_put(edit_script_t * es,Uint4 op,Uint4 n)233 static Int4 edit_script_put(edit_script_t *es, Uint4 op, Uint4 n)
234 {
235 if (!edit_script_readyplus(es, 2))
236 return 0;
237 es->last = op;
238 ASSERT(op != 0);
239 es->op[es->num] = edit_op_cons(op, n);
240 es->num += 1;
241 es->op[es->num] = 0; /* sentinal */
242
243 return 1;
244 }
245
edit_script_init(edit_script_t * es)246 static edit_script_t *edit_script_init(edit_script_t *es)
247 {
248 es->op = 0;
249 es->size = es->num = 0;
250 es->last = 0;
251 edit_script_ready(es, 8);
252 return es;
253 }
edit_script_first(edit_script_t * es)254 static edit_op_t *edit_script_first(edit_script_t *es)
255 {
256 return es->num > 0 ? &es->op[0] : 0;
257 }
258
edit_script_next(edit_script_t * es,edit_op_t * op)259 static edit_op_t *edit_script_next(edit_script_t *es, edit_op_t *op)
260 {
261 /* XXX - assumes flat address space */
262 if (&es->op[0] <= op && op < &es->op[es->num-1])
263 return op+1;
264 else
265 return 0;
266 }
edit_script_more(edit_script_t * data,Uint4 op,Uint4 k)267 static Int4 edit_script_more(edit_script_t *data, Uint4 op, Uint4 k)
268 {
269 if (op == EDIT_OP_ERR) {
270 ErrPostEx(SEV_FATAL, 1, 0,
271 "edit_script_more: bad opcode %d:%d", op, k);
272 return -1;
273 }
274
275 if (edit_opc_get(data->last) == op)
276 edit_op_inc_last(data, k);
277 else
278 edit_script_put(data, op, k);
279
280 return 0;
281 }
282 /* External */
edit_script_append(edit_script_t * es,edit_script_t * et)283 edit_script_t *edit_script_append(edit_script_t *es, edit_script_t *et)
284 {
285 edit_op_t *op;
286
287 for (op = edit_script_first(et); op; op = edit_script_next(et, op))
288 edit_script_more(es, edit_opc_get(*op), edit_val_get(*op));
289
290 return es;
291 }
292
293 /* External */
edit_script_new(void)294 edit_script_t *edit_script_new(void)
295 {
296 edit_script_t *es = MemNew(sizeof(*es));
297 if (!es)
298 return 0;
299
300 return edit_script_init(es);
301 }
302
303 /* External */
edit_script_free(edit_script_t * es)304 edit_script_t *edit_script_free(edit_script_t *es)
305 {
306 if (es) {
307 if (es->op)
308 MemFree(es->op);
309 MemSet(es, 0, sizeof(*es));
310 MemFree(es);
311 }
312 return 0;
313 }
314
edit_script_del(edit_script_t * data,Uint4 k)315 Int4 edit_script_del(edit_script_t *data, Uint4 k)
316 {
317 return edit_script_more(data, EDIT_OP_DEL, k);
318 }
319
edit_script_ins(edit_script_t * data,Uint4 k)320 Int4 edit_script_ins(edit_script_t *data, Uint4 k)
321 {
322 return edit_script_more(data, EDIT_OP_INS, k);
323 }
edit_script_rep(edit_script_t * data,Uint4 k)324 Int4 edit_script_rep(edit_script_t *data, Uint4 k)
325 {
326 return edit_script_more(data, EDIT_OP_REP, k);
327 }
328
edit_script_reverse_inplace(edit_script_t * es)329 edit_script_t *edit_script_reverse_inplace(edit_script_t *es)
330 {
331 Uint4 i;
332 const Uint4 num = es->num;
333 const Uint4 mid = num/2;
334 const Uint4 end = num-1;
335
336 for (i = 0; i < mid; ++i) {
337 const edit_op_t t = es->op[i];
338 es->op[i] = es->op[end-i];
339 es->op[end-i] = t;
340 }
341 return es;
342 }
343
new_mb_space()344 MBSpacePtr new_mb_space()
345 {
346 MBSpacePtr p;
347 Int4 amount;
348
349 p = Nlm_Malloc(sizeof(MBSpace));
350 amount = MAX_SPACE;
351 p->space_array = Nlm_Malloc(sizeof(ThreeVal)*amount);
352 if (p->space_array == NULL) {
353 p = MemFree(p);
354 return NULL;
355 }
356 p->used = 0;
357 p->size = amount;
358 p->next = NULL;
359
360 return p;
361 }
362
refresh_mb_space(MBSpacePtr sp)363 void refresh_mb_space(MBSpacePtr sp)
364 {
365 while (sp) {
366 sp->used = 0;
367 sp = sp->next;
368 }
369 }
370
free_mb_space(MBSpacePtr sp)371 void free_mb_space(MBSpacePtr sp)
372 {
373 MBSpacePtr next_sp;
374
375 while (sp) {
376 next_sp = sp->next;
377 sp->space_array = MemFree(sp->space_array);
378 sp = MemFree(sp);
379 sp = next_sp;
380 }
381 }
382
get_mb_space(MBSpacePtr S,Int4 amount)383 ThreeValPtr get_mb_space(MBSpacePtr S, Int4 amount)
384 {
385 ThreeValPtr s;
386 if (amount < 0)
387 return NULL;
388
389 while (S->used+amount > S->size) {
390 if (S->next == NULL)
391 if ((S->next = new_mb_space()) == NULL) {
392 ErrPostEx(SEV_WARNING, 0, 0, "Cannot get new space for greedy extension");
393 return NULL;
394 }
395 S = S->next;
396 }
397
398 s = S->space_array+S->used;
399 S->used += amount;
400
401 return s;
402 }
403 /* ----- */
404
gcd(Int4 a,Int4 b)405 static Int4 gcd(Int4 a, Int4 b)
406 {
407 Int4 c;
408 if (a < b) {
409 c = a;
410 a = b; b = c;
411 }
412 while ((c = a % b) != 0) {
413 a = b; b = c;
414 }
415
416 return b;
417 }
418
419
gdb3(Int4Ptr a,Int4Ptr b,Int4Ptr c)420 static Int4 gdb3(Int4Ptr a, Int4Ptr b, Int4Ptr c)
421 {
422 Int4 g;
423 if (*b == 0) g = gcd(*a, *c);
424 else g = gcd(*a, gcd(*b, *c));
425 if (g > 1) {
426 *a /= g;
427 *b /= g;
428 *c /= g;
429 }
430 return g;
431 }
432
get_lastC(ThreeValPtr PNTR flast_d,Int4Ptr lower,Int4Ptr upper,Int4Ptr d,Int4 diag,Int4 Mis_cost,Int4Ptr row1)433 static Int4 get_lastC(ThreeValPtr PNTR flast_d, Int4Ptr lower, Int4Ptr upper,
434 Int4Ptr d, Int4 diag, Int4 Mis_cost, Int4Ptr row1)
435 {
436 Int4 row;
437
438 if (diag >= lower[(*d)-Mis_cost] && diag <= upper[(*d)-Mis_cost]) {
439 row = flast_d[(*d)-Mis_cost][diag].C;
440 if (row >= MAX(flast_d[*d][diag].I, flast_d[*d][diag].D)) {
441 *d = *d-Mis_cost;
442 *row1 = row;
443 return sC;
444 }
445 }
446 if (flast_d[*d][diag].I > flast_d[*d][diag].D) {
447 *row1 = flast_d[*d][diag].I;
448 return sI;
449 } else {
450 *row1 = flast_d[*d][diag].D;
451 return sD;
452 }
453 }
454
get_last_ID(ThreeValPtr PNTR flast_d,Int4Ptr lower,Int4Ptr upper,Int4Ptr d,Int4 diag,Int4 GO_cost,Int4 GE_cost,Int4 IorD)455 static Int4 get_last_ID(ThreeValPtr PNTR flast_d, Int4Ptr lower, Int4Ptr upper,
456 Int4Ptr d, Int4 diag, Int4 GO_cost,
457 Int4 GE_cost, Int4 IorD)
458 {
459 Int4 ndiag;
460 Int4 row;
461
462 if (IorD == sI) { ndiag = diag -1;} else ndiag = diag+1;
463 if (ndiag >= lower[(*d)-GE_cost] && ndiag <=upper[(*d)-GE_cost])
464 row = (IorD == sI)? flast_d[(*d)-GE_cost][ndiag].I: flast_d[(*d)-GE_cost][ndiag].D;
465 else row = -100;
466 if (ndiag >= lower[(*d)-GO_cost-GE_cost] && ndiag <= upper[(*d)-GO_cost-GE_cost] && row < flast_d[(*d)-GO_cost-GE_cost][ndiag].C) {
467 *d = (*d)-GO_cost-GE_cost;
468 return sC;
469 }
470 *d = (*d)-GE_cost;
471 return IorD;
472 }
473
get_lastI(ThreeValPtr PNTR flast_d,Int4Ptr lower,Int4Ptr upper,Int4Ptr d,Int4 diag,Int4 GO_cost,Int4 GE_cost)474 static Int4 get_lastI(ThreeValPtr PNTR flast_d, Int4Ptr lower, Int4Ptr upper,
475 Int4Ptr d, Int4 diag, Int4 GO_cost, Int4 GE_cost)
476 {
477 return get_last_ID(flast_d, lower, upper, d, diag, GO_cost, GE_cost, sI);
478 }
479
480
get_lastD(ThreeValPtr PNTR flast_d,Int4Ptr lower,Int4Ptr upper,Int4Ptr d,Int4 diag,Int4 GO_cost,Int4 GE_cost)481 static int get_lastD(ThreeValPtr PNTR flast_d, Int4Ptr lower, Int4Ptr upper,
482 Int4Ptr d, Int4 diag, Int4 GO_cost, Int4 GE_cost)
483 {
484 return get_last_ID(flast_d, lower, upper, d, diag, GO_cost, GE_cost, sD);
485 }
486
487 /* --- From file align.c --- */
488 /* ----- */
489
get_last(Int4 ** flast_d,Int4 d,Int4 diag,Int4 * row1)490 static Int4 get_last(Int4 **flast_d, Int4 d, Int4 diag, Int4 *row1)
491 {
492 if (flast_d[d-1][diag-1] > MAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) {
493 *row1 = flast_d[d-1][diag-1];
494 return diag-1;
495 }
496 if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) {
497 *row1 = flast_d[d-1][diag];
498 return diag;
499 }
500 *row1 = flast_d[d-1][diag+1];
501 return diag+1;
502 }
503
504
505 /* ------ Functions, that create SeqAlignPtr from gap_align_ptr */
GreedyAlignMemAlloc(BlastSearchBlkPtr search)506 BlastSearchBlkPtr GreedyAlignMemAlloc(BlastSearchBlkPtr search)
507 {
508 Int4 max_d, max_d_1, Xdrop, d_diff, max_len, max_cost, gd, i;
509 Int4 reward, penalty, gap_open, gap_extend;
510 Int4 Mis_cost, GE_cost;
511
512 if (search == NULL)
513 return search;
514
515 if (search->sbp->reward % 2 == 1) {
516 reward = 2*search->sbp->reward;
517 penalty = -2*search->sbp->penalty;
518 Xdrop = 2*search->pbp->gap_x_dropoff;
519 gap_open = 2*search->pbp->gap_open;
520 gap_extend = 2*search->pbp->gap_extend;
521 } else {
522 reward = search->sbp->reward;
523 penalty = -search->sbp->penalty;
524 Xdrop = search->pbp->gap_x_dropoff;
525 gap_open = search->pbp->gap_open;
526 gap_extend = search->pbp->gap_extend;
527 }
528
529 if (gap_open == 0 && gap_extend == 0)
530 gap_extend = reward / 2 + penalty;
531
532 if (search->rdfp) {
533 ReadDBFILEPtr rdfp;
534 for (rdfp = search->rdfp, max_len = 0; rdfp; rdfp = rdfp->next)
535 max_len = MAX(max_len, readdb_get_maxlen(rdfp));
536 } else
537 max_len = search->subject->length;
538
539 max_len = MIN(max_len, MAX_DBSEQ_LEN);
540
541 max_d = (Int4) (max_len / ERROR_FRACTION + 1);
542
543 search->abmp = (GreedyAlignMemPtr) MemNew(sizeof(GreedyAlignMem));
544
545 if (search->pbp->gap_open==0 && search->pbp->gap_extend==0) {
546 d_diff = (Xdrop+reward/2) / (penalty+reward) + 1;
547
548 search->abmp->flast_d = (Int4Ptr PNTR) Malloc((max_d + 2) * sizeof(Int4Ptr));
549 if (search->abmp->flast_d == NULL) {
550 search->abmp = MemFree(search->abmp);
551 return search;
552 }
553 search->abmp->flast_d[0] = Malloc((max_d + max_d + 6) * sizeof(Int4) * 2);
554 if (search->abmp->flast_d[0] == NULL) {
555 ErrPostEx(SEV_WARNING, 0, 0, "Failed to allocate %ld bytes for abmp", (max_d + max_d + 6) * sizeof(Int4) * 2);
556 search->abmp = GreedyAlignMemFree(search->abmp);
557 return search;
558 }
559
560 search->abmp->flast_d[1] = search->abmp->flast_d[0] + max_d + max_d + 6;
561 search->abmp->flast_d_affine = NULL;
562 search->abmp->uplow_free = NULL;
563 } else {
564 search->abmp->flast_d = NULL;
565 Mis_cost = reward + penalty;
566 GE_cost = gap_extend+reward/2;
567 max_d_1 = max_d;
568 max_d *= GE_cost;
569 max_cost = MAX(Mis_cost, gap_open+GE_cost);
570 gd = gdb3(&Mis_cost, &gap_open, &GE_cost);
571 d_diff = (Xdrop+reward/2) / gd + 1;
572 search->abmp->uplow_free = MemNew(sizeof(Int4)*2*(max_d+1+max_cost));
573 search->abmp->flast_d_affine = (ThreeValPtr PNTR)
574 Malloc((MAX(max_d, max_cost) + 2) * sizeof(ThreeValPtr));
575 if (!search->abmp->uplow_free || !search->abmp->flast_d_affine) {
576 search->abmp = GreedyAlignMemFree(search->abmp);
577 return search;
578 }
579 search->abmp->flast_d_affine[0] =
580 MemNew((2*max_d_1 + 6) * sizeof(ThreeVal) * (max_cost+1));
581 for (i = 1; i <= max_cost; i++)
582 search->abmp->flast_d_affine[i] =
583 search->abmp->flast_d_affine[i-1] + 2*max_d_1 + 6;
584 if (!search->abmp->flast_d_affine || !search->abmp->flast_d_affine[0])
585 search->abmp = GreedyAlignMemFree(search->abmp);
586 }
587 search->abmp->max_row_free = Malloc(sizeof(Int4) * (max_d + 1 + d_diff));
588 if (!search->pbp->mb_params->no_traceback)
589 search->abmp->space = new_mb_space();
590 if (!search->abmp->max_row_free ||
591 (!search->pbp->mb_params->no_traceback && !search->abmp->space))
592 /* Failure in one of the memory allocations */
593 search->abmp = GreedyAlignMemFree(search->abmp);
594
595 return search;
596 }
597
GreedyAlignMemFree(GreedyAlignMemPtr abmp)598 GreedyAlignMemPtr GreedyAlignMemFree(GreedyAlignMemPtr abmp)
599 {
600 if (abmp->flast_d) {
601 MemFree(abmp->flast_d[0]);
602 MemFree(abmp->flast_d);
603 } else {
604 if (abmp->flast_d_affine) {
605 MemFree(abmp->flast_d_affine[0]);
606 MemFree(abmp->flast_d_affine);
607 }
608 MemFree(abmp->uplow_free);
609 }
610 MemFree(abmp->max_row_free);
611 if (abmp->space)
612 free_mb_space(abmp->space);
613 abmp = MemFree(abmp);
614 return abmp;
615 }
616
617 /*
618 Version to search a (possibly) packed nucl. sequence against
619 an unpacked sequence. s2 is the packed nucl. sequence.
620 s1 can be packed or unpacked. If rem == 4, then s1 is unpacked.
621 len2 corresponds to the unpacked (true) length.
622
623 * Basic O(ND) time, O(N) space, alignment function.
624 * Parameters:
625 * s1, len1 - first sequence and its length
626 * s2, len2 - second sequence and its length
627 * reverse - direction of alignment
628 * xdrop_threshold -
629 * mismatch_cost -
630 * e1, e2 - endpoint of the computed alignment
631 * edit_script -
632 * rem - offset shift within a packed sequence
633 */
MegaBlastGreedyAlign(const UcharPtr s1,Int4 len1,const UcharPtr s2,Int4 len2,Boolean reverse,Int4 xdrop_threshold,Int4 match_cost,Int4 mismatch_cost,Int4Ptr e1,Int4Ptr e2,GreedyAlignMemPtr abmp,edit_script_t * S,Uint1 rem)634 Int4 MegaBlastGreedyAlign(const UcharPtr s1, Int4 len1,
635 const UcharPtr s2, Int4 len2,
636 Boolean reverse, Int4 xdrop_threshold,
637 Int4 match_cost, Int4 mismatch_cost,
638 Int4Ptr e1, Int4Ptr e2,
639 GreedyAlignMemPtr abmp, edit_script_t *S,
640 Uint1 rem)
641 {
642 Int4 col, /* column number */
643 d, /* current distance */
644 k, /* current diagonal */
645 flower, fupper, /* boundaries for searching diagonals */
646 row, /* row number */
647 MAX_D, /* maximum cost */
648 ORIGIN,
649 return_val = 0;
650 Int4Ptr PNTR flast_d = abmp->flast_d; /* rows containing the last d */
651 Int4Ptr max_row; /* reached for cost d=0, ... len1. */
652
653 Int4 X_pen = xdrop_threshold;
654 Int4 M_half = match_cost/2;
655 Int4 Op_cost = mismatch_cost + M_half*2;
656 Int4 D_diff = (X_pen+M_half) / Op_cost + 1;
657
658 Int4 x, cur_max, b_diag = 0, best_diag = INT_MAX/2;
659 Int4Ptr max_row_free = abmp->max_row_free;
660 Char nlower = 0, nupper = 0;
661 MBSpacePtr space = abmp->space;
662 Int4 max_len = len2;
663
664 MAX_D = (Int4) (len1/ERROR_FRACTION + 1);
665 ORIGIN = MAX_D + 2;
666 *e1 = *e2 = 0;
667
668 if (reverse) {
669 if (!(rem & 4)) {
670 for (row = 0; row < len2 && row < len1 &&
671 (s2[len2-1-row] ==
672 READDB_UNPACK_BASE_N(s1[(len1-1-row)/4],
673 3-(len1-1-row)%4));
674 row++)
675 /*empty*/ ;
676 } else {
677 for (row = 0; row < len2 && row < len1 && (s2[len2-1-row] == s1[len1-1-row]); row++)
678 /*empty*/ ;
679 }
680 } else {
681 if (!(rem & 4)) {
682 for (row = 0; row < len2 && row < len1 &&
683 (s2[row] ==
684 READDB_UNPACK_BASE_N(s1[(row+rem)/4],
685 3-(row+rem)%4));
686 row++)
687 /*empty*/ ;
688 } else {
689 for (row = 0; row < len2 && row < len1 && (s2[row] == s1[row]); row++)
690 /*empty*/ ;
691 }
692 }
693 *e1 = row;
694 *e2 = row;
695 if (row == len1) {
696 if (S != NULL)
697 edit_script_rep(S, row);
698 /* hit last row; stop search */
699 return 0;
700 }
701 if (S==NULL)
702 space = 0;
703 else if (!space)
704 abmp->space = space = new_mb_space();
705 else
706 refresh_mb_space(space);
707
708 max_row = max_row_free + D_diff;
709 for (k = 0; k < D_diff; k++)
710 max_row_free[k] = 0;
711
712 flast_d[0][ORIGIN] = row;
713 max_row[0] = (row + row)*M_half;
714
715 flower = ORIGIN - 1;
716 fupper = ORIGIN + 1;
717
718 d = 1;
719 while (d <= MAX_D) {
720 Int4 fl0, fu0;
721 flast_d[d - 1][flower - 1] = flast_d[d - 1][flower] = -1;
722 flast_d[d - 1][fupper] = flast_d[d - 1][fupper + 1] = -1;
723 x = max_row[d - D_diff] + Op_cost * d - X_pen;
724 x = (Int4)ceil(x/M_half);
725 cur_max = 0;
726 fl0 = flower;
727 fu0 = fupper;
728 for (k = fl0; k <= fu0; k++) {
729 row = MAX(flast_d[d - 1][k + 1], flast_d[d - 1][k]) + 1;
730 row = MAX(row, flast_d[d - 1][k - 1]);
731 col = row + k - ORIGIN;
732 if (row + col >= x)
733 fupper = k;
734 else {
735 if (k == flower)
736 flower++;
737 else
738 flast_d[d][k] = -1;
739 continue;
740 }
741
742 if (row > max_len || row < 0) {
743 flower = k+1; nlower = 1;
744 } else {
745 /* Slide down the diagonal. Don't do this if reached
746 the end point, which has value 0x0f */
747 if (reverse) {
748 if (s2[len2-row] != 0x0f) {
749 if (!(rem & 4))
750 while (row < len2 && col < len1 && s2[len2-1-row] ==
751 READDB_UNPACK_BASE_N(s1[(len1-1-col)/4],
752 3-(len1-1-col)%4)) {
753 ++row;
754 ++col;
755 }
756 else
757 while (row < len2 && col < len1 && s2[len2-1-row] == s1[len1-1-col]) {
758 ++row;
759 ++col;
760 }
761 } else {
762 max_len = row;
763 flower = k+1; nlower = 1;
764 }
765 } else if (s2[row-1] != 0x0f) {
766 if (!(rem & 4)) {
767 while (row < len2 && col < len1 && s2[row] ==
768 READDB_UNPACK_BASE_N(s1[(col+rem)/4],
769 3-(col+rem)%4)) {
770 ++row;
771 ++col;
772 }
773 } else {
774 while (row < len2 && col < len1 && s2[row] == s1[col]) {
775 ++row;
776 ++col;
777 }
778 }
779 } else {
780 max_len = row;
781 flower = k+1; nlower = 1;
782 }
783 }
784 flast_d[d][k] = row;
785 if (row + col > cur_max) {
786 cur_max = row + col;
787 b_diag = k;
788 }
789 if (row == len2) {
790 flower = k+1; nlower = 1;
791 }
792 if (col == len1) {
793 fupper = k-1; nupper = 1;
794 }
795 }
796 k = cur_max*M_half - d * Op_cost;
797 if (max_row[d - 1] < k) {
798 max_row[d] = k;
799 return_val = d;
800 best_diag = b_diag;
801 *e2 = flast_d[d][b_diag];
802 *e1 = (*e2)+b_diag-ORIGIN;
803 } else {
804 max_row[d] = max_row[d - 1];
805 }
806 if (flower > fupper)
807 break;
808 d++;
809 if (!nlower) flower--;
810 if (!nupper) fupper++;
811 if (S==NULL)
812 flast_d[d] = flast_d[d - 2];
813 else {
814 /* space array consists of ThreeVal structures which are
815 3 times larger than Int4, so divide requested amount by 3
816 */
817 flast_d[d] = (Int4Ptr) get_mb_space(space, (fupper-flower+7)/3);
818 if (flast_d[d] != NULL)
819 flast_d[d] = flast_d[d] - flower + 2;
820 else
821 return return_val;
822 }
823 }
824
825 if (S!=NULL) { /*trace back*/
826 Int4 row1, col1, diag1, diag;
827 d = return_val; diag = best_diag;
828 row = *e2; col = *e1;
829 while (d > 0) {
830 diag1 = get_last(flast_d, d, diag, &row1);
831 col1 = row1+diag1-ORIGIN;
832 if (diag1 == diag) {
833 if (row-row1 > 0) edit_script_rep(S, row-row1);
834 } else if (diag1 < diag) {
835 if (row-row1 > 0) edit_script_rep(S, row-row1);
836 edit_script_ins(S,1);
837 } else {
838 if (row-row1-1> 0) edit_script_rep(S, row-row1-1);
839 edit_script_del(S, 1);
840 }
841 d--; diag = diag1; col = col1; row = row1;
842 }
843 edit_script_rep(S, flast_d[0][ORIGIN]);
844 if (!reverse)
845 edit_script_reverse_inplace(S);
846 }
847 return return_val;
848 }
849
850
MegaBlastAffineGreedyAlign(const UcharPtr s1,Int4 len1,const UcharPtr s2,Int4 len2,Boolean reverse,Int4 xdrop_threshold,Int4 match_score,Int4 mismatch_score,Int4 gap_open,Int4 gap_extend,Int4Ptr e1,Int4Ptr e2,GreedyAlignMemPtr abmp,edit_script_t * S,Uint1 rem)851 Int4 MegaBlastAffineGreedyAlign (const UcharPtr s1, Int4 len1,
852 const UcharPtr s2, Int4 len2,
853 Boolean reverse, Int4 xdrop_threshold,
854 Int4 match_score, Int4 mismatch_score,
855 Int4 gap_open, Int4 gap_extend,
856 Int4Ptr e1, Int4Ptr e2,
857 GreedyAlignMemPtr abmp, edit_script_t *S,
858 Uint1 rem)
859 {
860 Int4 col, /* column number */
861 d, /* current distance */
862 k, /* current diagonal */
863 flower, fupper, /* boundaries for searching diagonals */
864 row, /* row number */
865 MAX_D, /* maximum cost */
866 ORIGIN,
867 return_val = 0;
868 ThreeValPtr PNTR flast_d; /* rows containing the last d */
869 Int4Ptr max_row_free = abmp->max_row_free;
870 Int4Ptr max_row; /* reached for cost d=0, ... len1. */
871 Int4 Mis_cost, GO_cost, GE_cost;
872 Int4 D_diff, gd;
873 Int4 M_half;
874 Int4 max_cost;
875 Int4 *lower, *upper;
876
877 Int4 x, cur_max, b_diag = 0, best_diag = INT_MAX/2;
878 Char nlower = 0, nupper = 0;
879 MBSpacePtr space = abmp->space;
880 Int4 stop_condition;
881 Int4 max_d;
882 Int4Ptr uplow_free;
883 Int4 max_len = len2;
884
885 if (match_score % 2 == 1) {
886 match_score *= 2;
887 mismatch_score *= 2;
888 xdrop_threshold *= 2;
889 gap_open *= 2;
890 gap_extend *= 2;
891 }
892 M_half = match_score/2;
893
894 if (gap_open == 0 && gap_extend == 0) {
895 return MegaBlastGreedyAlign(s1, len1, s2, len2, reverse,
896 xdrop_threshold, match_score,
897 mismatch_score, e1, e2, abmp, S, rem);
898 }
899
900 Mis_cost = mismatch_score + match_score;
901 GO_cost = gap_open;
902 GE_cost = gap_extend+M_half;
903 gd = gdb3(&Mis_cost, &GO_cost, &GE_cost);
904 D_diff = (xdrop_threshold+M_half) / gd + 1;
905
906
907 MAX_D = (Int4) (len1/ERROR_FRACTION + 1);
908 max_d = MAX_D*GE_cost;
909 ORIGIN = MAX_D + 2;
910 max_cost = MAX(Mis_cost, GO_cost+GE_cost);
911 *e1 = *e2 = 0;
912
913 if (reverse) {
914 if (!(rem & 4)) {
915 for (row = 0; row < len2 && row < len1 &&
916 (s2[len2-1-row] ==
917 READDB_UNPACK_BASE_N(s1[(len1-1-row)/4],
918 3-(len1-1-row)%4));
919 row++)
920 /*empty*/ ;
921 } else {
922 for (row = 0; row < len2 && row < len1 && (s2[len2-1-row] == s1[len1-1-row]); row++)
923 /*empty*/ ;
924 }
925 } else {
926 if (!(rem & 4)) {
927 for (row = 0; row < len2 && row < len1 &&
928 (s2[row] ==
929 READDB_UNPACK_BASE_N(s1[(row+rem)/4],
930 3-(row+rem)%4));
931 row++)
932 /*empty*/ ;
933 } else {
934 for (row = 0; row < len2 && row < len1 && (s2[row] == s1[row]); row++)
935 /*empty*/ ;
936 }
937 }
938 *e1 = row;
939 *e2 = row;
940 if (row == len1 || row == len2) {
941 if (S != NULL)
942 edit_script_rep(S, row);
943 /* hit last row; stop search */
944 return row*match_score;
945 }
946 flast_d = abmp->flast_d_affine;
947 if (S==NULL) {
948 space = 0;
949 } else if (!space) {
950 abmp->space = space = new_mb_space();
951 } else {
952 refresh_mb_space(space);
953 }
954 max_row = max_row_free + D_diff;
955 for (k = 0; k < D_diff; k++)
956 max_row_free[k] = 0;
957 uplow_free = abmp->uplow_free;
958 lower = uplow_free;
959 upper = uplow_free+max_d+1+max_cost;
960 /* next 3 lines set boundary for -1,-2,...,-max_cost+1*/
961 for (k = 0; k < max_cost; k++) {lower[k] =LARGE; upper[k] = -LARGE;}
962 lower += max_cost;
963 upper += max_cost;
964
965 flast_d[0][ORIGIN].C = row;
966 flast_d[0][ORIGIN].I = flast_d[0][ORIGIN].D = -2;
967 max_row[0] = (row + row)*M_half;
968 lower[0] = upper[0] = ORIGIN;
969
970 flower = ORIGIN - 1;
971 fupper = ORIGIN + 1;
972
973 d = 1;
974 stop_condition = 1;
975 while (d <= max_d) {
976 Int4 fl0, fu0;
977 x = max_row[d - D_diff] + gd * d - xdrop_threshold;
978 x = (Int4)ceil(x/M_half);
979 if (x < 0) x=0;
980 cur_max = 0;
981 fl0 = flower;
982 fu0 = fupper;
983 for (k = fl0; k <= fu0; k++) {
984 row = -2;
985 if (k+1 <= upper[d-GO_cost-GE_cost] && k+1 >= lower[d-GO_cost-GE_cost])
986 row = flast_d[d-GO_cost-GE_cost][k+1].C;
987 if (k+1 <= upper[d-GE_cost] && k+1 >= lower[d-GE_cost] &&
988 row < flast_d[d-GE_cost][k+1].D)
989 row = flast_d[d-GE_cost][k+1].D;
990 row++;
991 if (row+row+k-ORIGIN >= x)
992 flast_d[d][k].D = row;
993 else flast_d[d][k].D = -2;
994 row = -1;
995 if (k-1 <= upper[d-GO_cost-GE_cost] && k-1 >= lower[d-GO_cost-GE_cost])
996 row = flast_d[d-GO_cost-GE_cost][k-1].C;
997 if (k-1 <= upper[d-GE_cost] && k-1 >= lower[d-GE_cost] &&
998 row < flast_d[d-GE_cost][k-1].I)
999 row = flast_d[d-GE_cost][k-1].I;
1000 if (row+row+k-ORIGIN >= x)
1001 flast_d[d][k].I = row;
1002 else flast_d[d][k].I = -2;
1003
1004 row = MAX(flast_d[d][k].I, flast_d[d][k].D);
1005 if (k <= upper[d-Mis_cost] && k >= lower[d-Mis_cost])
1006 row = MAX(flast_d[d-Mis_cost][k].C+1,row);
1007
1008 col = row + k - ORIGIN;
1009 if (row + col >= x)
1010 fupper = k;
1011 else {
1012 if (k == flower)
1013 flower++;
1014 else
1015 flast_d[d][k].C = -2;
1016 continue;
1017 }
1018 if (row > max_len || row < -2) {
1019 flower = k; nlower = k+1;
1020 } else {
1021 /* slide down the diagonal */
1022 if (reverse) {
1023 if (s2[len2 - row] != 0x0f) {
1024 if (!(rem & 4)) {
1025 while (row < len2 && col < len1 && s2[len2-1-row] ==
1026 READDB_UNPACK_BASE_N(s1[(len1-1-col)/4],
1027 3-(len1-1-col)%4)) {
1028 ++row;
1029 ++col;
1030 }
1031 } else {
1032 while (row < len2 && col < len1 && s2[len2-1-row] ==
1033 s1[len1-1-col]) {
1034 ++row;
1035 ++col;
1036 }
1037 }
1038 } else {
1039 max_len = row;
1040 flower = k; nlower = k+1;
1041 }
1042 } else if (s2[row-1] != 0x0f) {
1043 if (!(rem & 4)) {
1044 while (row < len2 && col < len1 && s2[row] ==
1045 READDB_UNPACK_BASE_N(s1[(col+rem)/4],
1046 3-(col+rem)%4)) {
1047 ++row;
1048 ++col;
1049 }
1050 } else {
1051 while (row < len2 && col < len1 && s2[row] == s1[col]) {
1052 ++row;
1053 ++col;
1054 }
1055 }
1056 } else {
1057 max_len = row;
1058 flower = k; nlower = k+1;
1059 }
1060 }
1061 flast_d[d][k].C = row;
1062 if (row + col > cur_max) {
1063 cur_max = row + col;
1064 b_diag = k;
1065 }
1066 if (row == len2) {
1067 flower = k; nlower = k+1;
1068 }
1069 if (col == len1) {
1070 fupper = k; nupper = k-1;
1071 }
1072 }
1073 k = cur_max*M_half - d * gd;
1074 if (max_row[d - 1] < k) {
1075 max_row[d] = k;
1076 return_val = d;
1077 best_diag = b_diag;
1078 *e2 = flast_d[d][b_diag].C;
1079 *e1 = (*e2)+b_diag-ORIGIN;
1080 } else {
1081 max_row[d] = max_row[d - 1];
1082 }
1083 if (flower <= fupper) {
1084 stop_condition++;
1085 lower[d] = flower; upper[d] = fupper;
1086 } else {
1087 lower[d] = LARGE; upper[d] = -LARGE;
1088 }
1089 if (lower[d-max_cost] <= upper[d-max_cost]) stop_condition--;
1090 if (stop_condition == 0) break;
1091 d++;
1092 flower = MIN(lower[d-Mis_cost], MIN(lower[d-GO_cost-GE_cost], lower[d-GE_cost])-1);
1093 if (nlower) flower = MAX(flower, nlower);
1094 fupper = MAX(upper[d-Mis_cost], MAX(upper[d-GO_cost-GE_cost], upper[d-GE_cost])+1);
1095 if (nupper) fupper = MIN(fupper, nupper);
1096 if (d > max_cost) {
1097 if (S==NULL) {
1098 /*if (d > max_cost)*/
1099 flast_d[d] = flast_d[d - max_cost-1];
1100 } else {
1101 flast_d[d] = get_mb_space(space, fupper-flower+1)-flower;
1102 if (flast_d[d] == NULL)
1103 return return_val;
1104 }
1105 }
1106 }
1107
1108 if (S!=NULL) { /*trace back*/
1109 Int4 row1, diag, state;
1110 d = return_val; diag = best_diag;
1111 row = *e2; state = sC;
1112 while (d > 0) {
1113 if (state == sC) {
1114 /* diag will not be changed*/
1115 state = get_lastC(flast_d, lower, upper, &d, diag, Mis_cost, &row1);
1116 if (row-row1 > 0) {
1117 edit_script_rep(S, row-row1);
1118 row = row1;
1119 }
1120 } else {
1121 if (state == sI) {
1122 /*row unchanged */
1123 state = get_lastI(flast_d, lower, upper, &d, diag, GO_cost, GE_cost);
1124 diag--;
1125 edit_script_ins(S,1);
1126 } else {
1127 edit_script_del(S,1);
1128 state = get_lastD(flast_d, lower, upper, &d, diag, GO_cost, GE_cost);
1129 diag++;
1130 row--;
1131 }
1132 }
1133 }
1134 edit_script_rep(S, flast_d[0][ORIGIN].C);
1135 if (!reverse)
1136 edit_script_reverse_inplace(S);
1137 }
1138 return_val = max_row[return_val];
1139 return return_val;
1140 }
1141
1142
1143
1144 /* ------------------------------------------------------------ */
1145