1 static char const rcsid[] = "$Id: mbalign.c,v 6.43 2006/02/02 17:57:34 papadopo Exp $";
2 
3 /* $Id: mbalign.c,v 6.43 2006/02/02 17:57:34 papadopo Exp $
4 * ===========================================================================
5 *
6 *                            PUBLIC DOMAIN NOTICE
7 *               National Center for Biotechnology Information
8 *
9 *  This software/database is a "United States Government Work" under the
10 *  terms of the United States Copyright Act.  It was written as part of
11 *  the author's official duties as a United States Government employee and
12 *  thus cannot be copyrighted.  This software/database is freely available
13 *  to the public for use. The National Library of Medicine and the U.S.
14 *  Government have not placed any restriction on its use or reproduction.
15 *
16 *  Although all reasonable efforts have been taken to ensure the accuracy
17 *  and reliability of the software and data, the NLM and the U.S.
18 *  Government do not and cannot warrant the performance or results that
19 *  may be obtained by using this software or data. The NLM and the U.S.
20 *  Government disclaim all warranties, express or implied, including
21 *  warranties of performance, merchantability or fitness for any particular
22 *  purpose.
23 *
24 *  Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 *
28 * File Name:  $RCSfile: mbalign.c,v $
29 *
30 * Author:  Webb Miller and Co.
31 * Adopted for NCBI standard libraries by Sergey Shavirin
32 *
33 * Initial Creation Date: 10/27/1999
34 *
35 * $Revision: 6.43 $
36 *
37 * File Description:
38 *        Alignment functions for Mega Blast program
39 *
40 * $Log: mbalign.c,v $
41 * Revision 6.43  2006/02/02 17:57:34  papadopo
42 * fix a rare bug computing greedy traceback
43 *
44 * Revision 6.42  2005/05/03 20:29:17  papadopo
45 * remove use of ICEIL macro; use either a true integer ceiling or (quotient+1) as circumstances warrant
46 *
47 * Revision 6.41  2003/10/29 17:46:59  dondosha
48 * Allow 2-stage greedy extension in megablast
49 *
50 * Revision 6.40  2003/05/30 17:25:36  coulouri
51 * add rcsid
52 *
53 * Revision 6.39  2003/05/13 16:02:53  coulouri
54 * make ErrPostEx(SEV_FATAL, ...) exit with nonzero status
55 *
56 * Revision 6.38  2003/01/30 19:40:13  dondosha
57 * No need to add gap extension penalty to gap open in greedy alignment functions
58 *
59 * Revision 6.37  2002/08/01 21:02:12  dondosha
60 * Added a sanity check in GreedyAlignMemFree
61 *
62 * Revision 6.36  2002/03/20 19:56:34  dondosha
63 * Check entire list of readdb structures for maximal length when allocating memory for greedy algorithm
64 *
65 * Revision 6.35  2001/12/28 20:38:41  dondosha
66 * Moved Mega BLAST related parameters into a separate structure
67 *
68 * Revision 6.34  2001/11/14 23:43:40  dondosha
69 * Subject sequence now already in blastna encoding, no need to convert
70 *
71 * Revision 6.33  2001/09/18 16:49:24  dondosha
72 * Removed unneeded functions, eliminated mbutils.h header
73 *
74 * Revision 6.32  2001/07/20 19:20:47  dondosha
75 * Allow packed or unpacked subject sequence
76 *
77 * Revision 6.31  2001/05/23 20:14:47  dondosha
78 * Corrected the row shift in non-affine greedy algorithm
79 *
80 * Revision 6.30  2001/05/07 22:05:24  dondosha
81 * Small correction in affine extension
82 *
83 * Revision 6.29  2001/05/03 21:48:28  dondosha
84 * Handle some cases when memory allocation fails
85 *
86 * Revision 6.28  2001/05/02 21:32:16  dondosha
87 * Return null from GreedyAlignMemFree
88 *
89 * Revision 6.27  2001/03/30 22:12:11  dondosha
90 * Improved protection from moving beyond sequence boundaries
91 *
92 * Revision 6.26  2001/02/12 19:47:49  dondosha
93 * Make space for the greedy algorithm a linked list, to allow its extention
94 *
95 * Revision 6.25  2001/01/25 18:50:53  dondosha
96 * Check for row going out of bounds in greedy extension routines
97 *
98 * Revision 6.24  2001/01/11 20:22:16  dondosha
99 * Allow for expansion of the space array for gapped extensions
100 *
101 * Revision 6.23  2000/12/07 17:45:14  dondosha
102 * Use actual subject sequence length in GreedyAlignMemAlloc for 2 Sequences engine
103 *
104 * Revision 6.22  2000/11/29 16:31:20  dondosha
105 * Database sequence length cannot be more than MAX_DBSEQ_LEN
106 *
107 * Revision 6.21  2000/11/03 20:58:35  dondosha
108 * Changed parameter one_line_results to no_traceback
109 *
110 * Revision 6.20  2000/10/13 16:57:16  dondosha
111 * Corrected behavior at the border between queries
112 *
113 * Revision 6.19  2000/10/02 14:48:49  dondosha
114 * Bug in previous change
115 *
116 * Revision 6.18  2000/09/29 22:16:48  dondosha
117 * Check if memory allocation failed in GreedyAlignMemAlloc
118 *
119 * Revision 6.17  2000/09/28 14:26:34  dondosha
120 * Moved all memory allocation away from the low level greedy algorithm
121 *
122 * Revision 6.16  2000/08/28 17:15:36  dondosha
123 * The sentinel value between queries changed to gap value
124 *
125 * Revision 6.15  2000/08/03 22:15:29  dondosha
126 * Use correct default gap extension parameter when allocating memory for affine case
127 *
128 * Revision 6.14  2000/05/24 19:51:28  dondosha
129 * Do not use readdb when megablast is used for two sequences
130 *
131 * Revision 6.13  2000/05/17 17:12:39  dondosha
132 * Removed unused variable
133 *
134 * Revision 6.12  2000/05/12 19:45:32  dondosha
135 * Subject sequence is now passed in ncbi4na encoding
136 *
137 * Revision 6.11  2000/05/01 21:23:40  dondosha
138 * Changed greedy_align_basic to MegaBlastGreedyAlign, added MegaBlastAffineGreedyAlign
139 *
140 * Revision 6.10  2000/04/19 18:53:31  dondosha
141 * Bug fix: allow to extend through gap and masked characters, but not end-points
142 *
143 * Revision 6.9  2000/04/07 18:52:32  dondosha
144 * Do not allocate new space for flast_d[0,1] every time
145 *
146 * Revision 6.8  2000/04/05 16:30:07  dondosha
147 * Fixed a bug that allowed crossing the end of a strand or query
148 *
149 * Revision 6.7  2000/03/30 20:38:33  dondosha
150 * Returned one modification from 6.5 that was lost in 6.6
151 *
152 * Revision 6.6  2000/03/29 22:03:14  dondosha
153 * Made edit_script_new and edit_script_append public for use in MegaBlastNtWordExtend
154 *
155 * Revision 6.5  2000/02/01 22:42:11  dondosha
156 * Moved some definitions to mbalign.h; added 3 new routines
157 *
158 * Revision 6.4  2000/01/26 16:39:18  beloslyu
159 * change space_t to space_type because HP-UX compiler got confused with ANSI C space_t
160 *
161 * Revision 6.3  1999/11/24 20:38:09  shavirin
162 * Added possibility to produce Traditional Blast Output.
163 *
164 * Revision 6.2  1999/11/03 19:53:57  shavirin
165 * Fixed problem with Realloc() function.
166 *
167 * Revision 6.1  1999/10/29 15:37:12  shavirin
168 * Initial revision.
169 *
170 *
171 * Initial revision.
172 *
173 * ==========================================================================
174 */
175 
176 #include <mbalign.h>
177 #include <blastdef.h>
178 
179 /* -------- From original file edit.c ------------- */
180 
edit_val_get(edit_op_t op)181 static Uint4 edit_val_get(edit_op_t op)
182 {
183     return op >> 2;
184 }
edit_opc_get(edit_op_t op)185 static Uint4 edit_opc_get(edit_op_t op)
186 {
187     return op & EDIT_OP_MASK;
188 }
189 
edit_op_cons(Uint4 op,Uint4 val)190 static edit_op_t edit_op_cons(Uint4 op, Uint4 val)
191 {
192     return (val << 2) | (op & EDIT_OP_MASK);
193 }
194 
edit_op_inc(edit_op_t op,Uint4 n)195 static edit_op_t edit_op_inc(edit_op_t op, Uint4 n)
196 {
197     return edit_op_cons(edit_opc_get(op), edit_val_get(op) + n);
198 }
199 
edit_op_inc_last(edit_script_t * es,Uint4 n)200 static edit_op_t edit_op_inc_last(edit_script_t *es, Uint4 n)
201 {
202     edit_op_t *last;
203     ASSERT (es->num > 0);
204     last = &(es->op[es->num-1]);
205     *last = edit_op_inc(*last, n);
206     return *last;
207 }
208 
edit_script_ready(edit_script_t * es,Uint4 n)209 static Int4 edit_script_ready(edit_script_t *es, Uint4 n)
210 {
211     edit_op_t *p;
212     Uint4 m = n + n/2;
213 
214     if (es->size <= n) {
215         p = Realloc(es->op, m*sizeof(edit_op_t));
216         if (p == 0) {
217             return 0;
218         } else {
219             es->op = p;
220             es->size = m;
221         }
222     }
223     return 1;
224 }
225 
edit_script_readyplus(edit_script_t * es,Uint4 n)226 static Int4 edit_script_readyplus(edit_script_t *es, Uint4 n)
227 {
228     if (es->size - es->num <= n)
229         return edit_script_ready(es, n + es->num);
230     return 1;
231 }
232 
edit_script_put(edit_script_t * es,Uint4 op,Uint4 n)233 static Int4 edit_script_put(edit_script_t *es, Uint4 op, Uint4 n)
234 {
235     if (!edit_script_readyplus(es, 2))
236         return 0;
237     es->last = op;
238     ASSERT(op != 0);
239     es->op[es->num] = edit_op_cons(op, n);
240     es->num += 1;
241     es->op[es->num] = 0; /* sentinal */
242 
243     return 1;
244 }
245 
edit_script_init(edit_script_t * es)246 static edit_script_t *edit_script_init(edit_script_t *es)
247 {
248 	es->op = 0;
249 	es->size = es->num = 0;
250 	es->last = 0;
251 	edit_script_ready(es, 8);
252 	return es;
253 }
edit_script_first(edit_script_t * es)254 static edit_op_t *edit_script_first(edit_script_t *es)
255 {
256     return es->num > 0 ? &es->op[0] : 0;
257 }
258 
edit_script_next(edit_script_t * es,edit_op_t * op)259 static edit_op_t *edit_script_next(edit_script_t *es, edit_op_t *op)
260 {
261     /* XXX - assumes flat address space */
262     if (&es->op[0] <= op && op < &es->op[es->num-1])
263         return op+1;
264     else
265         return 0;
266 }
edit_script_more(edit_script_t * data,Uint4 op,Uint4 k)267 static Int4 edit_script_more(edit_script_t *data, Uint4 op, Uint4 k)
268 {
269     if (op == EDIT_OP_ERR) {
270         ErrPostEx(SEV_FATAL, 1, 0,
271                   "edit_script_more: bad opcode %d:%d", op, k);
272         return -1;
273     }
274 
275     if (edit_opc_get(data->last) == op)
276         edit_op_inc_last(data, k);
277     else
278         edit_script_put(data, op, k);
279 
280     return 0;
281 }
282 /* External */
edit_script_append(edit_script_t * es,edit_script_t * et)283 edit_script_t *edit_script_append(edit_script_t *es, edit_script_t *et)
284 {
285     edit_op_t *op;
286 
287     for (op = edit_script_first(et); op; op = edit_script_next(et, op))
288         edit_script_more(es, edit_opc_get(*op), edit_val_get(*op));
289 
290     return es;
291 }
292 
293 /* External */
edit_script_new(void)294 edit_script_t *edit_script_new(void)
295 {
296     edit_script_t *es = MemNew(sizeof(*es));
297     if (!es)
298         return 0;
299 
300     return edit_script_init(es);
301 }
302 
303 /* External */
edit_script_free(edit_script_t * es)304 edit_script_t *edit_script_free(edit_script_t *es)
305 {
306     if (es) {
307         if (es->op)
308             MemFree(es->op);
309         MemSet(es, 0, sizeof(*es));
310         MemFree(es);
311     }
312     return 0;
313 }
314 
edit_script_del(edit_script_t * data,Uint4 k)315 Int4 edit_script_del(edit_script_t *data, Uint4 k)
316 {
317     return edit_script_more(data, EDIT_OP_DEL, k);
318 }
319 
edit_script_ins(edit_script_t * data,Uint4 k)320 Int4 edit_script_ins(edit_script_t *data, Uint4 k)
321 {
322     return edit_script_more(data, EDIT_OP_INS, k);
323 }
edit_script_rep(edit_script_t * data,Uint4 k)324 Int4 edit_script_rep(edit_script_t *data, Uint4 k)
325 {
326     return edit_script_more(data, EDIT_OP_REP, k);
327 }
328 
edit_script_reverse_inplace(edit_script_t * es)329 edit_script_t *edit_script_reverse_inplace(edit_script_t *es)
330 {
331     Uint4 i;
332     const Uint4 num = es->num;
333     const Uint4 mid = num/2;
334     const Uint4 end = num-1;
335 
336     for (i = 0; i < mid; ++i) {
337         const edit_op_t t = es->op[i];
338         es->op[i] = es->op[end-i];
339         es->op[end-i] = t;
340     }
341     return es;
342 }
343 
new_mb_space()344 MBSpacePtr new_mb_space()
345 {
346     MBSpacePtr p;
347     Int4 amount;
348 
349     p = Nlm_Malloc(sizeof(MBSpace));
350     amount = MAX_SPACE;
351     p->space_array = Nlm_Malloc(sizeof(ThreeVal)*amount);
352     if (p->space_array == NULL) {
353        p = MemFree(p);
354        return NULL;
355     }
356     p->used = 0;
357     p->size = amount;
358     p->next = NULL;
359 
360     return p;
361 }
362 
refresh_mb_space(MBSpacePtr sp)363 void refresh_mb_space(MBSpacePtr sp)
364 {
365    while (sp) {
366       sp->used = 0;
367       sp = sp->next;
368    }
369 }
370 
free_mb_space(MBSpacePtr sp)371 void free_mb_space(MBSpacePtr sp)
372 {
373    MBSpacePtr next_sp;
374 
375    while (sp) {
376       next_sp = sp->next;
377       sp->space_array = MemFree(sp->space_array);
378       sp = MemFree(sp);
379       sp = next_sp;
380    }
381 }
382 
get_mb_space(MBSpacePtr S,Int4 amount)383 ThreeValPtr get_mb_space(MBSpacePtr S, Int4 amount)
384 {
385     ThreeValPtr s;
386     if (amount < 0)
387         return NULL;
388 
389     while (S->used+amount > S->size) {
390        if (S->next == NULL)
391           if ((S->next = new_mb_space()) == NULL) {
392 	     ErrPostEx(SEV_WARNING, 0, 0, "Cannot get new space for greedy extension");
393 	     return NULL;
394           }
395        S = S->next;
396     }
397 
398     s = S->space_array+S->used;
399     S->used += amount;
400 
401     return s;
402 }
403 /* ----- */
404 
gcd(Int4 a,Int4 b)405 static Int4 gcd(Int4 a, Int4 b)
406 {
407     Int4 c;
408     if (a < b) {
409         c = a;
410         a = b; b = c;
411     }
412     while ((c = a % b) != 0) {
413         a = b; b = c;
414     }
415 
416     return b;
417 }
418 
419 
gdb3(Int4Ptr a,Int4Ptr b,Int4Ptr c)420 static Int4 gdb3(Int4Ptr a, Int4Ptr b, Int4Ptr c)
421 {
422     Int4 g;
423     if (*b == 0) g = gcd(*a, *c);
424     else g = gcd(*a, gcd(*b, *c));
425     if (g > 1) {
426         *a /= g;
427         *b /= g;
428         *c /= g;
429     }
430     return g;
431 }
432 
get_lastC(ThreeValPtr PNTR flast_d,Int4Ptr lower,Int4Ptr upper,Int4Ptr d,Int4 diag,Int4 Mis_cost,Int4Ptr row1)433 static Int4 get_lastC(ThreeValPtr PNTR flast_d, Int4Ptr lower, Int4Ptr upper,
434                       Int4Ptr d, Int4 diag, Int4 Mis_cost, Int4Ptr row1)
435 {
436     Int4 row;
437 
438     if (diag >= lower[(*d)-Mis_cost] && diag <= upper[(*d)-Mis_cost]) {
439         row = flast_d[(*d)-Mis_cost][diag].C;
440         if (row >= MAX(flast_d[*d][diag].I, flast_d[*d][diag].D)) {
441             *d = *d-Mis_cost;
442             *row1 = row;
443             return sC;
444         }
445     }
446     if (flast_d[*d][diag].I > flast_d[*d][diag].D) {
447         *row1 = flast_d[*d][diag].I;
448         return sI;
449     } else {
450         *row1 = flast_d[*d][diag].D;
451         return sD;
452     }
453 }
454 
get_last_ID(ThreeValPtr PNTR flast_d,Int4Ptr lower,Int4Ptr upper,Int4Ptr d,Int4 diag,Int4 GO_cost,Int4 GE_cost,Int4 IorD)455 static Int4 get_last_ID(ThreeValPtr PNTR flast_d, Int4Ptr lower, Int4Ptr upper,
456                         Int4Ptr d, Int4 diag, Int4 GO_cost,
457                         Int4 GE_cost, Int4 IorD)
458 {
459     Int4 ndiag;
460     Int4 row;
461 
462     if (IorD == sI) { ndiag = diag -1;} else ndiag = diag+1;
463     if (ndiag >= lower[(*d)-GE_cost] && ndiag <=upper[(*d)-GE_cost])
464         row = (IorD == sI)? flast_d[(*d)-GE_cost][ndiag].I: flast_d[(*d)-GE_cost][ndiag].D;
465     else row = -100;
466     if (ndiag >= lower[(*d)-GO_cost-GE_cost] && ndiag <= upper[(*d)-GO_cost-GE_cost] && row < flast_d[(*d)-GO_cost-GE_cost][ndiag].C) {
467         *d = (*d)-GO_cost-GE_cost;
468         return sC;
469     }
470     *d = (*d)-GE_cost;
471     return IorD;
472 }
473 
get_lastI(ThreeValPtr PNTR flast_d,Int4Ptr lower,Int4Ptr upper,Int4Ptr d,Int4 diag,Int4 GO_cost,Int4 GE_cost)474 static Int4 get_lastI(ThreeValPtr PNTR flast_d, Int4Ptr lower, Int4Ptr upper,
475                       Int4Ptr d, Int4 diag, Int4 GO_cost, Int4 GE_cost)
476 {
477     return get_last_ID(flast_d, lower, upper, d, diag, GO_cost, GE_cost, sI);
478 }
479 
480 
get_lastD(ThreeValPtr PNTR flast_d,Int4Ptr lower,Int4Ptr upper,Int4Ptr d,Int4 diag,Int4 GO_cost,Int4 GE_cost)481 static int get_lastD(ThreeValPtr PNTR flast_d, Int4Ptr lower, Int4Ptr upper,
482                      Int4Ptr d, Int4 diag, Int4 GO_cost, Int4 GE_cost)
483 {
484     return get_last_ID(flast_d, lower, upper, d, diag, GO_cost, GE_cost, sD);
485 }
486 
487 /* --- From file align.c --- */
488 /* ----- */
489 
get_last(Int4 ** flast_d,Int4 d,Int4 diag,Int4 * row1)490 static Int4 get_last(Int4 **flast_d, Int4 d, Int4 diag, Int4 *row1)
491 {
492     if (flast_d[d-1][diag-1] > MAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) {
493         *row1 = flast_d[d-1][diag-1];
494         return diag-1;
495     }
496     if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) {
497         *row1 = flast_d[d-1][diag];
498         return diag;
499     }
500     *row1 = flast_d[d-1][diag+1];
501     return diag+1;
502 }
503 
504 
505 /* ------ Functions, that create SeqAlignPtr from gap_align_ptr */
GreedyAlignMemAlloc(BlastSearchBlkPtr search)506 BlastSearchBlkPtr GreedyAlignMemAlloc(BlastSearchBlkPtr search)
507 {
508    Int4 max_d, max_d_1, Xdrop, d_diff, max_len, max_cost, gd, i;
509    Int4 reward, penalty, gap_open, gap_extend;
510    Int4 Mis_cost, GE_cost;
511 
512    if (search == NULL)
513       return search;
514 
515    if (search->sbp->reward % 2 == 1) {
516       reward = 2*search->sbp->reward;
517       penalty = -2*search->sbp->penalty;
518       Xdrop = 2*search->pbp->gap_x_dropoff;
519       gap_open = 2*search->pbp->gap_open;
520       gap_extend = 2*search->pbp->gap_extend;
521    } else {
522       reward = search->sbp->reward;
523       penalty = -search->sbp->penalty;
524       Xdrop = search->pbp->gap_x_dropoff;
525       gap_open = search->pbp->gap_open;
526       gap_extend = search->pbp->gap_extend;
527    }
528 
529    if (gap_open == 0 && gap_extend == 0)
530       gap_extend = reward / 2 + penalty;
531 
532    if (search->rdfp) {
533       ReadDBFILEPtr rdfp;
534       for (rdfp = search->rdfp, max_len = 0; rdfp; rdfp = rdfp->next)
535          max_len = MAX(max_len, readdb_get_maxlen(rdfp));
536    } else
537       max_len = search->subject->length;
538 
539    max_len = MIN(max_len, MAX_DBSEQ_LEN);
540 
541    max_d = (Int4) (max_len / ERROR_FRACTION + 1);
542 
543    search->abmp = (GreedyAlignMemPtr) MemNew(sizeof(GreedyAlignMem));
544 
545    if (search->pbp->gap_open==0 && search->pbp->gap_extend==0) {
546       d_diff = (Xdrop+reward/2) / (penalty+reward) + 1;
547 
548       search->abmp->flast_d = (Int4Ptr PNTR) Malloc((max_d + 2) * sizeof(Int4Ptr));
549       if (search->abmp->flast_d == NULL) {
550          search->abmp = MemFree(search->abmp);
551          return search;
552       }
553       search->abmp->flast_d[0] = Malloc((max_d + max_d + 6) * sizeof(Int4) * 2);
554       if (search->abmp->flast_d[0] == NULL) {
555 	 ErrPostEx(SEV_WARNING, 0, 0, "Failed to allocate %ld bytes for abmp", (max_d + max_d + 6) * sizeof(Int4) * 2);
556          search->abmp = GreedyAlignMemFree(search->abmp);
557          return search;
558       }
559 
560       search->abmp->flast_d[1] = search->abmp->flast_d[0] + max_d + max_d + 6;
561       search->abmp->flast_d_affine = NULL;
562       search->abmp->uplow_free = NULL;
563    } else {
564       search->abmp->flast_d = NULL;
565       Mis_cost = reward + penalty;
566       GE_cost = gap_extend+reward/2;
567       max_d_1 = max_d;
568       max_d *= GE_cost;
569       max_cost = MAX(Mis_cost, gap_open+GE_cost);
570       gd = gdb3(&Mis_cost, &gap_open, &GE_cost);
571       d_diff = (Xdrop+reward/2) / gd + 1;
572       search->abmp->uplow_free = MemNew(sizeof(Int4)*2*(max_d+1+max_cost));
573       search->abmp->flast_d_affine = (ThreeValPtr PNTR)
574 	 Malloc((MAX(max_d, max_cost) + 2) * sizeof(ThreeValPtr));
575       if (!search->abmp->uplow_free || !search->abmp->flast_d_affine) {
576          search->abmp = GreedyAlignMemFree(search->abmp);
577          return search;
578       }
579       search->abmp->flast_d_affine[0] =
580 	 MemNew((2*max_d_1 + 6) * sizeof(ThreeVal) * (max_cost+1));
581       for (i = 1; i <= max_cost; i++)
582 	 search->abmp->flast_d_affine[i] =
583 	    search->abmp->flast_d_affine[i-1] + 2*max_d_1 + 6;
584       if (!search->abmp->flast_d_affine || !search->abmp->flast_d_affine[0])
585          search->abmp = GreedyAlignMemFree(search->abmp);
586    }
587    search->abmp->max_row_free = Malloc(sizeof(Int4) * (max_d + 1 + d_diff));
588    if (!search->pbp->mb_params->no_traceback)
589      search->abmp->space = new_mb_space();
590    if (!search->abmp->max_row_free ||
591        (!search->pbp->mb_params->no_traceback && !search->abmp->space))
592       /* Failure in one of the memory allocations */
593       search->abmp = GreedyAlignMemFree(search->abmp);
594 
595    return search;
596 }
597 
GreedyAlignMemFree(GreedyAlignMemPtr abmp)598 GreedyAlignMemPtr GreedyAlignMemFree(GreedyAlignMemPtr abmp)
599 {
600    if (abmp->flast_d) {
601       MemFree(abmp->flast_d[0]);
602       MemFree(abmp->flast_d);
603    } else {
604       if (abmp->flast_d_affine) {
605          MemFree(abmp->flast_d_affine[0]);
606          MemFree(abmp->flast_d_affine);
607       }
608       MemFree(abmp->uplow_free);
609    }
610    MemFree(abmp->max_row_free);
611    if (abmp->space)
612       free_mb_space(abmp->space);
613    abmp = MemFree(abmp);
614    return abmp;
615 }
616 
617 /*
618 	Version to search a (possibly) packed nucl. sequence against
619 an unpacked sequence.  s2 is the packed nucl. sequence.
620 s1 can be packed or unpacked. If rem == 4, then s1 is unpacked.
621 len2 corresponds to the unpacked (true) length.
622 
623  * Basic O(ND) time, O(N) space, alignment function.
624  * Parameters:
625  *   s1, len1        - first sequence and its length
626  *   s2, len2        - second sequence and its length
627  *   reverse         - direction of alignment
628  *   xdrop_threshold -
629  *   mismatch_cost   -
630  *   e1, e2          - endpoint of the computed alignment
631  *   edit_script     -
632  *   rem             - offset shift within a packed sequence
633  */
MegaBlastGreedyAlign(const UcharPtr s1,Int4 len1,const UcharPtr s2,Int4 len2,Boolean reverse,Int4 xdrop_threshold,Int4 match_cost,Int4 mismatch_cost,Int4Ptr e1,Int4Ptr e2,GreedyAlignMemPtr abmp,edit_script_t * S,Uint1 rem)634 Int4 MegaBlastGreedyAlign(const UcharPtr s1, Int4 len1,
635 			  const UcharPtr s2, Int4 len2,
636 			  Boolean reverse, Int4 xdrop_threshold,
637 			  Int4 match_cost, Int4 mismatch_cost,
638 			  Int4Ptr e1, Int4Ptr e2,
639 			  GreedyAlignMemPtr abmp, edit_script_t *S,
640                           Uint1 rem)
641 {
642     Int4 col,			/* column number */
643         d,				/* current distance */
644         k,				/* current diagonal */
645         flower, fupper,            /* boundaries for searching diagonals */
646         row,		        /* row number */
647         MAX_D, 			/* maximum cost */
648         ORIGIN,
649         return_val = 0;
650     Int4Ptr PNTR flast_d = abmp->flast_d; /* rows containing the last d */
651     Int4Ptr max_row;		/* reached for cost d=0, ... len1.  */
652 
653     Int4 X_pen = xdrop_threshold;
654     Int4 M_half = match_cost/2;
655     Int4 Op_cost = mismatch_cost + M_half*2;
656     Int4 D_diff = (X_pen+M_half) / Op_cost + 1;
657 
658     Int4 x, cur_max, b_diag = 0, best_diag = INT_MAX/2;
659     Int4Ptr max_row_free = abmp->max_row_free;
660     Char nlower = 0, nupper = 0;
661     MBSpacePtr space = abmp->space;
662     Int4 max_len = len2;
663 
664     MAX_D = (Int4) (len1/ERROR_FRACTION + 1);
665     ORIGIN = MAX_D + 2;
666     *e1 = *e2 = 0;
667 
668     if (reverse) {
669        if (!(rem & 4)) {
670           for (row = 0; row < len2 && row < len1 &&
671                   (s2[len2-1-row] ==
672                    READDB_UNPACK_BASE_N(s1[(len1-1-row)/4],
673                                         3-(len1-1-row)%4));
674                row++)
675              /*empty*/ ;
676        } else {
677           for (row = 0; row < len2 && row < len1 && (s2[len2-1-row] == s1[len1-1-row]); row++)
678              /*empty*/ ;
679        }
680     } else {
681        if (!(rem & 4)) {
682           for (row = 0; row < len2 && row < len1 &&
683                   (s2[row] ==
684                    READDB_UNPACK_BASE_N(s1[(row+rem)/4],
685                                         3-(row+rem)%4));
686                row++)
687              /*empty*/ ;
688        } else {
689           for (row = 0; row < len2 && row < len1 && (s2[row] == s1[row]); row++)
690              /*empty*/ ;
691        }
692     }
693     *e1 = row;
694     *e2 = row;
695     if (row == len1) {
696         if (S != NULL)
697             edit_script_rep(S, row);
698 	/* hit last row; stop search */
699 	return 0;
700     }
701     if (S==NULL)
702        space = 0;
703     else if (!space)
704        abmp->space = space = new_mb_space();
705     else
706        refresh_mb_space(space);
707 
708     max_row = max_row_free + D_diff;
709     for (k = 0; k < D_diff; k++)
710 	max_row_free[k] = 0;
711 
712     flast_d[0][ORIGIN] = row;
713     max_row[0] = (row + row)*M_half;
714 
715     flower = ORIGIN - 1;
716     fupper = ORIGIN + 1;
717 
718     d = 1;
719     while (d <= MAX_D) {
720 	Int4 fl0, fu0;
721 	flast_d[d - 1][flower - 1] = flast_d[d - 1][flower] = -1;
722 	flast_d[d - 1][fupper] = flast_d[d - 1][fupper + 1] = -1;
723 	x = max_row[d - D_diff] + Op_cost * d - X_pen;
724 	x = (Int4)ceil(x/M_half);
725 	cur_max = 0;
726 	fl0 = flower;
727 	fu0 = fupper;
728 	for (k = fl0; k <= fu0; k++) {
729 	    row = MAX(flast_d[d - 1][k + 1], flast_d[d - 1][k]) + 1;
730 	    row = MAX(row, flast_d[d - 1][k - 1]);
731 	    col = row + k - ORIGIN;
732 	    if (row + col >= x)
733 		fupper = k;
734 	    else {
735 		if (k == flower)
736 		    flower++;
737 		else
738 		    flast_d[d][k] = -1;
739 		continue;
740 	    }
741 
742             if (row > max_len || row < 0) {
743                   flower = k+1; nlower = 1;
744             } else {
745                /* Slide down the diagonal. Don't do this if reached
746                   the end point, which has value 0x0f */
747                if (reverse) {
748                   if (s2[len2-row] != 0x0f) {
749                      if (!(rem & 4))
750                      while (row < len2 && col < len1 && s2[len2-1-row] ==
751                             READDB_UNPACK_BASE_N(s1[(len1-1-col)/4],
752                                                  3-(len1-1-col)%4)) {
753                         ++row;
754                         ++col;
755                      }
756                      else
757                       while (row < len2 && col < len1 && s2[len2-1-row] == s1[len1-1-col]) {
758                         ++row;
759                         ++col;
760                      }
761                   } else {
762                      max_len = row;
763                      flower = k+1; nlower = 1;
764                   }
765                } else if (s2[row-1] != 0x0f) {
766                   if (!(rem & 4)) {
767                      while (row < len2 && col < len1 && s2[row] ==
768                             READDB_UNPACK_BASE_N(s1[(col+rem)/4],
769                                                  3-(col+rem)%4)) {
770                         ++row;
771                         ++col;
772                      }
773                   } else {
774                      while (row < len2 && col < len1 && s2[row] == s1[col]) {
775                         ++row;
776                         ++col;
777                      }
778                   }
779                } else {
780                   max_len = row;
781                   flower = k+1; nlower = 1;
782                }
783             }
784 	    flast_d[d][k] = row;
785 	    if (row + col > cur_max) {
786 		cur_max = row + col;
787 		b_diag = k;
788 	    }
789 	    if (row == len2) {
790 		flower = k+1; nlower = 1;
791 	    }
792 	    if (col == len1) {
793 		fupper = k-1; nupper = 1;
794 	    }
795 	}
796 	k = cur_max*M_half - d * Op_cost;
797 	if (max_row[d - 1] < k) {
798 	    max_row[d] = k;
799 	    return_val = d;
800 	    best_diag = b_diag;
801 	    *e2 = flast_d[d][b_diag];
802 	    *e1 = (*e2)+b_diag-ORIGIN;
803 	} else {
804 	    max_row[d] = max_row[d - 1];
805 	}
806 	if (flower > fupper)
807 	    break;
808 	d++;
809 	if (!nlower) flower--;
810 	if (!nupper) fupper++;
811 	if (S==NULL)
812 	   flast_d[d] = flast_d[d - 2];
813 	else {
814            /* space array consists of ThreeVal structures which are
815               3 times larger than Int4, so divide requested amount by 3
816            */
817 	   flast_d[d] = (Int4Ptr) get_mb_space(space, (fupper-flower+7)/3);
818 	   if (flast_d[d] != NULL)
819               flast_d[d] = flast_d[d] - flower + 2;
820            else
821 	      return return_val;
822         }
823     }
824 
825     if (S!=NULL) { /*trace back*/
826         Int4 row1, col1, diag1, diag;
827         d = return_val; diag = best_diag;
828         row = *e2; col = *e1;
829         while (d > 0) {
830             diag1 = get_last(flast_d, d, diag, &row1);
831             col1 = row1+diag1-ORIGIN;
832             if (diag1 == diag) {
833                 if (row-row1 > 0) edit_script_rep(S, row-row1);
834             } else if (diag1 < diag) {
835                 if (row-row1 > 0) edit_script_rep(S, row-row1);
836                 edit_script_ins(S,1);
837             } else {
838                 if (row-row1-1> 0) edit_script_rep(S, row-row1-1);
839                 edit_script_del(S, 1);
840             }
841             d--; diag = diag1; col = col1; row = row1;
842         }
843         edit_script_rep(S, flast_d[0][ORIGIN]);
844         if (!reverse)
845             edit_script_reverse_inplace(S);
846     }
847     return return_val;
848 }
849 
850 
MegaBlastAffineGreedyAlign(const UcharPtr s1,Int4 len1,const UcharPtr s2,Int4 len2,Boolean reverse,Int4 xdrop_threshold,Int4 match_score,Int4 mismatch_score,Int4 gap_open,Int4 gap_extend,Int4Ptr e1,Int4Ptr e2,GreedyAlignMemPtr abmp,edit_script_t * S,Uint1 rem)851 Int4 MegaBlastAffineGreedyAlign (const UcharPtr s1, Int4 len1,
852 				 const UcharPtr s2, Int4 len2,
853 				 Boolean reverse, Int4 xdrop_threshold,
854 				 Int4 match_score, Int4 mismatch_score,
855 				 Int4 gap_open, Int4 gap_extend,
856 				 Int4Ptr e1, Int4Ptr e2,
857 				 GreedyAlignMemPtr abmp, edit_script_t *S,
858                                  Uint1 rem)
859 {
860     Int4 col,			/* column number */
861         d,			/* current distance */
862         k,			/* current diagonal */
863         flower, fupper,         /* boundaries for searching diagonals */
864         row,		        /* row number */
865         MAX_D, 			/* maximum cost */
866         ORIGIN,
867         return_val = 0;
868     ThreeValPtr PNTR flast_d;	/* rows containing the last d */
869     Int4Ptr max_row_free = abmp->max_row_free;
870     Int4Ptr max_row;		/* reached for cost d=0, ... len1.  */
871     Int4 Mis_cost, GO_cost, GE_cost;
872     Int4 D_diff, gd;
873     Int4 M_half;
874     Int4 max_cost;
875     Int4 *lower, *upper;
876 
877     Int4 x, cur_max, b_diag = 0, best_diag = INT_MAX/2;
878     Char nlower = 0, nupper = 0;
879     MBSpacePtr space = abmp->space;
880     Int4 stop_condition;
881     Int4 max_d;
882     Int4Ptr uplow_free;
883     Int4 max_len = len2;
884 
885     if (match_score % 2 == 1) {
886         match_score *= 2;
887         mismatch_score *= 2;
888         xdrop_threshold *= 2;
889         gap_open *= 2;
890 	gap_extend *= 2;
891     }
892     M_half = match_score/2;
893 
894     if (gap_open == 0 && gap_extend == 0) {
895        return MegaBlastGreedyAlign(s1, len1, s2, len2, reverse,
896                                    xdrop_threshold, match_score,
897                                    mismatch_score, e1, e2, abmp, S, rem);
898     }
899 
900     Mis_cost = mismatch_score + match_score;
901     GO_cost = gap_open;
902     GE_cost = gap_extend+M_half;
903     gd = gdb3(&Mis_cost, &GO_cost, &GE_cost);
904     D_diff = (xdrop_threshold+M_half) / gd + 1;
905 
906 
907     MAX_D = (Int4) (len1/ERROR_FRACTION + 1);
908     max_d = MAX_D*GE_cost;
909     ORIGIN = MAX_D + 2;
910     max_cost = MAX(Mis_cost, GO_cost+GE_cost);
911     *e1 = *e2 = 0;
912 
913     if (reverse) {
914        if (!(rem & 4)) {
915           for (row = 0; row < len2 && row < len1 &&
916                   (s2[len2-1-row] ==
917                    READDB_UNPACK_BASE_N(s1[(len1-1-row)/4],
918                                         3-(len1-1-row)%4));
919                row++)
920              /*empty*/ ;
921        } else {
922           for (row = 0; row < len2 && row < len1 && (s2[len2-1-row] == s1[len1-1-row]); row++)
923              /*empty*/ ;
924        }
925     } else {
926        if (!(rem & 4)) {
927           for (row = 0; row < len2 && row < len1 &&
928                   (s2[row] ==
929                    READDB_UNPACK_BASE_N(s1[(row+rem)/4],
930                                         3-(row+rem)%4));
931                row++)
932              /*empty*/ ;
933        } else {
934           for (row = 0; row < len2 && row < len1 && (s2[row] == s1[row]); row++)
935              /*empty*/ ;
936        }
937     }
938     *e1 = row;
939     *e2 = row;
940     if (row == len1 || row == len2) {
941         if (S != NULL)
942             edit_script_rep(S, row);
943 	/* hit last row; stop search */
944 	return row*match_score;
945     }
946     flast_d = abmp->flast_d_affine;
947     if (S==NULL) {
948         space = 0;
949     } else if (!space) {
950        abmp->space = space = new_mb_space();
951     } else {
952         refresh_mb_space(space);
953     }
954     max_row = max_row_free + D_diff;
955     for (k = 0; k < D_diff; k++)
956 	max_row_free[k] = 0;
957     uplow_free = abmp->uplow_free;
958     lower = uplow_free;
959     upper = uplow_free+max_d+1+max_cost;
960     /* next 3 lines set boundary for -1,-2,...,-max_cost+1*/
961     for (k = 0; k < max_cost; k++) {lower[k] =LARGE;  upper[k] = -LARGE;}
962     lower += max_cost;
963     upper += max_cost;
964 
965     flast_d[0][ORIGIN].C = row;
966     flast_d[0][ORIGIN].I = flast_d[0][ORIGIN].D = -2;
967     max_row[0] = (row + row)*M_half;
968     lower[0] = upper[0] = ORIGIN;
969 
970     flower = ORIGIN - 1;
971     fupper = ORIGIN + 1;
972 
973     d = 1;
974     stop_condition = 1;
975     while (d <= max_d) {
976 	Int4 fl0, fu0;
977 	x = max_row[d - D_diff] + gd * d - xdrop_threshold;
978 	x = (Int4)ceil(x/M_half);
979 	if (x < 0) x=0;
980 	cur_max = 0;
981 	fl0 = flower;
982 	fu0 = fupper;
983 	for (k = fl0; k <= fu0; k++) {
984 	    row = -2;
985 	    if (k+1 <= upper[d-GO_cost-GE_cost] && k+1 >= lower[d-GO_cost-GE_cost])
986                 row = flast_d[d-GO_cost-GE_cost][k+1].C;
987 	    if (k+1  <= upper[d-GE_cost] && k+1 >= lower[d-GE_cost] &&
988 		row < flast_d[d-GE_cost][k+1].D)
989                 row = flast_d[d-GE_cost][k+1].D;
990 	    row++;
991 	    if (row+row+k-ORIGIN >= x)
992 	      flast_d[d][k].D = row;
993 	    else flast_d[d][k].D = -2;
994 	    row = -1;
995 	    if (k-1 <= upper[d-GO_cost-GE_cost] && k-1 >= lower[d-GO_cost-GE_cost])
996                 row = flast_d[d-GO_cost-GE_cost][k-1].C;
997 	    if (k-1  <= upper[d-GE_cost] && k-1 >= lower[d-GE_cost] &&
998 		row < flast_d[d-GE_cost][k-1].I)
999                 row = flast_d[d-GE_cost][k-1].I;
1000 	    if (row+row+k-ORIGIN >= x)
1001                 flast_d[d][k].I = row;
1002 	    else flast_d[d][k].I = -2;
1003 
1004 	    row = MAX(flast_d[d][k].I, flast_d[d][k].D);
1005 	    if (k <= upper[d-Mis_cost] && k >= lower[d-Mis_cost])
1006                 row = MAX(flast_d[d-Mis_cost][k].C+1,row);
1007 
1008 	    col = row + k - ORIGIN;
1009 	    if (row + col >= x)
1010 		fupper = k;
1011 	    else {
1012 		if (k == flower)
1013 		    flower++;
1014 		else
1015 		    flast_d[d][k].C = -2;
1016 		continue;
1017 	    }
1018             if (row > max_len || row < -2) {
1019                flower = k; nlower = k+1;
1020             } else {
1021                /* slide down the diagonal */
1022                if (reverse) {
1023                   if (s2[len2 - row] != 0x0f) {
1024                      if (!(rem & 4)) {
1025                         while (row < len2 && col < len1 && s2[len2-1-row] ==
1026                                READDB_UNPACK_BASE_N(s1[(len1-1-col)/4],
1027                                                     3-(len1-1-col)%4)) {
1028                            ++row;
1029                            ++col;
1030                         }
1031                      } else {
1032                         while (row < len2 && col < len1 && s2[len2-1-row] ==
1033                                s1[len1-1-col]) {
1034                            ++row;
1035                            ++col;
1036                         }
1037                      }
1038                   } else {
1039                      max_len = row;
1040                      flower = k; nlower = k+1;
1041                   }
1042                } else if (s2[row-1] != 0x0f) {
1043                   if (!(rem & 4)) {
1044                      while (row < len2 && col < len1 && s2[row] ==
1045                             READDB_UNPACK_BASE_N(s1[(col+rem)/4],
1046                                                  3-(col+rem)%4)) {
1047                         ++row;
1048                         ++col;
1049                      }
1050                   } else {
1051                      while (row < len2 && col < len1 && s2[row] == s1[col]) {
1052                         ++row;
1053                         ++col;
1054                      }
1055                   }
1056                } else {
1057                   max_len = row;
1058                   flower = k; nlower = k+1;
1059                }
1060             }
1061 	    flast_d[d][k].C = row;
1062 	    if (row + col > cur_max) {
1063 		cur_max = row + col;
1064 		b_diag = k;
1065 	    }
1066 	    if (row == len2) {
1067 		flower = k; nlower = k+1;
1068 	    }
1069 	    if (col == len1) {
1070 		fupper = k; nupper = k-1;
1071 	    }
1072 	}
1073 	k = cur_max*M_half - d * gd;
1074 	if (max_row[d - 1] < k) {
1075 	    max_row[d] = k;
1076 	    return_val = d;
1077 	    best_diag = b_diag;
1078 	    *e2 = flast_d[d][b_diag].C;
1079 	    *e1 = (*e2)+b_diag-ORIGIN;
1080 	} else {
1081 	    max_row[d] = max_row[d - 1];
1082 	}
1083 	if (flower <= fupper) {
1084             stop_condition++;
1085             lower[d] = flower; upper[d] = fupper;
1086 	} else {
1087             lower[d] = LARGE; upper[d] = -LARGE;
1088 	}
1089 	if (lower[d-max_cost] <= upper[d-max_cost]) stop_condition--;
1090 	if (stop_condition == 0) break;
1091 	d++;
1092 	flower = MIN(lower[d-Mis_cost], MIN(lower[d-GO_cost-GE_cost], lower[d-GE_cost])-1);
1093 	if (nlower) flower = MAX(flower, nlower);
1094 	fupper = MAX(upper[d-Mis_cost], MAX(upper[d-GO_cost-GE_cost], upper[d-GE_cost])+1);
1095 	if (nupper) fupper = MIN(fupper, nupper);
1096 	if (d > max_cost) {
1097 	   if (S==NULL) {
1098 	      /*if (d > max_cost)*/
1099 	      flast_d[d] = flast_d[d - max_cost-1];
1100 	   } else {
1101 	      flast_d[d] = get_mb_space(space, fupper-flower+1)-flower;
1102 	      if (flast_d[d] == NULL)
1103 		 return return_val;
1104            }
1105 	}
1106     }
1107 
1108     if (S!=NULL) { /*trace back*/
1109         Int4 row1, diag, state;
1110         d = return_val; diag = best_diag;
1111         row = *e2; state = sC;
1112         while (d > 0) {
1113             if (state == sC) {
1114                 /* diag will not be changed*/
1115                 state = get_lastC(flast_d, lower, upper, &d, diag, Mis_cost, &row1);
1116                 if (row-row1 > 0) {
1117                    edit_script_rep(S, row-row1);
1118                    row = row1;
1119                 }
1120             } else {
1121                 if (state == sI) {
1122                     /*row unchanged */
1123                     state = get_lastI(flast_d, lower, upper, &d, diag, GO_cost, GE_cost);
1124                     diag--;
1125                     edit_script_ins(S,1);
1126                 } else {
1127                     edit_script_del(S,1);
1128                     state = get_lastD(flast_d, lower, upper, &d, diag, GO_cost, GE_cost);
1129                     diag++;
1130                     row--;
1131                 }
1132             }
1133         }
1134         edit_script_rep(S, flast_d[0][ORIGIN].C);
1135         if (!reverse)
1136             edit_script_reverse_inplace(S);
1137     }
1138     return_val = max_row[return_val];
1139     return return_val;
1140 }
1141 
1142 
1143 
1144 /* ------------------------------------------------------------ */
1145