1 /* smallviterbi.c
2 * Mon Jan 24 11:38:21 1994
3 *
4 * Small-memory version of viterbi.c
5 *
6 * In the two-matrix version of the alignment algorithm, we keep
7 * a matrix for the BEGIN states (B matrix) and a full matrix for all
8 * the scores (A matrix). The database scanning version of the algorithm takes
9 * advantage of the fact that, for scoring, we don't need to keep
10 * a full cube around for matrix A; we only need the current row and the
11 * last row. We only need a full cube of information for the BEGIN
12 * state scores, which we get from the B matrix.
13 *
14 * This trick saves a large amount of memory, depending on the structure
15 * of the model. (The fewer bifurcations and begins in the model, the more
16 * memory this trick can save.) Unfortunately, it gives up the ability
17 * to trace back and recover an alignment.
18 *
19 * In this module, we add back just enough information to the scanning
20 * algorithm to enable a traceback, at the expense of re-doing some
21 * calculation. The goal is to be able to fit 200-400 nt RNA sequences
22 * into memory.
23 *
24 * I will refer to a model "segment". A segment is a linear (unbranched)
25 * chunk of the model, starting at a BEGIN/ROOT state, ending at a
26 * BIFURC/END state.
27 *
28 * Matrix cells now carry traceback information. This number, tback, indicates
29 * the i,j coords that this segment's BIFURC/END aligns to.
30 * tback is determined recursively. A BIFURC/END's tback points to itself.
31 * All other tbacks are copied from the state that is connected to by
32 * the maximally likely path.
33 *
34 * A traceback is done by using bmx as a framework, and recalculating bits
35 * of the alignment in between BEGINs and BIFURC/ENDs. Thus, if
36 * we know that a BEGIN aligns to i1, j1, we can get two numbers i2,j2 from
37 * the BEGIN's tback, and know that the segment aligns to the sequence
38 * (i1..i2-1)(j2+1..j1). Then we can recalculate the alignment of this
39 * model segment to that subsequence, and reconstruct a full traceback.
40 *
41 * tback is a single unsigned integer. The two numbers i,j are restricted
42 * to 16 bits and are packed into tback by bit-shifting, i<<16.
43 *
44 *
45 */
46
47 /*
48 * amx and atr are [j = 0..N] [diff = 0..j] [y = 0..statenum]
49 * diff == 0 is for off-diagonal boundary conditions (this is why diff is shifted +1)
50 * diff == 1 is for the diagonal, i==j
51 *
52 * bmx and btr are [y = 0..statenum] [j = 0..N] [ diff = 0..j]
53 * a j,diff matrix exists only where y is a BEGIN state
54 *
55 * An optimization is made which requires END states to be explicitly
56 * added, so statenum (the number of states in the integer model)
57 * is *inclusive* of ENDs.
58 */
59
60 #include <stdio.h>
61 #include <stdlib.h>
62 #include <string.h>
63 #include <math.h>
64
65 #include "squid.h"
66 #include "structs.h"
67 #include "funcs.h"
68
69 #ifdef MEMDEBUG
70 #include "dbmalloc.h"
71 #endif
72
73 /* This is how we pack tracebacks into a single machine word.
74 * We assume a 32 bit int.
75 * If you're porting this code, all you have to do is make sure
76 * pack_tb() puts two 16-bit ints in one data type TBACK, and unpack_tb()
77 * gets them back.
78 */
79 typedef unsigned int TBACK;
80 #define pack_tb(i,j) ((i)<<16 | (j))
81 #define PACKED_I 0xFFFF0000U
82 #define PACKED_J 0x0000FFFFU
unpack_tb(TBACK tback,int * ret_i,int * ret_j)83 static void unpack_tb(TBACK tback, int *ret_i, int *ret_j)
84 {
85 *ret_j = tback & PACKED_J;
86 *ret_i = (tback & PACKED_I) >> 16;
87 }
88
89
90
91 static int allocate_mx(struct istate_s *icm,int statenum, int seqlen,
92 int ****ret_amx, TBACK ****ret_atr,
93 int ****ret_bmx, TBACK ****ret_btr);
94 static int init_mx (struct istate_s *icm, int statenum, int N,
95 int ***amx, TBACK ***atr,
96 int ***bmx, TBACK ***btr);
97 static int recurse_mx (struct istate_s *icm, int statenum, char *seq, int N,
98 int ***amx, TBACK ***atr,
99 int ***bmx, TBACK ***btr);
100 static int trace_mx (struct istate_s *icm, char *seq, int N,
101 int ***bmx, TBACK ***btr, struct trace_s **ret_tr);
102 static void free_mx (int ***amx, TBACK ***atr,
103 int ***bmx, TBACK ***btr,
104 int statenum, int seqlen);
105
106 #ifdef DEBUG
107 static void print_tb(int tb);
108 static void print_small_mx(FILE *fp, struct istate_s *icm, int statenum,
109 char *seq, int N, int ***bmx, TBACK ***btr);
110 #endif /* DEBUG */
111
112
113 /* Function: SmallViterbiAlign()
114 *
115 * Purpose: Align a sequence to a model, using the small-memory
116 * variant of the alignment algorithm. Return the score
117 * of the alignment and the traceback.
118 *
119 * Args: icm - the model to align sequence to
120 * statenum = number of states in icm
121 * seq - sequence to align model to
122 * ret_score - RETURN: global alignment score
123 * ret_trace - RETURN: traceback tree
124 *
125 * Return: 1 on success, 0 on failure.
126 */
127 int
SmallViterbiAlign(struct istate_s * icm,int statenum,char * seq,double * ret_score,struct trace_s ** ret_trace)128 SmallViterbiAlign(struct istate_s *icm,
129 int statenum,
130 char *seq,
131 double *ret_score,
132 struct trace_s **ret_trace)
133 {
134 int ***amx; /* the main score matrix */
135 TBACK ***atr; /* amx's traceback pointers */
136 int ***bmx; /* the BEGIN score matrix */
137 TBACK ***btr; /* bmx's traceback pointers */
138 int N; /* length of sequence */
139
140 N = strlen(seq);
141 seq--; /* convert to 1..N. Ugh! */
142
143 if (! allocate_mx(icm, statenum, N, &amx, &atr, &bmx, &btr)) return 0;
144 #ifdef DEBUG
145 printf("allocated matrices\n");
146 #endif
147
148 if (! init_mx(icm, statenum, N, amx, atr, bmx, btr)) return 0;
149 #ifdef DEBUG
150 printf("matrices initialized\n");
151 print_small_mx(stdout, icm, statenum, seq, N, bmx, btr);
152 #endif
153
154 if (! recurse_mx(icm, statenum, seq, N, amx, atr, bmx, btr)) return 0;
155 #ifdef DEBUG
156 printf("recursion finished\n");
157 print_small_mx(stdout, icm, statenum, seq, N, bmx, btr);
158 #endif
159
160 *ret_score = ((double) bmx[0][N][N] / INTPRECISION);
161 #ifdef DEBUG
162 printf("have a score of %.2f, starting traceback\n", *ret_score);
163 #endif
164
165 if (! trace_mx(icm, seq, N, bmx, btr, ret_trace)) return 0;
166 #ifdef DEBUG
167 printf("trace complete\n");
168 #endif
169
170 free_mx(amx, atr, bmx, btr, statenum, N);
171 return 1;
172 }
173
174
175
176 /* Function: allocate_mx()
177 *
178 * Purpose: Malloc space for the score matrices.
179 * amx and atr are indexed as j, i, y.
180 * bmx and btr are indexed as k, j, i.
181 * In the two sequence dimensions j, i they are
182 * diagonal (+1 off diagonal) matrices with
183 * rows j = 0..N, i = 1..j+1.
184 * In the node dimension k bmx and btr are k = 0..M.
185 * In the state dimension y amx and atr are y = 0..numstates.
186 *
187 * Args: icm - the int, log-odds, state-based model
188 * statenum - number of states in model
189 * seqlen - length of sequence
190 * ret_amx - RETURN: main score matrix
191 * ret_atr - RETURN: amx's traceback pointers
192 * ret_bmx - RETURN: BEGIN/BIFURC/END score matrix
193 * ret_btr - RETURN: bmx's traceback pointers
194 *
195 * Return: Ptr to allocated scoring matrix, or
196 * dies and exits.
197 */
198 static int
allocate_mx(struct istate_s * icm,int statenum,int seqlen,int **** ret_amx,TBACK **** ret_atr,int **** ret_bmx,TBACK **** ret_btr)199 allocate_mx(struct istate_s *icm,
200 int statenum,
201 int seqlen,
202 int ****ret_amx,
203 TBACK ****ret_atr,
204 int ****ret_bmx,
205 TBACK ****ret_btr)
206 {
207 int ***amx;
208 TBACK ***atr;
209 int ***bmx;
210 TBACK ***btr;
211 int diag, j, y;
212
213 /* Main matrix, amx: fastest varying index is y (j,i,y)
214 * we only keep two rows for j, 0 and 1.
215 */
216 /* malloc for j = 0..1 rows */
217 if ((amx = (int ***) malloc (2 * sizeof(int **))) == NULL ||
218 (atr = (TBACK ***) malloc (2 * sizeof(TBACK **))) == NULL)
219 Die("Memory allocation error in %s line %d", __FILE__, __LINE__);
220
221
222 for (j = 0; j <= 1; j++) /* loop over rows j = 0..1 */
223 {
224 /* malloc for diag = 0..j (0..seqlen) cols */
225 if ((amx[j] = (int **) malloc ((seqlen + 1) * sizeof(int *))) == NULL ||
226 (atr[j] = (TBACK **) malloc ((seqlen + 1) * sizeof(TBACK *))) == NULL)
227 Die("Memory allocation error in %s line %d", __FILE__, __LINE__);
228
229 /* loop over cols diag = 0..seqlen */
230 for (diag = 0; diag <= seqlen; diag++)
231 /* malloc for y = 0..statenum-1 decks */
232 if ((amx[j][diag] = (int *) malloc ((statenum) * sizeof (int ))) == NULL ||
233 (atr[j][diag] = (TBACK *) malloc ((statenum) * sizeof (TBACK))) == NULL)
234 Die("Memory allocation error in %s line %d", __FILE__, __LINE__);
235 }
236
237
238 /* B auxiliary matrices: fastest varying index is diag (y,j,diag)
239 * bmx, btr keeps score, traceback decks for BEGIN states
240 */
241 /* 0..statenum-1 decks */
242 if ((bmx = (int ***) malloc (statenum * sizeof(int **))) == NULL ||
243 (btr = (TBACK ***) malloc (statenum * sizeof(TBACK **))) == NULL)
244 Die("Memory allocation error in %s line %d", __FILE__, __LINE__);
245
246 for (y = 0; y < statenum; y++)
247 {
248 bmx[y] = NULL;
249 btr[y] = NULL;
250
251 /* we keep score info for BEGIN and BIFURC states */
252 if (icm[y].statetype == uBEGIN_ST || icm[y].statetype == uBIFURC_ST)
253 {
254 /* j= 0..seqlen rows */
255 if ((bmx[y] = (int **) malloc ((seqlen+1) * sizeof(int *))) == NULL)
256 Die("Memory allocation error in %s line %d", __FILE__, __LINE__);
257 /* i = 0..j columns */
258 for (j = 0; j <= seqlen; j++)
259 if ((bmx[y][j] = (int *) malloc ((j+1) * sizeof(int ))) == NULL)
260 Die("Memory allocation error in %s line %d", __FILE__, __LINE__);
261 }
262 /* We keep traceback info only for BEGIN states */
263 if (icm[y].statetype == uBEGIN_ST)
264 {
265 /* j= 0..seqlen rows */
266 if ((btr[y] = (TBACK **) malloc ((seqlen+1) * sizeof(TBACK *))) == NULL)
267 Die("Memory allocation error in %s line %d", __FILE__, __LINE__);
268 /* i = 0..j columns */
269 for (j = 0; j <= seqlen; j++)
270 if ((btr[y][j] = (TBACK *) malloc ((j+1) * sizeof(TBACK))) == NULL)
271 Die("Memory allocation error in %s line %d", __FILE__, __LINE__);
272 }
273
274 }
275
276 *ret_amx = amx;
277 *ret_atr = atr;
278 *ret_bmx = bmx;
279 *ret_btr = btr;
280 return 1;
281 }
282
283
284
285 /* Function: free_mx()
286 *
287 * Purpose: Free the space allocated to the scoring and traceback matrices.
288 * Precisely mirrors the allocations above in allocate_cvmx().
289 *
290 * Return: (void)
291 */
292 static void
free_mx(int *** amx,TBACK *** atr,int *** bmx,TBACK *** btr,int statenum,int seqlen)293 free_mx(int ***amx,
294 TBACK ***atr,
295 int ***bmx,
296 TBACK ***btr,
297 int statenum,
298 int seqlen)
299 {
300 int diag, j, y;
301
302 /* Free the main matrix, amx:
303 * amx[j][i][y] = [0..1] [0..seqlen] [0..statenum-1]
304 */
305 for (j = 0; j <= 1; j++)
306 {
307 for (diag = 0; diag <= seqlen; diag++)
308 {
309 free(amx[j][diag]);
310 free(atr[j][diag]);
311 }
312 free(amx[j]);
313 free(atr[j]);
314 }
315 free(amx);
316 free(atr);
317
318 /* Free the auxiliary matrices, bmx and btr
319 * bmx[y][j][i] = [0..statenum-1] [0..seqlen] [0..seqlen]
320 */
321 for (y = 0; y < statenum; y++)
322 {
323 if (bmx[y] != NULL)
324 {
325 for (j = 0; j <= seqlen; j++)
326 free(bmx[y][j]);
327 free(bmx[y]);
328 }
329 if (btr[y] != NULL)
330 {
331 for (j = 0; j <= seqlen; j++)
332 free(btr[y][j]);
333 free(btr[y]);
334 }
335 }
336 free(bmx);
337 free(btr);
338 }
339
340
341
342 /* Function: init_mx()
343 *
344 * Purpose: Initialization of the scoring matrices. We initialize the off-diagonal,
345 * the diagonal, and the "floor" (end states) of the cube.
346 *
347 * Return: 1 on success, 0 on failure.
348 */
349 static int
init_mx(struct istate_s * icm,int statenum,int N,int *** amx,TBACK *** atr,int *** bmx,TBACK *** btr)350 init_mx(struct istate_s *icm, /* integer model */
351 int statenum, /* number of states in icm */
352 int N, /* length of seq */
353 int ***amx,
354 TBACK ***atr,
355 int ***bmx,
356 TBACK ***btr)
357 {
358 int diag, j, y; /* counters for indices over the cvmx */
359 int ynext; /* index of next state k+1 */
360 int *beam; /* z-axis vector of numbers in amx */
361
362 /* Init the whole amx to -Infinity. We do this with memcpy, trying
363 * to be fast. We fill in j=0,diag=0 by hand, then memcpy() the other
364 * columns.
365 */
366 for (y = 0; y < statenum; y++)
367 amx[0][0][y] = amx[1][0][y] = NEGINFINITY;
368 for (diag = 1; diag <= N; diag++)
369 {
370 memcpy(amx[0][diag], amx[0][0], statenum * sizeof(int));
371 memcpy(amx[1][diag], amx[0][0], statenum * sizeof(int));
372 }
373
374 /* atr END and BIFURC traceback pointers point to themselves.
375 * just set everything to point at itself.
376 */
377 for (j = 0; j <= 1; j++)
378 for (diag = 0; diag <= N; diag++)
379 for (y = 0; y < statenum; y++)
380 atr[j][diag][y] = pack_tb(diag, j);
381
382 /* Init the whole bmx to -Inf. We know state 0 is a begin (it's ROOT), so we
383 * start there, and memcpy rows as needed.
384 */
385 for (diag = 0; diag <= N; diag++)
386 bmx[0][N][diag] = NEGINFINITY;
387 for (j = 0; j < N; j++)
388 memcpy(bmx[0][j], bmx[0][N], (j+1) * sizeof(int));
389
390 for (y = 1; y < statenum; y++)
391 if (bmx[y] != NULL)
392 for (j = 0; j <= N; j++)
393 memcpy(bmx[y][j], bmx[0][N], (j+1) * sizeof(int));
394
395 /* Set all btr traceback ptrs to point at themselves
396 */
397 for (y = 0; y < statenum; y++)
398 if (btr[y] != NULL)
399 for (j = 0; j <= N; j++)
400 for (diag = 0; diag <= j; diag++)
401 btr[y][j][diag] = pack_tb(diag,j);
402
403 /* Init the off-diagonal (j = 0..N; diag == 0) with -log P scores.
404 * End state = 0;
405 * del, bifurc states are calc'ed
406 * begin states same as del's
407 * THIS IS WASTEFUL AND SHOULD BE CHANGED.
408 */
409 for (j = 0; j <= N; j++)
410 for (y = statenum-1; y >= 0; y--)
411 {
412 if (icm[y].statetype == uEND_ST)
413 amx[j%2][0][y] = 0;
414
415 else if (icm[y].statetype == uBIFURC_ST)
416 amx[j%2][0][y] = bmx[icm[y].tmx[0]][j][0] + bmx[icm[y].tmx[1]][j][0];
417
418 else if (icm[y].statetype == uDEL_ST || icm[y].statetype == uBEGIN_ST)
419 {
420 /* only calc DEL-DEL and BEGIN-DEL transitions. Since
421 * we optimized the state transition tables, removing
422 * the unused ones, we don't know where the number
423 * for "to DEL" is! But we can find it, because it'll
424 * be the connection to a non-infinite score */
425 beam = amx[j%2][0] + y + icm[y].offset;
426 for (ynext = 0; ynext < icm[y].connectnum; ynext++)
427 {
428 if (*beam != NEGINFINITY)
429 amx[j%2][0][y] = *beam + icm[y].tmx[ynext];
430 beam++;
431 }
432 }
433 /* make a copy into bmx if y is a BEGIN or BIFURC */
434 if (icm[y].statetype == uBEGIN_ST ||
435 icm[y].statetype == uBIFURC_ST )
436 bmx[y][j][0] = amx[j%2][0][y];
437 }
438 return 1;
439 }
440
441
442
443 /* Function: recurse_mx()
444 *
445 * Purpose: Carry out the fill stage of the dynamic programming
446 * algorithm.
447 *
448 * Returns: 1 on success, 0 on failure.
449 */
450 static int
recurse_mx(struct istate_s * icm,int statenum,char * seq,int N,int *** amx,TBACK *** atr,int *** bmx,TBACK *** btr)451 recurse_mx(struct istate_s *icm, /* integer, state-form model */
452 int statenum, /* number of states in icm */
453 char *seq, /* sequence, 1..N */
454 int N, /* length of seq */
455 int ***amx, /* main scoring matrix */
456 TBACK ***atr, /* tracebacks for amx */
457 int ***bmx, /* bifurc scoring matrix */
458 TBACK ***btr) /* tracebacks for btr */
459 {
460 int i, j, y; /* indices for 3 dimensions */
461 int aj; /* 0 or 1, index for j in A matrices */
462 int diff; /* loop counter for difference: diff = j-i + 1 */
463 int symi, symj; /* symbol indices for seq[i], seq[j] */
464 int sc; /* tmp for a score */
465 int ynext; /* index of next state y */
466
467 int *beam; /* ptr to a beam (z-axis vector) */
468 TBACK *beamt; /* ptr into connected traceback beam */
469 int leftdiff; /* diff coord of BEGIN_L of a bifurc */
470 int leftj; /* j coord of BEGIN_L of a bifurc */
471 int **left_p; /* pointer into whole 2D deck of BEGINL's of a bifurc */
472 int *right_p; /* ptr into row of BEGIN_R's of a bifurc */
473 int *scp; /* score pointer: ptr into beam of scores being calc'ed */
474 TBACK *sct; /* tback beam being calc'ed */
475 struct istate_s *st; /* state pointer: ptr at current state in icm */
476 int *tmx;
477 int emitsc;
478
479 for (j = 1; j <= N; j++)
480 {
481 aj = j % 2;
482 symj = SymbolIndex(seq[j]);
483
484 /* we have to init END and BIF states to point
485 at themselves in this row of atr */
486 for (diff = 0; diff <= j; diff++)
487 for (y = 0; y < statenum; y++)
488 if (icm[y].statetype == uBIFURC_ST ||
489 icm[y].statetype == uEND_ST)
490 atr[aj][diff][y] = pack_tb(diff, j);
491
492 for (diff = 1; diff <= j; diff++)
493 {
494 i = j - diff + 1;
495 symi = SymbolIndex(seq[i]);
496
497
498 scp = &amx[aj][diff][statenum-1];
499 sct = &atr[aj][diff][statenum-1];
500 st = &icm[statenum-1];
501 for (y = statenum-1; y >= 0; y--, scp--, sct--, st--)
502 { /* loop over states */
503
504 if (st->statetype != uBIFURC_ST) /* a normal (non-BIFURC) state */
505 {
506 /* Connect the "beam" pointer to the appropriate
507 * starting place in the ynext scores we're connecting
508 * y to
509 */
510 switch (st->statetype) {
511 case uBEGIN_ST:
512 case uDEL_ST:
513 beam = amx[aj][diff];
514 beamt = atr[aj][diff];
515 emitsc = 0;
516 break;
517 case uMATP_ST: /* !aj toggles from 0 to 1 and vice versa */
518 if (diff == 1) continue;
519 beam = amx[!aj][diff-2];
520 beamt = atr[!aj][diff-2];
521 emitsc = st->emit[symi * ALPHASIZE + symj];
522 break;
523 case uMATR_ST:
524 case uINSR_ST:
525 beam = amx[!aj][diff-1];
526 beamt = atr[!aj][diff-1];
527 emitsc = st->emit[symj];
528 break;
529 case uMATL_ST:
530 case uINSL_ST:
531 beam = amx[aj][diff-1];
532 beamt = atr[aj][diff-1];
533 emitsc = st->emit[symi];
534 break;
535 case uEND_ST:
536 continue;
537 default: Die("no such state type %d", st->statetype);
538 }
539 beam += y + st->offset;
540 beamt += y + st->offset;
541 tmx = st->tmx;
542
543
544 /* Init for ynext == 0 case
545 */
546 *scp = *beam + *tmx;
547 *sct = *beamt;
548
549 /* Calculate remaining cases
550 */
551 for (ynext = 1; ynext < st->connectnum; ynext++)
552 {
553 beam++;
554 beamt++;
555 tmx++;
556 if (*beam > *scp)
557 {
558 sc = *beam + *tmx;
559 if (sc > *scp)
560 {
561 *scp = sc;
562 *sct = *beamt;
563 }
564 }
565 }
566
567 /* Add emission scores now
568 */
569 *scp += emitsc;
570
571 /* Make a copy into bmx, btr if necessary
572 */
573 if (st->statetype == uBEGIN_ST)
574 {
575 bmx[y][j][diff] = *scp;
576 btr[y][j][diff] = *sct;
577 }
578 } /* end block of normal state stuff */
579
580 else /* a BIFURC state */
581 {
582 leftdiff = diff;
583 leftj = j;
584 right_p = bmx[st->tmx[1]][j];
585 left_p = bmx[st->tmx[0]];
586
587 /* init w/ case that left branch emits it all */
588 *scp = left_p[leftj][leftdiff] + *right_p;
589 while (leftdiff > 0)
590 {
591 leftdiff--;
592 leftj--;
593 right_p++;
594
595 sc = left_p[leftj][leftdiff] + *right_p;
596 if (sc > *scp)
597 *scp = sc;
598 }
599 /* keep copy of score in bmx, for tracing */
600 bmx[y][j][diff] = *scp;
601 }
602
603 } /* end loop over states */
604 } /* end loop over diff */
605 } /* end loop over j */
606 return 1;
607 }
608
609
610 /* Function: trace_mx()
611 *
612 * Purpose: Trace stage of the dynamic programming: starting
613 * at j=N, i=1, k=0/BEGIN, trace back the optimal
614 * path. Returns a binary tree, ret_trace.
615 * Caller is reponsible for free'ing ret_trace.
616 */
617 static int
trace_mx(struct istate_s * icm,char * seq,int N,int *** bmx,TBACK *** btr,struct trace_s ** ret_trace)618 trace_mx(struct istate_s *icm, /* the model to align */
619 char *seq, /* sequence to align it to 1..N */
620 int N,
621 int ***bmx, /* matrix of BEGIN scores */
622 TBACK ***btr, /* matrix of BIFURC/END tbacks */
623 struct trace_s **ret_trace) /* RETURN: the traceback tree */
624 {
625 struct trace_s *tr; /* the traceback tree under construction */
626 struct trace_s *curr_tr; /* ptr to node of tr we're working on */
627 struct tracestack_s *dolist; /* pushdown stack of active tr nodes */
628 int diff,i, j; /* coords in mx (0..N) */
629 int y; /* counter for states (0..statenum-1) */
630 int leftdiff;
631 int leftj;
632 int *right_p;
633 int i2, j2; /* what's left unaccounted for at segment end */
634 int diff2;
635 int end_y; /* index of state that ends segment */
636
637 /* Initialize.
638 * Start at i = 1, j = N and work towards diagonal
639 */
640 InitTrace(&tr, NULL); /* start a trace tree */
641 dolist = InitTracestack(); /* start a stack for traversing the trace tree */
642
643 curr_tr = AttachTrace(tr, NULL, 0, N-1, 0, BEGIN_ST);
644 PushTracestack(dolist, curr_tr);
645
646 /* Recursion. While there's active nodes in the stack, trace from them.
647 *
648 * This is cribbed from recurse_cvmx(); it's almost the exact reverse.
649 * We know the best score, we just have to figure out where it came from.
650 */
651 while ((curr_tr = PopTracestack(dolist)) != NULL)
652 {
653 /* get some useful numbers, mostly for clarity */
654 /* which is important, since we're sort of misusing
655 * fields in the trace structures! */
656 i = curr_tr->emitl+1;
657 j = curr_tr->emitr+1;
658 y = curr_tr->nodeidx;
659 diff = j - i + 1;
660
661 /* find out which bifurc/end state terminates this
662 segment */
663 end_y = y+1;
664 while (icm[end_y].statetype != uBIFURC_ST &&
665 icm[end_y].statetype != uEND_ST)
666 end_y++;
667
668 /* find out i2,j2 that the terminal bifurc/end
669 aligns to */
670 unpack_tb(btr[y][j][diff], &diff2, &j2);
671 i2 = j2 - diff2 + 1;
672
673 /* For now, just write out what the traceback looks like.
674 */
675 printf("Segment from state %d to %d: aligns to %d..%d/%d..%d\n",
676 y, end_y, i, i2-1, j2+1, j);
677
678
679 /* push next BEGINs onto stack; they are connected
680 to BIFURC end_y */
681 if (icm[end_y].statetype == uBIFURC_ST)
682 {
683 if (i2 > j2)
684 {
685 PushTracestack(dolist, AttachTrace(curr_tr, NULL, i2-1, j2-1, icm[end_y].tmx[1], BEGIN_ST));
686 PushTracestack(dolist, AttachTrace(curr_tr, NULL, i2-1, j2-1, icm[end_y].tmx[0], BEGIN_ST));
687 }
688 else
689 {
690 leftdiff = diff2;
691 leftj = j2;
692 right_p = bmx[icm[end_y].tmx[1]][j2];
693
694 while (leftdiff >= 0)
695 {
696 if (bmx[end_y][j2][diff] == bmx[icm[end_y].tmx[0]][leftj][leftdiff] + *right_p)
697 {
698 printf("found the bifurc: it is %d-%d and %d-%d\n",
699 i2 -1, i2+leftdiff-2, i2 + leftdiff-1, j2-1);
700 PushTracestack(dolist, AttachTrace(curr_tr, NULL, i2 + leftdiff-1, j2-1,
701 icm[end_y].tmx[1], BEGIN_ST));
702 PushTracestack(dolist, AttachTrace(curr_tr, NULL, i2 -1, i2+leftdiff-2,
703 icm[end_y].tmx[0], BEGIN_ST));
704 break;
705 }
706 leftdiff--;
707 leftj--;
708 right_p++;
709 }
710 if (leftdiff < 0)
711 Die("bifurc reconstruction failed at ijy %d,%d,%d", i,j,y);
712 }
713 }
714 } /* (while something is in the tracestack) */
715
716 FreeTracestack(dolist);
717
718 *ret_trace = tr;
719 return 1;
720 }
721
722
723
724 #ifdef DEBUG
725
726 /* Function: print_tb()
727 *
728 * Purpose: Print the two numbers of a packed traceback.
729 */
730 static void
print_tb(int tb)731 print_tb(int tb)
732 {
733 int i, j;
734
735 unpack_tb(tb, &i, &j);
736 printf("%d %d\n", i, j);
737 }
738
739 /* Function: PrintSmallMX()
740 *
741 * Purpose: Debugging output; print out the three-dimensional
742 * auxiliary alignment matrix produced by the
743 * small-memory version.
744 */
745 static void
print_small_mx(FILE * fp,struct istate_s * icm,int statenum,char * seq,int N,int *** bmx,TBACK *** btr)746 print_small_mx(FILE *fp, /* open file or just stdout/stderr */
747 struct istate_s *icm, /* the model to align */
748 int statenum, /* number of states in icm */
749 char *seq, /* sequence, 1..N */
750 int N, /* length of seq */
751 int ***bmx, /* auxiliary matrix */
752 TBACK ***btr) /* traceback ptrs for bmx */
753 {
754 int j, diff, y; /* indices in 3D matrix */
755 int tbdiff, tbj; /* traceback pointers to a j, diff position */
756
757 for (y = 0; y < statenum; y++)
758 if (bmx[y] != NULL)
759 {
760 fprintf(fp, "### B Matrix for state %d, type %d (%s), from node %d\n",
761 y, icm[y].statetype, UstatetypeName(icm[y].statetype), icm[y].nodeidx);
762 fprintf(fp, " ");
763 for (diff = 0; diff <= N; diff++)
764 fprintf(fp, "%6d ", diff);
765 fprintf(fp, "\n");
766
767 for (j = 0; j <= N; j++)
768 {
769 fprintf(fp, "%c %3d ", (j > 0) ? seq[j] : (char) '*', j);
770 for (diff = 0; diff <= j; diff++)
771 fprintf(fp, "%6d ", bmx[y][j][diff]);
772 fprintf(fp, "\n ");
773 if (icm[y].statetype == uBEGIN_ST)
774 for (diff = 0; diff <= j; diff++)
775 {
776 unpack_tb(btr[y][j][diff], &tbdiff, &tbj);
777 fprintf(fp, "%3d/%3d ", tbdiff, tbj);
778 }
779 fprintf(fp, "\n");
780 }
781 fprintf(fp, "\n\n");
782 }
783 }
784 #endif /* DEBUG */
785
786
787