1 /* smallviterbi.c
2  * Mon Jan 24 11:38:21 1994
3  *
4  * Small-memory version of viterbi.c
5  *
6  * In the two-matrix version of the alignment algorithm, we keep
7  * a matrix for the BEGIN states (B matrix) and a full matrix for all
8  * the scores (A matrix). The database scanning version of the algorithm takes
9  * advantage of the fact that, for scoring, we don't need to keep
10  * a full cube around for matrix A; we only need the current row and the
11  * last row. We only need a full cube of information for the BEGIN
12  * state scores, which we get from the B matrix.
13  *
14  * This trick saves a large amount of memory, depending on the structure
15  * of the model. (The fewer bifurcations and begins in the model, the more
16  * memory this trick can save.) Unfortunately, it gives up the ability
17  * to trace back and recover an alignment.
18  *
19  * In this module, we add back just enough information to the scanning
20  * algorithm to enable a traceback, at the expense of re-doing some
21  * calculation. The goal is to be able to fit 200-400 nt RNA sequences
22  * into memory.
23  *
24  * I will refer to a model "segment". A segment is a linear (unbranched)
25  * chunk of the model, starting at a BEGIN/ROOT state, ending at a
26  * BIFURC/END state.
27  *
28  * Matrix cells now carry traceback information. This number, tback, indicates
29  * the i,j coords that this segment's BIFURC/END aligns to.
30  * tback is determined recursively. A BIFURC/END's tback points to itself.
31  * All other tbacks are copied from the state that is connected to by
32  * the maximally likely path.
33  *
34  * A traceback is done by using bmx as a framework, and recalculating bits
35  * of the alignment in between BEGINs and BIFURC/ENDs.  Thus, if
36  * we know that a BEGIN aligns to i1, j1, we can get two numbers i2,j2 from
37  * the BEGIN's tback, and know that the segment aligns to the sequence
38  * (i1..i2-1)(j2+1..j1). Then we can recalculate the alignment of this
39  * model segment to that subsequence, and reconstruct a full traceback.
40  *
41  * tback is a single unsigned integer. The two numbers i,j are restricted
42  * to 16 bits and are packed into tback by bit-shifting, i<<16.
43  *
44  *
45  */
46 
47 /*
48  * amx and atr are [j = 0..N] [diff = 0..j] [y = 0..statenum]
49  *  diff == 0 is for off-diagonal boundary conditions (this is why diff is shifted +1)
50  *  diff == 1 is for the diagonal, i==j
51  *
52  * bmx and btr are [y = 0..statenum] [j = 0..N] [ diff = 0..j]
53  *   a j,diff matrix exists only where y is a BEGIN state
54  *
55  * An optimization is made which requires END states to be explicitly
56  * added, so statenum (the number of states in the integer model)
57  * is *inclusive* of ENDs.
58  */
59 
60 #include <stdio.h>
61 #include <stdlib.h>
62 #include <string.h>
63 #include <math.h>
64 
65 #include "squid.h"
66 #include "structs.h"
67 #include "funcs.h"
68 
69 #ifdef MEMDEBUG
70 #include "dbmalloc.h"
71 #endif
72 
73 /* This is how we pack tracebacks into a single machine word.
74  * We assume a 32 bit int.
75  * If you're porting this code, all you have to do is make sure
76  * pack_tb() puts two 16-bit ints in one data type TBACK, and unpack_tb()
77  * gets them back.
78  */
79 typedef unsigned int TBACK;
80 #define pack_tb(i,j)           ((i)<<16 | (j))
81 #define PACKED_I               0xFFFF0000U
82 #define PACKED_J               0x0000FFFFU
unpack_tb(TBACK tback,int * ret_i,int * ret_j)83 static void unpack_tb(TBACK tback, int *ret_i, int *ret_j)
84 {
85   *ret_j =  tback & PACKED_J;
86   *ret_i = (tback & PACKED_I) >> 16;
87 }
88 
89 
90 
91 static int  allocate_mx(struct istate_s *icm,int statenum, int seqlen,
92 			int ****ret_amx, TBACK ****ret_atr,
93 			int ****ret_bmx, TBACK ****ret_btr);
94 static int  init_mx    (struct istate_s *icm, int statenum, int N,
95 			int ***amx, TBACK ***atr,
96 			int ***bmx, TBACK ***btr);
97 static int  recurse_mx (struct istate_s *icm, int statenum, char *seq, int N,
98 			int ***amx, TBACK ***atr,
99 			int ***bmx, TBACK ***btr);
100 static int  trace_mx   (struct istate_s *icm, char *seq, int N,
101 			int ***bmx, TBACK ***btr, struct trace_s **ret_tr);
102 static void free_mx    (int ***amx, TBACK ***atr,
103 			int ***bmx, TBACK ***btr,
104 			int statenum, int seqlen);
105 
106 #ifdef DEBUG
107 static void  print_tb(int tb);
108 static void  print_small_mx(FILE *fp, struct istate_s *icm, int statenum,
109 			    char *seq, int N, int ***bmx, TBACK ***btr);
110 #endif /* DEBUG */
111 
112 
113 /* Function: SmallViterbiAlign()
114  *
115  * Purpose:  Align a sequence to a model, using the small-memory
116  *           variant of the alignment algorithm. Return the score
117  *           of the alignment and the traceback.
118  *
119  * Args:     icm       - the model to align sequence to
120  *           statenum  = number of states in icm
121  *           seq       - sequence to align model to
122  *           ret_score - RETURN: global alignment score
123  *           ret_trace - RETURN: traceback tree
124  *
125  * Return:   1 on success, 0 on failure.
126  */
127 int
SmallViterbiAlign(struct istate_s * icm,int statenum,char * seq,double * ret_score,struct trace_s ** ret_trace)128 SmallViterbiAlign(struct istate_s *icm,
129 		  int              statenum,
130 		  char            *seq,
131 		  double          *ret_score,
132 		  struct trace_s **ret_trace)
133 {
134   int   ***amx;			/* the main score matrix     */
135   TBACK ***atr;                 /* amx's traceback pointers  */
136   int   ***bmx;                 /* the BEGIN score matrix    */
137   TBACK ***btr;                 /* bmx's traceback pointers  */
138   int      N;			/* length of sequence        */
139 
140   N = strlen(seq);
141   seq--;			/* convert to 1..N. Ugh! */
142 
143   if (! allocate_mx(icm, statenum, N, &amx, &atr, &bmx, &btr)) return 0;
144 #ifdef DEBUG
145   printf("allocated matrices\n");
146 #endif
147 
148   if (! init_mx(icm, statenum, N, amx, atr, bmx, btr)) return 0;
149 #ifdef DEBUG
150   printf("matrices initialized\n");
151   print_small_mx(stdout, icm, statenum, seq, N, bmx, btr);
152 #endif
153 
154   if (! recurse_mx(icm, statenum, seq, N, amx, atr, bmx, btr)) return 0;
155 #ifdef DEBUG
156   printf("recursion finished\n");
157   print_small_mx(stdout, icm, statenum, seq, N, bmx, btr);
158 #endif
159 
160   *ret_score = ((double) bmx[0][N][N] / INTPRECISION);
161 #ifdef DEBUG
162   printf("have a score of %.2f, starting traceback\n", *ret_score);
163 #endif
164 
165   if (! trace_mx(icm, seq, N, bmx, btr, ret_trace)) return 0;
166 #ifdef DEBUG
167   printf("trace complete\n");
168 #endif
169 
170   free_mx(amx, atr, bmx, btr, statenum, N);
171   return 1;
172 }
173 
174 
175 
176 /* Function: allocate_mx()
177  *
178  * Purpose:  Malloc space for the score matrices.
179  *           amx and atr are indexed as j, i, y.
180  *           bmx and btr are indexed as k, j, i.
181  *           In the two sequence dimensions j, i they are
182  *           diagonal (+1 off diagonal) matrices with
183  *           rows j = 0..N, i = 1..j+1.
184  *           In the node dimension k bmx and btr are k = 0..M.
185  *           In the state dimension y amx and atr are y = 0..numstates.
186  *
187  * Args:     icm      - the int, log-odds, state-based model
188  *           statenum - number of states in model
189  *           seqlen   - length of sequence
190  *           ret_amx  - RETURN: main score matrix
191  *           ret_atr  - RETURN: amx's traceback pointers
192  *           ret_bmx  - RETURN: BEGIN/BIFURC/END score matrix
193  *           ret_btr  - RETURN: bmx's traceback pointers
194  *
195  * Return:   Ptr to allocated scoring matrix, or
196  *           dies and exits.
197  */
198 static int
allocate_mx(struct istate_s * icm,int statenum,int seqlen,int **** ret_amx,TBACK **** ret_atr,int **** ret_bmx,TBACK **** ret_btr)199 allocate_mx(struct istate_s *icm,
200 	    int       statenum,
201 	    int       seqlen,
202 	    int   ****ret_amx,
203 	    TBACK ****ret_atr,
204 	    int   ****ret_bmx,
205 	    TBACK ****ret_btr)
206 {
207   int    ***amx;
208   TBACK  ***atr;
209   int    ***bmx;
210   TBACK  ***btr;
211   int     diag, j, y;
212 
213   /* Main matrix, amx: fastest varying index is y (j,i,y)
214    * we only keep two rows for j, 0 and 1.
215    */
216 				/* malloc for j = 0..1 rows */
217   if ((amx = (int   ***) malloc (2 * sizeof(int   **))) == NULL ||
218       (atr = (TBACK ***) malloc (2 * sizeof(TBACK **))) == NULL)
219     Die("Memory allocation error in %s line %d", __FILE__, __LINE__);
220 
221 
222   for (j = 0; j <= 1; j++)	/* loop over rows j = 0..1 */
223     {
224 				/* malloc for diag = 0..j (0..seqlen) cols */
225       if ((amx[j] = (int **)   malloc ((seqlen + 1) * sizeof(int   *))) == NULL ||
226 	  (atr[j] = (TBACK **) malloc ((seqlen + 1) * sizeof(TBACK *))) == NULL)
227 	Die("Memory allocation error in %s line %d", __FILE__, __LINE__);
228 
229 				/* loop over cols diag = 0..seqlen */
230       for (diag = 0; diag <= seqlen; diag++)
231 				/* malloc for y = 0..statenum-1 decks */
232 	  if ((amx[j][diag] = (int *)   malloc ((statenum) * sizeof (int  ))) == NULL ||
233 	      (atr[j][diag] = (TBACK *) malloc ((statenum) * sizeof (TBACK))) == NULL)
234 	    Die("Memory allocation error in %s line %d", __FILE__, __LINE__);
235     }
236 
237 
238   /* B auxiliary matrices: fastest varying index is diag (y,j,diag)
239    * bmx, btr keeps score, traceback decks for BEGIN states
240    */
241 				/* 0..statenum-1 decks */
242   if ((bmx = (int ***)   malloc (statenum * sizeof(int   **))) == NULL ||
243       (btr = (TBACK ***) malloc (statenum * sizeof(TBACK **))) == NULL)
244     Die("Memory allocation error in %s line %d", __FILE__, __LINE__);
245 
246   for (y = 0; y < statenum; y++)
247     {
248       bmx[y] = NULL;
249       btr[y] = NULL;
250 
251 				/* we keep score info for BEGIN and BIFURC states */
252       if (icm[y].statetype == uBEGIN_ST || icm[y].statetype == uBIFURC_ST)
253 	{
254 				/* j= 0..seqlen rows  */
255 	  if ((bmx[y] = (int   **) malloc ((seqlen+1) * sizeof(int   *))) == NULL)
256 	    Die("Memory allocation error in %s line %d", __FILE__, __LINE__);
257 				/* i = 0..j columns */
258 	  for (j = 0; j <= seqlen; j++)
259 	    if ((bmx[y][j] = (int   *) malloc ((j+1) * sizeof(int  ))) == NULL)
260 	      Die("Memory allocation error in %s line %d", __FILE__, __LINE__);
261 	}
262 				/* We keep traceback info only for BEGIN states */
263       if (icm[y].statetype == uBEGIN_ST)
264 	{
265 	  			/* j= 0..seqlen rows  */
266 	  if ((btr[y] = (TBACK **) malloc ((seqlen+1) * sizeof(TBACK *))) == NULL)
267 	    Die("Memory allocation error in %s line %d", __FILE__, __LINE__);
268 				/* i = 0..j columns */
269 	  for (j = 0; j <= seqlen; j++)
270 	    if ((btr[y][j] = (TBACK *) malloc ((j+1) * sizeof(TBACK))) == NULL)
271 	      Die("Memory allocation error in %s line %d", __FILE__, __LINE__);
272 	}
273 
274     }
275 
276   *ret_amx = amx;
277   *ret_atr = atr;
278   *ret_bmx = bmx;
279   *ret_btr = btr;
280   return 1;
281 }
282 
283 
284 
285 /* Function: free_mx()
286  *
287  * Purpose:  Free the space allocated to the scoring and traceback matrices.
288  *           Precisely mirrors the allocations above in allocate_cvmx().
289  *
290  * Return:   (void)
291  */
292 static void
free_mx(int *** amx,TBACK *** atr,int *** bmx,TBACK *** btr,int statenum,int seqlen)293 free_mx(int    ***amx,
294 	TBACK  ***atr,
295 	int    ***bmx,
296 	TBACK  ***btr,
297 	int     statenum,
298 	int     seqlen)
299 {
300   int diag, j, y;
301 
302   /* Free the main matrix, amx:
303    * amx[j][i][y] = [0..1] [0..seqlen] [0..statenum-1]
304    */
305   for (j = 0; j <= 1; j++)
306     {
307       for (diag = 0; diag <= seqlen; diag++)
308 	{
309 	  free(amx[j][diag]);
310 	  free(atr[j][diag]);
311 	}
312       free(amx[j]);
313       free(atr[j]);
314     }
315   free(amx);
316   free(atr);
317 
318   /* Free the auxiliary matrices, bmx and btr
319    * bmx[y][j][i] = [0..statenum-1] [0..seqlen] [0..seqlen]
320    */
321   for (y = 0; y < statenum; y++)
322     {
323       if (bmx[y] != NULL)
324 	{
325 	  for (j = 0; j <= seqlen; j++)
326 	    free(bmx[y][j]);
327 	  free(bmx[y]);
328 	}
329       if (btr[y] != NULL)
330 	{
331 	  for (j = 0; j <= seqlen; j++)
332 	    free(btr[y][j]);
333 	  free(btr[y]);
334 	}
335     }
336   free(bmx);
337   free(btr);
338 }
339 
340 
341 
342 /* Function: init_mx()
343  *
344  * Purpose:  Initialization of the scoring matrices. We initialize the off-diagonal,
345  *           the diagonal, and the "floor" (end states) of the cube.
346  *
347  * Return:   1 on success, 0 on failure.
348  */
349 static int
init_mx(struct istate_s * icm,int statenum,int N,int *** amx,TBACK *** atr,int *** bmx,TBACK *** btr)350 init_mx(struct istate_s *icm,          /* integer model  */
351 	int              statenum,     /* number of states in icm */
352 	int              N,	       /* length of seq  */
353 	int           ***amx,
354 	TBACK         ***atr,
355 	int           ***bmx,
356 	TBACK         ***btr)
357 {
358   int diag, j, y;		/* counters for indices over the cvmx            */
359   int ynext;			/* index of next state k+1                       */
360   int *beam;                    /* z-axis vector of numbers in amx               */
361 
362   /* Init the whole amx to -Infinity. We do this with memcpy, trying
363    * to be fast. We fill in j=0,diag=0 by hand, then memcpy() the other
364    * columns.
365    */
366   for (y = 0; y < statenum; y++)
367     amx[0][0][y] = amx[1][0][y] = NEGINFINITY;
368   for (diag = 1; diag <= N; diag++)
369     {
370       memcpy(amx[0][diag], amx[0][0], statenum * sizeof(int));
371       memcpy(amx[1][diag], amx[0][0], statenum * sizeof(int));
372     }
373 
374   /* atr END and BIFURC traceback pointers point to themselves.
375    * just set everything to point at itself.
376    */
377   for (j = 0; j <= 1; j++)
378     for (diag = 0; diag <= N; diag++)
379       for (y = 0; y < statenum; y++)
380 	atr[j][diag][y] = pack_tb(diag, j);
381 
382   /* Init the whole bmx to -Inf. We know state 0 is a begin (it's ROOT), so we
383    * start there, and memcpy rows as needed.
384    */
385   for (diag = 0; diag <= N; diag++)
386     bmx[0][N][diag] = NEGINFINITY;
387   for (j = 0; j < N; j++)
388     memcpy(bmx[0][j], bmx[0][N], (j+1) * sizeof(int));
389 
390   for (y = 1; y < statenum; y++)
391     if (bmx[y] != NULL)
392       for (j = 0; j <= N; j++)
393 	memcpy(bmx[y][j], bmx[0][N], (j+1) * sizeof(int));
394 
395   /* Set all btr traceback ptrs to point at themselves
396    */
397   for (y = 0; y < statenum; y++)
398     if (btr[y] != NULL)
399       for (j = 0; j <= N; j++)
400 	for (diag = 0; diag <= j; diag++)
401 	  btr[y][j][diag] = pack_tb(diag,j);
402 
403   /* Init the off-diagonal (j = 0..N; diag == 0) with -log P scores.
404    * End state = 0;
405    * del, bifurc states are calc'ed
406    * begin states same as del's
407    * THIS IS WASTEFUL AND SHOULD BE CHANGED.
408    */
409   for (j = 0; j <= N; j++)
410     for (y = statenum-1; y >= 0; y--)
411       {
412 	if (icm[y].statetype == uEND_ST)
413 	  amx[j%2][0][y] = 0;
414 
415       else if (icm[y].statetype == uBIFURC_ST)
416 	amx[j%2][0][y] = bmx[icm[y].tmx[0]][j][0] + bmx[icm[y].tmx[1]][j][0];
417 
418       else if (icm[y].statetype == uDEL_ST || icm[y].statetype == uBEGIN_ST)
419 	{
420 				/* only calc DEL-DEL and BEGIN-DEL transitions. Since
421 				 * we optimized the state transition tables, removing
422 				 * the unused ones, we don't know where the number
423 				 * for "to DEL" is! But we can find it, because it'll
424 				 * be the connection to a non-infinite score */
425 	  beam = amx[j%2][0] + y + icm[y].offset;
426 	  for (ynext = 0; ynext < icm[y].connectnum; ynext++)
427 	    {
428 	      if (*beam != NEGINFINITY)
429 		amx[j%2][0][y] = *beam + icm[y].tmx[ynext];
430 	      beam++;
431 	    }
432 	}
433 				/* make a copy into bmx if y is a BEGIN or BIFURC */
434       if (icm[y].statetype == uBEGIN_ST ||
435 	  icm[y].statetype == uBIFURC_ST )
436 	bmx[y][j][0] = amx[j%2][0][y];
437     }
438   return 1;
439 }
440 
441 
442 
443 /* Function: recurse_mx()
444  *
445  * Purpose:  Carry out the fill stage of the dynamic programming
446  *           algorithm.
447  *
448  * Returns:  1 on success, 0 on failure.
449  */
450 static int
recurse_mx(struct istate_s * icm,int statenum,char * seq,int N,int *** amx,TBACK *** atr,int *** bmx,TBACK *** btr)451 recurse_mx(struct istate_s *icm,      /* integer, state-form model */
452 	   int              statenum, /* number of states in icm   */
453 	   char            *seq,      /* sequence, 1..N            */
454 	   int              N,	      /* length of seq             */
455 	   int           ***amx,      /* main scoring matrix       */
456 	   TBACK         ***atr,      /* tracebacks for amx        */
457 	   int           ***bmx,      /* bifurc scoring matrix     */
458 	   TBACK         ***btr)      /* tracebacks for btr        */
459 {
460   int i, j, y;		        /* indices for 3 dimensions                    */
461   int aj;			/* 0 or 1, index for j in A matrices           */
462   int diff;			/* loop counter for difference: diff = j-i + 1 */
463   int symi, symj;		/* symbol indices for seq[i], seq[j]           */
464   int sc;			/* tmp for a score                             */
465   int ynext;			/* index of next state y                       */
466 
467   int *beam;                    /* ptr to a beam (z-axis vector)               */
468   TBACK *beamt;                 /* ptr into connected traceback beam           */
469   int  leftdiff;		/* diff coord of BEGIN_L of a bifurc     */
470   int  leftj;			/* j coord of BEGIN_L of a bifurc        */
471   int **left_p;			/* pointer into whole 2D deck of BEGINL's of a bifurc */
472   int *right_p;                 /* ptr into row of BEGIN_R's of a bifurc */
473   int   *scp;			/* score pointer: ptr into beam of scores being calc'ed */
474   TBACK *sct;                   /* tback beam being calc'ed */
475   struct istate_s *st;		/* state pointer: ptr at current state in icm */
476   int *tmx;
477   int  emitsc;
478 
479   for (j = 1; j <= N; j++)
480     {
481       aj = j % 2;
482       symj = SymbolIndex(seq[j]);
483 
484 				/* we have to init END and BIF states to point
485 				   at themselves in this row of atr */
486       for (diff = 0; diff <= j; diff++)
487 	for (y = 0; y < statenum; y++)
488 	  if (icm[y].statetype == uBIFURC_ST ||
489 	      icm[y].statetype == uEND_ST)
490 	    atr[aj][diff][y] = pack_tb(diff, j);
491 
492       for (diff = 1; diff <= j; diff++)
493 	{
494 	  i = j - diff + 1;
495 	  symi = SymbolIndex(seq[i]);
496 
497 
498 	  scp = &amx[aj][diff][statenum-1];
499 	  sct = &atr[aj][diff][statenum-1];
500 	  st  = &icm[statenum-1];
501 	  for (y = statenum-1; y >= 0; y--, scp--, sct--, st--)
502 	    { /* loop over states */
503 
504 	      if (st->statetype != uBIFURC_ST)	/* a normal (non-BIFURC) state */
505 		{
506 		    /* Connect the "beam" pointer to the appropriate
507 		     * starting place in the ynext scores we're connecting
508 		     * y to
509 		     */
510 		  switch (st->statetype) {
511 		  case uBEGIN_ST:
512 		  case uDEL_ST:
513 		    beam   = amx[aj][diff];
514 		    beamt  = atr[aj][diff];
515 		    emitsc = 0;
516 		    break;
517 		  case uMATP_ST: /* !aj toggles from 0 to 1 and vice versa */
518 		    if (diff == 1) continue;
519 		    beam   = amx[!aj][diff-2];
520 		    beamt  = atr[!aj][diff-2];
521 		    emitsc = st->emit[symi * ALPHASIZE + symj];
522 		    break;
523 		  case uMATR_ST:
524 		  case uINSR_ST:
525 		    beam   = amx[!aj][diff-1];
526 		    beamt  = atr[!aj][diff-1];
527 		    emitsc = st->emit[symj];
528 		    break;
529 		  case uMATL_ST:
530 		  case uINSL_ST:
531 		    beam   = amx[aj][diff-1];
532 		    beamt  = atr[aj][diff-1];
533 		    emitsc = st->emit[symi];
534 		    break;
535 		  case uEND_ST:
536 		    continue;
537 		  default: Die("no such state type %d", st->statetype);
538 		  }
539 		  beam  += y + st->offset;
540 		  beamt += y + st->offset;
541 		  tmx  = st->tmx;
542 
543 
544 		  /* Init for ynext == 0 case
545 		   */
546 		  *scp = *beam + *tmx;
547 		  *sct = *beamt;
548 
549 		  /* Calculate remaining cases
550 		   */
551 		  for (ynext = 1; ynext < st->connectnum; ynext++)
552 		    {
553 		      beam++;
554 		      beamt++;
555 		      tmx++;
556 		      if (*beam > *scp)
557 			{
558 			  sc = *beam + *tmx;
559 			  if (sc > *scp)
560 			    {
561 			      *scp = sc;
562 			      *sct = *beamt;
563 			    }
564 			}
565 		    }
566 
567 		  /* Add emission scores now
568 		   */
569 		  *scp += emitsc;
570 
571 		  /* Make a copy into bmx, btr if necessary
572 		   */
573 		  if (st->statetype == uBEGIN_ST)
574 		    {
575 		      bmx[y][j][diff] = *scp;
576 		      btr[y][j][diff] = *sct;
577 		    }
578 		} /* end block of normal state stuff */
579 
580 		else		/* a BIFURC state */
581 		  {
582 		    leftdiff = diff;
583 		    leftj    = j;
584 		    right_p  = bmx[st->tmx[1]][j];
585 		    left_p   = bmx[st->tmx[0]];
586 
587 				/* init w/ case that left branch emits it all */
588 		    *scp = left_p[leftj][leftdiff] + *right_p;
589 		    while (leftdiff > 0)
590 		      {
591 			leftdiff--;
592 			leftj--;
593 			right_p++;
594 
595 			sc = left_p[leftj][leftdiff] + *right_p;
596 			if (sc > *scp)
597 			  *scp = sc;
598 		      }
599 				/* keep copy of score in bmx, for tracing */
600 		    bmx[y][j][diff] = *scp;
601 		  }
602 
603 	      } /* end loop over states */
604 	  } /* end loop over diff */
605       } /* end loop over j */
606   return 1;
607 }
608 
609 
610 /* Function: trace_mx()
611  *
612  * Purpose:  Trace stage of the dynamic programming: starting
613  *           at j=N, i=1, k=0/BEGIN, trace back the optimal
614  *           path. Returns a binary tree, ret_trace.
615  *           Caller is reponsible for free'ing ret_trace.
616  */
617 static int
trace_mx(struct istate_s * icm,char * seq,int N,int *** bmx,TBACK *** btr,struct trace_s ** ret_trace)618 trace_mx(struct istate_s *icm,       /* the model to align               */
619 	 char            *seq,       /* sequence to align it to  1..N    */
620 	 int              N,
621 	 int           ***bmx,       /* matrix of BEGIN scores           */
622 	 TBACK         ***btr,       /* matrix of BIFURC/END tbacks      */
623 	 struct trace_s **ret_trace) /* RETURN: the traceback tree       */
624 {
625   struct trace_s *tr;           /* the traceback tree under construction */
626   struct trace_s *curr_tr;      /* ptr to node of tr we're working on    */
627   struct tracestack_s *dolist;  /* pushdown stack of active tr nodes     */
628   int diff,i, j;		/* coords in mx (0..N)                   */
629   int y;			/* counter for states (0..statenum-1)    */
630   int leftdiff;
631   int leftj;
632   int *right_p;
633   int  i2, j2;			/* what's left unaccounted for at segment end */
634   int  diff2;
635   int  end_y;			/* index of state that ends segment           */
636 
637   /* Initialize.
638    * Start at i = 1, j = N and work towards diagonal
639    */
640   InitTrace(&tr, NULL);         /* start a trace tree */
641   dolist = InitTracestack();	/* start a stack for traversing the trace tree */
642 
643   curr_tr = AttachTrace(tr, NULL, 0, N-1, 0, BEGIN_ST);
644   PushTracestack(dolist, curr_tr);
645 
646   /* Recursion. While there's active nodes in the stack, trace from them.
647    *
648    * This is cribbed from recurse_cvmx(); it's almost the exact reverse.
649    * We know the best score, we just have to figure out where it came from.
650    */
651   while ((curr_tr = PopTracestack(dolist)) != NULL)
652     {
653 				/* get some useful numbers, mostly for clarity */
654 				/* which is important, since we're sort of misusing
655 				 * fields in the trace structures! */
656       i    = curr_tr->emitl+1;
657       j    = curr_tr->emitr+1;
658       y    = curr_tr->nodeidx;
659       diff = j - i + 1;
660 
661 				/* find out which bifurc/end state terminates this
662 				   segment */
663       end_y = y+1;
664       while (icm[end_y].statetype != uBIFURC_ST &&
665 	     icm[end_y].statetype != uEND_ST)
666 	end_y++;
667 
668 				/* find out i2,j2 that the terminal bifurc/end
669 				   aligns to */
670       unpack_tb(btr[y][j][diff], &diff2, &j2);
671       i2 = j2 - diff2 + 1;
672 
673       /* For now, just write out what the traceback looks like.
674        */
675       printf("Segment from state %d to %d: aligns to %d..%d/%d..%d\n",
676 	     y, end_y, i, i2-1, j2+1, j);
677 
678 
679 				/* push next BEGINs onto stack; they are connected
680 				   to BIFURC end_y */
681       if (icm[end_y].statetype == uBIFURC_ST)
682 	{
683 	  if (i2 > j2)
684 	    {
685 	      PushTracestack(dolist, AttachTrace(curr_tr, NULL, i2-1, j2-1, icm[end_y].tmx[1], BEGIN_ST));
686 	      PushTracestack(dolist, AttachTrace(curr_tr, NULL, i2-1, j2-1, icm[end_y].tmx[0], BEGIN_ST));
687 	    }
688 	  else
689 	    {
690 	      leftdiff = diff2;
691 	      leftj    = j2;
692 	      right_p  = bmx[icm[end_y].tmx[1]][j2];
693 
694 	      while (leftdiff >= 0)
695 		{
696 		  if (bmx[end_y][j2][diff] == bmx[icm[end_y].tmx[0]][leftj][leftdiff] + *right_p)
697 		    {
698 		      printf("found the bifurc: it is %d-%d and %d-%d\n",
699 			     i2 -1, i2+leftdiff-2, i2 + leftdiff-1, j2-1);
700 		      PushTracestack(dolist, AttachTrace(curr_tr, NULL, i2 + leftdiff-1, j2-1,
701 							 icm[end_y].tmx[1], BEGIN_ST));
702 		      PushTracestack(dolist, AttachTrace(curr_tr, NULL, i2 -1, i2+leftdiff-2,
703 							 icm[end_y].tmx[0], BEGIN_ST));
704 		      break;
705 		    }
706 		  leftdiff--;
707 		  leftj--;
708 		  right_p++;
709 		}
710 	      if (leftdiff < 0)
711 		Die("bifurc reconstruction failed at ijy %d,%d,%d", i,j,y);
712 	    }
713 	}
714     } /* (while something is in the tracestack) */
715 
716   FreeTracestack(dolist);
717 
718   *ret_trace = tr;
719   return 1;
720 }
721 
722 
723 
724 #ifdef DEBUG
725 
726 /* Function: print_tb()
727  *
728  * Purpose:  Print the two numbers of a packed traceback.
729  */
730 static void
print_tb(int tb)731 print_tb(int tb)
732 {
733   int i, j;
734 
735   unpack_tb(tb, &i, &j);
736   printf("%d %d\n", i, j);
737 }
738 
739 /* Function: PrintSmallMX()
740  *
741  * Purpose:  Debugging output; print out the three-dimensional
742  *           auxiliary alignment matrix produced by the
743  *           small-memory version.
744  */
745 static void
print_small_mx(FILE * fp,struct istate_s * icm,int statenum,char * seq,int N,int *** bmx,TBACK *** btr)746 print_small_mx(FILE            *fp,    /* open file or just stdout/stderr */
747 	       struct istate_s *icm,   /* the model to align           */
748 	       int              statenum, /*  number of states in icm  */
749 	       char            *seq,   /* sequence, 1..N               */
750 	       int              N,     /* length of seq                */
751 	       int           ***bmx,   /* auxiliary matrix             */
752 	       TBACK         ***btr)   /* traceback ptrs for bmx       */
753 {
754   int j, diff, y;		/* indices in 3D matrix */
755   int tbdiff, tbj;              /* traceback pointers to a j, diff position */
756 
757   for (y = 0; y < statenum; y++)
758     if (bmx[y] != NULL)
759       {
760 	fprintf(fp, "### B Matrix for state %d, type %d (%s), from node %d\n",
761 		y, icm[y].statetype, UstatetypeName(icm[y].statetype), icm[y].nodeidx);
762 	fprintf(fp, "     ");
763 	for (diff = 0; diff <= N; diff++)
764 	  fprintf(fp, "%6d  ", diff);
765 	fprintf(fp, "\n");
766 
767 	for (j = 0; j <= N; j++)
768 	  {
769 	    fprintf(fp, "%c %3d ", (j > 0) ? seq[j] : (char) '*', j);
770 	    for (diff = 0; diff <= j; diff++)
771 	      fprintf(fp, "%6d  ", bmx[y][j][diff]);
772 	    fprintf(fp, "\n      ");
773 	    if (icm[y].statetype == uBEGIN_ST)
774 	      for (diff = 0; diff <= j; diff++)
775 		{
776 		  unpack_tb(btr[y][j][diff], &tbdiff, &tbj);
777 		  fprintf(fp, "%3d/%3d ", tbdiff, tbj);
778 		}
779 	    fprintf(fp, "\n");
780 	  }
781 	fprintf(fp, "\n\n");
782       }
783 }
784 #endif /* DEBUG */
785 
786 
787