1 /* trace.c
2  * cove 1.0: Mon May 17 09:38:14 1993
3  * moved to cove 2.0, Mon Sep  6 13:34:55 1993
4  *
5  * Unlike a traceback of a normal HMM alignment, which is linear,
6  * the traceback of a covariance HMM is a tree structure. Here
7  * we provide support for the traceback data structures: the
8  * tree itself, and a pushdown stack used for traversing the
9  * tree.
10  *
11  * The trace tree structure has a dummy node at its beginning,
12  * and dummy end nodes at the termination of each branch. Non-BIFURC
13  * states have a NULL right branch.
14  *
15  * The pushdown stack structure has a dummy begin node, and the
16  * end is signified by a final NULL ptr.
17  */
18 
19 
20 #include <stdio.h>
21 #include <string.h>
22 #include <stdlib.h>
23 
24 #include "structs.h"		/* struct trace_s and struct tracestack_s */
25 #include "funcs.h"
26 #include "version.h"
27 
28 #ifdef MEMDEBUG
29 #include "dbmalloc.h"
30 #endif
31 #ifdef DEBUG
32 #include <assert.h>
33 #endif
34 
35 /* Function: InitTrace()
36  *
37  * Purpose:  Initialize a traceback tree structure.
38  *           ret_tmem may be passed as NULL for default behavior;
39  *           if ret_tmem is passed, enables optimized memory
40  *           behavior for traces.
41  *
42  * Return:   ptr to the new tree.
43  */
44 void
InitTrace(struct trace_s ** ret_new,struct trmem_s ** ret_tmem)45 InitTrace(struct trace_s **ret_new, struct trmem_s **ret_tmem)
46 {
47   struct trace_s *new;
48   struct trace_s *end;
49   struct trmem_s *pool;
50 
51   if (ret_tmem != NULL)
52     {
53       InitTracepool(&pool);
54       new = PopTracepool(pool);
55     }
56   else if ((new = (struct trace_s *) malloc (sizeof(struct trace_s))) == NULL)
57     Die("Memory allocation failure at %s line %d", __FILE__, __LINE__);
58 
59   new->emitl = new->emitr = -1;
60   new->nodeidx = 0;
61   new->type    = uBEGIN_ST;
62   new->nxtr    = NULL;
63   new->prv     = NULL;
64 
65   if (ret_tmem != NULL)
66     end = PopTracepool(pool);
67   else if ((end = (struct trace_s *) malloc (sizeof(struct trace_s))) == NULL)
68     Die("Memory allocation failure at %s line %d", __FILE__, __LINE__);
69   end->type = uEND_ST;
70   end->emitl = end->emitr = end->nodeidx = -1;
71   end->nxtr = end->nxtl = NULL;
72 
73   end->prv  = new;
74   new->nxtl = end;
75 
76   *ret_new = new;
77   if (ret_tmem != NULL) *ret_tmem = pool;
78 }
79 
80 /* Function: AttachTrace()
81  *
82  * Purpose:  attach a new node to a tracetree node.
83  *           There are dummy END nodes.
84  *
85  *           Because of the mechanics of tracebacks through a Viterbi matrix,
86  *           we have to make sure that BIFURC children are attached
87  *           right first, left second.
88  *
89  *           trmem_s may be NULL (default behavior) or an active
90  *           trace pool (optimized memory behavior)
91  *
92  * Returns:  ptr to the new node, or NULL on failure.
93  */
94 struct trace_s *
AttachTrace(struct trace_s * parent,struct trmem_s * pool,int emitl,int emitr,int nodeidx,int type)95 AttachTrace(struct trace_s *parent,
96 	    struct trmem_s *pool,
97 	    int             emitl,
98 	    int             emitr,
99 	    int             nodeidx,
100 	    int             type)
101 {
102   struct trace_s *new;
103   struct trace_s *end;
104 
105   if (parent->nxtr != NULL)
106     Die("That trace node is already full, fool.");
107 
108   /* If left branch is already connected to something, swap it over to the
109    * right (thus enforcing the necessary rule that BIFURCS attach to the right
110    * branch first), and attach a new dummy end to the left branch.
111    */
112   if (parent->nxtl->nxtl != NULL)
113     {
114       parent->nxtr = parent->nxtl;
115 
116       if (pool != NULL)
117 	end = PopTracepool(pool);
118       else if ((end = (struct trace_s *) malloc (sizeof(struct trace_s))) == NULL)
119 	Die("Memory allocation failure at %s line %d", __FILE__, __LINE__);
120       end->type = uEND_ST;
121       end->emitl = end->emitr = end->nodeidx = -1;
122       end->nxtl = end->nxtr = NULL;
123 
124       end->prv     = parent;
125       parent->nxtl = end;
126     }
127 
128   if (pool != NULL)
129     new = PopTracepool(pool);
130   else if ((new = (struct trace_s *) malloc (sizeof(struct trace_s))) == NULL)
131     Die("Memory allocation failure at %s line %d", __FILE__, __LINE__);
132   new->nxtr = NULL;
133 
134   new->nxtl         = parent->nxtl;
135   new->prv          = parent;
136   parent->nxtl->prv = new;	/* end state also points back, to new */
137   parent->nxtl      = new;
138 
139   new->emitl   = emitl;
140   new->emitr   = emitr;
141   new->nodeidx = nodeidx;
142   new->type    = type;
143 
144   return new;
145 }
146 
147 void
FreeTrace(struct trace_s * tr,struct trmem_s * pool)148 FreeTrace(struct trace_s *tr, struct trmem_s *pool)
149 {
150   if (pool == NULL)
151     {
152       struct tracestack_s *stack;
153       struct trace_s      *currtr;
154 
155       stack = InitTracestack();
156       PushTracestack(stack, tr);
157 
158       while ((currtr = PopTracestack(stack)) != NULL)
159 	{
160 	  if (currtr->nxtr != NULL)
161 	    PushTracestack(stack, currtr->nxtr);
162 	  if (currtr->nxtl != NULL)
163 	    PushTracestack(stack, currtr->nxtl);
164 	  free(currtr);
165 	}
166       FreeTracestack(stack);
167     }
168   else
169     FreeTracepool(pool);
170 }
171 
172 void
DeleteTracenode(struct trace_s * oldtr,struct trmem_s * pool)173 DeleteTracenode(struct trace_s *oldtr, struct trmem_s *pool)
174 {
175   struct trace_s *parent;
176 
177   parent = oldtr->prv;
178 
179   parent->nxtl = oldtr->nxtl;
180   parent->nxtr = oldtr->nxtr;
181   oldtr->nxtl->prv = parent;
182   if (oldtr->nxtr) oldtr->nxtr->prv = parent;
183   if (pool == NULL) free(oldtr);
184 }
185 
186 /* Functions: InitTracepool(), PopTracepool(), FreeTracepool()
187  *
188  * Purpose:   Malloc() optimizations for building lots of
189  *            trace trees. A "trace pool" just lets me malloc
190  *            a lot of trace_s structures at once (InitTracepool).
191  *            They are retrieved one at a time using PopTracepool().
192  *            When done (free'ing the trace), one would call
193  *            FreeTracepool().
194  *
195  *            Make one trace pool per trace, if using this optimization.
196  */
197 void
InitTracepool(struct trmem_s ** ret_tmem)198 InitTracepool(struct trmem_s **ret_tmem)
199 {
200   struct trmem_s *tmem;
201 
202   tmem = (struct trmem_s *) MallocOrDie (sizeof(struct trmem_s));
203   tmem->next = 0;
204   tmem->num  = TMEM_BLOCK;
205   tmem->pool = (struct trace_s *) MallocOrDie (TMEM_BLOCK * sizeof(struct trace_s));
206   tmem->used = InitTracestack();
207   *ret_tmem = tmem;
208 }
209 struct trace_s *
PopTracepool(struct trmem_s * tmem)210 PopTracepool(struct trmem_s *tmem)
211 {
212   struct trace_s *tr;
213   if (tmem->next == tmem->num)
214     {				/* need a new pool */
215       PushTracestack(tmem->used, tmem->pool);
216       tmem->next = 0;
217       tmem->num  = TMEM_BLOCK;
218       tmem->pool = (struct trace_s *) MallocOrDie (TMEM_BLOCK * sizeof(struct trace_s));
219     }
220   tr = tmem->pool + tmem->next;
221   tmem->next++;
222   return tr;
223 }
224 void
FreeTracepool(struct trmem_s * tmem)225 FreeTracepool(struct trmem_s *tmem)
226 {
227   struct trace_s *pop;
228 
229   while ((pop = PopTracestack(tmem->used)) != NULL)
230     free(pop);
231   FreeTracestack(tmem->used);
232   free(tmem->pool);
233   free(tmem);
234 }
235 
236 
237 /* Functions: InitTracestack()
238  *            PushTracestack()
239  *            PopTracestack()
240  *            FreeTracestack()
241  *
242  * Purpose:   Implementation of the pushdown stack for
243  *            traversing traceback trees.
244  */
245 struct tracestack_s *
InitTracestack(void)246 InitTracestack(void)
247 {
248   struct tracestack_s *stack;
249 
250   stack = (struct tracestack_s *) MallocOrDie (sizeof(struct tracestack_s));
251   stack->next = 0;
252   stack->num  = TSTACK_BLOCK;
253   stack->list = (struct trace_s **) MallocOrDie (sizeof(struct trace_s *) * TSTACK_BLOCK);
254   return stack;
255 }
256 
257 void
PushTracestack(struct tracestack_s * stack,struct trace_s * tracenode)258 PushTracestack(struct tracestack_s *stack, struct trace_s *tracenode)
259 {
260   if (stack->next == stack->num)
261     {
262       stack->num += TSTACK_BLOCK;
263       stack->list = (struct trace_s **) ReallocOrDie (stack->list, sizeof(struct trace_s *) * stack->num);
264     }
265   stack->list[stack->next] = tracenode;
266   stack->next++;
267 }
268 struct trace_s *
PopTracestack(struct tracestack_s * stack)269 PopTracestack(struct tracestack_s *stack)
270 {
271   struct trace_s *pop;
272 
273   if (stack->next == 0) return NULL;
274   stack->next--;
275   pop = stack->list[stack->next];
276   return pop;
277 }
278 void
FreeTracestack(struct tracestack_s * stack)279 FreeTracestack(struct tracestack_s *stack)
280 {
281   free(stack->list);
282   free(stack);
283 }
284 
285 
286 
287 /* Function: TraceCount()
288  *
289  * Purpose:  Given a trace structure, and the sequence it traces across,
290  *           and a nascent model (counts form), bump the appropriate
291  *           emission and transition counters in the model.
292  *
293  * Return:   1 on success, 0 on failure.
294  */
295 int
TraceCount(struct cm_s * cm,char * seq,double weight,struct trace_s * tr)296 TraceCount(struct cm_s   *cm,        /* model               */
297 	   char          *seq,       /* sequence, 0..len-1  */
298 	   double         weight,    /* weight on sequence  */
299 	   struct trace_s *tr)       /* traceback           */
300 {
301   struct tracestack_s *dolist;  /* stack for traversal of traceback tree */
302   struct trace_s      *curr;    /* current node in the tree              */
303   int   symr, syml;
304 #ifdef DEBUG
305   int   len;
306   len = strlen(seq);
307 #endif
308 
309   dolist = InitTracestack();
310   PushTracestack(dolist, tr->nxtl);
311 
312   while ((curr = PopTracestack(dolist)) != NULL)
313     {
314 				/* ignore END states */
315       if (curr->nodeidx == -1 || curr->nxtl == NULL)
316 	continue;
317 
318 				/* BIFURC states: no transits, no emission */
319       if (curr->nxtr != NULL)
320 	{
321 #ifdef DEBUG
322 	  assert(curr->nxtr != NULL && curr->nxtl != NULL);
323 #endif
324 	  PushTracestack(dolist, curr->nxtr);
325 	  PushTracestack(dolist, curr->nxtl);
326 	}
327 
328       else if (curr->type == uINSL_ST)
329 	{
330 #ifdef DEBUG
331 	  assert(curr->emitl >= 0 && curr->emitl < len);
332 #endif
333 	  syml = SymbolIndex(seq[curr->emitl]);
334 #ifdef DEBUG
335 	  assert(syml >= 0 && syml < 4);
336 	  assert(curr->nodeidx >= 0 && curr->nodeidx < cm->nodes);
337 	  assert(curr->nxtl != NULL);
338 #endif
339 	  cm->nd[curr->nodeidx].tmx[INSL_ST][StatetypeIndex(curr->nxtl->type)] += weight;
340 	  cm->nd[curr->nodeidx].il_emit[syml] += weight;
341 	  PushTracestack(dolist, curr->nxtl);
342 	}
343 
344       else if (curr->type == uINSR_ST)
345 	{
346 #ifdef DEBUG
347 	  assert(curr->emitr >= 0 && curr->emitr < len);
348 #endif
349 	  symr = SymbolIndex(seq[curr->emitr]);
350 #ifdef DEBUG
351 	  assert(symr >= 0 && symr < 4);
352 	  assert(curr->nodeidx >= 0 && curr->nodeidx < cm->nodes);
353 	  assert(curr->nxtl != NULL);
354 #endif
355 	  cm->nd[curr->nodeidx].tmx[INSR_ST][StatetypeIndex(curr->nxtl->type)] += weight;
356 	  cm->nd[curr->nodeidx].ir_emit[symr] += weight;
357 	  PushTracestack(dolist, curr->nxtl);
358 	}
359 
360       else if (curr->type == uMATP_ST)
361 	{
362 #ifdef DEBUG
363 	  assert(curr->emitr >= 0 && curr->emitr < len);
364 	  assert(curr->emitl >= 0 && curr->emitl < len);
365 #endif
366 	  syml = SymbolIndex(seq[curr->emitl]);
367 	  symr = SymbolIndex(seq[curr->emitr]);
368 #ifdef DEBUG
369 	  assert(syml >= 0 && syml < 4);
370 	  assert(symr >= 0 && symr < 4);
371 	  assert(curr->nodeidx > 0 && curr->nodeidx < cm->nodes);
372 	  assert(curr->nxtl != NULL);
373 #endif
374 	  cm->nd[curr->nodeidx].tmx[MATP_ST][StatetypeIndex(curr->nxtl->type)] += weight;
375 	  cm->nd[curr->nodeidx].mp_emit[syml][symr] += weight;
376 	  PushTracestack(dolist, curr->nxtl);
377 	}
378 
379       else if (curr->type == uMATL_ST)
380 	{
381 #ifdef DEBUG
382 	  assert(curr->emitl >= 0 && curr->emitl < len);
383 #endif
384 	  syml = SymbolIndex(seq[curr->emitl]);
385 #ifdef DEBUG
386 	  assert(syml >= 0 && syml < 4);
387 	  assert(curr->nodeidx > 0 && curr->nodeidx < cm->nodes);
388 	  assert(curr->nxtl != NULL);
389 #endif
390 	  cm->nd[curr->nodeidx].tmx[MATL_ST][StatetypeIndex(curr->nxtl->type)] += weight;
391 	  cm->nd[curr->nodeidx].ml_emit[syml] += weight;
392 	  PushTracestack(dolist, curr->nxtl);
393 	}
394 
395       else if (curr->type == uMATR_ST)
396 	{
397 #ifdef DEBUG
398 	  assert(curr->emitr >= 0 && curr->emitr < len);
399 #endif
400 	  symr = SymbolIndex(seq[curr->emitr]);
401 #ifdef DEBUG
402 	  assert(symr >= 0 && symr < 4);
403 	  assert(curr->nodeidx > 0 && curr->nodeidx < cm->nodes);
404 	  assert(curr->nxtl != NULL);
405 #endif
406 	  cm->nd[curr->nodeidx].tmx[MATR_ST][StatetypeIndex(curr->nxtl->type)] += weight;
407 	  cm->nd[curr->nodeidx].mr_emit[symr] += weight;
408 	  PushTracestack(dolist, curr->nxtl);
409 	}
410 
411       else			/* DEL or BEGIN state */
412 	{
413 #ifdef DEBUG
414 	  assert(curr->nodeidx >= 0 && curr->nodeidx < cm->nodes);
415 	  assert(curr->nxtl->type >= 0 && curr->nxtl->type < STATETYPES);
416 	  assert(curr->nxtl != NULL);
417 #endif
418 	  cm->nd[curr->nodeidx].tmx[DEL_ST][StatetypeIndex(curr->nxtl->type)] += weight;
419 	  PushTracestack(dolist, curr->nxtl);
420 	}
421     }
422 
423   FreeTracestack(dolist);
424   return 1;
425 }
426 
427 
428 
429 /* Function: TraceCountPrior()
430  *
431  * Purpose:  Same as above, except that we register the counts
432  *           in a prior instead of a model. Used for "training"
433  *           new priors.
434  *
435  * Return:   1 on success, 0 on failure.
436  */
437 int
TraceCountPrior(struct cm_s * cm,struct prior_s * prior,char * seq,double weight,struct trace_s * tr)438 TraceCountPrior(struct cm_s      *cm,       /* covariance model    */
439 		struct prior_s   *prior,    /* prior to count into */
440 		char             *seq,      /* sequence, 0..len-1  */
441 		double            weight,   /* weight on sequence  */
442 		struct trace_s   *tr)       /* traceback           */
443 {
444   struct tracestack_s *dolist;  /* stack for traversal of traceback tree */
445   struct trace_s      *curr;    /* current node in the tree              */
446   int   symr, syml;
447   int   fnode, tnode;
448   int   fs, ts;
449 
450   dolist = InitTracestack();
451   PushTracestack(dolist, tr->nxtl);
452 
453   while ((curr = PopTracestack(dolist)) != NULL)
454     {
455 				/* ignore END states */
456       if (curr->nodeidx == -1 || curr->nxtl == NULL)
457 	continue;
458 
459 			/* BIFURC states: no transits, no emission */
460       if (curr->nxtr != NULL)
461 	{
462 	  PushTracestack(dolist, curr->nxtr);
463 	  PushTracestack(dolist, curr->nxtl);
464 	  continue;
465 	}
466 
467       syml = symr = 0;
468       if (curr->emitl != -1 && !isgap(seq[curr->emitl]))
469 	syml = SymbolIndex(seq[curr->emitl]);
470       if (curr->emitr != -1 && !isgap(seq[curr->emitr]))
471 	symr = SymbolIndex(seq[curr->emitr]);
472       fnode = cm->nd[curr->nodeidx].type;
473       tnode = (cm->nd[curr->nodeidx].nxt != -1) ? cm->nd[cm->nd[curr->nodeidx].nxt].type : END_NODE;
474       fs    = StatetypeIndex(curr->type);
475       ts    = (cm->nd[curr->nodeidx].nxt != -1) ? StatetypeIndex(curr->nxtl->type) : END_ST;
476 
477       /* Verify where we're writing in memory. Had some problems here!
478        */
479       if (fnode < 0 || fnode > 6) Die("fnode is %d", fnode);
480       if (tnode < 0 || tnode > 3) Die("tnode is %d", tnode);
481       if (fs < 0 || fs >= STATETYPES) Die("fs is %d", fs);
482       if (ts < 0 || ts >= STATETYPES) Die("ts is %d", ts);
483       if (syml < 0 || syml >= ALPHASIZE) Die("syml is %d", syml);
484       if (symr < 0 || symr >= ALPHASIZE) Die("symr is %d", symr);
485 
486 
487       prior->tprior[fnode][tnode][fs][ts] += weight;
488 
489       switch (curr->type) {
490       case uMATP_ST: prior->matp_prior[syml][symr] += weight; break;
491       case uMATL_ST: prior->matl_prior[syml] += weight;       break;
492       case uMATR_ST: prior->matr_prior[symr] += weight;       break;
493       case uINSL_ST: prior->insl_prior[syml] += weight;       break;
494       case uINSR_ST: prior->insr_prior[symr] += weight;       break;
495       case uDEL_ST:  break;
496       default: Die("no such state type %d", curr->type);
497       }
498 
499       PushTracestack(dolist, curr->nxtl);
500     }
501   FreeTracestack(dolist);
502   return 1;
503 }
504 
505 
506 
507 
508 
509 
510 /* Function: TraceScore()
511  *
512  * Purpose:  Given a trace structure, and the sequence it traces across,
513  *           and a model (probability form), calculate the log-odds
514  *           probability score.
515  *
516  *
517  * Return:   1 on success, 0 on failure.
518  */
519 double
TraceScore(struct cm_s * cm,char * seq,struct trace_s * tr)520 TraceScore(struct cm_s   *cm,        /* model               */
521 	   char          *seq,       /* sequence, 0..len-1  */
522 	   struct trace_s *tr)       /* traceback           */
523 {
524   struct tracestack_s *dolist;  /* stack for traversal of traceback tree */
525   struct trace_s      *curr;    /* current node in the tree              */
526   int   symr, syml;
527   double  score;
528 
529   score  = 0;
530   dolist = InitTracestack();
531   PushTracestack(dolist, tr->nxtl);
532   while ((curr = PopTracestack(dolist)) != NULL)
533     {
534 				/* ignore END states */
535       if (curr->nodeidx == -1 || curr->nxtl == NULL)
536 	continue;
537 
538 				/* BIFURC states: no transits, no emission */
539       if (curr->nxtr != NULL)
540 	{
541 	  PushTracestack(dolist, curr->nxtr);
542 	  PushTracestack(dolist, curr->nxtl);
543 	}
544 
545       else if (curr->type == uINSL_ST)
546 	{
547 	  syml = SymbolIndex(seq[curr->emitl]);
548 	  score += log(cm->nd[curr->nodeidx].tmx[INSL_ST][StatetypeIndex(curr->nxtl->type)]);
549 	  score += log(cm->nd[curr->nodeidx].il_emit[syml]);
550 	  score += log(4.0);		/* for log-odds */
551 	  PushTracestack(dolist, curr->nxtl);
552 	}
553 
554       else if (curr->type == uINSR_ST)
555 	{
556 	  symr = SymbolIndex(seq[curr->emitr]);
557 	  score += log(cm->nd[curr->nodeidx].tmx[INSR_ST][StatetypeIndex(curr->nxtl->type)]);
558 	  score += log(cm->nd[curr->nodeidx].ir_emit[symr]);
559 	  score += log(4.0);		/* for log-odds */
560 	  PushTracestack(dolist, curr->nxtl);
561 	}
562 
563       else if (curr->type == uMATP_ST)
564 	{
565 	  syml = SymbolIndex(seq[curr->emitl]);
566 	  symr = SymbolIndex(seq[curr->emitr]);
567 	  score += log(cm->nd[curr->nodeidx].tmx[MATP_ST][StatetypeIndex(curr->nxtl->type)]);
568 	  score += log(cm->nd[curr->nodeidx].mp_emit[syml][symr]);
569 	  score += log(16.0);		/* for log-odds */
570 	  PushTracestack(dolist, curr->nxtl);
571 	}
572 
573       else if (curr->type == uMATL_ST)
574 	{
575 	  syml = SymbolIndex(seq[curr->emitl]);
576 	  score += log(cm->nd[curr->nodeidx].tmx[MATL_ST][StatetypeIndex(curr->nxtl->type)]);
577 	  score += log(cm->nd[curr->nodeidx].ml_emit[syml]);
578 	  score += log(4.0);		/* for log-odds */
579 	  PushTracestack(dolist, curr->nxtl);
580 	}
581 
582       else if (curr->type == uMATR_ST)
583 	{
584 
585 	  symr = SymbolIndex(seq[curr->emitr]);
586 	  score += log(cm->nd[curr->nodeidx].tmx[MATR_ST][StatetypeIndex(curr->nxtl->type)]);
587 	  score += log(cm->nd[curr->nodeidx].mr_emit[symr]);
588 	  score += log(4.0);		/* for log-odds */
589 	  PushTracestack(dolist, curr->nxtl);
590 	}
591 
592       else			/* DEL or BEGIN state */
593 	{
594 	  score += log(cm->nd[curr->nodeidx].tmx[DEL_ST][StatetypeIndex(curr->nxtl->type)]);
595 	  PushTracestack(dolist, curr->nxtl);
596 	}
597     }
598 
599   FreeTracestack(dolist);
600   score = score / log(2.0);	/* convert to bits */
601   return score;
602 }
603 
604