1 /* trace.c
2 * cove 1.0: Mon May 17 09:38:14 1993
3 * moved to cove 2.0, Mon Sep 6 13:34:55 1993
4 *
5 * Unlike a traceback of a normal HMM alignment, which is linear,
6 * the traceback of a covariance HMM is a tree structure. Here
7 * we provide support for the traceback data structures: the
8 * tree itself, and a pushdown stack used for traversing the
9 * tree.
10 *
11 * The trace tree structure has a dummy node at its beginning,
12 * and dummy end nodes at the termination of each branch. Non-BIFURC
13 * states have a NULL right branch.
14 *
15 * The pushdown stack structure has a dummy begin node, and the
16 * end is signified by a final NULL ptr.
17 */
18
19
20 #include <stdio.h>
21 #include <string.h>
22 #include <stdlib.h>
23
24 #include "structs.h" /* struct trace_s and struct tracestack_s */
25 #include "funcs.h"
26 #include "version.h"
27
28 #ifdef MEMDEBUG
29 #include "dbmalloc.h"
30 #endif
31 #ifdef DEBUG
32 #include <assert.h>
33 #endif
34
35 /* Function: InitTrace()
36 *
37 * Purpose: Initialize a traceback tree structure.
38 * ret_tmem may be passed as NULL for default behavior;
39 * if ret_tmem is passed, enables optimized memory
40 * behavior for traces.
41 *
42 * Return: ptr to the new tree.
43 */
44 void
InitTrace(struct trace_s ** ret_new,struct trmem_s ** ret_tmem)45 InitTrace(struct trace_s **ret_new, struct trmem_s **ret_tmem)
46 {
47 struct trace_s *new;
48 struct trace_s *end;
49 struct trmem_s *pool;
50
51 if (ret_tmem != NULL)
52 {
53 InitTracepool(&pool);
54 new = PopTracepool(pool);
55 }
56 else if ((new = (struct trace_s *) malloc (sizeof(struct trace_s))) == NULL)
57 Die("Memory allocation failure at %s line %d", __FILE__, __LINE__);
58
59 new->emitl = new->emitr = -1;
60 new->nodeidx = 0;
61 new->type = uBEGIN_ST;
62 new->nxtr = NULL;
63 new->prv = NULL;
64
65 if (ret_tmem != NULL)
66 end = PopTracepool(pool);
67 else if ((end = (struct trace_s *) malloc (sizeof(struct trace_s))) == NULL)
68 Die("Memory allocation failure at %s line %d", __FILE__, __LINE__);
69 end->type = uEND_ST;
70 end->emitl = end->emitr = end->nodeidx = -1;
71 end->nxtr = end->nxtl = NULL;
72
73 end->prv = new;
74 new->nxtl = end;
75
76 *ret_new = new;
77 if (ret_tmem != NULL) *ret_tmem = pool;
78 }
79
80 /* Function: AttachTrace()
81 *
82 * Purpose: attach a new node to a tracetree node.
83 * There are dummy END nodes.
84 *
85 * Because of the mechanics of tracebacks through a Viterbi matrix,
86 * we have to make sure that BIFURC children are attached
87 * right first, left second.
88 *
89 * trmem_s may be NULL (default behavior) or an active
90 * trace pool (optimized memory behavior)
91 *
92 * Returns: ptr to the new node, or NULL on failure.
93 */
94 struct trace_s *
AttachTrace(struct trace_s * parent,struct trmem_s * pool,int emitl,int emitr,int nodeidx,int type)95 AttachTrace(struct trace_s *parent,
96 struct trmem_s *pool,
97 int emitl,
98 int emitr,
99 int nodeidx,
100 int type)
101 {
102 struct trace_s *new;
103 struct trace_s *end;
104
105 if (parent->nxtr != NULL)
106 Die("That trace node is already full, fool.");
107
108 /* If left branch is already connected to something, swap it over to the
109 * right (thus enforcing the necessary rule that BIFURCS attach to the right
110 * branch first), and attach a new dummy end to the left branch.
111 */
112 if (parent->nxtl->nxtl != NULL)
113 {
114 parent->nxtr = parent->nxtl;
115
116 if (pool != NULL)
117 end = PopTracepool(pool);
118 else if ((end = (struct trace_s *) malloc (sizeof(struct trace_s))) == NULL)
119 Die("Memory allocation failure at %s line %d", __FILE__, __LINE__);
120 end->type = uEND_ST;
121 end->emitl = end->emitr = end->nodeidx = -1;
122 end->nxtl = end->nxtr = NULL;
123
124 end->prv = parent;
125 parent->nxtl = end;
126 }
127
128 if (pool != NULL)
129 new = PopTracepool(pool);
130 else if ((new = (struct trace_s *) malloc (sizeof(struct trace_s))) == NULL)
131 Die("Memory allocation failure at %s line %d", __FILE__, __LINE__);
132 new->nxtr = NULL;
133
134 new->nxtl = parent->nxtl;
135 new->prv = parent;
136 parent->nxtl->prv = new; /* end state also points back, to new */
137 parent->nxtl = new;
138
139 new->emitl = emitl;
140 new->emitr = emitr;
141 new->nodeidx = nodeidx;
142 new->type = type;
143
144 return new;
145 }
146
147 void
FreeTrace(struct trace_s * tr,struct trmem_s * pool)148 FreeTrace(struct trace_s *tr, struct trmem_s *pool)
149 {
150 if (pool == NULL)
151 {
152 struct tracestack_s *stack;
153 struct trace_s *currtr;
154
155 stack = InitTracestack();
156 PushTracestack(stack, tr);
157
158 while ((currtr = PopTracestack(stack)) != NULL)
159 {
160 if (currtr->nxtr != NULL)
161 PushTracestack(stack, currtr->nxtr);
162 if (currtr->nxtl != NULL)
163 PushTracestack(stack, currtr->nxtl);
164 free(currtr);
165 }
166 FreeTracestack(stack);
167 }
168 else
169 FreeTracepool(pool);
170 }
171
172 void
DeleteTracenode(struct trace_s * oldtr,struct trmem_s * pool)173 DeleteTracenode(struct trace_s *oldtr, struct trmem_s *pool)
174 {
175 struct trace_s *parent;
176
177 parent = oldtr->prv;
178
179 parent->nxtl = oldtr->nxtl;
180 parent->nxtr = oldtr->nxtr;
181 oldtr->nxtl->prv = parent;
182 if (oldtr->nxtr) oldtr->nxtr->prv = parent;
183 if (pool == NULL) free(oldtr);
184 }
185
186 /* Functions: InitTracepool(), PopTracepool(), FreeTracepool()
187 *
188 * Purpose: Malloc() optimizations for building lots of
189 * trace trees. A "trace pool" just lets me malloc
190 * a lot of trace_s structures at once (InitTracepool).
191 * They are retrieved one at a time using PopTracepool().
192 * When done (free'ing the trace), one would call
193 * FreeTracepool().
194 *
195 * Make one trace pool per trace, if using this optimization.
196 */
197 void
InitTracepool(struct trmem_s ** ret_tmem)198 InitTracepool(struct trmem_s **ret_tmem)
199 {
200 struct trmem_s *tmem;
201
202 tmem = (struct trmem_s *) MallocOrDie (sizeof(struct trmem_s));
203 tmem->next = 0;
204 tmem->num = TMEM_BLOCK;
205 tmem->pool = (struct trace_s *) MallocOrDie (TMEM_BLOCK * sizeof(struct trace_s));
206 tmem->used = InitTracestack();
207 *ret_tmem = tmem;
208 }
209 struct trace_s *
PopTracepool(struct trmem_s * tmem)210 PopTracepool(struct trmem_s *tmem)
211 {
212 struct trace_s *tr;
213 if (tmem->next == tmem->num)
214 { /* need a new pool */
215 PushTracestack(tmem->used, tmem->pool);
216 tmem->next = 0;
217 tmem->num = TMEM_BLOCK;
218 tmem->pool = (struct trace_s *) MallocOrDie (TMEM_BLOCK * sizeof(struct trace_s));
219 }
220 tr = tmem->pool + tmem->next;
221 tmem->next++;
222 return tr;
223 }
224 void
FreeTracepool(struct trmem_s * tmem)225 FreeTracepool(struct trmem_s *tmem)
226 {
227 struct trace_s *pop;
228
229 while ((pop = PopTracestack(tmem->used)) != NULL)
230 free(pop);
231 FreeTracestack(tmem->used);
232 free(tmem->pool);
233 free(tmem);
234 }
235
236
237 /* Functions: InitTracestack()
238 * PushTracestack()
239 * PopTracestack()
240 * FreeTracestack()
241 *
242 * Purpose: Implementation of the pushdown stack for
243 * traversing traceback trees.
244 */
245 struct tracestack_s *
InitTracestack(void)246 InitTracestack(void)
247 {
248 struct tracestack_s *stack;
249
250 stack = (struct tracestack_s *) MallocOrDie (sizeof(struct tracestack_s));
251 stack->next = 0;
252 stack->num = TSTACK_BLOCK;
253 stack->list = (struct trace_s **) MallocOrDie (sizeof(struct trace_s *) * TSTACK_BLOCK);
254 return stack;
255 }
256
257 void
PushTracestack(struct tracestack_s * stack,struct trace_s * tracenode)258 PushTracestack(struct tracestack_s *stack, struct trace_s *tracenode)
259 {
260 if (stack->next == stack->num)
261 {
262 stack->num += TSTACK_BLOCK;
263 stack->list = (struct trace_s **) ReallocOrDie (stack->list, sizeof(struct trace_s *) * stack->num);
264 }
265 stack->list[stack->next] = tracenode;
266 stack->next++;
267 }
268 struct trace_s *
PopTracestack(struct tracestack_s * stack)269 PopTracestack(struct tracestack_s *stack)
270 {
271 struct trace_s *pop;
272
273 if (stack->next == 0) return NULL;
274 stack->next--;
275 pop = stack->list[stack->next];
276 return pop;
277 }
278 void
FreeTracestack(struct tracestack_s * stack)279 FreeTracestack(struct tracestack_s *stack)
280 {
281 free(stack->list);
282 free(stack);
283 }
284
285
286
287 /* Function: TraceCount()
288 *
289 * Purpose: Given a trace structure, and the sequence it traces across,
290 * and a nascent model (counts form), bump the appropriate
291 * emission and transition counters in the model.
292 *
293 * Return: 1 on success, 0 on failure.
294 */
295 int
TraceCount(struct cm_s * cm,char * seq,double weight,struct trace_s * tr)296 TraceCount(struct cm_s *cm, /* model */
297 char *seq, /* sequence, 0..len-1 */
298 double weight, /* weight on sequence */
299 struct trace_s *tr) /* traceback */
300 {
301 struct tracestack_s *dolist; /* stack for traversal of traceback tree */
302 struct trace_s *curr; /* current node in the tree */
303 int symr, syml;
304 #ifdef DEBUG
305 int len;
306 len = strlen(seq);
307 #endif
308
309 dolist = InitTracestack();
310 PushTracestack(dolist, tr->nxtl);
311
312 while ((curr = PopTracestack(dolist)) != NULL)
313 {
314 /* ignore END states */
315 if (curr->nodeidx == -1 || curr->nxtl == NULL)
316 continue;
317
318 /* BIFURC states: no transits, no emission */
319 if (curr->nxtr != NULL)
320 {
321 #ifdef DEBUG
322 assert(curr->nxtr != NULL && curr->nxtl != NULL);
323 #endif
324 PushTracestack(dolist, curr->nxtr);
325 PushTracestack(dolist, curr->nxtl);
326 }
327
328 else if (curr->type == uINSL_ST)
329 {
330 #ifdef DEBUG
331 assert(curr->emitl >= 0 && curr->emitl < len);
332 #endif
333 syml = SymbolIndex(seq[curr->emitl]);
334 #ifdef DEBUG
335 assert(syml >= 0 && syml < 4);
336 assert(curr->nodeidx >= 0 && curr->nodeidx < cm->nodes);
337 assert(curr->nxtl != NULL);
338 #endif
339 cm->nd[curr->nodeidx].tmx[INSL_ST][StatetypeIndex(curr->nxtl->type)] += weight;
340 cm->nd[curr->nodeidx].il_emit[syml] += weight;
341 PushTracestack(dolist, curr->nxtl);
342 }
343
344 else if (curr->type == uINSR_ST)
345 {
346 #ifdef DEBUG
347 assert(curr->emitr >= 0 && curr->emitr < len);
348 #endif
349 symr = SymbolIndex(seq[curr->emitr]);
350 #ifdef DEBUG
351 assert(symr >= 0 && symr < 4);
352 assert(curr->nodeidx >= 0 && curr->nodeidx < cm->nodes);
353 assert(curr->nxtl != NULL);
354 #endif
355 cm->nd[curr->nodeidx].tmx[INSR_ST][StatetypeIndex(curr->nxtl->type)] += weight;
356 cm->nd[curr->nodeidx].ir_emit[symr] += weight;
357 PushTracestack(dolist, curr->nxtl);
358 }
359
360 else if (curr->type == uMATP_ST)
361 {
362 #ifdef DEBUG
363 assert(curr->emitr >= 0 && curr->emitr < len);
364 assert(curr->emitl >= 0 && curr->emitl < len);
365 #endif
366 syml = SymbolIndex(seq[curr->emitl]);
367 symr = SymbolIndex(seq[curr->emitr]);
368 #ifdef DEBUG
369 assert(syml >= 0 && syml < 4);
370 assert(symr >= 0 && symr < 4);
371 assert(curr->nodeidx > 0 && curr->nodeidx < cm->nodes);
372 assert(curr->nxtl != NULL);
373 #endif
374 cm->nd[curr->nodeidx].tmx[MATP_ST][StatetypeIndex(curr->nxtl->type)] += weight;
375 cm->nd[curr->nodeidx].mp_emit[syml][symr] += weight;
376 PushTracestack(dolist, curr->nxtl);
377 }
378
379 else if (curr->type == uMATL_ST)
380 {
381 #ifdef DEBUG
382 assert(curr->emitl >= 0 && curr->emitl < len);
383 #endif
384 syml = SymbolIndex(seq[curr->emitl]);
385 #ifdef DEBUG
386 assert(syml >= 0 && syml < 4);
387 assert(curr->nodeidx > 0 && curr->nodeidx < cm->nodes);
388 assert(curr->nxtl != NULL);
389 #endif
390 cm->nd[curr->nodeidx].tmx[MATL_ST][StatetypeIndex(curr->nxtl->type)] += weight;
391 cm->nd[curr->nodeidx].ml_emit[syml] += weight;
392 PushTracestack(dolist, curr->nxtl);
393 }
394
395 else if (curr->type == uMATR_ST)
396 {
397 #ifdef DEBUG
398 assert(curr->emitr >= 0 && curr->emitr < len);
399 #endif
400 symr = SymbolIndex(seq[curr->emitr]);
401 #ifdef DEBUG
402 assert(symr >= 0 && symr < 4);
403 assert(curr->nodeidx > 0 && curr->nodeidx < cm->nodes);
404 assert(curr->nxtl != NULL);
405 #endif
406 cm->nd[curr->nodeidx].tmx[MATR_ST][StatetypeIndex(curr->nxtl->type)] += weight;
407 cm->nd[curr->nodeidx].mr_emit[symr] += weight;
408 PushTracestack(dolist, curr->nxtl);
409 }
410
411 else /* DEL or BEGIN state */
412 {
413 #ifdef DEBUG
414 assert(curr->nodeidx >= 0 && curr->nodeidx < cm->nodes);
415 assert(curr->nxtl->type >= 0 && curr->nxtl->type < STATETYPES);
416 assert(curr->nxtl != NULL);
417 #endif
418 cm->nd[curr->nodeidx].tmx[DEL_ST][StatetypeIndex(curr->nxtl->type)] += weight;
419 PushTracestack(dolist, curr->nxtl);
420 }
421 }
422
423 FreeTracestack(dolist);
424 return 1;
425 }
426
427
428
429 /* Function: TraceCountPrior()
430 *
431 * Purpose: Same as above, except that we register the counts
432 * in a prior instead of a model. Used for "training"
433 * new priors.
434 *
435 * Return: 1 on success, 0 on failure.
436 */
437 int
TraceCountPrior(struct cm_s * cm,struct prior_s * prior,char * seq,double weight,struct trace_s * tr)438 TraceCountPrior(struct cm_s *cm, /* covariance model */
439 struct prior_s *prior, /* prior to count into */
440 char *seq, /* sequence, 0..len-1 */
441 double weight, /* weight on sequence */
442 struct trace_s *tr) /* traceback */
443 {
444 struct tracestack_s *dolist; /* stack for traversal of traceback tree */
445 struct trace_s *curr; /* current node in the tree */
446 int symr, syml;
447 int fnode, tnode;
448 int fs, ts;
449
450 dolist = InitTracestack();
451 PushTracestack(dolist, tr->nxtl);
452
453 while ((curr = PopTracestack(dolist)) != NULL)
454 {
455 /* ignore END states */
456 if (curr->nodeidx == -1 || curr->nxtl == NULL)
457 continue;
458
459 /* BIFURC states: no transits, no emission */
460 if (curr->nxtr != NULL)
461 {
462 PushTracestack(dolist, curr->nxtr);
463 PushTracestack(dolist, curr->nxtl);
464 continue;
465 }
466
467 syml = symr = 0;
468 if (curr->emitl != -1 && !isgap(seq[curr->emitl]))
469 syml = SymbolIndex(seq[curr->emitl]);
470 if (curr->emitr != -1 && !isgap(seq[curr->emitr]))
471 symr = SymbolIndex(seq[curr->emitr]);
472 fnode = cm->nd[curr->nodeidx].type;
473 tnode = (cm->nd[curr->nodeidx].nxt != -1) ? cm->nd[cm->nd[curr->nodeidx].nxt].type : END_NODE;
474 fs = StatetypeIndex(curr->type);
475 ts = (cm->nd[curr->nodeidx].nxt != -1) ? StatetypeIndex(curr->nxtl->type) : END_ST;
476
477 /* Verify where we're writing in memory. Had some problems here!
478 */
479 if (fnode < 0 || fnode > 6) Die("fnode is %d", fnode);
480 if (tnode < 0 || tnode > 3) Die("tnode is %d", tnode);
481 if (fs < 0 || fs >= STATETYPES) Die("fs is %d", fs);
482 if (ts < 0 || ts >= STATETYPES) Die("ts is %d", ts);
483 if (syml < 0 || syml >= ALPHASIZE) Die("syml is %d", syml);
484 if (symr < 0 || symr >= ALPHASIZE) Die("symr is %d", symr);
485
486
487 prior->tprior[fnode][tnode][fs][ts] += weight;
488
489 switch (curr->type) {
490 case uMATP_ST: prior->matp_prior[syml][symr] += weight; break;
491 case uMATL_ST: prior->matl_prior[syml] += weight; break;
492 case uMATR_ST: prior->matr_prior[symr] += weight; break;
493 case uINSL_ST: prior->insl_prior[syml] += weight; break;
494 case uINSR_ST: prior->insr_prior[symr] += weight; break;
495 case uDEL_ST: break;
496 default: Die("no such state type %d", curr->type);
497 }
498
499 PushTracestack(dolist, curr->nxtl);
500 }
501 FreeTracestack(dolist);
502 return 1;
503 }
504
505
506
507
508
509
510 /* Function: TraceScore()
511 *
512 * Purpose: Given a trace structure, and the sequence it traces across,
513 * and a model (probability form), calculate the log-odds
514 * probability score.
515 *
516 *
517 * Return: 1 on success, 0 on failure.
518 */
519 double
TraceScore(struct cm_s * cm,char * seq,struct trace_s * tr)520 TraceScore(struct cm_s *cm, /* model */
521 char *seq, /* sequence, 0..len-1 */
522 struct trace_s *tr) /* traceback */
523 {
524 struct tracestack_s *dolist; /* stack for traversal of traceback tree */
525 struct trace_s *curr; /* current node in the tree */
526 int symr, syml;
527 double score;
528
529 score = 0;
530 dolist = InitTracestack();
531 PushTracestack(dolist, tr->nxtl);
532 while ((curr = PopTracestack(dolist)) != NULL)
533 {
534 /* ignore END states */
535 if (curr->nodeidx == -1 || curr->nxtl == NULL)
536 continue;
537
538 /* BIFURC states: no transits, no emission */
539 if (curr->nxtr != NULL)
540 {
541 PushTracestack(dolist, curr->nxtr);
542 PushTracestack(dolist, curr->nxtl);
543 }
544
545 else if (curr->type == uINSL_ST)
546 {
547 syml = SymbolIndex(seq[curr->emitl]);
548 score += log(cm->nd[curr->nodeidx].tmx[INSL_ST][StatetypeIndex(curr->nxtl->type)]);
549 score += log(cm->nd[curr->nodeidx].il_emit[syml]);
550 score += log(4.0); /* for log-odds */
551 PushTracestack(dolist, curr->nxtl);
552 }
553
554 else if (curr->type == uINSR_ST)
555 {
556 symr = SymbolIndex(seq[curr->emitr]);
557 score += log(cm->nd[curr->nodeidx].tmx[INSR_ST][StatetypeIndex(curr->nxtl->type)]);
558 score += log(cm->nd[curr->nodeidx].ir_emit[symr]);
559 score += log(4.0); /* for log-odds */
560 PushTracestack(dolist, curr->nxtl);
561 }
562
563 else if (curr->type == uMATP_ST)
564 {
565 syml = SymbolIndex(seq[curr->emitl]);
566 symr = SymbolIndex(seq[curr->emitr]);
567 score += log(cm->nd[curr->nodeidx].tmx[MATP_ST][StatetypeIndex(curr->nxtl->type)]);
568 score += log(cm->nd[curr->nodeidx].mp_emit[syml][symr]);
569 score += log(16.0); /* for log-odds */
570 PushTracestack(dolist, curr->nxtl);
571 }
572
573 else if (curr->type == uMATL_ST)
574 {
575 syml = SymbolIndex(seq[curr->emitl]);
576 score += log(cm->nd[curr->nodeidx].tmx[MATL_ST][StatetypeIndex(curr->nxtl->type)]);
577 score += log(cm->nd[curr->nodeidx].ml_emit[syml]);
578 score += log(4.0); /* for log-odds */
579 PushTracestack(dolist, curr->nxtl);
580 }
581
582 else if (curr->type == uMATR_ST)
583 {
584
585 symr = SymbolIndex(seq[curr->emitr]);
586 score += log(cm->nd[curr->nodeidx].tmx[MATR_ST][StatetypeIndex(curr->nxtl->type)]);
587 score += log(cm->nd[curr->nodeidx].mr_emit[symr]);
588 score += log(4.0); /* for log-odds */
589 PushTracestack(dolist, curr->nxtl);
590 }
591
592 else /* DEL or BEGIN state */
593 {
594 score += log(cm->nd[curr->nodeidx].tmx[DEL_ST][StatetypeIndex(curr->nxtl->type)]);
595 PushTracestack(dolist, curr->nxtl);
596 }
597 }
598
599 FreeTracestack(dolist);
600 score = score / log(2.0); /* convert to bits */
601 return score;
602 }
603
604