1 /* cp9_trace.c
2 * EPN, Wed Dec 5 13:05:17 2007
3 *
4 * Note: all of these functions originated in cp9.c [EPN 02.27.06]
5 *
6 * Support for the CM Plan 9 HMM trace CP9trace_t structure.
7 * This was based heavily on HMMER's 2.x data structures.
8 *
9 */
10
11 #include "esl_config.h"
12 #include "p7_config.h"
13 #include "config.h"
14
15 #include <stdio.h>
16 #include <string.h>
17 #include <ctype.h>
18 #include <time.h>
19 #include <math.h>
20
21 #include "easel.h"
22 #include "esl_msa.h"
23 #include "esl_vectorops.h"
24
25 #include "hmmer.h"
26
27 #include "infernal.h"
28
29 /* Function: CP9AllocTrace(), CP9ReallocTrace(), CP9FreeTrace()
30 *
31 * Purpose: allocation and freeing of traceback structures
32 */
33 void
CP9AllocTrace(int tlen,CP9trace_t ** ret_tr)34 CP9AllocTrace(int tlen, CP9trace_t **ret_tr)
35 {
36 int status;
37 CP9trace_t *tr;
38
39 ESL_ALLOC(tr, sizeof(CP9trace_t));
40 ESL_ALLOC(tr->statetype, sizeof(char) * tlen);
41 ESL_ALLOC(tr->nodeidx, sizeof(int) * tlen);
42 ESL_ALLOC(tr->pos, sizeof(int) * tlen);
43 *ret_tr = tr;
44 return;
45
46 ERROR:
47 cm_Fail("Memory allocation error.");
48 }
49 void
CP9ReallocTrace(CP9trace_t * tr,int tlen)50 CP9ReallocTrace(CP9trace_t *tr, int tlen)
51 {
52 int status;
53 void *tmp;
54
55 ESL_RALLOC(tr->statetype, tmp, tlen * sizeof(char));
56 ESL_RALLOC(tr->nodeidx, tmp, tlen * sizeof(int));
57 ESL_RALLOC(tr->pos, tmp, tlen * sizeof(int));
58 return;
59
60 ERROR:
61 cm_Fail("Memory reallocation error.");
62 }
63 void
CP9FreeTrace(CP9trace_t * tr)64 CP9FreeTrace(CP9trace_t *tr)
65 {
66 if (tr == NULL) return;
67 free(tr->pos);
68 free(tr->nodeidx);
69 free(tr->statetype);
70 free(tr);
71 }
72
73 /* Function: CP9_fake_tracebacks()
74 *
75 * Purpose: From a consensus assignment of columns to MAT/INS, construct fake
76 * tracebacks for each individual sequence.
77 *
78 * Args: msa - msa alignment
79 * matassign - assignment of column 1 if MAT, 0 if INS;
80 * [1..alen] (off one from aseqs)
81 * ret_tr - RETURN: array of tracebacks
82 *
83 * Return: (void)
84 * ret_tr is alloc'ed here. Caller must free.
85 */
86 void
CP9_fake_tracebacks(ESL_MSA * msa,int * matassign,CP9trace_t *** ret_tr)87 CP9_fake_tracebacks(ESL_MSA *msa, int *matassign, CP9trace_t ***ret_tr)
88 {
89 if(! (msa->flags & eslMSA_DIGITAL))
90 cm_Fail("ERROR in CP9_fake_tracebacks(), msa should be digitized.\n");
91
92 int status;
93 CP9trace_t **tr;
94 int idx; /* counter over sequences */
95 int i; /* position in raw sequence (1..L) */
96 int k; /* position in HMM */
97 int apos; /* position in alignment columns */
98 int tpos; /* position in traceback */
99 int first_match; /* first match column */
100 /*int last_match;*/ /* last match column, not used */
101
102 ESL_ALLOC(tr, sizeof(CP9trace_t *) * msa->nseq);
103
104 first_match = -1;
105 /*last_match = -1;*/
106 for (apos = 0; apos < msa->alen; apos++)
107 {
108 if(matassign[apos+1] && first_match == -1) first_match = apos;
109 /*if(matassign[apos+1]) last_match = apos;*/
110 }
111
112 for (idx = 0; idx < msa->nseq; idx++)
113 {
114 CP9AllocTrace(msa->alen+2, &tr[idx]); /* allow room for B & E */
115
116 /* all traces start with M_0 state (the B state)... */
117 tr[idx]->statetype[0] = CSTB;
118 tr[idx]->nodeidx[0] = 0;
119 tr[idx]->pos[0] = 0;
120
121 i = 1;
122 k = 0;
123 tpos = 1;
124
125 for (apos = 0; apos < msa->alen; apos++)
126 {
127 tr[idx]->statetype[tpos] = CSTBOGUS; /* bogus, deliberately, to debug */
128
129 if (matassign[apos+1] && !(esl_abc_XIsGap(msa->abc, msa->ax[idx][(apos+1)])))
130 { /* MATCH */
131 k++; /* move to next model pos */
132 tr[idx]->statetype[tpos] = CSTM;
133 tr[idx]->nodeidx[tpos] = k;
134 tr[idx]->pos[tpos] = i;
135 i++;
136 tpos++;
137 }
138 else if (matassign[apos+1])
139 { /* DELETE */
140 /* We should be careful about S/W transitions; but we have
141 * an ambiguity, based on the MSA, we can't tell if we
142 * did a local begin (some M->E transition) or if we
143 * went through a bunch of D state's before the first match
144 * B->D_1 -> D_2 .... -> M_x. For now, we assume we're not in
145 * S/W mode, and treat it as the latter case, see
146 * HMMER's modelmaker.c:fake_tracebacks() for code
147 * on one *would* implement the S/W consideration IF
148 * there wasn't a B->D_1 transition allowed.
149 */
150 k++; /* *always* move on model when match column seen */
151 tr[idx]->statetype[tpos] = CSTD;
152 tr[idx]->nodeidx[tpos] = k;
153 tr[idx]->pos[tpos] = 0;
154 tpos++;
155 }
156 else if (! (esl_abc_XIsGap(msa->abc, msa->ax[idx][(apos+1)])))
157 { /* INSERT */
158 tr[idx]->statetype[tpos] = CSTI;
159 tr[idx]->nodeidx[tpos] = k;
160 tr[idx]->pos[tpos] = i;
161 i++;
162 tpos++;
163 }
164 }
165 /* all traces end with E state */
166 /* We should be careful about S/W transitions; but we have
167 * an ambiguity, based on the MSA, we can't tell if we
168 * did a local end (some M->E transition) or if we
169 * went through a bunch of D state's before the final
170 * D_M -> E transition. For now, we assume we're not in
171 * S/W mode, and treat it as the latter case, see
172 * HMMER's modelmaker.c:fake_tracebacks() for code
173 * on one *would* implement the S/W consideration IF
174 * there wasn't a D_M -> E transition allowed.
175 */
176 tr[idx]->statetype[tpos] = CSTE;
177 tr[idx]->nodeidx[tpos] = 0;
178 tr[idx]->pos[tpos] = 0;
179 tpos++;
180 tr[idx]->tlen = tpos;
181 } /* end for sequence # idx */
182
183 *ret_tr = tr;
184 return;
185
186 ERROR:
187 cm_Fail("Memory allocation error.");
188 }
189
190 /* Function: CP9TraceCount()
191 * EPN 09.04.06 based on Eddy's P7TraceCount() from HMMER's trace.c
192 *
193 * Purpose: Count a traceback into a count-based HMM structure.
194 * (Usually as part of a model parameter re-estimation.)
195 * Traceback should not have any EL state visits in it.
196 *
197 * Args: hmm - counts-based CM Plan 9 HMM
198 * dsq - sequence that traceback aligns to the HMM (1..L)
199 * wt - weight on the sequence
200 * tr - alignment of seq to HMM
201 *
202 * Return: (void)
203 */
204 void
CP9TraceCount(CP9_t * hmm,ESL_DSQ * dsq,float wt,CP9trace_t * tr)205 CP9TraceCount(CP9_t *hmm, ESL_DSQ *dsq, float wt, CP9trace_t *tr)
206 {
207 /* contract check */
208 if(dsq == NULL) cm_Fail("ERROR in CP9TraceCount(), dsq is NULL.");
209 if(hmm->flags & CPLAN9_EL) cm_Fail("CP9TraceCount(), EL states are on, which this function is not setup for.");
210
211 int tpos; /* position in tr */
212 int i; /* symbol position in seq */
213
214 for (tpos = 0; tpos < tr->tlen; tpos++)
215 {
216 i = tr->pos[tpos];
217
218 /* Emission counts.
219 */
220 if (tr->statetype[tpos] == CSTM)
221 esl_abc_FCount(hmm->abc, hmm->mat[tr->nodeidx[tpos]], dsq[i], wt);
222 else if (tr->statetype[tpos] == CSTI)
223 esl_abc_FCount(hmm->abc, hmm->ins[tr->nodeidx[tpos]], dsq[i], wt);
224
225 /* State transition counts
226 */
227 switch (tr->statetype[tpos]) {
228 case CSTB:
229 switch (tr->statetype[tpos+1]) {
230 case CSTM: hmm->begin[tr->nodeidx[tpos+1]] += wt; break;
231 case CSTI: hmm->t[0][CTMI] += wt; break;
232 case CSTD: hmm->t[0][CTMD] += wt; break;
233 default:
234 cm_Fail("illegal state transition %s->%s in traceback",
235 CP9Statetype(tr->statetype[tpos]),
236 CP9Statetype(tr->statetype[tpos+1]));
237 }
238 break;
239 case CSTM:
240 switch (tr->statetype[tpos+1]) {
241 case CSTM: hmm->t[tr->nodeidx[tpos]][CTMM] += wt; break;
242 case CSTI: hmm->t[tr->nodeidx[tpos]][CTMI] += wt; break;
243 case CSTD: hmm->t[tr->nodeidx[tpos]][CTMD] += wt; break;
244 case CSTE: hmm->end[tr->nodeidx[tpos]] += wt; break;
245 default:
246 cm_Fail("illegal state transition %s->%s in traceback",
247 CP9Statetype(tr->statetype[tpos]),
248 CP9Statetype(tr->statetype[tpos+1]));
249 }
250 break;
251 case CSTI:
252 switch (tr->statetype[tpos+1]) {
253 case CSTM: hmm->t[tr->nodeidx[tpos]][CTIM] += wt; break;
254 case CSTI: hmm->t[tr->nodeidx[tpos]][CTII] += wt; break;
255 case CSTD: hmm->t[tr->nodeidx[tpos]][CTID] += wt; break;
256 case CSTE:
257 /* This should only happen from the final insert (I_M) state */
258 if((tpos+1) != (tr->tlen-1))
259 cm_Fail("illegal state transition %s->%s (I is not final insert) in traceback",
260 CP9Statetype(tr->statetype[tpos]),
261 CP9Statetype(tr->statetype[tpos+1]));
262 hmm->t[tr->nodeidx[tpos]][CTIM] += wt; break;
263 break;
264 default:
265 cm_Fail("illegal state transition %s->%s in traceback",
266 CP9Statetype(tr->statetype[tpos]),
267 CP9Statetype(tr->statetype[tpos+1]));
268 }
269 break;
270 case CSTD:
271 switch (tr->statetype[tpos+1]) {
272 case CSTM: hmm->t[tr->nodeidx[tpos]][CTDM] += wt; break;
273 case CSTI: hmm->t[tr->nodeidx[tpos]][CTDI] += wt; break;
274 case CSTD: hmm->t[tr->nodeidx[tpos]][CTDD] += wt; break;
275 case CSTE:
276 /* This should only happen from the final delete (D_M) state */
277 if((tpos+1) != (tr->tlen-1))
278 cm_Fail("illegal state transition %s->%s (D is not final delete) in traceback",
279 CP9Statetype(tr->statetype[tpos]),
280 CP9Statetype(tr->statetype[tpos+1]));
281 hmm->t[tr->nodeidx[tpos]][CTDM] += wt; break;
282 break;
283 default:
284 cm_Fail("illegal state transition %s->%s in traceback",
285 CP9Statetype(tr->statetype[tpos]),
286 CP9Statetype(tr->statetype[tpos+1]));
287 }
288 break;
289 case CSTEL:
290 cm_Fail("EL in traceback in CP9TraceCount(), this function is being abused.");
291 break;
292 case CSTE:
293 break; /* E is the last. It makes no transitions. */
294
295 default:
296 cm_Fail("illegal state %s in traceback",
297 CP9Statetype(tr->statetype[tpos]));
298 }
299 }
300 }
301
302 /* Function: CP9TraceScore()
303 * based on HMMER 2.3.2's P7TraceScore by SRE
304 *
305 * Purpose: Score a traceback and return the score in scaled bits.
306 * Incept: EPN, Wed May 30 06:07:14 2007
307 *
308 * Args: hmm - HMM with valid log odds scores.
309 * dsq - digitized sequence that traceback aligns to the HMM (1..sq->n)
310 * tr - alignment of seq to HMM
311 *
312 * Return: (void)
313 */
314 float
CP9TraceScore(CP9_t * hmm,ESL_DSQ * dsq,CP9trace_t * tr)315 CP9TraceScore(CP9_t *hmm, ESL_DSQ *dsq, CP9trace_t *tr)
316 {
317 int score; /* total score as a scaled integer */
318 int tpos; /* position in tr */
319 char sym; /* digitized symbol in dsq */
320
321 /* Contract check */
322 if(dsq == NULL)
323 cm_Fail("ERROR in CP9TraceScore, dsq is NULL.");
324
325 /*CP9PrintTrace(stdout, tr, hmm, sq); */
326 score = 0;
327 for (tpos = 0; tpos < tr->tlen-1; tpos++)
328 {
329 sym = dsq[tr->pos[tpos]];
330
331 /* Emissions from M and I states.
332 */
333 if (tr->statetype[tpos] == CSTM)
334 score += hmm->msc[(int) sym][tr->nodeidx[tpos]];
335 else if (tr->statetype[tpos] == CSTI)
336 score += hmm->isc[(int) sym][tr->nodeidx[tpos]];
337
338 /* State transitions. Including EL emissions, EL emits on transition
339 */
340 score += CP9TransitionScoreLookup(hmm,
341 tr->statetype[tpos], tr->nodeidx[tpos],
342 tr->statetype[tpos+1], tr->nodeidx[tpos+1]);
343 }
344 return Scorify(score);
345 }
346
347 /* Function: CP9Statetype()
348 *
349 * Purpose: Returns the state type in text.
350 * Example: CP9Statetype(M) = "M"
351 */
352 char *
CP9Statetype(char st)353 CP9Statetype(char st)
354 {
355 switch (st) {
356 case CSTM: return "M";
357 case CSTD: return "D";
358 case CSTI: return "I";
359 case CSTB: return "B";
360 case CSTE: return "E";
361 case CSTEL: return "L";
362 default: return "BOGUS";
363 }
364 }
365
366 /* Function: CP9PrintTrace()
367 * based on HMMER's 2.3.2 P7PrintTrace()
368 *
369 * Purpose: Print out a traceback structure.
370 * If hmm is non-NULL, also print transition and emission scores.
371 * Incept: EPN, Wed May 30 06:07:57 2007
372 *
373 * Args: fp - stderr or stdout, often
374 * tr - trace structure to print
375 * hmm - NULL or hmm containing scores to print
376 * dsq - NULL or digitized sequence trace refers to.
377 */
378 void
CP9PrintTrace(FILE * fp,CP9trace_t * tr,CP9_t * hmm,ESL_DSQ * dsq)379 CP9PrintTrace(FILE *fp, CP9trace_t *tr, CP9_t *hmm, ESL_DSQ *dsq)
380 {
381 /* Contract check */
382 if((dsq != NULL) && (hmm == NULL))
383 cm_Fail("ERROR in CP9PrintTrace, dsq is non-NULL but HMM is NULL.\n");
384
385 int tpos; /* counter for trace position */
386 unsigned int sym;
387 int sc;
388
389 if (tr == NULL) {
390 fprintf(fp, " [ trace is NULL ]\n");
391 return;
392 }
393
394 if (hmm == NULL) {
395 fprintf(fp, "st node rpos - traceback len %d\n", tr->tlen);
396 fprintf(fp, "-- ---- ------\n");
397 for (tpos = 0; tpos < tr->tlen; tpos++) {
398 fprintf(fp, "%1s %4d %6d\n",
399 CP9Statetype(tr->statetype[tpos]),
400 tr->nodeidx[tpos],
401 tr->pos[tpos]);
402 }
403 } else {
404 if (!(hmm->flags & CPLAN9_HASBITS))
405 cm_Fail("oi, you can't print scores from that hmm, it's not ready.");
406
407 sc = 0;
408 fprintf(fp, "st node rpos transit emission - traceback len %d\n", tr->tlen);
409 fprintf(fp, "-- ---- ------ ------- --------\n");
410 for (tpos = 0; tpos < tr->tlen; tpos++) {
411 if (dsq != NULL) sym = dsq[tr->pos[tpos]];
412
413 fprintf(fp, "%1s %4d %6d %7d",
414 CP9Statetype(tr->statetype[tpos]),
415 tr->nodeidx[tpos],
416 tr->pos[tpos],
417 (tpos < tr->tlen-1) ?
418 CP9TransitionScoreLookup(hmm, tr->statetype[tpos], tr->nodeidx[tpos],
419 tr->statetype[tpos+1], tr->nodeidx[tpos+1]) : 0);
420
421 if (tpos < tr->tlen-1)
422 sc += CP9TransitionScoreLookup(hmm, tr->statetype[tpos], tr->nodeidx[tpos],
423 tr->statetype[tpos+1], tr->nodeidx[tpos+1]);
424
425 if (dsq != NULL) {
426 if (tr->statetype[tpos] == CSTM)
427 {
428 fprintf(fp, " %8d %c", hmm->msc[(int) sym][tr->nodeidx[tpos]],
429 hmm->abc->sym[(int) sym]);
430 sc += hmm->msc[(int) sym][tr->nodeidx[tpos]];
431 }
432 else if (tr->statetype[tpos] == CSTI)
433 {
434 fprintf(fp, " %8d %c", hmm->isc[(int) sym][tr->nodeidx[tpos]],
435 (char) tolower((int) hmm->abc->sym[(int) sym]));
436 sc += hmm->isc[(int) sym][tr->nodeidx[tpos]];
437 }
438 else if (tr->statetype[tpos] == CSTEL)
439 {
440 if(tr->statetype[tpos-1] == CSTEL) /* we will emit on self transit */
441 {
442 fprintf(fp, " %8d %c", 0,
443 (char) tolower((int) hmm->abc->sym[(int) sym]));
444 }
445 else /* we just entered EL, no emission */
446 {
447 fprintf(fp, " %8s %c", "-", '-');
448 }
449 }
450 } else {
451 fprintf(fp, " %8s %c", "-", '-');
452 }
453
454
455 fputs("\n", fp);
456 }
457 fprintf(fp, " ------- --------\n");
458 fprintf(fp, " total: %6d\n\n", sc);
459 }
460 }
461
462 /* Function: CP9TransitionScoreLookup()
463 * based on HMMER's 2.3.2 function of same name
464 *
465 * Incept: EPN, Wed May 30 06:09:04 2007
466 * Purpose: Convenience function used in CP9PrintTrace() and CP9TraceScore();
467 * given state types and node indices for a transition,
468 * return the integer score for that transition.
469 */
470 int
CP9TransitionScoreLookup(CP9_t * hmm,char st1,int k1,char st2,int k2)471 CP9TransitionScoreLookup(CP9_t *hmm, char st1, int k1,
472 char st2, int k2)
473 {
474 switch (st1) {
475 case CSTB:
476 switch (st2) {
477 case CSTM: return hmm->bsc[k2];
478 case CSTI: return hmm->tsc[CTMI][0];
479 case CSTD: return hmm->tsc[CTMD][0];
480 default: cm_Fail("illegal %s->%s transition", CP9Statetype(st1), CP9Statetype(st2));
481 }
482 break;
483 case CSTM:
484 switch (st2) {
485 case CSTM: return hmm->tsc[CTMM][k1];
486 case CSTI: return hmm->tsc[CTMI][k1];
487 case CSTD: return hmm->tsc[CTMD][k1];
488 case CSTE: return hmm->esc[k1];
489 case CSTEL: return hmm->tsc[CTMEL][k1];
490 default: cm_Fail("illegal %s->%s transition", CP9Statetype(st1), CP9Statetype(st2));
491 }
492 break;
493 case CSTI:
494 switch (st2) {
495 case CSTM: return hmm->tsc[CTIM][k1];
496 case CSTI: return hmm->tsc[CTII][k1];
497 case CSTD: return hmm->tsc[CTID][k1];
498 case CSTE: return hmm->tsc[CTIM][k1]; /* This should only happen from the final insert (I_M) state */
499 default: cm_Fail("illegal %s->%s transition", CP9Statetype(st1), CP9Statetype(st2));
500 }
501 break;
502 case CSTD:
503 switch (st2) {
504 case CSTM: return hmm->tsc[CTDM][k1];
505 case CSTI: return hmm->tsc[CTDI][k1];
506 case CSTD: return hmm->tsc[CTDD][k1];
507 case CSTE: return hmm->tsc[CTDM][k1]; /* This should only happen from the final delete (D_M) state */
508 default: cm_Fail("illegal %s->%s transition", CP9Statetype(st1), CP9Statetype(st2));
509 }
510 break;
511 case CSTEL:
512 switch (st2) {
513 case CSTM: return 0; /* transition to EL penalty incurred when M->EL transition takes place */
514 case CSTE: return 0; /* transition to EL penalty incurred when M->EL transition takes place */
515 case CSTEL: return hmm->el_selfsc; /* penalty for EL->EL self transition loop */
516 default: cm_Fail("illegal %s->%s transition", CP9Statetype(st1), CP9Statetype(st2));
517 }
518 break;
519 case CSTE: /* this should never happen, it means we transitioned from E, which is not
520 * allowed. */
521 cm_Fail("illegal %s->%s transition", CP9Statetype(st1), CP9Statetype(st2));
522 break;
523 default: cm_Fail("illegal state %s in traceback", CP9Statetype(st1));
524 }
525 /*NOTREACHED*/
526 return 0;
527 }
528
529
530 /* Function: CP9ViterbiTrace()
531 * Date: EPN, Wed May 30 17:32:05 2007
532 * based on HMMER 2.3.2's P7ViterbiTrace()
533 *
534 * Purpose: Traceback of a Viterbi matrix: i.e. retrieval
535 * of optimum alignment.
536 *
537 * Args: hmm - hmm, log odds form, used to make mx
538 * dsq - sequence aligned to (digital form) 1..L
539 * i0 - first residue of sequence, often 1
540 * j0 - last residue of sequence, often L
541 * mx - the matrix to trace back in, L x hmm->M
542 * ret_tr - RETURN: traceback.
543 *
544 * Return: (void)
545 * ret_tr is allocated here. Free using CP9FreeTrace().
546 */
547 void
CP9ViterbiTrace(CP9_t * hmm,ESL_DSQ * dsq,int i0,int j0,CP9_MX * mx,CP9trace_t ** ret_tr)548 CP9ViterbiTrace(CP9_t *hmm, ESL_DSQ *dsq, int i0, int j0,
549 CP9_MX *mx, CP9trace_t **ret_tr)
550 {
551 /* contract check */
552 if(dsq == NULL)
553 cm_Fail("ERROR in CP9ViterbiTrace(), dsq is NULL.");
554
555 CP9trace_t *tr;
556 int curralloc; /* current allocated length of trace */
557 int tpos; /* position in trace */
558 int i; /* position in seq (1..N) */
559 int k; /* position in model (1..M) */
560 int *erow, **mmx, **imx, **dmx, **elmx;
561 int sc; /* temp var for pre-emission score */
562 int error_flag;
563 int c; /* counter over possible EL states */
564
565 /* Overallocate for the trace.
566 * B- ... - E : 2 states + N is minimum trace;
567 * add N more as buffer.
568 */
569 curralloc = (j0-i0+1) * 2 + 2;
570 CP9AllocTrace(curralloc, &tr);
571
572 mmx = mx->mmx;
573 imx = mx->imx;
574 dmx = mx->dmx;
575 elmx= mx->elmx;
576 erow= mx->erow;
577
578 /* Initialization of trace
579 * We do it back to front; ReverseTrace() is called later.
580 */
581 tr->statetype[0] = CSTE;
582 tr->nodeidx[0] = 0;
583 tr->pos[0] = 0;
584 tpos = 1;
585 i = j0; /* current i (seq pos) we're trying to assign */
586
587 /* Traceback
588 */
589 while (tr->statetype[tpos-1] != CSTB) {
590 error_flag = FALSE;
591 switch (tr->statetype[tpos-1]) {
592 case CSTM: /* M connects from i-1,k-1, B or an EL*/
593 /*printf("CSTM k: %d i:%d \n", k, i);*/
594 sc = mmx[i+1][k+1] - hmm->msc[dsq[i+1]][k+1];
595 if (sc <= -INFTY) { CP9FreeTrace(tr); *ret_tr = NULL; return; }
596 else if (sc == mmx[i][k] + hmm->tsc[CTMM][k])
597 {
598 tr->statetype[tpos] = CSTM;
599 tr->nodeidx[tpos] = k--;
600 tr->pos[tpos] = i--;
601 }
602 else if (sc == imx[i][k] + hmm->tsc[CTIM][k])
603 {
604 tr->statetype[tpos] = CSTI;
605 tr->nodeidx[tpos] = k;
606 tr->pos[tpos] = i--;
607 }
608 else if (sc == dmx[i][k] + hmm->tsc[CTDM][k])
609 {
610 tr->statetype[tpos] = CSTD;
611 tr->nodeidx[tpos] = k--;
612 tr->pos[tpos] = 0;
613 }
614 else /* Check if we came from an EL state (could be more than 1 choice) */
615 {
616 error_flag = TRUE;
617 /* note we look at el_from_ct[k+1] not k for same reason we look
618 * at bsc[k+1] above, we're going backwards, this is a tricky off-by-one */
619 for(c = 0; c < hmm->el_from_ct[k+1]; c++) /* el_from_ct[k+1] is >= 0 */
620 {
621 /* transition penalty to EL incurred when EL was entered */
622 if(sc == elmx[i][hmm->el_from_idx[k+1][c]])
623 {
624 tr->statetype[tpos] = CSTEL;
625 k = hmm->el_from_idx[(k+1)][c];
626 tr->nodeidx[tpos] = k;
627 tr->pos[tpos] = i--;
628 error_flag = FALSE;
629 break;
630 }
631 }
632 }
633 if(error_flag)
634 {
635 /* one last possibility, we came from B, check this last, in
636 * case hmm->bsc[k+1] happens to be identical to sc but
637 * we're not done the parse yet (i.e. one of the cases
638 * above equaled sc). */
639 if (sc == hmm->bsc[k+1])
640 {
641 tr->statetype[tpos] = CSTB;
642 tr->nodeidx[tpos] = 0;
643 tr->pos[tpos] = 0;
644 if(tr->pos[tpos-1] != 1)
645 cm_Fail("traceback failed: premature begin");
646 error_flag = FALSE;
647 }
648 }
649 if(error_flag)
650 cm_Fail("traceback failed");
651 break;
652
653 case CSTD: /* D connects from M,D,I, (D_1 also connects from B (M_0) */
654 /*printf("CSTD k: %d i:%d \n", k, i);*/
655 sc = dmx[i][k+1];
656 if (sc <= -INFTY) { CP9FreeTrace(tr); *ret_tr = NULL; return; }
657 else if(k == 0) /* D_1 connects from B(M_0), and I_0, when k == 0, we're dealing with D_1, a confusing off-by-one */
658 {
659 if(sc == mmx[i][k] + hmm->tsc[CTMD][k])
660 {
661 tr->statetype[tpos] = CSTB;
662 tr->nodeidx[tpos] = 0;
663 tr->pos[tpos] = 0;
664 }
665 else if (sc == imx[i][k] + hmm->tsc[CTID][k])
666 {
667 tr->statetype[tpos] = CSTI;
668 tr->nodeidx[tpos] = k;
669 tr->pos[tpos] = i--;
670 }
671 else cm_Fail("traceback failed");
672 } /* else k != 0 */
673 else if (sc == mmx[i][k] + hmm->tsc[CTMD][k])
674 {
675 tr->statetype[tpos] = CSTM;
676 tr->nodeidx[tpos] = k--;
677 tr->pos[tpos] = i--;
678 }
679 else if (sc == imx[i][k] + hmm->tsc[CTID][k])
680 {
681 tr->statetype[tpos] = CSTI;
682 tr->nodeidx[tpos] = k;
683 tr->pos[tpos] = i--;
684 }
685 else if (sc == dmx[i][k] + hmm->tsc[CTDD][k])
686 {
687 tr->statetype[tpos] = CSTD;
688 tr->nodeidx[tpos] = k--;
689 tr->pos[tpos] = 0;
690 }
691 else cm_Fail("traceback failed");
692 break;
693
694 case CSTI: /* I connects from M,I,D, (I_0 connects from B also(*/
695 /*printf("CSTI k: %d i:%d \n", k, i);*/
696 sc = imx[i+1][k] - hmm->isc[dsq[i+1]][k];
697 if (sc <= -INFTY) { CP9FreeTrace(tr); *ret_tr = NULL; return; }
698 else if(k == 0) /* I_0 connects from B(M_0), and I_0 */
699 {
700 if(sc == mmx[i][k] + hmm->tsc[CTMI][k])
701 {
702 tr->statetype[tpos] = CSTB;
703 tr->nodeidx[tpos] = 0;
704 tr->pos[tpos] = 0;
705 }
706 else if (sc == imx[i][k] + hmm->tsc[CTII][k])
707 {
708 tr->statetype[tpos] = CSTI;
709 tr->nodeidx[tpos] = k;
710 tr->pos[tpos] = i--;
711 }
712 }
713 /* else k != 0 */
714 else if (sc == mmx[i][k] + hmm->tsc[CTMI][k])
715 {
716 tr->statetype[tpos] = CSTM;
717 tr->nodeidx[tpos] = k--;
718 tr->pos[tpos] = i--;
719 }
720
721 else if (sc == imx[i][k] + hmm->tsc[CTII][k])
722 {
723 tr->statetype[tpos] = CSTI;
724 tr->nodeidx[tpos] = k;
725 tr->pos[tpos] = i--;
726 }
727 else if (sc == dmx[i][k] + hmm->tsc[CTDI][k])
728 {
729 tr->statetype[tpos] = CSTD;
730 tr->nodeidx[tpos] = k--;
731 tr->pos[tpos] = 0;
732 }
733 else cm_Fail("traceback failed");
734 break;
735
736 case CSTE: /* E connects from any M state. k set here
737 * also can connect from I_M or D_M (diff from p7)
738 * or even EL_M if it exists */
739 if (erow[i] <= -INFTY) { CP9FreeTrace(tr); *ret_tr = NULL; return; }
740 if (erow[i] == imx[i][hmm->M] + hmm->tsc[CTIM][hmm->M])
741 {
742 k = hmm->M;
743 tr->statetype[tpos] = CSTI;
744 tr->nodeidx[tpos] = k;
745 tr->pos[tpos] = i--;
746 }
747 else if (erow[i] == dmx[i][hmm->M] + hmm->tsc[CTDM][hmm->M])
748 {
749 k = hmm->M;
750 tr->statetype[tpos] = CSTD;
751 tr->nodeidx[tpos] = k--;
752 tr->pos[tpos] = 0;
753 }
754 else
755 {
756 error_flag = TRUE;
757 for (k = hmm->M; k >= 1; k--)
758 if (erow[i] == mmx[i][k] + hmm->esc[k])
759 {
760 tr->statetype[tpos] = CSTM;
761 tr->nodeidx[tpos] = k--;
762 tr->pos[tpos] = i--;
763 error_flag = FALSE;
764 break;
765 }
766 if(error_flag)
767 {
768 /* Check if we came from an EL state (could be more than 1 choice) */
769 /* hmm->el_from-ct[hmm->M+1] is # of ELs that can transit to E (END) */
770 for(c = hmm->el_from_ct[hmm->M+1]-1; c >= 0; c--) /* el_from_ct[] is >= 0 */
771 {
772 /* transition penalty to EL incurred when EL was entered */
773 if(erow[i] == elmx[i][hmm->el_from_idx[hmm->M+1][c]])
774 {
775 tr->statetype[tpos] = CSTEL;
776 k = hmm->el_from_idx[(hmm->M+1)][c];
777 tr->nodeidx[tpos] = k;
778 tr->pos[tpos] = i--;
779 error_flag = FALSE;
780 break;
781 }
782 }
783 }
784 }
785 if (k < 0 || error_flag) cm_Fail("traceback failed");
786 break;
787
788 case CSTEL: /* EL connects from certain M states and itself */
789 /*printf("CSTEL k: %d i:%d \n", k, i);*/
790 /* check if we are staying in the EL */
791 sc = elmx[i+1][k];
792 if (sc == elmx[i][k] + hmm->el_selfsc) /* i >= 2, first residue must be emitted by a match, not an EL */
793 {
794 tr->statetype[tpos] = CSTEL;
795 tr->nodeidx[tpos] = k;
796 tr->pos[tpos] = i--;
797 }
798 else if(sc == mmx[i+1][k] + hmm->tsc[CTMEL][k]) /* M->EL->M with 0 self loops in EL */
799 {
800 tr->statetype[tpos] = CSTM;
801 tr->nodeidx[tpos] = k--;
802 tr->pos[tpos] = i+1; /* special case, we decremented i prematurely b/c we
803 * had no way of knowing it was our last visit to EL, before
804 * we went to M (since we're working backwards this is actually
805 * the first visit to EL).
806 */
807 }
808 else cm_Fail("traceback failed");
809 break;
810
811 default:
812 cm_Fail("traceback failed");
813
814 } /* end switch over statetype[tpos-1] */
815
816 tpos++;
817 if (tpos == curralloc)
818 { /* grow trace if necessary */
819 curralloc += (j0-i0+1);
820 CP9ReallocTrace(tr, curralloc);
821 }
822
823 } /* end traceback, at S state; tpos == tlen now */
824 tr->tlen = tpos;
825 CP9ReverseTrace(tr);
826
827 *ret_tr = tr;
828 }
829
830 /* Function: CP9ReverseTrace()
831 * Date: EPN, Wed May 30 17:52:18 2007
832 * identical to SRE's P7ReverseTrace() from HMMER 2.3.2
833 *
834 * Purpose: Reverse the arrays in a traceback structure.
835 * Tracebacks from Forward() and Viterbi() are
836 * collected backwards, and call this function
837 * when they're done.
838 *
839 * It's possible to reverse the arrays in place
840 * more efficiently; but the realloc/copy strategy
841 * has the advantage of reallocating the trace
842 * into the right size of memory. (Tracebacks
843 * overallocate.)
844 *
845 * Args: tr - the traceback to reverse. tr->tlen must be set.
846 *
847 * Return: (void)
848 * tr is modified.
849 */
850 void
CP9ReverseTrace(CP9trace_t * tr)851 CP9ReverseTrace(CP9trace_t *tr)
852 {
853 int status;
854 char *statetype;
855 int *nodeidx;
856 int *pos;
857 int opos, npos;
858
859 /* Allocate
860 */
861 ESL_ALLOC(statetype, sizeof(char)* tr->tlen);
862 ESL_ALLOC(nodeidx, sizeof(int) * tr->tlen);
863 ESL_ALLOC(pos, sizeof(int) * tr->tlen);
864
865 /* Reverse the trace.
866 */
867 for (opos = tr->tlen-1, npos = 0; npos < tr->tlen; npos++, opos--)
868 {
869 statetype[npos] = tr->statetype[opos];
870 nodeidx[npos] = tr->nodeidx[opos];
871 pos[npos] = tr->pos[opos];
872 }
873
874 /* Swap old, new arrays.
875 */
876 free(tr->statetype);
877 free(tr->nodeidx);
878 free(tr->pos);
879 tr->statetype = statetype;
880 tr->nodeidx = nodeidx;
881 tr->pos = pos;
882 return;
883
884 ERROR:
885 cm_Fail("Memory allocation error.");
886 }
887
888
889
890 /* Function: CP9Traces2Alignment()
891 * based on SRE's P7Traces2Alignment() from HMMER 2.3.2
892 *
893 * Purpose: Convert an array of traceback structures for a set
894 * of sequences into a new multiple alignment. Modified
895 * from HMMER to account for possible EL local-end
896 * insertions (which don't exist in P7). Including EL
897 * insertions requires an emit map from the CM.
898 *
899 * Insertions/ELs are put into lower case and
900 * are not aligned; instead, Nterm is right-justified,
901 * Cterm is left-justified, and internal insertions
902 * are split in half and the halves are justified in
903 * each direction (the objective being to increase
904 * the chances of getting insertions aligned well enough
905 * for them to become a match). SAM gap char conventions
906 * are used: - in match columns, . in insert columns
907 *
908 * Args: cm - the CM the CP9 was built from, needed to get emitmap,
909 * so we know where to put EL transitions
910 * cp9 - the CP9 used to determine the traces (cm->cp9loc or cm->cp9glb)
911 * abc - alphabet to use to create the return MSA
912 * sq - sequences
913 * wgt - weights for seqs, NULL for none
914 * nseq - number of sequences
915 * tr - array of tracebacks
916 * do_full - TRUE to always include all match columns in alignment
917 * do_matchonly - TRUE to ONLY include match columns
918 * ret_msa - MSA, alloc'ed created here
919 *
920 * Return: eslOK on succes, eslEMEM on memory error.
921 * MSA structure in ret_msa, caller responsible for freeing.
922 */
923 int
CP9Traces2Alignment(CM_t * cm,CP9_t * cp9,const ESL_ALPHABET * abc,ESL_SQ ** sq,float * wgt,int nseq,CP9trace_t ** tr,int do_full,int do_matchonly,ESL_MSA ** ret_msa)924 CP9Traces2Alignment(CM_t *cm, CP9_t *cp9, const ESL_ALPHABET *abc, ESL_SQ **sq, float *wgt,
925 int nseq, CP9trace_t **tr, int do_full, int do_matchonly,
926 ESL_MSA **ret_msa)
927 {
928 int status; /* easel status flag */
929 ESL_MSA *msa = NULL; /* RETURN: new alignment */
930 int idx; /* counter for sequences */
931 int alen; /* width of alignment */
932 int *maxins = NULL; /* array of max inserts between aligned columns */
933 int *maxels = NULL; /* array of max ELs emissions between aligned columns */
934 int *matmap = NULL; /* matmap[k] = apos of match k [1..M] */
935 int nins; /* counter for inserts */
936 int cpos; /* HMM node, consensus position */
937 int apos; /* position in aligned sequence (0..alen-1)*/
938 int rpos; /* position in raw digital sequence (1..L)*/
939 int tpos; /* position counter in traceback */
940 int epos; /* position ctr for EL insertions */
941 int statetype; /* type of current state, e.g. STM */
942 CMEmitMap_t *emap = NULL; /* consensus emit map for the CM */
943 int *imap = NULL; /* first apos for an insert following a cpos */
944 int *elmap = NULL; /* first apos for an EL following a cpos */
945 int *matuse = NULL; /* TRUE if we need a cpos in mult alignment */
946 int *eluse = NULL; /* TRUE if we have an EL after cpos in alignment */
947 int **eposmap = NULL; /* [seq idx][CP9 node idx] where each EL should emit to */
948 int *iuse = NULL; /* TRUE if we have an I after cpos in alignment */
949 CMConsensus_t *con = NULL; /* consensus information for the CM */
950 int next_match; /* used for filling eposmap */
951 int c; /* counter over possible EL froms */
952 int *insleft; /* [0..cpos..clen] TRUE if inserts *following* cpos should be flush right */
953 int nd; /* counter over nodes */
954 int max_ins_or_el[2];/* for regularizing (splitting) inserts */
955 int pass_offset[2]; /* for regularizing (splitting) inserts */
956 int pass; /* for regularizing (splitting) inserts */
957 char errbuf[eslERRBUFSIZE];
958
959 /* Contract checks */
960 if(cp9 == NULL)
961 cm_Fail("ERROR in CP9Traces2Alignment, cp9 is NULL.\n");
962 if(cm->cp9map == NULL)
963 cm_Fail("ERROR in CP9Traces2Alignment, cm->cp9map is NULL.\n");
964 /* We allow the caller to specify the alphabet they want the
965 * resulting MSA in, but it has to make sense (see next few lines). */
966 if(cm->abc->type == eslRNA)
967 {
968 if(abc->type != eslRNA && abc->type != eslDNA)
969 cm_Fail("ERROR in CP9Traces2Alignment(), cm alphabet is RNA, but requested output alphabet is neither DNA nor RNA.");
970 }
971 else if(cm->abc->K != abc->K)
972 cm_Fail("ERROR in CP9Traces2Alignment(), cm alphabet size is %d, but requested output alphabet size is %d.", cm->abc->K, abc->K);
973
974 /* create the emit map */
975 emap = CreateEmitMap(cm);
976
977 /* Determine which direction we emit to for each consensus column,
978 * IL's emit left, IR's emit right, but this info isn't indexed by
979 * consensus column, so we use an emitmap to get it. This is used
980 * to determine if we go IL before EL or EL before IR when inserting
981 * both regular IL/IR inserts and EL inserts in same place. This is
982 * also used if we have enabled (cm->align_opts &
983 * CM_ALIGN_FLUSHINSERTS) which overides default behavior to split
984 * the inserts and adopts 'flush left for IL / flush right for IR'
985 * behavior (which older versions of Infernal used).
986 */
987 ESL_ALLOC(insleft, sizeof(int) * (emap->clen+1));
988 esl_vec_ISet(insleft, (cm->clen+1), -1);
989 for(nd = 0; nd < cm->nodes; nd++)
990 {
991 if(cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd || cm->ndtype[nd] == BEGR_nd || cm->ndtype[nd] == ROOT_nd)
992 if(insleft[emap->lpos[nd]] == -1) /* deal with sole CM grammar ambiguity */
993 insleft[emap->lpos[nd]] = TRUE;
994 if(cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATR_nd)
995 insleft[emap->rpos[nd]-1] = FALSE;
996 }
997 if(insleft[emap->clen] == -1) insleft[emap->clen] = FALSE; /* special case, insleft[emap->clen] == -1 IFF cpos==emap->clen is modelled by MATR or MATP. */
998 /* check we've constructed insleft properly, TEMPORARY */
999 for(cpos = 0; cpos <= emap->clen; cpos++)
1000 ESL_DASSERT1((insleft[cpos] != -1));
1001
1002 /* Here's the problem. We want to align the match states in columns,
1003 * but some sequences have inserted symbols in them; we need some
1004 * sort of overall knowledge of where the inserts are and how long
1005 * they are in order to create the alignment.
1006 *
1007 * Here's our trick. maxins[] and maxels[] are 0..hmm->M arrays;
1008 * maxins[i] stores the maximum number of times insert substate
1009 * i was used. maxels[i] stores the max number of times an EL insertion
1010 * occurs after insert substate i. maxins[i] + maxels[i]
1011 * is the maximum number of gaps to insert between canonical
1012 * column i and i+1. maxins[0], maxels[0] is the N-term tail;
1013 * maxins[M], maxels[0] is the C-term tail.
1014 */
1015 ESL_ALLOC(matuse, sizeof(int) * (emap->clen+1));
1016 ESL_ALLOC(eluse, sizeof(int) * (emap->clen+1));
1017 ESL_ALLOC(iuse, sizeof(int) * (emap->clen+1));
1018 ESL_ALLOC(maxins, sizeof(int) * (emap->clen+1));
1019 ESL_ALLOC(maxels, sizeof(int) * (emap->clen+1));
1020 ESL_ALLOC(matmap, sizeof(int) * (emap->clen+1));
1021 ESL_ALLOC(imap, sizeof(int) * (emap->clen+1));
1022 ESL_ALLOC(elmap, sizeof(int) * (emap->clen+1));
1023 ESL_ALLOC(eposmap,sizeof(int *) * (nseq));
1024
1025 /* eposmap is 2D b/c different traces can have different epos
1026 * (position where EL inserts) for the same EL state, for example:
1027 * an EL state for node 9 may reconnect at node 25 in one parse
1028 * and node 50 in another if there's a CM MATL node with subtree
1029 * lpos=9 rpos=50, and a CM BEGL node with subtree lpos=9 rpos=25,
1030 * i.e. there are 2 CM EL states being mirrored by 1 HMM EL state.
1031 */
1032 for (cpos = 0; cpos <= emap->clen; cpos++)
1033 {
1034 if(!do_full || cpos == 0)
1035 matuse[cpos] = 0;
1036 else
1037 matuse[cpos] = 1;
1038 maxins[cpos] = maxels[cpos] = 0;
1039 iuse[cpos] = eluse[cpos] = imap[cpos] = elmap[cpos] = 0;
1040 }
1041
1042 /* Look at all the traces; find maximum length of
1043 * insert needed at each of the clen+1 possible gap
1044 * points. (There are two types of insert, I/EL)
1045 * Also find whether we don't need some of the match
1046 * (consensus) columns.
1047 */
1048 for (idx = 0; idx < nseq; idx++)
1049 {
1050 ESL_ALLOC(eposmap[idx], sizeof(int) * (emap->clen+1));
1051 for (cpos = 0; cpos <= emap->clen; cpos++)
1052 {
1053 iuse[cpos] = eluse[cpos] = 0;
1054 eposmap[idx][cpos] = -1;
1055 }
1056
1057 /* Determine the eposmap, the cpos EL's go into for each cpos for each seq.
1058 * This depends on the first match state entered after each EL, so we go bottom up */
1059 next_match = -1;
1060 for (tpos = tr[idx]->tlen - 1; tpos >= 0; tpos--)
1061 {
1062 statetype = tr[idx]->statetype[tpos]; /* just for clarity */
1063 cpos = tr[idx]->nodeidx[tpos];
1064 if(statetype == CSTM) next_match = cpos;
1065 if(statetype == CSTE) next_match = cp9->M+1;
1066 if(statetype == CSTEL) eposmap[idx][cpos] = next_match; /* this will be overwritten below */
1067 }
1068 for (cpos = 0; cpos <= emap->clen; cpos++)
1069 {
1070 if(eposmap[idx][cpos] != -1)
1071 {
1072 /*printf("cpos: %d eposmap[idx][cpos]: %d ct: %d\n", cpos, eposmap[idx][cpos], cp9->el_from_ct[eposmap[idx][cpos]]);*/
1073 /* determine the epos based on the CM emit map and cp9->el* data structures */
1074 for(c = 0; c < cp9->el_from_ct[eposmap[idx][cpos]]; c++)
1075 {
1076 if(cp9->el_from_idx[eposmap[idx][cpos]][c] == cpos)
1077 {
1078 eposmap[idx][cpos] = emap->epos[cp9->el_from_cmnd[eposmap[idx][cpos]][c]];
1079 break;
1080 }
1081 if(c == (cp9->el_from_ct[eposmap[idx][cpos]] - 1))
1082 cm_Fail("Couldn't determine epos for cpos: %d\n", cpos);
1083 }
1084 }
1085 }
1086
1087 for (tpos = 0; tpos < tr[idx]->tlen; tpos++)
1088 {
1089 cpos = tr[idx]->nodeidx[tpos];
1090 switch (tr[idx]->statetype[tpos]) {
1091 case CSTI: iuse[cpos]++; break;
1092 case CSTM: matuse[tr[idx]->nodeidx[tpos]] = 1; break;
1093 case CSTEL:
1094 eluse[eposmap[idx][cpos]]++;
1095 break;
1096 case CSTD:
1097 case CSTE:
1098 case CSTB:
1099 break;
1100 default:
1101 cm_Fail("CP9Traces2Alignment reports unrecognized statetype %c",
1102 CP9Statetype(tr[idx]->statetype[tpos]));
1103 }
1104 } /* end looking at trace i */
1105 for (cpos = 0; cpos <= emap->clen; cpos++)
1106 {
1107 if (iuse[cpos] > maxins[cpos]) maxins[cpos] = iuse[cpos];
1108 if (eluse[cpos] > maxels[cpos]) maxels[cpos] = eluse[cpos]-1; /* EL only emits on self loops */
1109 }
1110 } /* end calculating lengths used by all traces */
1111
1112 /***********************************************
1113 * Construct the alignment
1114 ***********************************************/
1115
1116 /* Now we can calculate the total length of the multiple alignment, alen;
1117 * and the maps imap and elmap that turn a cpos into an apos
1118 * in the multiple alignment: e.g. for an insert that follows consensus
1119 * position cpos, put it at or after apos = imap[cpos] in aseq[][].
1120 */
1121
1122 matmap[0] = -1; /* M_0 is B state, non-emitter */
1123 alen = 0;
1124 for (cpos = 0; cpos <= emap->clen; cpos++)
1125 {
1126 if (matuse[cpos])
1127 {
1128 matmap[cpos] = alen;
1129 alen++;
1130 }
1131 else
1132 matmap[cpos] = -1;
1133
1134 if(insleft[cpos]) { /* IL state inserts here, IL's go before EL's */
1135 imap[cpos] = alen; alen += maxins[cpos];
1136 elmap[cpos] = alen; alen += maxels[cpos];
1137 }
1138 else { /* IR state inserts here, IR's go after EL's */
1139 elmap[cpos] = alen; alen += maxels[cpos];
1140 imap[cpos] = alen; alen += maxins[cpos];
1141 }
1142 }
1143 /* allocation for new alignment */
1144 if ((msa = esl_msa_Create(nseq, alen)) == NULL) { status = eslEMEM; goto ERROR; }
1145 msa->nseq = nseq;
1146 msa->alen = alen;
1147 msa->abc = (ESL_ALPHABET *) abc;
1148
1149 for (idx = 0; idx < nseq; idx++)
1150 {
1151 if(sq[idx]->dsq == NULL) cm_Fail("ERROR in CP9Traces2Alignment(), sq's should be digitized.\n");
1152
1153 for (cpos = 0; cpos <= emap->clen; cpos++)
1154 iuse[cpos] = eluse[cpos] = 0;
1155 /* blank an aseq */
1156 for (apos = 0; apos < alen; apos++)
1157 msa->aseq[idx][apos] = '.';
1158 for (cpos = 0; cpos <= emap->clen; cpos++)
1159 if (matmap[cpos] != -1) msa->aseq[idx][matmap[cpos]] = '-';
1160 msa->aseq[idx][alen] = '\0';
1161
1162 /* align the sequence */
1163 apos = 0;
1164 for (tpos = 0; tpos < tr[idx]->tlen; tpos++)
1165 {
1166 statetype = tr[idx]->statetype[tpos]; /* just for clarity */
1167 rpos = tr[idx]->pos[tpos];
1168 cpos = tr[idx]->nodeidx[tpos];
1169
1170 if (statetype == CSTM)
1171 {
1172 apos = matmap[cpos];
1173 msa->aseq[idx][apos] = abc->sym[sq[idx]->dsq[rpos]];
1174 }
1175 else if (statetype == CSTD)
1176 apos = matmap[cpos]+1; /* need for handling D->I; xref STL6/p.117 */
1177 else if (statetype == CSTI)
1178 {
1179 /* flush all inserts left for now, we'll split or flush-right after we're done with all seqs */
1180 apos = imap[cpos] + iuse[cpos];
1181 msa->aseq[idx][apos] = (char) tolower((int) abc->sym[sq[idx]->dsq[rpos]]);
1182 iuse[cpos]++;
1183 }
1184 else if (statetype == CSTEL)
1185 {
1186 /*printf("CSTEL cpos: %d rpos: %d epos: %d\n", cpos, rpos);*/
1187 epos = eposmap[idx][cpos];
1188 if(tr[idx]->statetype[tpos-1] == CSTEL) /* we don't emit on first EL visit */
1189 {
1190 apos = elmap[epos] + eluse[epos];
1191 msa->aseq[idx][apos] = (char) tolower((int) abc->sym[sq[idx]->dsq[rpos]]);
1192 eluse[epos]++;
1193 }
1194 }
1195 else if (statetype == CSTE)
1196 apos = matmap[emap->clen]+1; /* set position for C-term tail */
1197 }
1198
1199 /* All insertions (IL/IR/EL) are currently flush-left, but they won't all remain so.
1200 * Two options for what to do:
1201 * 1. Split insertions (this is default):
1202 * 5' extension (ROOT_IL: prior to cpos 1) is right-justified.
1203 * Internal inserts are split in half
1204 * 3' extension (ROOT_IR: after final cpos) remains left-justified.
1205 * 2. Flush IL's left, IR's right only ON if cm->align_opts & CM_ALIGN_FLUSHINSERTS (this was Infernal pre-1.0 default)
1206 * use insleft array to determine which type of insert (IL,IR) emits after each
1207 * consensus column, and flush inserts appropriately.
1208 *
1209 * We have to be careful about EL's. We don't want to group IL/IR's and EL's together and then split them
1210 * because we need to annotate IL/IR's as '.'s in the consensus structure and EL's as '~'. So we split
1211 * each of the 2 group of inserts separately (IL or IR's (their can only be one per position)) and EL's.
1212 * This is done somewhat confusingly (but without repeating too much code) with the
1213 * for (pass = 0; pass <= 1; pass++) loop, and the max_ins_or_el[] and pass_offset[] arrays.
1214 */
1215
1216 /* deal with inserts before cpos 1, don't think they're can be EL's here, but we leave it in case I'm forgetting.
1217 * if there are EL's they would come after any ROOT_ILs */
1218 if(!(cm->align_opts & CM_ALIGN_FLUSHINSERTS)) /* default behavior, flush ROOT_IL right, else leave ROOT_IL flush left */
1219 rightjustify(msa->abc, msa->aseq[idx], maxins[0]);
1220 if(!(cm->align_opts & CM_ALIGN_FLUSHINSERTS)) /* default behavior, flush pre-cpos=1 ELs right, else leave them flush left */
1221 rightjustify(msa->abc, msa->aseq[idx]+maxins[0], maxels[0]);
1222
1223 for (cpos = 1; cpos < emap->clen; cpos++)
1224 {
1225 if(insleft[cpos]) { /* ILs then ELs */
1226 max_ins_or_el[0] = maxins[cpos];
1227 max_ins_or_el[1] = maxels[cpos];
1228 pass_offset[0] = 0;
1229 pass_offset[1] = maxins[cpos]; /* we'll have to add this to get to appropriate alignment position when pass==1 */
1230 }
1231 else { /* ELs then IRs */
1232 max_ins_or_el[0] = maxels[cpos];
1233 max_ins_or_el[1] = maxins[cpos];
1234 pass_offset[0] = 0;
1235 pass_offset[1] = maxels[cpos]; /* we'll have to add this to get to appropriate alignment position when pass==1 */
1236 }
1237 for(pass = 0; pass <= 1; pass++)
1238 {
1239 if (max_ins_or_el[pass] > 1)
1240 {
1241 apos = matmap[cpos]+1 + pass_offset[pass];
1242 if(! (cm->align_opts & CM_ALIGN_FLUSHINSERTS)) /* default behavior, split insert in half */
1243 {
1244 for (nins = 0; islower((int) (msa->aseq[idx][apos])); apos++)
1245 nins++;
1246 nins /= 2; /* split the insertion in half */
1247 rightjustify(msa->abc, msa->aseq[idx]+matmap[cpos]+1 + pass_offset[pass] + nins, max_ins_or_el[pass]-nins);
1248 }
1249 /* else revert to pre-1.0 infernal behavior, flush IL's left, and flush IR's right */
1250 else if(!(insleft[cpos])) /* only insert right if next consensus column doesn't insert left */
1251 rightjustify(msa->abc, msa->aseq[idx] + apos, max_ins_or_el[pass]);
1252 }
1253 }
1254 }
1255 /* deal with inserts after final cpos 1,
1256 * if there are EL's they would come before any ROOT_IRs */
1257 if(cm->align_opts & CM_ALIGN_FLUSHINSERTS) /* old behavior, flush ROOT_IR right, else (default) leave ROOT_IR flush left */
1258 rightjustify(msa->abc, msa->aseq[idx]+matmap[emap->clen]+1, maxels[emap->clen]);
1259 if(cm->align_opts & CM_ALIGN_FLUSHINSERTS) /* old behavior, flush ROOT_IR right, else (default) leave ROOT_IR flush left */
1260 rightjustify(msa->abc, msa->aseq[idx]+matmap[emap->clen]+1+maxels[emap->clen], maxins[emap->clen]);
1261 }
1262 /***********************************************
1263 * Build the rest of the MSA annotation.
1264 ***********************************************/
1265
1266 msa->nseq = nseq;
1267 msa->alen = alen;
1268 ESL_ALLOC(msa->au, sizeof(char) * (strlen(INFERNAL_VERSION)+10));
1269 sprintf(msa->au, "Infernal %s", INFERNAL_VERSION);
1270
1271 /* copy names and weights */
1272 for (idx = 0; idx < nseq; idx++)
1273 {
1274 if((status = esl_strdup(sq[idx]->name, -1, &(msa->sqname[idx]))) != eslOK) goto ERROR;
1275 if (wgt == NULL) msa->wgt[idx] = 1.0;
1276 else msa->wgt[idx] = wgt[idx];
1277 }
1278
1279 /* Construct the secondary structure consensus line, msa->ss_cons:
1280 * IL, IR are annotated as .
1281 * EL is annotated as ~
1282 * and match columns use the structure code.
1283 * Also the primary sequence consensus/reference coordinate system line,
1284 * msa->rf.
1285 */
1286 ESL_ALLOC(msa->ss_cons, (sizeof(char) * (alen+1)));
1287 ESL_ALLOC(msa->rf, (sizeof(char) * (alen+1)));
1288 con = CreateCMConsensus(cm, abc);
1289
1290 for (cpos = 0; cpos <= emap->clen; cpos++)
1291 {
1292 if (matuse[cpos])
1293 { /* CMConsensus is off-by-one right now, 0..clen-1 relative to cpos's 1..clen */
1294 if (con->ct[cpos-1] != -1 && matuse[con->ct[cpos-1]+1] == 0) {
1295 msa->ss_cons[matmap[cpos]] = '.';
1296 msa->rf[matmap[cpos]] = con->cseq[cpos-1];
1297 } else {
1298 msa->ss_cons[matmap[cpos]] = con->cstr[cpos-1];
1299 msa->rf[matmap[cpos]] = con->cseq[cpos-1];
1300 }
1301 }
1302 if (maxins[cpos] > 0)
1303 for (apos = imap[cpos]; apos < imap[cpos] + maxins[cpos]; apos++)
1304 {
1305 msa->ss_cons[apos] = '.';
1306 msa->rf[apos] = '.';
1307 }
1308 if (maxels[cpos] > 0)
1309 {
1310 for (apos = elmap[cpos]; apos < elmap[cpos] + maxels[cpos]; apos++)
1311 {
1312 msa->ss_cons[apos] = '~';
1313 msa->rf[apos] = '~';
1314 }
1315 }
1316 }
1317 msa->ss_cons[alen] = '\0';
1318 msa->rf[alen] = '\0';
1319
1320 /* If we only want the match columns, shorten the alignment
1321 * by getting rid of the inserts. (Alternatively we could probably
1322 * simplify the building of the alignment, but all that pretty code
1323 * above already existed, so we do this post-msa-building shortening).
1324 */
1325 if(do_matchonly)
1326 {
1327 int *useme;
1328 ESL_ALLOC(useme, sizeof(int) * (msa->alen));
1329 esl_vec_ISet(useme, msa->alen, FALSE);
1330 for(cpos = 0; cpos <= emap->clen; cpos++)
1331 if(matmap[cpos] != -1) useme[matmap[cpos]] = TRUE;
1332 if((status = esl_msa_ColumnSubset(msa, errbuf, useme)) != eslOK) return status;
1333 free(useme);
1334 }
1335
1336 /* Free and return */
1337 FreeCMConsensus(con);
1338 FreeEmitMap(emap);
1339 free(eluse);
1340 free(iuse);
1341 free(matuse);
1342 free(maxins);
1343 free(maxels);
1344 free(matmap);
1345 free(imap);
1346 free(elmap);
1347 free(insleft);
1348 esl_Free2D((void **) eposmap, nseq);
1349 *ret_msa = msa;
1350 return eslOK;
1351
1352 ERROR:
1353 if(con != NULL) FreeCMConsensus(con);
1354 if(emap != NULL) FreeEmitMap(emap);
1355 if(matuse!= NULL) free(matuse);
1356 if(iuse != NULL) free(iuse);
1357 if(elmap != NULL) free(elmap);
1358 if(maxels!= NULL) free(maxels);
1359 if(matmap!= NULL) free(matmap);
1360 esl_Free2D((void **) eposmap, nseq);
1361 if(msa != NULL) esl_msa_Destroy(msa);
1362 return status;
1363 }
1364
1365
1366 /* Function: CP9TraceScoreCorrectionNull2()
1367 * Based on HMMER2's TraceScoreCorrection()
1368 * Date: Sun Dec 21 12:05:47 1997 [StL]
1369 *
1370 * Purpose: Calculate a correction (in integer log_2 odds) to be
1371 * applied to a sequence, using a second null model,
1372 * based on a traceback. M/I emissions are corrected;
1373 * The null model is constructed /post hoc/ as the
1374 * average over all the M,I distributions used by the trace.
1375 *
1376 * Return: the log_2-odds score correction.
1377 */
1378 int
CP9TraceScoreCorrectionNull2(CP9_t * hmm,char * errbuf,CP9trace_t * tr,ESL_DSQ * dsq,int start,float omega,float * ret_sc)1379 CP9TraceScoreCorrectionNull2(CP9_t *hmm, char *errbuf, CP9trace_t *tr, ESL_DSQ *dsq, int start, float omega, float *ret_sc)
1380 {
1381 int status;
1382 float *p; /* null2 model distribution */
1383 int *sc; /* null2 model scores */
1384 int a; /* residue index counters */
1385 int tpos;
1386 int score;
1387
1388 /* Rarely, the alignment was totally impossible, and tr is NULL.
1389 */
1390 if (tr == NULL) return 0.0;
1391
1392 /* Set up model: average over the emission distributions of
1393 * all M, I states that appear in the trace. Ad hoc? Sure, you betcha.
1394 */
1395 /* trivial preorder traverse, since we're already numbered that way */
1396 ESL_ALLOC(p, sizeof(float) * hmm->abc->K);
1397 esl_vec_FSet(p, hmm->abc->K, 0.0);
1398 for (tpos = 0; tpos < tr->tlen; tpos++) {
1399 if (tr->statetype[tpos] == CSTM) esl_vec_FAdd(p, hmm->mat[tr->nodeidx[tpos]], hmm->abc->K);
1400 else if (tr->statetype[tpos] == CSTI) esl_vec_FAdd(p, hmm->ins[tr->nodeidx[tpos]], hmm->abc->K);
1401 }
1402 esl_vec_FNorm(p, hmm->abc->K);
1403
1404 ESL_ALLOC(sc, sizeof(int) * (hmm->abc->Kp));
1405 /* calculate null2 scores of each possible emission, first the base alphabet */
1406 for (a = 0; a < hmm->abc->K; a++) sc[a] = Prob2Score(p[a], hmm->null[a]);
1407 /* the ambiguities */
1408 for (a = hmm->abc->K+1; a < hmm->abc->Kp-1; a++) sc[a] = esl_abc_IAvgScore(hmm->abc, a, sc);
1409
1410 /* Score all the M,I state emissions that appear in the trace.
1411 */
1412 score = 0;
1413 for (tpos = 0; tpos < tr->tlen; tpos++)
1414 if (tr->statetype[tpos] == CSTM || tr->statetype[tpos] == CSTI) score += sc[dsq[tr->pos[tpos]]];
1415
1416 /* Apply an ad hoc 8 bit fudge factor penalty;
1417 * interpreted as a prior, saying that the second null model is
1418 * 1/2^8 (1/256) as likely as the standard null model
1419 */
1420 score += ((int) (sreLOG2(omega))) * INTSCALE;
1421
1422 /* Return the correction to the bit score.
1423 */
1424 printf("CP9TraceScoreCorrectionNull2() returning %.3f bits\n", Scorify(ILogsum(0, score)));
1425 *ret_sc = Scorify(ILogsum(0, score));
1426 return eslOK;
1427
1428 ERROR:
1429 ESL_FAIL(status, errbuf, "CP9TraceScoreCorrectionNull2(): memory allocation error.");
1430 return status; /* NEVERREACHED*/
1431 }
1432