1 /* cp9_trace.c
2  * EPN, Wed Dec  5 13:05:17 2007
3  *
4  * Note: all of these functions originated in cp9.c [EPN 02.27.06]
5  *
6  * Support for the CM Plan 9 HMM trace CP9trace_t structure.
7  * This was based heavily on HMMER's 2.x data structures.
8  *
9  */
10 
11 #include "esl_config.h"
12 #include "p7_config.h"
13 #include "config.h"
14 
15 #include <stdio.h>
16 #include <string.h>
17 #include <ctype.h>
18 #include <time.h>
19 #include <math.h>
20 
21 #include "easel.h"
22 #include "esl_msa.h"
23 #include "esl_vectorops.h"
24 
25 #include "hmmer.h"
26 
27 #include "infernal.h"
28 
29 /* Function: CP9AllocTrace(), CP9ReallocTrace(), CP9FreeTrace()
30  *
31  * Purpose:  allocation and freeing of traceback structures
32  */
33 void
CP9AllocTrace(int tlen,CP9trace_t ** ret_tr)34 CP9AllocTrace(int tlen, CP9trace_t **ret_tr)
35 {
36   int status;
37   CP9trace_t *tr;
38 
39   ESL_ALLOC(tr, sizeof(CP9trace_t));
40   ESL_ALLOC(tr->statetype, sizeof(char) * tlen);
41   ESL_ALLOC(tr->nodeidx,   sizeof(int)  * tlen);
42   ESL_ALLOC(tr->pos,       sizeof(int)  * tlen);
43   *ret_tr = tr;
44   return;
45 
46  ERROR:
47   cm_Fail("Memory allocation error.");
48 }
49 void
CP9ReallocTrace(CP9trace_t * tr,int tlen)50 CP9ReallocTrace(CP9trace_t *tr, int tlen)
51 {
52   int status;
53   void *tmp;
54 
55   ESL_RALLOC(tr->statetype, tmp, tlen * sizeof(char));
56   ESL_RALLOC(tr->nodeidx,   tmp, tlen * sizeof(int));
57   ESL_RALLOC(tr->pos,       tmp, tlen * sizeof(int));
58   return;
59 
60  ERROR:
61   cm_Fail("Memory reallocation error.");
62 }
63 void
CP9FreeTrace(CP9trace_t * tr)64 CP9FreeTrace(CP9trace_t *tr)
65 {
66   if (tr == NULL) return;
67   free(tr->pos);
68   free(tr->nodeidx);
69   free(tr->statetype);
70   free(tr);
71 }
72 
73 /* Function: CP9_fake_tracebacks()
74  *
75  * Purpose:  From a consensus assignment of columns to MAT/INS, construct fake
76  *           tracebacks for each individual sequence.
77  *
78  * Args:     msa       - msa alignment
79  *           matassign - assignment of column 1 if MAT, 0 if INS;
80  *                       [1..alen] (off one from aseqs)
81  *           ret_tr    - RETURN: array of tracebacks
82  *
83  * Return:   (void)
84  *           ret_tr is alloc'ed here. Caller must free.
85  */
86 void
CP9_fake_tracebacks(ESL_MSA * msa,int * matassign,CP9trace_t *** ret_tr)87 CP9_fake_tracebacks(ESL_MSA *msa, int *matassign, CP9trace_t ***ret_tr)
88 {
89   if(! (msa->flags & eslMSA_DIGITAL))
90     cm_Fail("ERROR in CP9_fake_tracebacks(), msa should be digitized.\n");
91 
92   int  status;
93   CP9trace_t **tr;
94   int  idx;                     /* counter over sequences          */
95   int  i;                       /* position in raw sequence (1..L) */
96   int  k;                       /* position in HMM                 */
97   int  apos;                    /* position in alignment columns   */
98   int  tpos;			/* position in traceback           */
99   int  first_match;             /* first match column */
100   /*int  last_match;*/              /* last match column, not used */
101 
102   ESL_ALLOC(tr, sizeof(CP9trace_t *) * msa->nseq);
103 
104   first_match = -1;
105   /*last_match  = -1;*/
106   for (apos = 0; apos < msa->alen; apos++)
107     {
108       if(matassign[apos+1] && first_match == -1) first_match = apos;
109       /*if(matassign[apos+1]) last_match = apos;*/
110     }
111 
112   for (idx = 0; idx < msa->nseq; idx++)
113     {
114       CP9AllocTrace(msa->alen+2, &tr[idx]);  /* allow room for B & E */
115 
116 				/* all traces start with M_0 state (the B state)... */
117       tr[idx]->statetype[0] = CSTB;
118       tr[idx]->nodeidx[0]   = 0;
119       tr[idx]->pos[0]       = 0;
120 
121       i = 1;
122       k = 0;
123       tpos = 1;
124 
125       for (apos = 0; apos < msa->alen; apos++)
126         {
127 	  tr[idx]->statetype[tpos] = CSTBOGUS; /* bogus, deliberately, to debug */
128 
129 	  if (matassign[apos+1] && !(esl_abc_XIsGap(msa->abc, msa->ax[idx][(apos+1)])))
130 	  {			/* MATCH */
131 	      k++;		/* move to next model pos */
132 	      tr[idx]->statetype[tpos] = CSTM;
133 	      tr[idx]->nodeidx[tpos]   = k;
134 	      tr[idx]->pos[tpos]       = i;
135 	      i++;
136 	      tpos++;
137 	    }
138           else if (matassign[apos+1])
139             {                   /* DELETE */
140 	      /* We should be careful about S/W transitions; but we have
141 	       * an ambiguity, based on the MSA, we can't tell if we
142 	       * did a local begin (some M->E transition) or if we
143 	       * went through a bunch of D state's before the first match
144 	       * B->D_1 -> D_2 .... -> M_x. For now, we assume we're not in
145 	       * S/W mode, and treat it as the latter case, see
146 	       * HMMER's modelmaker.c:fake_tracebacks() for code
147 	       * on one *would* implement the S/W consideration IF
148 	       * there wasn't a B->D_1 transition allowed.
149 	       */
150 	      k++;		/* *always* move on model when match column seen */
151 	      tr[idx]->statetype[tpos] = CSTD;
152 	      tr[idx]->nodeidx[tpos]   = k;
153 	      tr[idx]->pos[tpos]       = 0;
154 	      tpos++;
155             }
156 	  else if (! (esl_abc_XIsGap(msa->abc, msa->ax[idx][(apos+1)])))
157 	    {			/* INSERT */
158 	      tr[idx]->statetype[tpos] = CSTI;
159               tr[idx]->nodeidx[tpos]   = k;
160               tr[idx]->pos[tpos]       = i;
161 	      i++;
162 	      tpos++;
163 	    }
164 	}
165        /* all traces end with E state */
166       /* We should be careful about S/W transitions; but we have
167        * an ambiguity, based on the MSA, we can't tell if we
168        * did a local end (some M->E transition) or if we
169        * went through a bunch of D state's before the final
170        * D_M -> E transition. For now, we assume we're not in
171        * S/W mode, and treat it as the latter case, see
172        * HMMER's modelmaker.c:fake_tracebacks() for code
173        * on one *would* implement the S/W consideration IF
174        * there wasn't a D_M -> E transition allowed.
175        */
176       tr[idx]->statetype[tpos] = CSTE;
177       tr[idx]->nodeidx[tpos]   = 0;
178       tr[idx]->pos[tpos]       = 0;
179       tpos++;
180       tr[idx]->tlen = tpos;
181     }    /* end for sequence # idx */
182 
183   *ret_tr = tr;
184   return;
185 
186  ERROR:
187   cm_Fail("Memory allocation error.");
188 }
189 
190 /* Function: CP9TraceCount()
191  * EPN 09.04.06 based on Eddy's P7TraceCount() from HMMER's trace.c
192  *
193  * Purpose:  Count a traceback into a count-based HMM structure.
194  *           (Usually as part of a model parameter re-estimation.)
195  *           Traceback should not have any EL state visits in it.
196  *
197  * Args:     hmm   - counts-based CM Plan 9 HMM
198  *           dsq   - sequence that traceback aligns to the HMM (1..L)
199  *           wt    - weight on the sequence
200  *           tr    - alignment of seq to HMM
201  *
202  * Return:   (void)
203  */
204 void
CP9TraceCount(CP9_t * hmm,ESL_DSQ * dsq,float wt,CP9trace_t * tr)205 CP9TraceCount(CP9_t *hmm, ESL_DSQ *dsq, float wt, CP9trace_t *tr)
206 {
207   /* contract check */
208   if(dsq == NULL)            cm_Fail("ERROR in CP9TraceCount(), dsq is NULL.");
209   if(hmm->flags & CPLAN9_EL) cm_Fail("CP9TraceCount(), EL states are on, which this function is not setup for.");
210 
211   int tpos;                     /* position in tr */
212   int i;			/* symbol position in seq */
213 
214   for (tpos = 0; tpos < tr->tlen; tpos++)
215     {
216       i = tr->pos[tpos];
217 
218       /* Emission counts.
219        */
220       if (tr->statetype[tpos] == CSTM)
221 	esl_abc_FCount(hmm->abc, hmm->mat[tr->nodeidx[tpos]], dsq[i], wt);
222       else if (tr->statetype[tpos] == CSTI)
223 	esl_abc_FCount(hmm->abc, hmm->ins[tr->nodeidx[tpos]], dsq[i], wt);
224 
225       /* State transition counts
226        */
227       switch (tr->statetype[tpos]) {
228       case CSTB:
229 	switch (tr->statetype[tpos+1]) {
230 	case CSTM: hmm->begin[tr->nodeidx[tpos+1]] += wt; break;
231 	case CSTI: hmm->t[0][CTMI]                 += wt; break;
232 	case CSTD: hmm->t[0][CTMD]                 += wt; break;
233 	default:
234 	  cm_Fail("illegal state transition %s->%s in traceback",
235 	      CP9Statetype(tr->statetype[tpos]),
236 	      CP9Statetype(tr->statetype[tpos+1]));
237 	}
238 	break;
239       case CSTM:
240 	switch (tr->statetype[tpos+1]) {
241 	case CSTM: hmm->t[tr->nodeidx[tpos]][CTMM] += wt; break;
242 	case CSTI: hmm->t[tr->nodeidx[tpos]][CTMI] += wt; break;
243 	case CSTD: hmm->t[tr->nodeidx[tpos]][CTMD] += wt; break;
244 	case CSTE: hmm->end[tr->nodeidx[tpos]]     += wt; break;
245 	default:
246 	  cm_Fail("illegal state transition %s->%s in traceback",
247 	      CP9Statetype(tr->statetype[tpos]),
248 	      CP9Statetype(tr->statetype[tpos+1]));
249 	}
250 	break;
251       case CSTI:
252 	switch (tr->statetype[tpos+1]) {
253 	case CSTM: hmm->t[tr->nodeidx[tpos]][CTIM] += wt; break;
254 	case CSTI: hmm->t[tr->nodeidx[tpos]][CTII] += wt; break;
255 	case CSTD: hmm->t[tr->nodeidx[tpos]][CTID] += wt; break;
256 	case CSTE:
257 	  /* This should only happen from the final insert (I_M) state */
258 	  if((tpos+1) != (tr->tlen-1))
259 	    cm_Fail("illegal state transition %s->%s (I is not final insert) in traceback",
260 		CP9Statetype(tr->statetype[tpos]),
261 		CP9Statetype(tr->statetype[tpos+1]));
262 	  hmm->t[tr->nodeidx[tpos]][CTIM] += wt; break;
263 	  break;
264 	default:
265 	  cm_Fail("illegal state transition %s->%s in traceback",
266 	      CP9Statetype(tr->statetype[tpos]),
267 	      CP9Statetype(tr->statetype[tpos+1]));
268 	}
269 	break;
270       case CSTD:
271 	switch (tr->statetype[tpos+1]) {
272 	case CSTM: hmm->t[tr->nodeidx[tpos]][CTDM] += wt; break;
273 	case CSTI: hmm->t[tr->nodeidx[tpos]][CTDI] += wt; break;
274 	case CSTD: hmm->t[tr->nodeidx[tpos]][CTDD] += wt; break;
275 	case CSTE:
276 	  /* This should only happen from the final delete (D_M) state */
277 	  if((tpos+1) != (tr->tlen-1))
278 	    cm_Fail("illegal state transition %s->%s (D is not final delete) in traceback",
279 		CP9Statetype(tr->statetype[tpos]),
280 		CP9Statetype(tr->statetype[tpos+1]));
281 	  hmm->t[tr->nodeidx[tpos]][CTDM] += wt; break;
282 	  break;
283 	default:
284 	  cm_Fail("illegal state transition %s->%s in traceback",
285 	      CP9Statetype(tr->statetype[tpos]),
286 	      CP9Statetype(tr->statetype[tpos+1]));
287 	}
288 	break;
289       case CSTEL:
290 	cm_Fail("EL in traceback in CP9TraceCount(), this function is being abused.");
291 	break;
292       case CSTE:
293 	break; /* E is the last. It makes no transitions. */
294 
295       default:
296 	cm_Fail("illegal state %s in traceback",
297 	    CP9Statetype(tr->statetype[tpos]));
298       }
299     }
300 }
301 
302 /* Function: CP9TraceScore()
303  *           based on HMMER 2.3.2's P7TraceScore by SRE
304  *
305  * Purpose:  Score a traceback and return the score in scaled bits.
306  * Incept:   EPN, Wed May 30 06:07:14 2007
307  *
308  * Args:     hmm   - HMM with valid log odds scores.
309  *           dsq   - digitized sequence that traceback aligns to the HMM (1..sq->n)
310  *           tr    - alignment of seq to HMM
311  *
312  * Return:   (void)
313  */
314 float
CP9TraceScore(CP9_t * hmm,ESL_DSQ * dsq,CP9trace_t * tr)315 CP9TraceScore(CP9_t *hmm, ESL_DSQ *dsq, CP9trace_t *tr)
316 {
317   int score;			/* total score as a scaled integer */
318   int tpos;                     /* position in tr */
319   char sym;		        /* digitized symbol in dsq */
320 
321   /* Contract check */
322   if(dsq == NULL)
323     cm_Fail("ERROR in CP9TraceScore, dsq is NULL.");
324 
325   /*CP9PrintTrace(stdout, tr, hmm, sq); */
326   score = 0;
327   for (tpos = 0; tpos < tr->tlen-1; tpos++)
328     {
329       sym = dsq[tr->pos[tpos]];
330 
331       /* Emissions from M and I states.
332        */
333       if (tr->statetype[tpos] == CSTM)
334 	score += hmm->msc[(int) sym][tr->nodeidx[tpos]];
335       else if (tr->statetype[tpos] == CSTI)
336 	score += hmm->isc[(int) sym][tr->nodeidx[tpos]];
337 
338       /* State transitions. Including EL emissions, EL emits on transition
339        */
340       score += CP9TransitionScoreLookup(hmm,
341 					tr->statetype[tpos], tr->nodeidx[tpos],
342 					tr->statetype[tpos+1], tr->nodeidx[tpos+1]);
343     }
344   return Scorify(score);
345 }
346 
347 /* Function: CP9Statetype()
348  *
349  * Purpose:  Returns the state type in text.
350  * Example:  CP9Statetype(M) = "M"
351  */
352 char *
CP9Statetype(char st)353 CP9Statetype(char st)
354 {
355   switch (st) {
356   case CSTM: return "M";
357   case CSTD: return "D";
358   case CSTI: return "I";
359   case CSTB: return "B";
360   case CSTE: return "E";
361   case CSTEL: return "L";
362   default: return "BOGUS";
363   }
364 }
365 
366 /* Function: CP9PrintTrace()
367  *           based on HMMER's 2.3.2 P7PrintTrace()
368  *
369  * Purpose:  Print out a traceback structure.
370  *           If hmm is non-NULL, also print transition and emission scores.
371  * Incept:   EPN, Wed May 30 06:07:57 2007
372  *
373  * Args:     fp  - stderr or stdout, often
374  *           tr  - trace structure to print
375  *           hmm - NULL or hmm containing scores to print
376  *           dsq - NULL or digitized sequence trace refers to.
377  */
378 void
CP9PrintTrace(FILE * fp,CP9trace_t * tr,CP9_t * hmm,ESL_DSQ * dsq)379 CP9PrintTrace(FILE *fp, CP9trace_t *tr, CP9_t *hmm, ESL_DSQ *dsq)
380 {
381   /* Contract check */
382   if((dsq != NULL) && (hmm == NULL))
383     cm_Fail("ERROR in CP9PrintTrace, dsq is non-NULL but HMM is NULL.\n");
384 
385   int          tpos;		/* counter for trace position */
386   unsigned int sym;
387   int          sc;
388 
389   if (tr == NULL) {
390     fprintf(fp, " [ trace is NULL ]\n");
391     return;
392   }
393 
394   if (hmm == NULL) {
395     fprintf(fp, "st  node   rpos  - traceback len %d\n", tr->tlen);
396     fprintf(fp, "--  ---- ------\n");
397     for (tpos = 0; tpos < tr->tlen; tpos++) {
398       fprintf(fp, "%1s  %4d %6d\n",
399 	      CP9Statetype(tr->statetype[tpos]),
400 	      tr->nodeidx[tpos],
401 	      tr->pos[tpos]);
402     }
403   } else {
404     if (!(hmm->flags & CPLAN9_HASBITS))
405       cm_Fail("oi, you can't print scores from that hmm, it's not ready.");
406 
407     sc = 0;
408     fprintf(fp, "st  node   rpos  transit emission - traceback len %d\n", tr->tlen);
409     fprintf(fp, "--  ---- ------  ------- --------\n");
410     for (tpos = 0; tpos < tr->tlen; tpos++) {
411       if (dsq != NULL) sym = dsq[tr->pos[tpos]];
412 
413       fprintf(fp, "%1s  %4d %6d  %7d",
414 	      CP9Statetype(tr->statetype[tpos]),
415 	      tr->nodeidx[tpos],
416 	      tr->pos[tpos],
417 	      (tpos < tr->tlen-1) ?
418 	      CP9TransitionScoreLookup(hmm, tr->statetype[tpos], tr->nodeidx[tpos],
419 				    tr->statetype[tpos+1], tr->nodeidx[tpos+1]) : 0);
420 
421       if (tpos < tr->tlen-1)
422 	sc += CP9TransitionScoreLookup(hmm, tr->statetype[tpos], tr->nodeidx[tpos],
423 				       tr->statetype[tpos+1], tr->nodeidx[tpos+1]);
424 
425       if (dsq != NULL) {
426 	if (tr->statetype[tpos] == CSTM)
427 	  {
428 	    fprintf(fp, " %8d %c", hmm->msc[(int) sym][tr->nodeidx[tpos]],
429 		    hmm->abc->sym[(int) sym]);
430 	    sc += hmm->msc[(int) sym][tr->nodeidx[tpos]];
431 	  }
432 	else if (tr->statetype[tpos] == CSTI)
433 	  {
434 	    fprintf(fp, " %8d %c", hmm->isc[(int) sym][tr->nodeidx[tpos]],
435 		    (char) tolower((int) hmm->abc->sym[(int) sym]));
436 	    sc += hmm->isc[(int) sym][tr->nodeidx[tpos]];
437 	  }
438 	else if (tr->statetype[tpos] == CSTEL)
439 	  {
440 	    if(tr->statetype[tpos-1] == CSTEL) /* we will emit on self transit */
441 	      {
442 		fprintf(fp, " %8d %c", 0,
443 			(char) tolower((int) hmm->abc->sym[(int) sym]));
444 	      }
445 	    else /* we just entered EL, no emission */
446 	      {
447 		fprintf(fp, " %8s %c", "-", '-');
448 	      }
449 	  }
450       } else {
451 	fprintf(fp, " %8s %c", "-", '-');
452       }
453 
454 
455       fputs("\n", fp);
456     }
457     fprintf(fp, "                 ------- --------\n");
458     fprintf(fp, "           total: %6d\n\n", sc);
459   }
460 }
461 
462 /* Function: CP9TransitionScoreLookup()
463  *           based on HMMER's 2.3.2 function of same name
464  *
465  * Incept:   EPN, Wed May 30 06:09:04 2007
466  * Purpose:  Convenience function used in CP9PrintTrace() and CP9TraceScore();
467  *           given state types and node indices for a transition,
468  *           return the integer score for that transition.
469  */
470 int
CP9TransitionScoreLookup(CP9_t * hmm,char st1,int k1,char st2,int k2)471 CP9TransitionScoreLookup(CP9_t *hmm, char st1, int k1,
472 			 char st2, int k2)
473 {
474   switch (st1) {
475   case CSTB:
476     switch (st2) {
477     case CSTM: return hmm->bsc[k2];
478     case CSTI: return hmm->tsc[CTMI][0];
479     case CSTD: return hmm->tsc[CTMD][0];
480     default:      cm_Fail("illegal %s->%s transition", CP9Statetype(st1), CP9Statetype(st2));
481     }
482     break;
483   case CSTM:
484     switch (st2) {
485     case CSTM: return hmm->tsc[CTMM][k1];
486     case CSTI: return hmm->tsc[CTMI][k1];
487     case CSTD: return hmm->tsc[CTMD][k1];
488     case CSTE: return hmm->esc[k1];
489     case CSTEL: return hmm->tsc[CTMEL][k1];
490     default:      cm_Fail("illegal %s->%s transition", CP9Statetype(st1), CP9Statetype(st2));
491     }
492     break;
493   case CSTI:
494     switch (st2) {
495     case CSTM: return hmm->tsc[CTIM][k1];
496     case CSTI: return hmm->tsc[CTII][k1];
497     case CSTD: return hmm->tsc[CTID][k1];
498     case CSTE: return hmm->tsc[CTIM][k1]; /* This should only happen from the final insert (I_M) state */
499     default:      cm_Fail("illegal %s->%s transition", CP9Statetype(st1), CP9Statetype(st2));
500     }
501     break;
502   case CSTD:
503     switch (st2) {
504     case CSTM: return hmm->tsc[CTDM][k1];
505     case CSTI: return hmm->tsc[CTDI][k1];
506     case CSTD: return hmm->tsc[CTDD][k1];
507     case CSTE: return hmm->tsc[CTDM][k1]; /* This should only happen from the final delete (D_M) state */
508     default:      cm_Fail("illegal %s->%s transition", CP9Statetype(st1), CP9Statetype(st2));
509     }
510     break;
511   case CSTEL:
512     switch (st2) {
513     case CSTM: return 0; /* transition to EL penalty incurred when M->EL transition takes place */
514     case CSTE: return 0; /* transition to EL penalty incurred when M->EL transition takes place */
515     case CSTEL: return hmm->el_selfsc; /* penalty for EL->EL self transition loop */
516     default:      cm_Fail("illegal %s->%s transition", CP9Statetype(st1), CP9Statetype(st2));
517     }
518     break;
519   case CSTE: /* this should never happen, it means we transitioned from E, which is not
520 	      * allowed. */
521     cm_Fail("illegal %s->%s transition", CP9Statetype(st1), CP9Statetype(st2));
522     break;
523   default:        cm_Fail("illegal state %s in traceback", CP9Statetype(st1));
524   }
525   /*NOTREACHED*/
526   return 0;
527 }
528 
529 
530 /* Function: CP9ViterbiTrace()
531  * Date:     EPN, Wed May 30 17:32:05 2007
532  *           based on HMMER 2.3.2's P7ViterbiTrace()
533  *
534  * Purpose:  Traceback of a Viterbi matrix: i.e. retrieval
535  *           of optimum alignment.
536  *
537  * Args:     hmm    - hmm, log odds form, used to make mx
538  *           dsq    - sequence aligned to (digital form) 1..L
539  *           i0     - first residue of sequence, often 1
540  *           j0     - last residue of sequence, often L
541  *           mx     - the matrix to trace back in, L x hmm->M
542  *           ret_tr - RETURN: traceback.
543  *
544  * Return:   (void)
545  *           ret_tr is allocated here. Free using CP9FreeTrace().
546  */
547 void
CP9ViterbiTrace(CP9_t * hmm,ESL_DSQ * dsq,int i0,int j0,CP9_MX * mx,CP9trace_t ** ret_tr)548 CP9ViterbiTrace(CP9_t *hmm, ESL_DSQ *dsq, int i0, int j0,
549 		CP9_MX *mx, CP9trace_t **ret_tr)
550 {
551   /* contract check */
552   if(dsq == NULL)
553     cm_Fail("ERROR in CP9ViterbiTrace(), dsq is NULL.");
554 
555   CP9trace_t *tr;
556   int curralloc;		/* current allocated length of trace */
557   int tpos;			/* position in trace */
558   int i;			/* position in seq (1..N) */
559   int k;			/* position in model (1..M) */
560   int *erow, **mmx, **imx, **dmx, **elmx;
561   int sc;			/* temp var for pre-emission score */
562   int error_flag;
563   int c;                        /* counter over possible EL states */
564 
565   /* Overallocate for the trace.
566    * B- ... - E  : 2 states + N is minimum trace;
567    * add N more as buffer.
568    */
569   curralloc = (j0-i0+1) * 2 + 2;
570   CP9AllocTrace(curralloc, &tr);
571 
572   mmx = mx->mmx;
573   imx = mx->imx;
574   dmx = mx->dmx;
575   elmx= mx->elmx;
576   erow= mx->erow;
577 
578   /* Initialization of trace
579    * We do it back to front; ReverseTrace() is called later.
580    */
581   tr->statetype[0] = CSTE;
582   tr->nodeidx[0]   = 0;
583   tr->pos[0]       = 0;
584   tpos = 1;
585   i    = j0;			/* current i (seq pos) we're trying to assign */
586 
587   /* Traceback
588    */
589   while (tr->statetype[tpos-1] != CSTB) {
590     error_flag = FALSE;
591     switch (tr->statetype[tpos-1]) {
592     case CSTM:			/* M connects from i-1,k-1, B or an EL*/
593       /*printf("CSTM k: %d i:%d \n", k, i);*/
594       sc = mmx[i+1][k+1] - hmm->msc[dsq[i+1]][k+1];
595       if (sc <= -INFTY) { CP9FreeTrace(tr); *ret_tr = NULL; return; }
596       else if (sc == mmx[i][k] + hmm->tsc[CTMM][k])
597 	{
598 	  tr->statetype[tpos] = CSTM;
599 	  tr->nodeidx[tpos]   = k--;
600 	  tr->pos[tpos]       = i--;
601 	}
602       else if (sc == imx[i][k] + hmm->tsc[CTIM][k])
603 	{
604 	  tr->statetype[tpos] = CSTI;
605 	  tr->nodeidx[tpos]   = k;
606 	  tr->pos[tpos]       = i--;
607 	}
608       else if (sc == dmx[i][k] + hmm->tsc[CTDM][k])
609 	{
610 	  tr->statetype[tpos] = CSTD;
611 	  tr->nodeidx[tpos]   = k--;
612 	  tr->pos[tpos]       = 0;
613 	}
614       else /* Check if we came from an EL state (could be more than 1 choice) */
615 	{
616 	  error_flag = TRUE;
617 	  /* note we look at el_from_ct[k+1] not k for same reason we look
618 	   * at bsc[k+1] above, we're going backwards, this is a tricky off-by-one */
619 	  for(c = 0; c < hmm->el_from_ct[k+1]; c++) /* el_from_ct[k+1] is >= 0 */
620 	    {
621 	      /* transition penalty to EL incurred when EL was entered */
622 	      if(sc == elmx[i][hmm->el_from_idx[k+1][c]])
623 		{
624 		  tr->statetype[tpos] = CSTEL;
625 		  k = hmm->el_from_idx[(k+1)][c];
626 		  tr->nodeidx[tpos]   = k;
627 		  tr->pos[tpos]       = i--;
628 		  error_flag = FALSE;
629 		  break;
630 		}
631 	    }
632 	}
633       if(error_flag)
634 	{
635 	  /* one last possibility, we came from B, check this last, in
636 	   * case hmm->bsc[k+1] happens to be identical to sc but
637 	   * we're not done the parse yet (i.e. one of the cases
638 	   * above equaled sc). */
639 	  if (sc == hmm->bsc[k+1])
640 	    {
641 	      tr->statetype[tpos] = CSTB;
642 	      tr->nodeidx[tpos]   = 0;
643 	      tr->pos[tpos]       = 0;
644 	      if(tr->pos[tpos-1] != 1)
645 		cm_Fail("traceback failed: premature begin");
646 	      error_flag = FALSE;
647 	    }
648 	}
649       if(error_flag)
650 	cm_Fail("traceback failed");
651       break;
652 
653     case CSTD:			/* D connects from M,D,I, (D_1 also connects from B (M_0) */
654       /*printf("CSTD k: %d i:%d \n", k, i);*/
655        sc = dmx[i][k+1];
656        if (sc <= -INFTY) { CP9FreeTrace(tr); *ret_tr = NULL; return; }
657        else if(k == 0) /* D_1 connects from B(M_0), and I_0, when k == 0, we're dealing with D_1, a confusing off-by-one */
658 	{
659 	  if(sc == mmx[i][k] + hmm->tsc[CTMD][k])
660 	    {
661 	      tr->statetype[tpos] = CSTB;
662 	      tr->nodeidx[tpos]   = 0;
663 	      tr->pos[tpos]       = 0;
664 	    }
665 	  else if (sc == imx[i][k] + hmm->tsc[CTID][k])
666 	    {
667 	      tr->statetype[tpos] = CSTI;
668 	      tr->nodeidx[tpos]   = k;
669 	      tr->pos[tpos]       = i--;
670 	    }
671 	  else cm_Fail("traceback failed");
672 	} /* else k != 0 */
673       else if (sc == mmx[i][k] + hmm->tsc[CTMD][k])
674 	{
675 	  tr->statetype[tpos] = CSTM;
676 	  tr->nodeidx[tpos]   = k--;
677 	  tr->pos[tpos]       = i--;
678 	}
679       else if (sc == imx[i][k] + hmm->tsc[CTID][k])
680 	{
681 	  tr->statetype[tpos] = CSTI;
682 	  tr->nodeidx[tpos]   = k;
683 	  tr->pos[tpos]       = i--;
684 	}
685       else if (sc == dmx[i][k] + hmm->tsc[CTDD][k])
686 	{
687 	  tr->statetype[tpos] = CSTD;
688 	  tr->nodeidx[tpos]   = k--;
689 	  tr->pos[tpos]       = 0;
690 	}
691       else cm_Fail("traceback failed");
692       break;
693 
694     case CSTI:			/* I connects from M,I,D, (I_0 connects from B also(*/
695       /*printf("CSTI k: %d i:%d \n", k, i);*/
696       sc = imx[i+1][k] - hmm->isc[dsq[i+1]][k];
697       if (sc <= -INFTY) { CP9FreeTrace(tr); *ret_tr = NULL; return; }
698       else if(k == 0) /* I_0 connects from B(M_0), and I_0 */
699 	{
700 	  if(sc == mmx[i][k] + hmm->tsc[CTMI][k])
701 	    {
702 	      tr->statetype[tpos] = CSTB;
703 	      tr->nodeidx[tpos]   = 0;
704 	      tr->pos[tpos]       = 0;
705 	    }
706 	  else if (sc == imx[i][k] + hmm->tsc[CTII][k])
707 	    {
708 	      tr->statetype[tpos] = CSTI;
709 	      tr->nodeidx[tpos]   = k;
710 	      tr->pos[tpos]       = i--;
711 	    }
712 	}
713       /* else k != 0 */
714       else if (sc == mmx[i][k] + hmm->tsc[CTMI][k])
715 	{
716 	  tr->statetype[tpos] = CSTM;
717 	  tr->nodeidx[tpos]   = k--;
718 	  tr->pos[tpos]       = i--;
719 	}
720 
721       else if (sc == imx[i][k] + hmm->tsc[CTII][k])
722 	{
723 	  tr->statetype[tpos] = CSTI;
724 	  tr->nodeidx[tpos]   = k;
725 	  tr->pos[tpos]       = i--;
726 	}
727       else if (sc == dmx[i][k] + hmm->tsc[CTDI][k])
728 	{
729 	  tr->statetype[tpos] = CSTD;
730 	  tr->nodeidx[tpos]   = k--;
731 	  tr->pos[tpos]       = 0;
732 	}
733       else cm_Fail("traceback failed");
734       break;
735 
736     case CSTE:			/* E connects from any M state. k set here
737 				 * also can connect from I_M or D_M (diff from p7)
738 				 * or even EL_M if it exists */
739       if (erow[i] <= -INFTY) { CP9FreeTrace(tr); *ret_tr = NULL; return; }
740       if (erow[i] == imx[i][hmm->M] + hmm->tsc[CTIM][hmm->M])
741 	{
742 	  k = hmm->M;
743 	  tr->statetype[tpos] = CSTI;
744 	  tr->nodeidx[tpos]   = k;
745 	  tr->pos[tpos]       = i--;
746 	}
747       else if (erow[i] == dmx[i][hmm->M] + hmm->tsc[CTDM][hmm->M])
748 	{
749 	  k = hmm->M;
750 	  tr->statetype[tpos] = CSTD;
751 	  tr->nodeidx[tpos]   = k--;
752 	  tr->pos[tpos]       = 0;
753 	}
754       else
755 	{
756 	  error_flag = TRUE;
757 	  for (k = hmm->M; k >= 1; k--)
758 	    if (erow[i] == mmx[i][k] + hmm->esc[k])
759 	      {
760 		tr->statetype[tpos] = CSTM;
761 		tr->nodeidx[tpos]   = k--;
762 		tr->pos[tpos]       = i--;
763 		error_flag = FALSE;
764 		break;
765 	      }
766 	  if(error_flag)
767 	    {
768 	      /* Check if we came from an EL state (could be more than 1 choice) */
769 	      /* hmm->el_from-ct[hmm->M+1] is # of ELs that can transit to E (END) */
770 	      for(c = hmm->el_from_ct[hmm->M+1]-1; c >= 0; c--) /* el_from_ct[] is >= 0 */
771 		{
772 		  /* transition penalty to EL incurred when EL was entered */
773 		  if(erow[i] == elmx[i][hmm->el_from_idx[hmm->M+1][c]])
774 		    {
775 		      tr->statetype[tpos] = CSTEL;
776 		      k = hmm->el_from_idx[(hmm->M+1)][c];
777 		      tr->nodeidx[tpos]   = k;
778 		      tr->pos[tpos]       = i--;
779 		      error_flag = FALSE;
780 		      break;
781 		    }
782 		}
783 	    }
784 	}
785       if (k < 0 || error_flag) cm_Fail("traceback failed");
786       break;
787 
788     case CSTEL:			/* EL connects from certain M states and itself */
789       /*printf("CSTEL k: %d i:%d \n", k, i);*/
790       /* check if we are staying in the EL */
791       sc = elmx[i+1][k];
792       if (sc == elmx[i][k] + hmm->el_selfsc) /* i >= 2, first residue must be emitted by a match, not an EL */
793 	{
794 	  tr->statetype[tpos] = CSTEL;
795 	  tr->nodeidx[tpos]   = k;
796 	  tr->pos[tpos]       = i--;
797 	}
798       else if(sc  == mmx[i+1][k]   + hmm->tsc[CTMEL][k])    /* M->EL->M with 0 self loops in EL */
799 	{
800 	  tr->statetype[tpos] = CSTM;
801 	  tr->nodeidx[tpos]   = k--;
802 	  tr->pos[tpos]       = i+1; /* special case, we decremented i prematurely b/c we
803 				      * had no way of knowing it was our last visit to EL, before
804 				      * we went to M (since we're working backwards this is actually
805 				      * the first visit to EL).
806 				      */
807 	}
808       else cm_Fail("traceback failed");
809       break;
810 
811     default:
812       cm_Fail("traceback failed");
813 
814     } /* end switch over statetype[tpos-1] */
815 
816     tpos++;
817     if (tpos == curralloc)
818       {				/* grow trace if necessary  */
819 	curralloc += (j0-i0+1);
820 	CP9ReallocTrace(tr, curralloc);
821       }
822 
823   } /* end traceback, at S state; tpos == tlen now */
824   tr->tlen = tpos;
825   CP9ReverseTrace(tr);
826 
827   *ret_tr = tr;
828 }
829 
830 /* Function: CP9ReverseTrace()
831  * Date:     EPN, Wed May 30 17:52:18 2007
832  *           identical to SRE's P7ReverseTrace() from HMMER 2.3.2
833  *
834  * Purpose:  Reverse the arrays in a traceback structure.
835  *           Tracebacks from Forward() and Viterbi() are
836  *           collected backwards, and call this function
837  *           when they're done.
838  *
839  *           It's possible to reverse the arrays in place
840  *           more efficiently; but the realloc/copy strategy
841  *           has the advantage of reallocating the trace
842  *           into the right size of memory. (Tracebacks
843  *           overallocate.)
844  *
845  * Args:     tr - the traceback to reverse. tr->tlen must be set.
846  *
847  * Return:   (void)
848  *           tr is modified.
849  */
850 void
CP9ReverseTrace(CP9trace_t * tr)851 CP9ReverseTrace(CP9trace_t *tr)
852 {
853   int    status;
854   char  *statetype;
855   int   *nodeidx;
856   int   *pos;
857   int    opos, npos;
858 
859   /* Allocate
860    */
861   ESL_ALLOC(statetype, sizeof(char)* tr->tlen);
862   ESL_ALLOC(nodeidx,   sizeof(int) * tr->tlen);
863   ESL_ALLOC(pos,       sizeof(int) * tr->tlen);
864 
865   /* Reverse the trace.
866    */
867   for (opos = tr->tlen-1, npos = 0; npos < tr->tlen; npos++, opos--)
868     {
869       statetype[npos] = tr->statetype[opos];
870       nodeidx[npos]   = tr->nodeidx[opos];
871       pos[npos]       = tr->pos[opos];
872     }
873 
874   /* Swap old, new arrays.
875    */
876   free(tr->statetype);
877   free(tr->nodeidx);
878   free(tr->pos);
879   tr->statetype = statetype;
880   tr->nodeidx   = nodeidx;
881   tr->pos       = pos;
882   return;
883 
884  ERROR:
885   cm_Fail("Memory allocation error.");
886 }
887 
888 
889 
890 /* Function: CP9Traces2Alignment()
891  *           based on SRE's P7Traces2Alignment() from HMMER 2.3.2
892  *
893  * Purpose:  Convert an array of traceback structures for a set
894  *           of sequences into a new multiple alignment. Modified
895  *           from HMMER to account for possible EL local-end
896  *           insertions (which don't exist in P7). Including EL
897  *           insertions requires an emit map from the CM.
898  *
899  *           Insertions/ELs are put into lower case and
900  *           are not aligned; instead, Nterm is right-justified,
901  *           Cterm is left-justified, and internal insertions
902  *           are split in half and the halves are justified in
903  *           each direction (the objective being to increase
904  *           the chances of getting insertions aligned well enough
905  *           for them to become a match). SAM gap char conventions
906  *           are used: - in match columns, . in insert columns
907  *
908  * Args:     cm         - the CM the CP9 was built from, needed to get emitmap,
909  *                        so we know where to put EL transitions
910  *           cp9        - the CP9 used to determine the traces (cm->cp9loc or cm->cp9glb)
911  *           abc        - alphabet to use to create the return MSA
912  *           sq         - sequences
913  *           wgt        - weights for seqs, NULL for none
914  *           nseq       - number of sequences
915  *           tr         - array of tracebacks
916  *           do_full    - TRUE to always include all match columns in alignment
917  *           do_matchonly - TRUE to ONLY include match columns
918  *           ret_msa    - MSA, alloc'ed created here
919  *
920  * Return:   eslOK on succes, eslEMEM on memory error.
921  *           MSA structure in ret_msa, caller responsible for freeing.
922  */
923 int
CP9Traces2Alignment(CM_t * cm,CP9_t * cp9,const ESL_ALPHABET * abc,ESL_SQ ** sq,float * wgt,int nseq,CP9trace_t ** tr,int do_full,int do_matchonly,ESL_MSA ** ret_msa)924 CP9Traces2Alignment(CM_t *cm, CP9_t *cp9, const ESL_ALPHABET *abc, ESL_SQ **sq, float *wgt,
925 		    int nseq, CP9trace_t **tr, int do_full, int do_matchonly,
926 		    ESL_MSA **ret_msa)
927 {
928   int status;                   /* easel status flag */
929   ESL_MSA   *msa = NULL;        /* RETURN: new alignment */
930   int    idx;                   /* counter for sequences */
931   int    alen;                  /* width of alignment */
932   int   *maxins = NULL;         /* array of max inserts between aligned columns */
933   int   *maxels = NULL;         /* array of max ELs emissions between aligned columns */
934   int   *matmap = NULL;         /* matmap[k] = apos of match k [1..M] */
935   int    nins;                  /* counter for inserts */
936   int    cpos;                  /* HMM node, consensus position */
937   int    apos;                  /* position in aligned sequence (0..alen-1)*/
938   int    rpos;                  /* position in raw digital sequence (1..L)*/
939   int    tpos;                  /* position counter in traceback */
940   int    epos;                  /* position ctr for EL insertions */
941   int    statetype;		/* type of current state, e.g. STM */
942   CMEmitMap_t *emap = NULL;     /* consensus emit map for the CM */
943   int         *imap = NULL;     /* first apos for an insert following a cpos */
944   int         *elmap = NULL;    /* first apos for an EL following a cpos */
945   int         *matuse = NULL;   /* TRUE if we need a cpos in mult alignment */
946   int         *eluse = NULL;    /* TRUE if we have an EL after cpos in alignment */
947   int        **eposmap = NULL;  /* [seq idx][CP9 node idx] where each EL should emit to */
948   int         *iuse = NULL;     /* TRUE if we have an I after cpos in alignment */
949   CMConsensus_t *con = NULL;    /* consensus information for the CM */
950   int          next_match;      /* used for filling eposmap */
951   int          c;               /* counter over possible EL froms */
952   int         *insleft;         /* [0..cpos..clen] TRUE if inserts *following* cpos should be flush right */
953   int          nd;              /* counter over nodes */
954   int          max_ins_or_el[2];/* for regularizing (splitting) inserts */
955   int          pass_offset[2];  /* for regularizing (splitting) inserts */
956   int          pass;            /* for regularizing (splitting) inserts */
957   char         errbuf[eslERRBUFSIZE];
958 
959   /* Contract checks */
960   if(cp9 == NULL)
961     cm_Fail("ERROR in CP9Traces2Alignment, cp9 is NULL.\n");
962   if(cm->cp9map == NULL)
963     cm_Fail("ERROR in CP9Traces2Alignment, cm->cp9map is NULL.\n");
964   /* We allow the caller to specify the alphabet they want the
965    * resulting MSA in, but it has to make sense (see next few lines). */
966   if(cm->abc->type == eslRNA)
967     {
968       if(abc->type != eslRNA && abc->type != eslDNA)
969 	cm_Fail("ERROR in CP9Traces2Alignment(), cm alphabet is RNA, but requested output alphabet is neither DNA nor RNA.");
970     }
971   else if(cm->abc->K != abc->K)
972     cm_Fail("ERROR in CP9Traces2Alignment(), cm alphabet size is %d, but requested output alphabet size is %d.", cm->abc->K, abc->K);
973 
974   /* create the emit map */
975   emap = CreateEmitMap(cm);
976 
977   /* Determine which direction we emit to for each consensus column,
978    * IL's emit left, IR's emit right, but this info isn't indexed by
979    * consensus column, so we use an emitmap to get it.  This is used
980    * to determine if we go IL before EL or EL before IR when inserting
981    * both regular IL/IR inserts and EL inserts in same place. This is
982    * also used if we have enabled (cm->align_opts &
983    * CM_ALIGN_FLUSHINSERTS) which overides default behavior to split
984    * the inserts and adopts 'flush left for IL / flush right for IR'
985    * behavior (which older versions of Infernal used).
986    */
987   ESL_ALLOC(insleft, sizeof(int) * (emap->clen+1));
988   esl_vec_ISet(insleft, (cm->clen+1), -1);
989   for(nd = 0; nd < cm->nodes; nd++)
990     {
991       if(cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd || cm->ndtype[nd] == BEGR_nd || cm->ndtype[nd] == ROOT_nd)
992 	if(insleft[emap->lpos[nd]] == -1) /* deal with sole CM grammar ambiguity */
993 	  insleft[emap->lpos[nd]] = TRUE;
994       if(cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATR_nd)
995 	insleft[emap->rpos[nd]-1] = FALSE;
996     }
997   if(insleft[emap->clen] == -1) insleft[emap->clen] = FALSE; /* special case, insleft[emap->clen] == -1 IFF cpos==emap->clen is modelled by MATR or MATP. */
998   /* check we've constructed insleft properly, TEMPORARY */
999   for(cpos = 0; cpos <= emap->clen; cpos++)
1000     ESL_DASSERT1((insleft[cpos] != -1));
1001 
1002   /* Here's the problem. We want to align the match states in columns,
1003    * but some sequences have inserted symbols in them; we need some
1004    * sort of overall knowledge of where the inserts are and how long
1005    * they are in order to create the alignment.
1006    *
1007    * Here's our trick. maxins[] and maxels[] are 0..hmm->M arrays;
1008    * maxins[i] stores the maximum number of times insert substate
1009    * i was used. maxels[i] stores the max number of times an EL insertion
1010    * occurs after insert substate i. maxins[i] + maxels[i]
1011    * is the maximum number of gaps to insert between canonical
1012    * column i and i+1.  maxins[0], maxels[0] is the N-term tail;
1013    * maxins[M], maxels[0] is the C-term tail.
1014    */
1015   ESL_ALLOC(matuse, sizeof(int) * (emap->clen+1));
1016   ESL_ALLOC(eluse,  sizeof(int) * (emap->clen+1));
1017   ESL_ALLOC(iuse,   sizeof(int) * (emap->clen+1));
1018   ESL_ALLOC(maxins, sizeof(int) * (emap->clen+1));
1019   ESL_ALLOC(maxels, sizeof(int) * (emap->clen+1));
1020   ESL_ALLOC(matmap, sizeof(int) * (emap->clen+1));
1021   ESL_ALLOC(imap,   sizeof(int) * (emap->clen+1));
1022   ESL_ALLOC(elmap,  sizeof(int) * (emap->clen+1));
1023   ESL_ALLOC(eposmap,sizeof(int *) * (nseq));
1024 
1025   /* eposmap is 2D b/c different traces can have different epos
1026    * (position where EL inserts) for the same EL state, for example:
1027    * an EL state for node 9 may reconnect at node 25 in one parse
1028    * and node 50 in another if there's a CM MATL node with subtree
1029    * lpos=9 rpos=50, and a CM BEGL node with subtree lpos=9 rpos=25,
1030    * i.e. there are 2 CM EL states being mirrored by 1 HMM EL state.
1031    */
1032   for (cpos = 0; cpos <= emap->clen; cpos++)
1033     {
1034       if(!do_full || cpos == 0)
1035 	matuse[cpos] = 0;
1036       else
1037 	matuse[cpos] = 1;
1038       maxins[cpos] = maxels[cpos] = 0;
1039       iuse[cpos] = eluse[cpos] = imap[cpos] = elmap[cpos] = 0;
1040     }
1041 
1042   /* Look at all the traces; find maximum length of
1043    * insert needed at each of the clen+1 possible gap
1044    * points. (There are two types of insert, I/EL)
1045    * Also find whether we don't need some of the match
1046    * (consensus) columns.
1047    */
1048   for (idx = 0; idx < nseq; idx++)
1049     {
1050       ESL_ALLOC(eposmap[idx], sizeof(int) * (emap->clen+1));
1051       for (cpos = 0; cpos <= emap->clen; cpos++)
1052 	{
1053 	  iuse[cpos] = eluse[cpos] = 0;
1054 	  eposmap[idx][cpos] = -1;
1055 	}
1056 
1057       /* Determine the eposmap, the cpos EL's go into for each cpos for each seq.
1058        * This depends on the first match state entered after each EL, so we go bottom up */
1059       next_match = -1;
1060       for (tpos = tr[idx]->tlen - 1; tpos >= 0; tpos--)
1061 	{
1062 	  statetype = tr[idx]->statetype[tpos]; /* just for clarity */
1063 	  cpos      = tr[idx]->nodeidx[tpos];
1064 	  if(statetype == CSTM) next_match = cpos;
1065 	  if(statetype == CSTE) next_match = cp9->M+1;
1066 	  if(statetype == CSTEL) eposmap[idx][cpos] = next_match; /* this will be overwritten below */
1067 	}
1068       for (cpos = 0; cpos <= emap->clen; cpos++)
1069 	{
1070 	  if(eposmap[idx][cpos] != -1)
1071 	    {
1072 	      /*printf("cpos: %d eposmap[idx][cpos]: %d ct: %d\n", cpos, eposmap[idx][cpos], cp9->el_from_ct[eposmap[idx][cpos]]);*/
1073 	      /* determine the epos based on the CM emit map and cp9->el* data structures */
1074 	      for(c = 0; c < cp9->el_from_ct[eposmap[idx][cpos]]; c++)
1075 		{
1076 		  if(cp9->el_from_idx[eposmap[idx][cpos]][c] == cpos)
1077 		    {
1078 		      eposmap[idx][cpos] = emap->epos[cp9->el_from_cmnd[eposmap[idx][cpos]][c]];
1079 		      break;
1080 		    }
1081 		  if(c == (cp9->el_from_ct[eposmap[idx][cpos]] - 1))
1082 		    cm_Fail("Couldn't determine epos for cpos: %d\n", cpos);
1083 		}
1084 	    }
1085 	}
1086 
1087       for (tpos = 0; tpos < tr[idx]->tlen; tpos++)
1088 	{
1089 	  cpos = tr[idx]->nodeidx[tpos];
1090 	  switch (tr[idx]->statetype[tpos]) {
1091 	    case CSTI: iuse[cpos]++; break;
1092 	  case CSTM: matuse[tr[idx]->nodeidx[tpos]] = 1; break;
1093 	  case CSTEL:
1094 	    eluse[eposmap[idx][cpos]]++;
1095 	    break;
1096 	  case CSTD:
1097 	  case CSTE:
1098 	  case CSTB:
1099 	    break;
1100 	  default:
1101 	    cm_Fail("CP9Traces2Alignment reports unrecognized statetype %c",
1102 		CP9Statetype(tr[idx]->statetype[tpos]));
1103 	  }
1104 	} /* end looking at trace i */
1105       for (cpos = 0; cpos <= emap->clen; cpos++)
1106 	{
1107 	  if (iuse[cpos]  > maxins[cpos]) maxins[cpos]  = iuse[cpos];
1108 	  if (eluse[cpos] > maxels[cpos]) maxels[cpos]  = eluse[cpos]-1; /* EL only emits on self loops */
1109 	}
1110     } /* end calculating lengths used by all traces */
1111 
1112   /***********************************************
1113    * Construct the alignment
1114    ***********************************************/
1115 
1116   /* Now we can calculate the total length of the multiple alignment, alen;
1117    * and the maps imap and  elmap that turn a cpos into an apos
1118    * in the multiple alignment: e.g. for an insert that follows consensus
1119    * position cpos, put it at or after apos = imap[cpos] in aseq[][].
1120    */
1121 
1122   matmap[0] = -1; /* M_0 is B state, non-emitter */
1123   alen = 0;
1124   for (cpos = 0; cpos <= emap->clen; cpos++)
1125     {
1126       if (matuse[cpos])
1127 	{
1128 	  matmap[cpos] = alen;
1129 	  alen++;
1130 	}
1131       else
1132 	matmap[cpos] = -1;
1133 
1134       if(insleft[cpos]) { /* IL state inserts here, IL's go before EL's */
1135 	imap[cpos]  = alen; alen += maxins[cpos];
1136 	elmap[cpos] = alen; alen += maxels[cpos];
1137       }
1138       else { /* IR state inserts here, IR's go after EL's */
1139 	elmap[cpos] = alen; alen += maxels[cpos];
1140 	imap[cpos]  = alen; alen += maxins[cpos];
1141       }
1142     }
1143                                 /* allocation for new alignment */
1144   if ((msa = esl_msa_Create(nseq, alen)) == NULL) { status = eslEMEM; goto ERROR; }
1145   msa->nseq = nseq;
1146   msa->alen = alen;
1147   msa->abc  = (ESL_ALPHABET *) abc;
1148 
1149   for (idx = 0; idx < nseq; idx++)
1150     {
1151       if(sq[idx]->dsq == NULL) cm_Fail("ERROR in CP9Traces2Alignment(), sq's should be digitized.\n");
1152 
1153       for (cpos = 0; cpos <= emap->clen; cpos++)
1154 	iuse[cpos] = eluse[cpos] = 0;
1155       /* blank an aseq */
1156       for (apos = 0; apos < alen; apos++)
1157 	msa->aseq[idx][apos] = '.';
1158       for (cpos = 0; cpos <= emap->clen; cpos++)
1159 	if (matmap[cpos] != -1) msa->aseq[idx][matmap[cpos]] = '-';
1160       msa->aseq[idx][alen] = '\0';
1161 
1162       /* align the sequence */
1163       apos = 0;
1164       for (tpos = 0; tpos < tr[idx]->tlen; tpos++)
1165 	{
1166 	  statetype = tr[idx]->statetype[tpos]; /* just for clarity */
1167 	  rpos      = tr[idx]->pos[tpos];
1168 	  cpos      = tr[idx]->nodeidx[tpos];
1169 
1170 	  if (statetype == CSTM)
1171 	    {
1172 	      apos = matmap[cpos];
1173 	      msa->aseq[idx][apos] = abc->sym[sq[idx]->dsq[rpos]];
1174 	    }
1175 	  else if (statetype == CSTD)
1176 	    apos = matmap[cpos]+1;	/* need for handling D->I; xref STL6/p.117 */
1177 	  else if (statetype == CSTI)
1178 	    {
1179 	      /* flush all inserts left for now, we'll split or flush-right after we're done with all seqs */
1180 	      apos = imap[cpos] + iuse[cpos];
1181 	      msa->aseq[idx][apos] = (char) tolower((int) abc->sym[sq[idx]->dsq[rpos]]);
1182 	      iuse[cpos]++;
1183 	    }
1184 	  else if (statetype == CSTEL)
1185 	    {
1186 	      /*printf("CSTEL cpos: %d rpos: %d epos: %d\n", cpos, rpos);*/
1187 	      epos = eposmap[idx][cpos];
1188 	      if(tr[idx]->statetype[tpos-1] == CSTEL) /* we don't emit on first EL visit */
1189 		{
1190 		  apos = elmap[epos] + eluse[epos];
1191 		  msa->aseq[idx][apos] = (char) tolower((int) abc->sym[sq[idx]->dsq[rpos]]);
1192 		  eluse[epos]++;
1193 		}
1194 	    }
1195 	  else if (statetype == CSTE)
1196 	    apos = matmap[emap->clen]+1;	/* set position for C-term tail */
1197 	}
1198 
1199       /* All insertions (IL/IR/EL) are currently flush-left, but they won't all remain so.
1200        * Two options for what to do:
1201        * 1. Split insertions (this is default):
1202        *    5' extension (ROOT_IL: prior to cpos 1) is right-justified.
1203        *    Internal inserts are split in half
1204        *    3' extension (ROOT_IR: after final cpos) remains left-justified.
1205        * 2. Flush IL's left, IR's right only ON if cm->align_opts & CM_ALIGN_FLUSHINSERTS (this was Infernal pre-1.0 default)
1206        *    use insleft array to determine which type of insert (IL,IR) emits after each
1207        *    consensus column, and flush inserts appropriately.
1208        *
1209        * We have to be careful about EL's. We don't want to group IL/IR's and EL's together and then split them
1210        * because we need to annotate IL/IR's as '.'s in the consensus structure and EL's as '~'. So we split
1211        * each of the 2 group of inserts separately (IL or IR's (their can only be one per position)) and EL's.
1212        * This is done somewhat confusingly (but without repeating too much code) with the
1213        * for (pass = 0; pass <= 1; pass++) loop, and the max_ins_or_el[] and pass_offset[] arrays.
1214        */
1215 
1216       /* deal with inserts before cpos 1, don't think they're can be EL's here, but we leave it in case I'm forgetting.
1217        * if there are EL's they would come after any ROOT_ILs */
1218       if(!(cm->align_opts & CM_ALIGN_FLUSHINSERTS)) /* default behavior, flush ROOT_IL right, else leave ROOT_IL flush left */
1219 	rightjustify(msa->abc, msa->aseq[idx], maxins[0]);
1220       if(!(cm->align_opts & CM_ALIGN_FLUSHINSERTS)) /* default behavior, flush pre-cpos=1 ELs right, else leave them flush left */
1221 	rightjustify(msa->abc, msa->aseq[idx]+maxins[0], maxels[0]);
1222 
1223       for (cpos = 1; cpos < emap->clen; cpos++)
1224 	{
1225 	  if(insleft[cpos]) { /* ILs then ELs */
1226 	    max_ins_or_el[0] = maxins[cpos];
1227 	    max_ins_or_el[1] = maxels[cpos];
1228 	    pass_offset[0]   = 0;
1229 	    pass_offset[1]   = maxins[cpos]; /* we'll have to add this to get to appropriate alignment position when pass==1 */
1230 	  }
1231 	  else { /* ELs then IRs */
1232 	    max_ins_or_el[0] = maxels[cpos];
1233 	    max_ins_or_el[1] = maxins[cpos];
1234 	    pass_offset[0]   = 0;
1235 	    pass_offset[1]   = maxels[cpos]; /* we'll have to add this to get to appropriate alignment position when pass==1 */
1236 	  }
1237 	  for(pass = 0; pass <= 1; pass++)
1238 	    {
1239 	      if (max_ins_or_el[pass]  > 1)
1240 		{
1241 		  apos = matmap[cpos]+1 + pass_offset[pass];
1242 		  if(! (cm->align_opts & CM_ALIGN_FLUSHINSERTS)) /* default behavior, split insert in half */
1243 		    {
1244 		      for (nins = 0; islower((int) (msa->aseq[idx][apos])); apos++)
1245 			nins++;
1246 		      nins /= 2;		/* split the insertion in half */
1247 		      rightjustify(msa->abc, msa->aseq[idx]+matmap[cpos]+1 + pass_offset[pass] + nins, max_ins_or_el[pass]-nins);
1248 		    }
1249 		  /* else revert to pre-1.0 infernal behavior, flush IL's left, and flush IR's right */
1250 		  else if(!(insleft[cpos])) /* only insert right if next consensus column doesn't insert left */
1251 		    rightjustify(msa->abc, msa->aseq[idx] + apos, max_ins_or_el[pass]);
1252 		}
1253 	    }
1254 	}
1255       /* deal with inserts after final cpos 1,
1256        * if there are EL's they would come before any ROOT_IRs */
1257       if(cm->align_opts & CM_ALIGN_FLUSHINSERTS) /* old behavior, flush ROOT_IR right, else (default) leave ROOT_IR flush left */
1258 	  rightjustify(msa->abc, msa->aseq[idx]+matmap[emap->clen]+1, maxels[emap->clen]);
1259       if(cm->align_opts & CM_ALIGN_FLUSHINSERTS) /* old behavior, flush ROOT_IR right, else (default) leave ROOT_IR flush left */
1260 	  rightjustify(msa->abc, msa->aseq[idx]+matmap[emap->clen]+1+maxels[emap->clen], maxins[emap->clen]);
1261     }
1262   /***********************************************
1263    * Build the rest of the MSA annotation.
1264    ***********************************************/
1265 
1266   msa->nseq = nseq;
1267   msa->alen = alen;
1268   ESL_ALLOC(msa->au, sizeof(char) * (strlen(INFERNAL_VERSION)+10));
1269   sprintf(msa->au, "Infernal %s", INFERNAL_VERSION);
1270 
1271   /* copy names and weights */
1272   for (idx = 0; idx < nseq; idx++)
1273     {
1274       if((status = esl_strdup(sq[idx]->name, -1, &(msa->sqname[idx]))) != eslOK) goto ERROR;
1275       if (wgt == NULL) msa->wgt[idx] = 1.0;
1276       else             msa->wgt[idx] = wgt[idx];
1277     }
1278 
1279   /* Construct the secondary structure consensus line, msa->ss_cons:
1280    *       IL, IR are annotated as .
1281    *       EL is annotated as ~
1282    *       and match columns use the structure code.
1283    * Also the primary sequence consensus/reference coordinate system line,
1284    * msa->rf.
1285    */
1286   ESL_ALLOC(msa->ss_cons, (sizeof(char) * (alen+1)));
1287   ESL_ALLOC(msa->rf,      (sizeof(char) * (alen+1)));
1288   con = CreateCMConsensus(cm, abc);
1289 
1290   for (cpos = 0; cpos <= emap->clen; cpos++)
1291     {
1292       if (matuse[cpos])
1293 	{ /* CMConsensus is off-by-one right now, 0..clen-1 relative to cpos's 1..clen */
1294 	  if (con->ct[cpos-1] != -1 && matuse[con->ct[cpos-1]+1] == 0) {
1295 	    msa->ss_cons[matmap[cpos]] = '.';
1296 	    msa->rf[matmap[cpos]]      = con->cseq[cpos-1];
1297 	  } else {
1298 	    msa->ss_cons[matmap[cpos]] = con->cstr[cpos-1];
1299 	    msa->rf[matmap[cpos]]      = con->cseq[cpos-1];
1300 	  }
1301 	}
1302       if (maxins[cpos] > 0)
1303 	for (apos = imap[cpos]; apos < imap[cpos] + maxins[cpos]; apos++)
1304 	  {
1305 	    msa->ss_cons[apos] = '.';
1306 	    msa->rf[apos] = '.';
1307 	  }
1308       if (maxels[cpos] > 0)
1309 	{
1310 	  for (apos = elmap[cpos]; apos < elmap[cpos] + maxels[cpos]; apos++)
1311 	  {
1312 	    msa->ss_cons[apos] = '~';
1313 	    msa->rf[apos] = '~';
1314 	  }
1315 	}
1316     }
1317   msa->ss_cons[alen] = '\0';
1318   msa->rf[alen] = '\0';
1319 
1320   /* If we only want the match columns, shorten the alignment
1321    * by getting rid of the inserts. (Alternatively we could probably
1322    * simplify the building of the alignment, but all that pretty code
1323    * above already existed, so we do this post-msa-building shortening).
1324    */
1325   if(do_matchonly)
1326     {
1327       int *useme;
1328       ESL_ALLOC(useme, sizeof(int) * (msa->alen));
1329       esl_vec_ISet(useme, msa->alen, FALSE);
1330       for(cpos = 0; cpos <= emap->clen; cpos++)
1331 	if(matmap[cpos] != -1) useme[matmap[cpos]] = TRUE;
1332       if((status = esl_msa_ColumnSubset(msa, errbuf, useme)) != eslOK) return status;
1333       free(useme);
1334     }
1335 
1336   /* Free and return */
1337   FreeCMConsensus(con);
1338   FreeEmitMap(emap);
1339   free(eluse);
1340   free(iuse);
1341   free(matuse);
1342   free(maxins);
1343   free(maxels);
1344   free(matmap);
1345   free(imap);
1346   free(elmap);
1347   free(insleft);
1348   esl_Free2D((void **) eposmap, nseq);
1349   *ret_msa = msa;
1350   return eslOK;
1351 
1352  ERROR:
1353   if(con   != NULL)  FreeCMConsensus(con);
1354   if(emap  != NULL)  FreeEmitMap(emap);
1355   if(matuse!= NULL)  free(matuse);
1356   if(iuse != NULL)   free(iuse);
1357   if(elmap != NULL)  free(elmap);
1358   if(maxels!= NULL)  free(maxels);
1359   if(matmap!= NULL)  free(matmap);
1360   esl_Free2D((void **) eposmap, nseq);
1361   if(msa   != NULL)  esl_msa_Destroy(msa);
1362   return status;
1363 }
1364 
1365 
1366 /* Function: CP9TraceScoreCorrectionNull2()
1367  * Based on HMMER2's TraceScoreCorrection()
1368  * Date:     Sun Dec 21 12:05:47 1997 [StL]
1369  *
1370  * Purpose:  Calculate a correction (in integer log_2 odds) to be
1371  *           applied to a sequence, using a second null model,
1372  *           based on a traceback. M/I emissions are corrected;
1373  *           The null model is constructed /post hoc/ as the
1374  *           average over all the M,I distributions used by the trace.
1375  *
1376  * Return:   the log_2-odds score correction.
1377  */
1378 int
CP9TraceScoreCorrectionNull2(CP9_t * hmm,char * errbuf,CP9trace_t * tr,ESL_DSQ * dsq,int start,float omega,float * ret_sc)1379 CP9TraceScoreCorrectionNull2(CP9_t *hmm, char *errbuf, CP9trace_t *tr, ESL_DSQ *dsq, int start, float omega, float *ret_sc)
1380 {
1381   int status;
1382   float *p;		/* null2 model distribution */
1383   int *sc;	        /* null2 model scores       */
1384   int   a;              /* residue index counters */
1385   int   tpos;
1386   int score;
1387 
1388   /* Rarely, the alignment was totally impossible, and tr is NULL.
1389    */
1390   if (tr == NULL) return 0.0;
1391 
1392   /* Set up model: average over the emission distributions of
1393    * all M, I states that appear in the trace. Ad hoc? Sure, you betcha.
1394    */
1395   /* trivial preorder traverse, since we're already numbered that way */
1396   ESL_ALLOC(p, sizeof(float) * hmm->abc->K);
1397   esl_vec_FSet(p, hmm->abc->K, 0.0);
1398   for (tpos = 0; tpos < tr->tlen; tpos++) {
1399      if      (tr->statetype[tpos] == CSTM) esl_vec_FAdd(p, hmm->mat[tr->nodeidx[tpos]], hmm->abc->K);
1400      else if (tr->statetype[tpos] == CSTI) esl_vec_FAdd(p, hmm->ins[tr->nodeidx[tpos]], hmm->abc->K);
1401   }
1402   esl_vec_FNorm(p, hmm->abc->K);
1403 
1404   ESL_ALLOC(sc,  sizeof(int) * (hmm->abc->Kp));
1405   /* calculate null2 scores of each possible emission, first the base alphabet */
1406   for (a = 0; a < hmm->abc->K; a++)     sc[a] = Prob2Score(p[a], hmm->null[a]);
1407   /* the ambiguities */
1408   for (a = hmm->abc->K+1; a < hmm->abc->Kp-1; a++) sc[a] = esl_abc_IAvgScore(hmm->abc, a, sc);
1409 
1410   /* Score all the M,I state emissions that appear in the trace.
1411    */
1412    score = 0;
1413    for (tpos = 0; tpos < tr->tlen; tpos++)
1414      if (tr->statetype[tpos] == CSTM || tr->statetype[tpos] == CSTI) score += sc[dsq[tr->pos[tpos]]];
1415 
1416    /* Apply an ad hoc 8 bit fudge factor penalty;
1417     * interpreted as a prior, saying that the second null model is
1418     * 1/2^8 (1/256) as likely as the standard null model
1419     */
1420    score += ((int) (sreLOG2(omega))) * INTSCALE;
1421 
1422    /* Return the correction to the bit score.
1423     */
1424    printf("CP9TraceScoreCorrectionNull2() returning %.3f bits\n", Scorify(ILogsum(0, score)));
1425    *ret_sc = Scorify(ILogsum(0, score));
1426    return eslOK;
1427 
1428  ERROR:
1429   ESL_FAIL(status, errbuf, "CP9TraceScoreCorrectionNull2(): memory allocation error.");
1430   return status; /* NEVERREACHED*/
1431 }
1432