1 /* display.c
2  * SRE, Thu May 23 08:18:05 2002 [St. Louis]
3  *
4  * Routines for formatting and displaying parse trees
5  * for output.
6  */
7 
8 #include "esl_config.h"
9 #include "p7_config.h"
10 #include "config.h"
11 
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <ctype.h>
16 #include <math.h>
17 #include <time.h>
18 
19 #include "easel.h"
20 #include "esl_getopts.h"
21 #include "esl_stack.h"
22 #include "esl_vectorops.h"
23 
24 #include "hmmer.h"
25 
26 #include "infernal.h"
27 
28 static int *createMultifurcationOrderChart(CM_t *cm);
29 static int  bp_is_canonical(char lseq, char rseq);
30 static void createFaceCharts(CM_t *cm, int **ret_inface, int **ret_outface);
31 
32 /* Function:  CreateFancyAli()
33  * Incept:    SRE, Thu May 23 13:46:09 2002 [St. Louis]
34  *
35  * Purpose:   Given a trace (and the model and sequence it corresponds
36  *            to), create a pairwise alignment for display; return in a Fancyali_t
37  *            structure.
38  *
39  * Args:      abc   - alphabet to create alignment with (often cm->abc)
40  *            tr    - parsetree for cm aligned to dsq
41  *            cm    - model
42  *            cons  - consensus information for cm; see CreateCMConsensus()
43  *            dsq   - digitized sequence
44  *            do_noncanonical - mark half-bps and negative scoring bps that are non-canonicals in top line with 'v'
45  *                              (by default, all negative scoring and half-bps are marked with 'x')
46  *            pcode - posterior code
47  *
48  *
49  * Returns:   fancy alignment structure.
50  *            Caller frees, with FreeFancyAli(ali).
51  *
52  * Xref:      STL6 p.58
53  */
54 Fancyali_t *
CreateFancyAli(const ESL_ALPHABET * abc,Parsetree_t * tr,CM_t * cm,CMConsensus_t * cons,ESL_DSQ * dsq,int do_noncanonical,char * pcode)55 CreateFancyAli(const ESL_ALPHABET *abc, Parsetree_t *tr, CM_t *cm, CMConsensus_t *cons, ESL_DSQ *dsq, int do_noncanonical, char *pcode)
56 {
57   int         status;
58   Fancyali_t *ali;              /* alignment structure we're building        */
59   ESL_STACK  *pda;              /* pushdown automaton used to traverse trace */
60   int         type;		/* type of pda move: PDA_RESIDUE, PDA_STATE  */
61   int         v;		/* state index       */
62   int         nd;		/* node index        */
63   int         ti;		/* position in trace */
64   int         qinset, tinset;	/* # consensus nt skipped by an EL, or in an EL */
65   int         ninset;		/* max # nt in an EL     */
66   int         pos;		/* position in growing ali */
67   int         lc, rc;		/* indices for left, right pos in consensus */
68   int         symi, symj;
69   int         d;
70   char        mode;
71   int         lrf, rrf;         /* chars in reference line; left, right     */
72   int         lstr, rstr;	/* chars in structure line; left, right      */
73   int         lcons, rcons;	/* chars in consensus line; left, right      */
74   int         lmid, rmid;	/* chars in ali quality line; left, right    */
75   int         ltop, rtop;	/* chars in optional noncompensatory line; left, right */
76   int         lnegnc, rnegnc;	/* chars in optional noncanonical line; left, right */
77   int         lseq, rseq;	/* chars in aligned target line; left, right */
78   int         lpost, rpost;	/* chars in aligned posteriors, left, right  */
79   int         do_left, do_right;/* flags to generate left, right             */
80   int cpos_l, cpos_r;   	/* positions in consensus (1..clen)          */
81   int spos_l, spos_r;		/* positions in dsq (1..L)                   */
82   int have_pcodes;
83 
84   /* Contract check. We allow the caller to specify the alphabet they want the
85    * resulting MSA in, but it has to make sense (see next few lines). */
86   if(cm->abc->type == eslRNA)
87     {
88       if(abc->type != eslRNA && abc->type != eslDNA)
89 	cm_Fail("ERROR in CreateFancyAli(), cm alphabet is RNA, but requested output alphabet is neither DNA nor RNA.");
90     }
91   else if(cm->abc->K != abc->K)
92     cm_Fail("ERROR in CreateFancyAli(), cm alphabet size is %d, but requested output alphabet size is %d.", cm->abc->K, abc->K);
93 
94   ESL_ALLOC(ali, sizeof(Fancyali_t));
95   have_pcodes = (pcode != NULL) ? TRUE : FALSE;
96 
97   /* Calculate length of the alignment display.
98    *   MATP node        : +2
99    *   MATL, MATR node  : +1
100    *   IL, IR state     : +1
101    *   EL:              : 4 + width of length display : "*[nn]*"
102    *   anything else    : 0.
103    */
104   ali->len = 0;
105   for (ti = 0; ti < tr->n; ti++)
106     {
107       v  = tr->state[ti];
108       if (v == cm->M) {  /* special case: local exit into EL */
109 	nd = cm->ndidx[tr->state[ti-1]]; /* calculate node that EL replaced */
110 	qinset     = cons->rpos[nd] - cons->lpos[nd] + 1;
111 	tinset     = tr->emitr[ti]  - tr->emitl[ti]  + 1;
112 	ninset     = ESL_MAX(qinset,tinset);
113 	ali->len += 4;
114 	do { ali->len++; ninset/=10; } while (ninset); /* poor man's (int)log_10(ninset)+1 */
115 	continue;
116       } else {
117 	nd = cm->ndidx[v];
118 	if      (cm->sttype[v]  == IL_st   || cm->sttype[v]  == IR_st)
119 	  ali->len += 1;
120 	else if (cm->ndtype[nd] == MATL_nd || cm->ndtype[nd] == MATR_nd)
121 	  ali->len += 1;
122 	else if (cm->ndtype[nd] == MATP_nd)
123 	  ali->len += 2;
124       }
125       /* Catch marginal-type local ends and treat them like EL for output */
126       if ((tr->nxtl[ti] == -1) && (cm->sttype[v] != E_st)) {
127 	nd = cm->ndidx[tr->state[ti]];
128 	qinset     = cons->rpos[nd] - cons->lpos[nd] + 1;
129 	tinset     = tr->emitr[ti]  - tr->emitl[ti]  + 1;
130         if (tinset > 0) tinset--;
131 	ninset     = ESL_MAX(qinset,tinset);
132 	ali->len += 4;
133 	do { ali->len++; ninset/=10; } while (ninset); /* poor man's (int)log_10(ninset)+1 */
134       }
135     }
136 
137   /* Allocate and initialize.
138    * Blank the reference lines (memset calls) - only needed
139    * because of the way we deal w/ EL.
140    */
141   if (cm->rf != NULL )
142     ESL_ALLOC(ali->rf, sizeof(char) * (ali->len+1));
143   else
144     ali->rf = NULL;
145   ESL_ALLOC(ali->cstr, sizeof(char) * (ali->len+1));
146   ESL_ALLOC(ali->cseq, sizeof(char) * (ali->len+1));
147   ESL_ALLOC(ali->mid,  sizeof(char) * (ali->len+1));
148   ESL_ALLOC(ali->top,  sizeof(char) * (ali->len+1));
149   ESL_ALLOC(ali->aseq, sizeof(char) * (ali->len+1));
150   if(have_pcodes) {
151     ESL_ALLOC(ali->pcode, sizeof(char) * (ali->len+1));
152   }
153   else {
154     ali->pcode = NULL;
155   }
156   ESL_ALLOC(ali->scoord, sizeof(int)  * ali->len);
157   ESL_ALLOC(ali->ccoord, sizeof(int)  * ali->len);
158 
159   if (cm->rf != NULL) memset(ali->rf, ' ', ali->len);
160   memset(ali->cstr, ' ', ali->len);
161   memset(ali->cseq, ' ', ali->len);
162   memset(ali->mid,  ' ', ali->len);
163   memset(ali->top,  ' ', ali->len);
164   memset(ali->aseq, ' ', ali->len);
165   if(have_pcodes) {
166     memset(ali->pcode, ' ', ali->len);
167   }
168   for (pos = 0; pos < ali->len; pos++)
169     ali->ccoord[pos] = ali->scoord[pos] = 0;
170 
171   /* Fill in the lines: traverse the traceback.
172    */
173   pos = 0;
174   if((pda = esl_stack_ICreate()) == NULL) goto ERROR;
175   if((status = esl_stack_IPush(pda, 0)) != eslOK) goto ERROR;
176   if((status = esl_stack_IPush(pda, PDA_STATE)) != eslOK) goto ERROR;
177 
178   while (esl_stack_IPop(pda, &type) != eslEOD)
179     {
180       if (type == PDA_RESIDUE) {
181 	if (cm->rf != NULL) {
182 	  esl_stack_IPop(pda, &rrf);
183 	  ali->rf[pos] = rrf;
184 	}
185 	esl_stack_IPop(pda, &rstr); 	  ali->cstr[pos]   = rstr;
186 	esl_stack_IPop(pda, &rcons);	  ali->cseq[pos]   = rcons;
187 	esl_stack_IPop(pda, &rmid);	  ali->mid[pos]    = rmid;
188 	esl_stack_IPop(pda, &rtop);	  ali->top[pos]    = rtop;
189 	esl_stack_IPop(pda, &rseq);       ali->aseq[pos]   = rseq;
190 	if(have_pcodes) {
191 	  esl_stack_IPop(pda, &rpost);    ali->pcode[pos] = rpost;
192 	}
193 	esl_stack_IPop(pda, &cpos_r);     ali->ccoord[pos] = cpos_r;
194 	esl_stack_IPop(pda, &spos_r);     ali->scoord[pos] = spos_r;
195 	pos++;
196 	continue;
197       }
198 
199       /* Else, we're PDA_STATE - e.g. dealing with a trace node.
200        */
201       esl_stack_IPop(pda, &ti);
202       v = tr->state[ti];
203 
204       /* Deal with EL (local ends, state M) as a special case.
205        * We get away with only writing into aseq because we've
206        * memset() the display strings to blank.
207        */
208       if (v == cm->M) {
209 	int numwidth;		/* number of chars to leave for displaying width numbers */
210 
211 	nd = 1 + cm->ndidx[tr->state[ti-1]]; /* calculate node that EL replaced */
212 	qinset     = cons->rpos[nd] - cons->lpos[nd] + 1;
213 	tinset     = tr->emitr[ti]  - tr->emitl[ti]  + 1;
214 	ninset     = ESL_MAX(qinset,tinset);
215 	numwidth = 0; do { numwidth++; ninset/=10; } while (ninset); /* poor man's (int)log_10(ninset)+1 */
216 	memset(ali->cstr+pos,  '~', numwidth+4);
217 	sprintf(ali->cseq+pos, "*[%*d]*", numwidth, qinset);
218 	sprintf(ali->aseq+pos, "*[%*d]*", numwidth, tinset);
219 	/* do nothing for posteriors here, they'll stay as they were init'ed, as ' ' */
220 	pos += 4 + numwidth;
221 	continue;
222       }
223 
224       /* Fetch some info into tmp variables, for "clarity"
225        */
226       nd = cm->ndidx[v];	  /* what CM node we're in */
227       lc   = cons->lpos[nd];	  /* where CM node aligns to in consensus */
228       rc   = cons->rpos[nd];
229       symi = dsq[tr->emitl[ti]];  /* residue indices that node is aligned to */
230       symj = dsq[tr->emitr[ti]];
231       if(have_pcodes) { /* posterior codes are indexed 0..alen-1, off-by-one w.r.t dsq */
232 	lpost = '.'; /* init to gap, if it corresponds to a residue, we'll reset it below */
233 	rpost = '.'; /* init to gap, if it corresponds to a residue, we'll reset it below */
234       }
235       d = tr->emitr[ti] - tr->emitl[ti] + 1;
236       mode = tr->mode[ti];
237 
238       /* Calculate four of the five lines: rf, str, cons, and seq.
239        */
240       do_left = do_right = FALSE;
241       if (cm->sttype[v] == IL_st) {
242 	do_left = TRUE;
243 	if (cm->rf != NULL) lrf = '.';
244 	lstr    = '.';
245 	lcons   = '.';
246 	if (mode == TRMODE_J || mode == TRMODE_L) lseq = tolower((int) abc->sym[symi]);
247         else                        lseq = '~';
248 	cpos_l  = 0;
249 	spos_l  = tr->emitl[ti];
250 	if(pcode != NULL) { lpost = pcode[tr->emitl[ti]-1]; } /* watch off-by-one w.r.t. dsq */
251       } else if (cm->sttype[v] == IR_st) {
252 	do_right = TRUE;
253 	if (cm->rf != NULL) rrf = '.';
254 	rstr    = '.';
255 	rcons   = '.';
256 	if (mode == TRMODE_J || TRMODE_R) rseq = tolower((int) abc->sym[symj]);
257         else                        rseq = '~';
258 	cpos_r  = 0;
259 	spos_r  = tr->emitr[ti];
260 	if(pcode != NULL) { rpost = pcode[tr->emitr[ti]-1]; } /* watch off-by-one w.r.t. dsq */
261       } else {
262 	if (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd) {
263 	  do_left = TRUE;
264 	  if (cm->rf != NULL) lrf = cm->rf[lc];
265 	  lstr   = cons->cstr[lc];
266 	  lcons  = cons->cseq[lc];
267 	  cpos_l = lc+1;
268 	  if (cm->sttype[v] == MP_st || cm->sttype[v] == ML_st) {
269 	    if      (mode == TRMODE_J)         lseq = abc->sym[symi];
270             else if (mode == TRMODE_L && d>0 ) lseq = abc->sym[symi];
271             else                        lseq = '~';
272 	    spos_l = tr->emitl[ti];
273 	    if(pcode != NULL) { lpost = pcode[tr->emitl[ti]-1]; } /* watch off-by-one w.r.t. dsq */
274 	  } else {
275 	    if (mode == TRMODE_J || mode == TRMODE_L) lseq = '-';
276             else                        lseq = '~';
277 	    spos_l = 0;
278 	    /* lpost remains as it was init'ed as a gap '.' */
279 	  }
280 	}
281 	if (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATR_nd) {
282 	  do_right = TRUE;
283 	  if (cm->rf != NULL) rrf = cm->rf[rc];
284 	  rstr   = cons->cstr[rc];
285 	  rcons  = cons->cseq[rc];
286 	  cpos_r = rc+1;
287 	  if (cm->sttype[v] == MP_st || cm->sttype[v] == MR_st) {
288 	    if      (mode == TRMODE_J)         rseq = abc->sym[symj];
289             else if (TRMODE_R && d>0 ) rseq = abc->sym[symj];
290             else                        rseq = '~';
291 	    spos_r = tr->emitr[ti];
292 	    if(pcode != NULL) { rpost = pcode[tr->emitr[ti]-1]; } /* watch off-by-one w.r.t. dsq */
293 	  } else {
294 	    if (mode == TRMODE_J || TRMODE_R) rseq = '-';
295             else                        rseq = '~';
296 	    spos_r = 0;
297 	    /* rpost remains as it was init'ed as a gap '.' */
298 	  }
299 	}
300       }
301 
302       /* Use emission p and score to set lmid, rmid line for emitting states.
303        */
304       lmid = rmid = ' ';
305       ltop = rtop = ' ';
306       lnegnc = rnegnc = ' ';
307       if (cm->sttype[v] == MP_st) {
308 	if (lseq == toupper(lcons) && rseq == toupper(rcons))
309 	  {
310 	    lmid = lseq;
311 	    rmid = rseq;
312 	  }
313         else if (mode != 3)
314           {
315             if (mode == TRMODE_L && lseq == toupper(lcons)) lmid = lseq;
316             if (TRMODE_R && rseq == toupper(rcons)) rmid = rseq;
317           }
318 	else if (DegeneratePairScore(cm->abc, cm->esc[v], symi, symj) >= 0)
319 	  lmid = rmid = ':';
320 
321 	/* determine ltop, rtop for optional noncompensatory annotation, they are 'x' if lmid, rmid are ' ', and ' ' otherwise */
322 	if (lmid == ' ' && rmid == ' ')
323 	  ltop = rtop = 'x';
324 
325 	/* determine lnegnc, rnegnc for optional negative scoring non-canonical annotation, they are 'v' if lseq and rseq are a negative scoring non-canonical (not a AU,UA,GC,CG,GU,UG) pair */
326 	if ((mode == TRMODE_J) && (DegeneratePairScore(cm->abc, cm->esc[v], symi, symj) < 0) && (! bp_is_canonical(lseq, rseq))) {
327 	  lnegnc = rnegnc = 'v';
328 	}
329       } else if (cm->sttype[v] == ML_st || cm->sttype[v] == IL_st) {
330 	if (lseq == toupper(lcons))
331 	  lmid = lseq;
332         else if ( (mode != 3) && (mode != 2) )
333           ;
334 	else if(esl_abc_FAvgScore(cm->abc, symi, cm->esc[v]) > 0)
335 	  lmid = '+';
336       } else if (cm->sttype[v] == MR_st || cm->sttype[v] == IR_st) {
337 	if (rseq == toupper(rcons))
338 	  rmid = rseq;
339         else if ( (mode != 3) && (mode != 1) )
340           ;
341 	else if(esl_abc_FAvgScore(cm->abc, symj, cm->esc[v]) > 0)
342 	  rmid = '+';
343       }
344       if(cm->stid[v] == MATP_ML || cm->stid[v] == MATP_MR) {
345 	if(mode == TRMODE_J) {
346 	  ltop = rtop = 'x';     /* mark non-truncated half base-pairs (MATP_ML or MATP_MR) with 'x' */
347 	  lnegnc = rnegnc = 'v'; /* mark non-truncated half base-pairs (MATP_ML or MATP_MR) with 'v' */
348 	}
349       }
350       /* If we're storing a residue leftwise - just do it.
351        * If rightwise - push it onto stack.
352        */
353       if (do_left) {
354 	if (cm->rf != NULL) ali->rf[pos] = lrf;
355 	ali->cstr[pos]   = lstr;
356 	ali->cseq[pos]   = lcons;
357 	ali->mid[pos]    = lmid;
358 	ali->top[pos]    = (do_noncanonical) ? lnegnc : ltop;
359 	ali->aseq[pos]   = lseq;
360 	if(have_pcodes) {
361 	  ali->pcode[pos] = lpost;
362 	}
363 	ali->ccoord[pos] = cpos_l;
364 	ali->scoord[pos] = spos_l;
365 	pos++;
366       }
367       if (do_right) {
368 	if ((status = esl_stack_IPush(pda, spos_r)) != eslOK) goto ERROR;
369 	if ((status = esl_stack_IPush(pda, cpos_r)) != eslOK) goto ERROR;
370 	if(have_pcodes) {
371 	  if ((status = esl_stack_IPush(pda, (int) rpost)) != eslOK) goto ERROR;
372 	}
373 	if ((status = esl_stack_IPush(pda, (int) rseq)) != eslOK) goto ERROR;
374 	if (do_noncanonical) { if ((status = esl_stack_IPush(pda, (int) rnegnc)) != eslOK) goto ERROR; }
375 	else                 { if ((status = esl_stack_IPush(pda, (int) rtop))   != eslOK) goto ERROR; }
376 	if ((status = esl_stack_IPush(pda, (int) rmid)) != eslOK) goto ERROR;
377 	if ((status = esl_stack_IPush(pda, (int) rcons)) != eslOK) goto ERROR;
378 	if ((status = esl_stack_IPush(pda, (int) rstr)) != eslOK) goto ERROR;
379 	if (cm->rf != NULL) if ((status = esl_stack_IPush(pda, (int) rrf)) != eslOK) goto ERROR;
380 	if ((status = esl_stack_IPush(pda, PDA_RESIDUE)) != eslOK) goto ERROR;
381       }
382 
383       /* Push the child trace nodes onto the PDA;
384        * right first, so it pops last.
385        */
386       if (tr->nxtr[ti] != -1) {
387 	if ((status = esl_stack_IPush(pda, tr->nxtr[ti])) != eslOK) goto ERROR;
388 	if ((status = esl_stack_IPush(pda, PDA_STATE)) != eslOK) goto ERROR;
389       }
390       if (tr->nxtl[ti] != -1) {
391 	if ((status = esl_stack_IPush(pda, tr->nxtl[ti])) != eslOK) goto ERROR;
392 	if ((status = esl_stack_IPush(pda, PDA_STATE)) != eslOK) goto ERROR;
393       }
394       else if (cm->sttype[v] != E_st) {
395         /* Catch marginal-type local ends, treat like EL for output */
396 	int numwidth;		/* number of chars to leave for displaying width numbers */
397 
398 	nd = 1 + cm->ndidx[tr->state[ti]]; /* calculate node that EL replaced */
399 	qinset     = cons->rpos[nd] - cons->lpos[nd] + 1;
400 	tinset     = tr->emitr[ti]  - tr->emitl[ti]  + 1;
401         if (tinset > 0) tinset--;
402 	ninset     = ESL_MAX(qinset,tinset);
403 	numwidth = 0; do { numwidth++; ninset/=10; } while (ninset); /* poor man's (int)log_10(ninset)+1 */
404 	memset(ali->cstr+pos,  '~', numwidth+4);
405 	sprintf(ali->cseq+pos, "*[%*d]*", numwidth, qinset);
406 	sprintf(ali->aseq+pos, "*[%*d]*", numwidth, tinset);
407 	/* do nothing for posteriors here, they'll stay as they were init'ed, as ' ' */
408 	pos += 4 + numwidth;
409       }
410     } /* end loop over the PDA; PDA now empty */
411 
412   if (cm->rf != NULL) ali->rf[ali->len] = '\0';
413   ali->cstr[ali->len] = '\0';
414   ali->cseq[ali->len] = '\0';
415   ali->mid[ali->len]  = '\0';
416   ali->top[ali->len]  = '\0';
417   ali->aseq[ali->len] = '\0';
418   if(have_pcodes) {
419     ali->pcode[ali->len] = '\0';
420   }
421   /* Laboriously determine the maximum bounds.
422    */
423   ali->sqfrom = 0;
424   for (pos = 0; pos < ali->len; pos++)
425     if (ali->scoord[pos] != 0) {
426       ali->sqfrom = ali->scoord[pos];
427       break;
428     }
429   ali->sqto = 0;
430   for (pos = 0; pos < ali->len; pos++)
431     if (ali->scoord[pos] != 0) ali->sqto = ali->scoord[pos];
432   ali->cfrom = 0;
433   for (pos = 0; pos < ali->len; pos++)
434     if (ali->ccoord[pos] != 0) {
435       ali->cfrom = ali->ccoord[pos];
436       break;
437     }
438   ali->cto = 0;
439   for (pos = 0; pos < ali->len; pos++)
440     if (ali->ccoord[pos] != 0) ali->cto = ali->ccoord[pos];
441 
442   esl_stack_Destroy(pda);
443   return ali;
444 
445  ERROR:
446   cm_Fail("Memory allocation error.\n");
447   return NULL; /* not reached */
448 }
449 
450 /* Function: PrintFancyAli()
451  * Date:     SRE, Thu Jun 13 02:23:18 2002 [Aula Magna, Stockholm]
452  *
453  * Purpose:  Write a CM/sequence alignment to a stream, from a
454  *           Fancyali_t structure. Line length currently hardcoded
455  *           but this could be changed. Modeled on HMMER's
456  *           eponymous function.
457  *
458  *            Put at least <min_aliwidth> alignment characters per
459  *            line; try to make lines no longer than <linewidth>
460  *            characters, including name, coords, and spacing.  The
461  *            width of lines may exceed <linewidth>, if that's what it
462  *            takes to put a name, coords, and <min_aliwidth>
463  *            characters of alignment on a line.
464  *
465  *            As a special case, if <linewidth> is negative or 0, then
466  *            alignments are formatted in a single block of unlimited
467  *            line length.
468  *
469  *
470  * Args:     fp              - where to print it (stdout or open FILE)
471  *           ali             - alignment structure to print.
472  *           offset          - number of residues to add to target seq index,
473  *                             to ease MPI search, all target hits start at posn 1
474  *           in_revcomp      - TRUE if hit we're printing an alignment for a
475  *                             cmsearch hit on reverse complement strand.
476  *           do_top          - TRUE to turn on optional 'top-line' annotation.
477  *                             This was set in CreateFancyAli() as marking either:
478  *                             negative scoring (non-compensatories) and half-bps:
479  *                             negative scoring *non-canonical* bps and half bps
480  *                             (in this 2nd case negative scoring canonicals are unmarked)
481  *           min_aliwidth    - min length for alignment block (see Purpose above)
482  *           linewidth       - preferred line length (see Purpose above)
483  *           show_accessions - TRUE to print seq/query accessions (if avail.), not names
484  * Returns:  (void)
485  */
486 void
PrintFancyAli(FILE * fp,Fancyali_t * ali,int64_t offset,int in_revcomp,int do_top,int linewidth)487 PrintFancyAli(FILE *fp, Fancyali_t *ali, int64_t offset, int in_revcomp, int do_top, int linewidth)
488 {
489   int   status;
490   char *buf;
491   int   pos;
492   int   ci,  cj;		/* positions in CM consensus 1..clen */
493   int   sqi, sqj;		/* positions in target seq 1..L      */
494   int   i;
495   int   i2print, j2print; /* i,j indices we'll print, used to deal with case of reverse complement */
496   int   have_pcodes;      /* TRUE if posterior codes are valid */
497 
498   printf("in PrintFancyAli sqfrom..sqto %d..%d in_revcomp: %d offset: %" PRId64 "\n", ali->sqfrom, ali->sqto, in_revcomp, offset);
499 
500   have_pcodes = (ali->pcode != NULL) ? TRUE : FALSE;
501 
502   ESL_ALLOC(buf, sizeof(char) * (linewidth + 1));
503   buf[linewidth] = '\0';
504   for (pos = 0; pos < ali->len; pos += linewidth)
505     {
506       /* Laboriously determine our coord bounds on dsq
507        * and consensus line for this alignment section.
508        */
509       sqi = 0;
510       for (i = pos; ali->aseq[i] != '\0' && i < pos + linewidth; i++) {
511 	if (ali->scoord[i] != 0) {
512 	  sqi = ali->scoord[i];
513 	  break;
514 	}
515       }
516       sqj = 0;
517       for (i = pos; ali->aseq[i] != '\0' && i < pos + linewidth; i++) {
518 	if (ali->scoord[i] != 0) sqj = ali->scoord[i];
519       }
520       ci = 0;
521       for (i = pos; ali->aseq[i] != '\0' && i < pos + linewidth; i++) {
522 	if (ali->ccoord[i] != 0) {
523 	  ci = ali->ccoord[i];
524 	  break;
525 	}
526       }
527       cj = 0;
528       for (i = pos; ali->aseq[i] != '\0' && i < pos + linewidth; i++) {
529 	if (ali->ccoord[i] != 0) cj = ali->ccoord[i];
530       }
531 
532       /* Formats and print the alignment section.
533        */
534       if (ali->rf != NULL) {
535 	strncpy(buf, ali->rf+pos, linewidth);
536 	fprintf(fp, "  %8s %s\n", " ", buf);
537       }
538       if (do_top && ali->top != NULL) {
539 	strncpy(buf, ali->top+pos, linewidth);
540 	fprintf(fp, "  %8s %s\n", " ", buf);
541       }
542       if (ali->cstr != NULL) {
543 	strncpy(buf, ali->cstr+pos, linewidth);
544 	fprintf(fp, "  %8s %s\n", " ", buf);
545       }
546       if (ali->cseq != NULL) {
547 	strncpy(buf, ali->cseq+pos, linewidth);
548 	if (ci && cj)
549 	  fprintf(fp, "  %8d %s %-8d\n", ci, buf, cj);
550 	else
551 	  fprintf(fp, "  %8s %s %-8s\n", "-", buf, "-");
552       }
553       if (ali->mid != NULL) {
554 	strncpy(buf, ali->mid+pos,  linewidth);
555 	fprintf(fp, "  %8s %s\n", " ", buf);
556       }
557       if (ali->aseq != NULL) {
558 	strncpy(buf, ali->aseq+pos, linewidth);
559 	if (sqj && sqi) {
560 	  if(in_revcomp) {
561 	    i2print = offset - (sqi-1)    + 1;
562 	    j2print = i2print - (sqj-sqi);
563 	  }
564 	  else {
565 	    i2print = sqi + offset;
566 	    j2print = sqj + offset;
567 	  }
568 	  fprintf(fp, "  %8d %s %-8d\n", i2print, buf, j2print);
569 	}
570 	else {
571 	  fprintf(fp, "  %8s %s %-8s\n", "-", buf, "-");
572 	}
573       }
574       if (have_pcodes && ali->pcode != NULL) {
575 	strncpy(buf, ali->pcode+pos, linewidth);
576 	fprintf(fp, "  %8s %s\n", "PP", buf);
577       }
578       fprintf(fp, "\n");
579     }
580   free(buf);
581   fflush(fp);
582   return;
583 
584  ERROR:
585   cm_Fail("Memory allocation error.\n");
586 }
587 
588 
589 /* Function:  FreeFancyAli()
590  * Incept:    SRE, Fri May 24 15:37:30 2002 [St. Louis]
591  */
592 void
FreeFancyAli(Fancyali_t * ali)593 FreeFancyAli(Fancyali_t *ali)
594 {
595   if (ali->rf != NULL) free(ali->rf);
596   if (ali->cstr   != NULL) free(ali->cstr);
597   if (ali->cseq   != NULL) free(ali->cseq);
598   if (ali->mid    != NULL) free(ali->mid);
599   if (ali->top    != NULL) free(ali->top);
600   if (ali->aseq   != NULL) free(ali->aseq);
601   if (ali->pcode  != NULL) free(ali->pcode);
602   if (ali->ccoord != NULL) free(ali->ccoord);
603   if (ali->scoord != NULL) free(ali->scoord);
604   free(ali);
605 }
606 
607 
608 /* Function:  CreateCMConsensus()
609  * Incept:    SRE, Thu May 23 10:39:39 2002 [St. Louis]
610  *
611  * Purpose:   Create displayable strings for consensus sequence
612  *            and consensus structure; also create map of
613  *            nodes -> right and left consensus positions.
614  *
615  *            Consensus sequence shows maximum scoring residue(s)
616  *            for each emitting node. If score < pthresh (for pairs)
617  *            or < sthresh (for singlets), the residue is shown
618  *            in lower case. (That is, "strong" consensus residues
619  *            are in upper case.)
620  *
621  *            Currently, pthresh is hard-coded as 3.0 and sthresh as
622  *            1.0 (previously v1.0.2 and earlier had these passed in,
623  *            but they were always set as 3.0/1.0 so I modified the
624  *            function prototype - feel free to change it back if you
625  *            want different values, or want to let caller (user) set
626  *            them)).
627  *
628  *            Consensus structure annotates
629  *            base pairs according to "multifurcation order" (how
630  *            many multifurcation loops are beneath this pair).
631  *               terminal stems:  <>
632  *               order 1:         ()
633  *               order 2:         []
634  *               order >2:        {}
635  *            Singlets are annotated : if external, _ if hairpin,
636  *            - if bulge or interior loop, and , for multifurcation loop.
637  *
638  *            Example:
639  *                ::(((,,<<<__>>>,<<<__>>->,,)))::
640  *                AAGGGAACCCTTGGGTGGGTTCCACAACCCAA
641  *
642  * Args:      cm  - the model
643  *            abc - alphabet to create con->cseq with (often cm->abc)
644  *
645  * Returns:   Newly created CMConsensus_t.
646  *            Caller frees w/ FreeCMConsensus().
647  *            Returns NULL on any error.
648  *
649  * Xref:      STL6 p.58.
650  */
651 CMConsensus_t *
CreateCMConsensus(CM_t * cm,const ESL_ALPHABET * abc)652 CreateCMConsensus(CM_t *cm, const ESL_ALPHABET *abc)
653 {
654   int       status;
655   CMConsensus_t *con = NULL;    /* growing consensus info */
656   char     *cseq = NULL;        /* growing consensus sequence display string   */
657   char     *cstr = NULL;        /* growing consensus structure display string  */
658   int      *ct = NULL;	        /* growing ct Zuker pairing partnet string     */
659   int      *lpos = NULL;        /* maps node->left consensus position, [0..nodes-1] */
660   int      *rpos = NULL;        /* maps node->right consensus position, [0..nodes-1] */
661   int       cpos;		/* current position in cseq, cstr              */
662   int       nalloc;		/* current allocated length of cseq, cstr      */
663   ESL_STACK *pda;               /* pushdown automaton used to traverse model   */
664   int      *multiorder;         /* "height" of each node (multifurcation order), [0..nodes-1] */
665   int      *inface;             /* face count for each node, inside */
666   int      *outface;            /* face count for each node, outside */
667   int       v, nd;
668   int       type;
669   char      lchar, rchar;
670   char      lstruc, rstruc;
671   int       x;
672   int       pairpartner;	/* coord of left pairing partner of a right base */
673   void     *tmp;                /* for ESL_RALLOC */
674   /* bit score thresholds for base pairs (pthresh) and singlets
675    * (sthresh) to be lowercased. These are hard-coded. If you want
676    * to change them, consider changing the function prototype so they're
677    * chosen by caller and passed in.
678    */
679   float     pthresh = 3.0;
680   float     sthresh = 1.0;
681 
682   /* Contract check. CM must have log odds scores. Alphabet must make sense. */
683   if(! (cm->flags & CMH_BITS)) return NULL;
684   if(cm->abc->type == eslRNA) {
685     if(abc->type != eslRNA && abc->type != eslDNA) return NULL;
686   }
687   else if(cm->abc->K != abc->K) return NULL;
688 
689   ESL_ALLOC(lpos, sizeof(int) * cm->nodes);
690   ESL_ALLOC(rpos, sizeof(int) * cm->nodes);
691   ESL_ALLOC(cseq, sizeof(char) * 100);
692   ESL_ALLOC(cstr, sizeof(char) * 100);
693   ESL_ALLOC(ct,   sizeof(int)  * 100);
694   nalloc  = 100;
695   cpos    = 0;
696 
697   for (nd = 0; nd < cm->nodes; nd++)
698     lpos[nd] = rpos[nd] = -1;
699 
700   multiorder = createMultifurcationOrderChart(cm);
701   createFaceCharts(cm, &inface, &outface);
702 
703   if((pda = esl_stack_ICreate()) == NULL) goto ERROR;
704   if ((status = esl_stack_IPush(pda, 0)) != eslOK) goto ERROR;
705   if ((status = esl_stack_IPush(pda, PDA_STATE)) != eslOK) goto ERROR;
706   while (esl_stack_IPop(pda, &type) != eslEOD) {
707     if (type == PDA_RESIDUE)
708       {
709 	esl_stack_IPop(pda, &x); rchar  = (char) x;
710 	esl_stack_IPop(pda, &x); rstruc = (char) x;
711 	esl_stack_IPop(pda, &pairpartner);
712 	esl_stack_IPop(pda, &nd);
713 	rpos[nd]   = cpos;
714 	cseq[cpos] = rchar;
715 	cstr[cpos] = rstruc;
716 	ct[cpos]   = pairpartner;
717 	if (pairpartner != -1) ct[pairpartner] = cpos;
718 	cpos++;
719       }
720     else if (type == PDA_MARKER)
721       {
722 	esl_stack_IPop(pda, &nd);
723 	rpos[nd]   = cpos-1;
724       }
725     else if (type == PDA_STATE)
726       {
727 	esl_stack_IPop(pda, &v);
728 	nd    = cm->ndidx[v];
729 	lchar = rchar = lstruc = rstruc = 0;
730 
731 	/* Determine what we emit:
732 	 * MATP, MATL, MATR consensus states only.
733 	 */
734 	if (cm->stid[v] == MATP_MP)
735 	  {
736 	    x = esl_vec_FArgMax(cm->esc[v], abc->K*abc->K);
737 	    lchar = abc->sym[x / abc->K];
738 	    rchar = abc->sym[x % abc->K];
739 	    if (cm->esc[v][x] < pthresh) {
740 	      lchar = tolower((int) lchar);
741 	      rchar = tolower((int) rchar);
742 	    }
743 	    switch (multiorder[nd]) {
744 	    case 0:  lstruc = '<'; rstruc = '>'; break;
745 	    case 1:  lstruc = '('; rstruc = ')'; break;
746 	    case 2:  lstruc = '['; rstruc = ']'; break;
747 	    default: lstruc = '{'; rstruc = '}'; break;
748 	    }
749 	} else if (cm->stid[v] == MATL_ML) {
750 	  x = esl_vec_FArgMax(cm->esc[v], cm->abc->K);
751 	  lchar = abc->sym[x];
752 	  if (cm->esc[v][x] < sthresh) lchar = tolower((int) lchar);
753 	  if      (outface[nd] == 0)                    lstruc = ':'; /* external ss */
754 	  else if (inface[nd] == 0 && outface[nd] == 1) lstruc = '_'; /* hairpin loop */
755 	  else if (inface[nd] == 1 && outface[nd] == 1) lstruc = '-'; /* bulge/interior */
756 	  else                                          lstruc = ','; /* multiloop */
757 	  rstruc = ' ';
758 	} else if (cm->stid[v] == MATR_MR) {
759 	  x = esl_vec_FArgMax(cm->esc[v], cm->abc->K);
760 	  rchar = abc->sym[x];
761 	  if (cm->esc[v][x] < sthresh) rchar = tolower((int) rchar);
762 	  if      (outface[nd] == 0)                    rstruc = ':'; /* external ss */
763 	  else if (inface[nd] == 0 && outface[nd] == 1) rstruc = '?'; /* doesn't happen */
764 	  else if (inface[nd] == 1 && outface[nd] == 1) rstruc = '-'; /* bulge/interior */
765 	  else                                          rstruc = ','; /* multiloop */
766 	  lstruc = ' ';
767 	}
768 
769 	/* Emit. A left base, we can do now;
770 	 * a right base, we defer onto PDA.
771 	 */
772 	lpos[nd]   = cpos;	/* we always set lpos, rpos even for nonemitters */
773 	if (lchar) {
774 	  cseq[cpos] = lchar;
775 	  cstr[cpos] = lstruc;
776 	  ct[cpos]   = -1;	/* will be overwritten, if needed, when right guy is processed */
777 	  cpos++;
778 	}
779 	if (rchar) {
780 	  if ((status = esl_stack_IPush(pda, nd)) != eslOK) goto ERROR;
781 	  if (lchar) { if ((status = esl_stack_IPush(pda, cpos-1)) != eslOK) goto ERROR; }
782 	  else       { if ((status = esl_stack_IPush(pda, -1)) != eslOK) goto ERROR; }
783 	  if ((status = esl_stack_IPush(pda, rstruc)) != eslOK) goto ERROR;
784 	  if ((status = esl_stack_IPush(pda, rchar)) != eslOK) goto ERROR;
785 	  if ((status = esl_stack_IPush(pda, PDA_RESIDUE)) != eslOK) goto ERROR;
786 	} else {
787 	  if ((status = esl_stack_IPush(pda, nd)) != eslOK) goto ERROR;
788 	  if ((status = esl_stack_IPush(pda, PDA_MARKER)) != eslOK) goto ERROR;
789 	}
790 
791 	/* Transit - to consensus states only.
792 	 * The obfuscated idiom finds the index of the next consensus
793 	 * state without making assumptions about numbering or connectivity.
794 	 */
795 	if (cm->sttype[v] == B_st) {
796 	  if ((status = esl_stack_IPush(pda, cm->cnum[v])) != eslOK) goto ERROR;     /* right S  */
797 	  if ((status = esl_stack_IPush(pda, PDA_STATE)) != eslOK) goto ERROR;
798 	  if ((status = esl_stack_IPush(pda, cm->cfirst[v])) != eslOK) goto ERROR;   /* left S */
799 	  if ((status = esl_stack_IPush(pda, PDA_STATE)) != eslOK) goto ERROR;
800 	} else if (cm->sttype[v] != E_st) {
801 	  v = cm->nodemap[cm->ndidx[cm->cfirst[v] + cm->cnum[v] - 1]];
802 	  if ((status = esl_stack_IPush(pda, v)) != eslOK) goto ERROR;
803 	  if ((status = esl_stack_IPush(pda, PDA_STATE)) != eslOK) goto ERROR;
804 	}
805       } /*end PDA_STATE block*/
806 
807     if (cpos == nalloc) {
808       nalloc += 100;
809       ESL_RALLOC(cseq, tmp, sizeof(char) * nalloc);
810       ESL_RALLOC(cstr, tmp, sizeof(char) * nalloc);
811       ESL_RALLOC(ct,   tmp, sizeof(int)  * nalloc);
812     }
813   }/* PDA now empty... done generating cseq, cstr, and node->consensus residue map */
814   cseq[cpos] = '\0';
815   cstr[cpos] = '\0';
816 
817   esl_stack_Destroy(pda);
818   free(multiorder);
819   free(inface);
820   free(outface);
821 
822   ESL_ALLOC(con, sizeof(CMConsensus_t));
823   con->cseq = cseq;
824   con->cstr = cstr;
825   con->ct   = ct;
826   con->lpos = lpos;
827   con->rpos = rpos;
828   con->clen = cpos;
829   return con;
830 
831  ERROR:
832   if(cseq != NULL) free(cseq);
833   if(cstr != NULL) free(cstr);
834   if(ct   != NULL) free(ct);
835   if(lpos != NULL) free(lpos);
836   if(rpos != NULL) free(rpos);
837   if(con  != NULL) free(con);
838   return NULL;
839 }
840 
841 void
FreeCMConsensus(CMConsensus_t * con)842 FreeCMConsensus(CMConsensus_t *con)
843 {
844   if (con->cseq != NULL) free(con->cseq);
845   if (con->cstr != NULL) free(con->cstr);
846   if (con->ct   != NULL) free(con->ct);
847   if (con->lpos != NULL) free(con->lpos);
848   if (con->rpos != NULL) free(con->rpos);
849   free(con);
850 }
851 
852 /* Function:  createMultifurcationOrderChart()
853  * Incept:    SRE, Thu May 23 09:48:33 2002 [St. Louis]
854  *
855  * Purpose:   Calculates the degree of multifurcation beneath
856  *            the master subtree rooted at every node n.
857  *            Returns [0..nodes-1] array of these values.
858  *
859  *            Terminal stems have value 0. All nodes n starting with
860  *            the BEG node for a terminal stem have height[n] = 0.
861  *
862  *            A stem "above" a multifurcation into all terminal stems
863  *            has value 1; all nodes n starting with BEG and ending
864  *            with BIF have height[n] = 1.
865  *
866  *            And so on, for "higher order" (deeper) multifurcations.
867  *
868  *            Used for figuring out what characters we'll display a
869  *            consensus pair with.
870  *
871  *            THIS FUNCTION IS BUGGY (Sat Jun  1 12:24:23 2002)
872  *
873  * Args:      cm - the model.
874  *
875  * Returns:   [0..cm->nodes-1] array of multifurcation orders, for each node.
876  *            This array is allocated here; caller free's w/ free().
877  *
878  * xref:     STL6 p.58.
879  */
880 static int *
createMultifurcationOrderChart(CM_t * cm)881 createMultifurcationOrderChart(CM_t *cm)
882 {
883   int status;
884   int  v, nd, left, right;
885   int *height;
886   int *seg_has_pairs;
887 
888   ESL_ALLOC(height,        sizeof(int) * cm->nodes);
889   ESL_ALLOC(seg_has_pairs, sizeof(int) * cm->nodes);
890   for (nd = cm->nodes-1; nd >= 0; nd--)
891     {
892       v = cm->nodemap[nd];
893 
894       if       (cm->stid[v] == MATP_MP) seg_has_pairs[nd] = TRUE;
895       else if  (cm->stid[v] == END_E)   seg_has_pairs[nd] = FALSE;
896       else if  (cm->stid[v] == BIF_B)   seg_has_pairs[nd] = FALSE;
897       else                              seg_has_pairs[nd] = seg_has_pairs[nd+1];
898 
899       if (cm->stid[v] == END_E)
900 	height[nd]        = 0;
901       else if (cm->stid[v] == BIF_B)
902 	{
903 	  left  = cm->ndidx[cm->cfirst[v]];
904 	  right = cm->ndidx[cm->cnum[v]];
905 	  height[nd] = ESL_MAX(height[left] + seg_has_pairs[left],
906 			       height[right] + seg_has_pairs[right]);
907 	}
908       else
909 	height[nd] = height[nd+1];
910     }
911   free(seg_has_pairs);
912   return height;
913 
914  ERROR:
915   cm_Fail("Memory allocation error.\n");
916   return 0; /* never reached */
917 }
918 
919 
920 /* Function:  createFaceCharts()
921  * Incept:    SRE, Thu May 23 12:40:04 2002 [St. Louis]
922  *
923  * Purpose:   Calculate "inface" and "outface" for each node
924  *            in the consensus (master) structure of the CM.
925  *            These can be used to label nodes:
926  *                                inface       outface
927  *                             ------------   ----------
928  *             external ss         any           0
929  *             hairpin loop         0            1
930  *             bulge/interior       1            1
931  *             multifurc           >1            1
932  *             multifurc            1           >1
933  *             doesn't happen       0           >1
934  *
935  *             hairpin closing bp   0            1
936  *             extern closing bp    1            0
937  *             stem bp              1            1
938  *             multifurc close bp  >1            1
939  *             multifurc close bp   1           >1
940  *             doesn't happen       0           >1
941  *
942  * Args:       cm          - the model
943  *             ret_inface  - RETURN: inface[0..nodes-1]
944  *             ret_outface - RETURN: outface[0..nodes-1]
945  *
946  * Returns:    inface, outface;
947  *             they're alloc'ed here. Caller free's with free()
948  *
949  * Xref:       STL6 p.58
950  */
951 static void
createFaceCharts(CM_t * cm,int ** ret_inface,int ** ret_outface)952 createFaceCharts(CM_t *cm, int **ret_inface, int **ret_outface)
953 {
954   int  status;
955   int *inface;
956   int *outface;
957   int  nd, left, right, parent;
958   int  v,y;
959 
960   ESL_ALLOC(inface,  sizeof(int) * cm->nodes);
961   ESL_ALLOC(outface, sizeof(int) * cm->nodes);
962 
963   /* inface - the number of faces below us in descendant
964    *          subtrees. if 0, we're either external, or
965    *          a closing basepair, or we're in a hairpin loop.
966    *          inface is exclusive of current pair - so we
967    *          can easily detect closing base pairs.
968    */
969   for (nd = cm->nodes-1; nd >= 0; nd--)
970     {
971       v = cm->nodemap[nd];
972       if      (cm->ndtype[nd] == END_nd) inface[nd] = 0;
973       else if (cm->ndtype[nd] == BIF_nd) {
974 	left  = cm->ndidx[cm->cfirst[v]];
975 	right = cm->ndidx[cm->cnum[v]];
976 	inface[nd] = inface[left] + inface[right];
977       } else {
978 	if (cm->ndtype[nd+1] == MATP_nd) inface[nd] = 1;
979 	else                             inface[nd] = inface[nd+1];
980       }
981     }
982 
983   /* outface - the number of faces above us in the tree
984    *           excluding our subtree. if 0, we're external.
985    *           Like inface, outface is exclusive of current
986    *           pair.
987    */
988   for (nd = 0; nd < cm->nodes; nd++)
989     {
990       v = cm->nodemap[nd];
991       if      (cm->ndtype[nd] == ROOT_nd) outface[nd] = 0;
992       else if (cm->ndtype[nd] == BEGL_nd)
993 	{
994 	  parent = cm->ndidx[cm->plast[v]];
995 	  y      = cm->nodemap[parent];
996 	  right  = cm->ndidx[cm->cnum[y]];
997 	  outface[nd] = outface[parent] + inface[right];
998 	}
999       else if (cm->ndtype[nd] == BEGR_nd)
1000 	{
1001 	  parent = cm->ndidx[cm->plast[v]];
1002 	  left   = cm->ndidx[cm->cfirst[y]];
1003 	  outface[nd] = outface[parent] + inface[left];
1004 	}
1005       else
1006 	{
1007 	  parent = nd-1;
1008 	  if (cm->ndtype[parent] == MATP_nd) outface[nd] = 1;
1009 	  else                               outface[nd] = outface[parent];
1010 	}
1011     }
1012 
1013   *ret_inface  = inface;
1014   *ret_outface = outface;
1015   return;
1016 
1017  ERROR:
1018   cm_Fail("Memory allocation error.\n");
1019 }
1020 
1021 /* Function: IsCompensatory()
1022  * Date:     SRE, Sun Jun  2 10:16:59 2002 [Madison]
1023  *
1024  * Purpose:  Returns TRUE if log[pij/(pi*pj)] is >= 0,
1025  *           where pij is the probability of a base pair,
1026  *           pi and pj are the marginal probabilities
1027  *           for the symbols at i and j.
1028  *
1029  *           Currently returns FALSE if symi or symj
1030  *           are ambiguous IUPAC nucs. Could do a more
1031  *           sophisticated marginalization - prob not
1032  *           worth it right now.
1033  *
1034  * Args:     pij  - joint emission probability vector [0..15]
1035  *                  indexed symi*4 + symj.
1036  *           symi - symbol index at i [0..3], equiv to [a..u]
1037  *           symj - symbol index at j [0..3], equiv to [a..u]
1038  *
1039  * Returns:  TRUE or FALSE.
1040  */
1041 int
IsCompensatory(const ESL_ALPHABET * abc,float * pij,int symi,int symj)1042 IsCompensatory(const ESL_ALPHABET *abc, float *pij, int symi, int symj)
1043 {
1044   int   x;
1045   float pi, pj;
1046 
1047   if (symi >= abc->K || symj >= abc->K)
1048     return FALSE;
1049 
1050   pi = pj = 0.;
1051   for (x = 0; x < abc->K; x++)
1052     {
1053       pi += pij[symi*abc->K + x];
1054       pj += pij[x*abc->K    + symi];
1055     }
1056   if (log(pij[symi*abc->K+symj]) - log(pi) - log(pj) >= 0)
1057     return TRUE;
1058   else
1059     return FALSE;
1060 }
1061 
1062 /* Implementation of CMEmitMap_t structure:
1063  * map of a CM's nodes onto consensus sequence positions.
1064  * Structure is declared in infernal.h.
1065  *
1066  * Used for constructing multiple alignments.
1067  *
1068  *   clen              : consensus length.
1069  *       clen is 2* n(MATP) + n(MATL) + n(MATR).
1070  *       The consensus sequence is indexed 1..clen.
1071  *       0 and clen+1 are also used, as boundary conditions.
1072  *
1073  *   lpos[0..nodes-1]  : has values 0 to clen+1.
1074  *       Any left match emission from node nd is placed in lpos[nd].
1075  *       Any left insert emission from node nd *follows* lpos[nd].
1076  *
1077  *   rpos[0..nodes-1]  : has values 0..clen+1
1078  *       Any right match emission from node nd is placed in rpos[nd].
1079  *       Any right insert emission from node nd *precedes* rpos[nd]
1080  *
1081  *   epos[0..nodes-1]  : has values 0..clen+1.
1082  *       Any EL insertion from a nd->EL transition *follows* epos[nd].
1083  *
1084  * There are no dummy values; all lpos, rpos, epos are valid coords
1085  * 0..clen+1, as described above, even for END_nd's.
1086  *
1087  * For nonemitting nodes, rpos and lpos give a noninclusive bound:
1088  * for example, lpos[0] = 0 and rpos[0] = clen+1 by definition.
1089  *
1090  * Insertions occur between consensus positions. An inter-consensus-position
1091  * space may contain more than one type of insertion: an IL insertion and
1092  * an EL insertion, an IR insertion and an EL insertion; or (in
1093  * a single absurd case of a model with a consensus length of 0)
1094  * all three insertion types. Insertions are placed in order IL/EL/IR.
1095  *
1096  */
1097 
1098 /* Function: CreateEmitMap()
1099  * Date:     ? (SRE pre Infernal version 0.55).
1100  *
1101  * Purpose:  Create and fill an emit map for a given CM <cm>
1102  *           and return it.
1103  *           If we run out of memory or we have a problem creating
1104  *           the map, we return NULL.
1105  *
1106  */
1107 
1108 CMEmitMap_t *
CreateEmitMap(CM_t * cm)1109 CreateEmitMap(CM_t *cm)
1110 {
1111   int          status;
1112   CMEmitMap_t *map = NULL;
1113   ESL_STACK   *pda;
1114   int          cpos;
1115   int          nd;
1116   int          on_right;
1117 
1118   ESL_ALLOC(map,       sizeof(CMEmitMap_t));
1119   ESL_ALLOC(map->lpos, sizeof(int) * cm->nodes);
1120   ESL_ALLOC(map->rpos, sizeof(int) * cm->nodes);
1121   ESL_ALLOC(map->epos, sizeof(int) * cm->nodes);
1122 
1123   for (nd = 0; nd < cm->nodes; nd++)
1124     map->lpos[nd] = map->rpos[nd] = map->epos[nd] = -1;
1125 
1126   cpos = 0;
1127   nd   = 0;
1128   if ((pda  = esl_stack_ICreate()) == NULL) goto ERROR;
1129   if ((status = esl_stack_IPush(pda, 0)) != eslOK) goto ERROR;		/* 0 = left side. 1 would = right side. */
1130   if ((status = esl_stack_IPush(pda, nd)) != eslOK) goto ERROR;
1131   while (esl_stack_IPop(pda, &nd) != eslEOD)
1132     {
1133       esl_stack_IPop(pda, &on_right);
1134 
1135       if (on_right)
1136 	{
1137 	  map->rpos[nd] = cpos+1;
1138 	  if (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATR_nd) cpos++;
1139 	}
1140       else
1141 	{
1142 	  if (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd) cpos++;
1143 	  map->lpos[nd] = cpos;
1144 
1145 	  if (cm->ndtype[nd] == BIF_nd)
1146 	    {
1147 				/* push the BIF back on for its right side  */
1148 	      if ((status = esl_stack_IPush(pda, 1)) != eslOK) goto ERROR;
1149 	      if ((status = esl_stack_IPush(pda, nd)) != eslOK) goto ERROR;
1150                             /* push node index for right child */
1151 	      if ((status = esl_stack_IPush(pda, 0)) != eslOK) goto ERROR;
1152 	      if ((status = esl_stack_IPush(pda, cm->ndidx[cm->cnum[cm->nodemap[nd]]])) != eslOK) goto ERROR;
1153                             /* push node index for left child */
1154 	      if ((status = esl_stack_IPush(pda, 0)) != eslOK) goto ERROR;
1155 	      if ((status = esl_stack_IPush(pda, cm->ndidx[cm->cfirst[cm->nodemap[nd]]])) != eslOK) goto ERROR;
1156 	    }
1157 	  else
1158 	    {
1159 				/* push the node back on for right side */
1160 	      if ((status = esl_stack_IPush(pda, 1)) != eslOK) goto ERROR;
1161 	      if ((status = esl_stack_IPush(pda, nd)) != eslOK) goto ERROR;
1162 				/* push child node on */
1163 	      if (cm->ndtype[nd] != END_nd) {
1164 		if ((status = esl_stack_IPush(pda, 0)) != eslOK) goto ERROR;
1165 		if ((status = esl_stack_IPush(pda, nd+1)) != eslOK) goto ERROR;
1166 	      }
1167 	    }
1168 	}
1169     }
1170 
1171   /* Construct the epos map: if we do a v->EL transition,
1172    * the EL follows what consensus position (and its IL insertions,
1173    * if any)
1174    */
1175   for (nd = cm->nodes-1; nd >= 0; nd--) {
1176     if (cm->ndtype[nd] == END_nd)
1177       cpos = map->lpos[nd];
1178     else if (cm->ndtype[nd] == BIF_nd) /* propagate epos for *right* branch. */
1179       cpos = map->epos[cm->ndidx[cm->cnum[cm->nodemap[nd]]]];
1180 
1181     map->epos[nd] = cpos;
1182   }
1183 
1184   map->clen = map->rpos[0]-1;
1185   esl_stack_Destroy(pda);
1186 
1187   /* ensure we've filled in the map correctly */
1188   for (nd = 0; nd < cm->nodes; nd++) {
1189     if(map->lpos[nd] == -1) goto ERROR;
1190     if(map->rpos[nd] == -1) goto ERROR;
1191     if(map->epos[nd] == -1) goto ERROR;
1192   }
1193 
1194   return map;
1195 
1196  ERROR:
1197   if(map != NULL) FreeEmitMap(map);
1198   return NULL;
1199 }
1200 
1201 
1202 /* Function: SizeofEmitMap()
1203  * Date:     EPN, Wed Jan 18 10:15:46 2012
1204  *
1205  * Purpose:  Calculate and return the size of an emit map
1206  *           in Mb.
1207  */
1208 
1209 float
SizeofEmitMap(CM_t * cm,CMEmitMap_t * emap)1210 SizeofEmitMap(CM_t *cm, CMEmitMap_t *emap)
1211 {
1212   float bytes = 0.;
1213 
1214   if(emap == NULL) return 0.;
1215 
1216   bytes = sizeof(CMEmitMap_t);
1217   bytes += sizeof(int) * cm->nodes; /* lpos */
1218   bytes += sizeof(int) * cm->nodes; /* rpos */
1219   bytes += sizeof(int) * cm->nodes; /* epos */
1220 
1221   return bytes / 1000000.;
1222 }
1223 
1224 void
DumpEmitMap(FILE * fp,CMEmitMap_t * map,CM_t * cm)1225 DumpEmitMap(FILE *fp, CMEmitMap_t *map, CM_t *cm)
1226 {
1227   int nd;
1228 
1229   fprintf(fp, "CM to consensus emit map; consensus length = %d \n",
1230 	  map->clen);
1231   fprintf(fp, "%4s %7s %9s %4s %4s %4s\n",
1232 	  "Node", "State 1", "Node type", "lpos", "rpos", "epos");
1233   fprintf(fp, "%4s %7s %9s %4s %4s %4s\n",
1234 	  "----", "-------", "---------", "----", "----", "----");
1235   for (nd = 0; nd < cm->nodes; nd++)
1236     fprintf(fp, "%4d %7d %9s %4d %4d %4d\n",
1237 	    nd, cm->nodemap[nd], Nodetype(cm->ndtype[nd]),
1238 	    map->lpos[nd], map->rpos[nd], map->epos[nd]);
1239 }
1240 
1241 void
FreeEmitMap(CMEmitMap_t * map)1242 FreeEmitMap(CMEmitMap_t *map)
1243 {
1244   free(map->lpos);
1245   free(map->rpos);
1246   free(map->epos);
1247   free(map);
1248 }
1249 
1250 /* format_time_string()
1251  * Date:     SRE, Fri Nov 26 15:06:28 1999 [St. Louis]
1252  *
1253  * Purpose:  Given a number of seconds, format into
1254  *           hh:mm:ss.xx in a provided buffer.
1255  *
1256  * Args:     buf     - allocated space (128 is plenty!)
1257  *           sec     - number of seconds
1258  *           do_frac - TRUE (1) to include hundredths of a sec
1259  */
1260 void
FormatTimeString(char * buf,double sec,int do_frac)1261 FormatTimeString(char *buf, double sec, int do_frac)
1262 {
1263   int h, m, s, hs;
1264 
1265   h  = (int) (sec / 3600.);
1266   m  = (int) (sec / 60.) - h * 60;
1267   s  = (int) (sec) - h * 3600 - m * 60;
1268   if (do_frac) {
1269     hs = (int) (sec * 100.) - h * 360000 - m * 6000 - s * 100;
1270     sprintf(buf, "%02d:%02d:%02d.%02d", h,m,s,hs);
1271   } else {
1272     sprintf(buf, "%02d:%02d:%02d", h,m,s);
1273   }
1274 }
1275 
1276 
1277 /* Function: GetCommand
1278  * Date:     EPN, Fri Jan 25 13:56:10 2008
1279  *
1280  * Purpose:  Return the command used to call an infernal executable
1281  *           in <ret_command>.
1282  *
1283  * Returns:  eslOK on success; eslEMEM on allocation failure.
1284  */
1285 int
GetCommand(const ESL_GETOPTS * go,char * errbuf,char ** ret_command)1286 GetCommand(const ESL_GETOPTS *go, char *errbuf, char **ret_command)
1287 {
1288   int status;
1289   int i;
1290   char *command = NULL;
1291 
1292   for (i = 0; i < go->argc; i++) { /* copy all command line options and args */
1293     if((status = esl_strcat(&(command),  -1, go->argv[i], -1)) != eslOK) goto ERROR;
1294     if(i < (go->argc-1)) if((status = esl_strcat(&(command), -1, " ", 1)) != eslOK) goto ERROR;
1295   }
1296   *ret_command = command;
1297 
1298   return eslOK;
1299 
1300  ERROR:
1301   ESL_FAIL(status, errbuf, "GetCommand(): memory allocation error.");
1302   return status;
1303 }
1304 
1305 /* Function: GetDate
1306  * Date:     EPN, Fri Jan 25 13:59:22 2008
1307  *
1308  * Purpose:  Return a string that gives the current date.
1309  *
1310  * Returns:  eslOK on success; eslEMEM on allocation failure.
1311  */
1312 int
GetDate(char * errbuf,char ** ret_date)1313 GetDate(char *errbuf, char **ret_date)
1314 {
1315   int    status;
1316   time_t date = time(NULL);
1317   char  *sdate = NULL;
1318 
1319   if((status = esl_strdup(ctime(&date), -1, &sdate)) != eslOK) goto ERROR;
1320   esl_strchop(sdate, -1); /* doesn't return anything but eslOK */
1321 
1322   *ret_date = sdate;
1323   return eslOK;
1324 
1325  ERROR:
1326   ESL_FAIL(status, errbuf, "get_date() error status: %d, probably out of memory.", status);
1327   return status;
1328 }
1329 
1330 
1331 /* Function: bp_is_canonical
1332  * Date:     EPN, Wed Oct 14 06:17:27 2009
1333  *
1334  * Purpose:  Determine if two residues form a canonical base pair or not.
1335  *           Works for RNA or DNA (because for some reason cmsearch allows
1336  *           the user to format output as DNA (with --dna)).
1337  *
1338  * Returns:  TRUE if:
1339  *           lseq  rseq
1340  *           ----  ----
1341  *            A     U
1342  *            U     A
1343  *            C     G
1344  *            G     C
1345  *            G     U
1346  *            U     G
1347  *            A     T
1348  *            T     A
1349  *            G     T
1350  *            T     G
1351  *            Else, return FALSE.
1352  */
1353 int
bp_is_canonical(char lseq,char rseq)1354 bp_is_canonical(char lseq, char rseq)
1355 {
1356   switch (toupper(lseq)) {
1357   case 'A':
1358     switch (toupper(rseq)) {
1359     case 'U': return TRUE; break;
1360     case 'T': return TRUE; break;
1361     default: break;
1362     }
1363     break;
1364   case 'C':
1365     switch (toupper(rseq)) {
1366     case 'G': return TRUE; break;
1367     default: break;
1368     }
1369     break;
1370   case 'G':
1371     switch (toupper(rseq)) {
1372     case 'C': return TRUE; break;
1373     case 'U': return TRUE; break;
1374     case 'T': return TRUE; break;
1375     default: break;
1376     }
1377     break;
1378   case 'U':
1379     switch (toupper(rseq)) {
1380     case 'A': return TRUE; break;
1381     case 'G': return TRUE; break;
1382     default: break;
1383     }
1384     break;
1385   case 'T':
1386     switch (toupper(rseq)) {
1387     case 'A': return TRUE; break;
1388     case 'G': return TRUE; break;
1389     default: break;
1390     }
1391     break;
1392   default: break;
1393   }
1394 
1395   return FALSE;
1396 }
1397 
1398