1 /* display.c
2 * SRE, Thu May 23 08:18:05 2002 [St. Louis]
3 *
4 * Routines for formatting and displaying parse trees
5 * for output.
6 */
7
8 #include "esl_config.h"
9 #include "p7_config.h"
10 #include "config.h"
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <ctype.h>
16 #include <math.h>
17 #include <time.h>
18
19 #include "easel.h"
20 #include "esl_getopts.h"
21 #include "esl_stack.h"
22 #include "esl_vectorops.h"
23
24 #include "hmmer.h"
25
26 #include "infernal.h"
27
28 static int *createMultifurcationOrderChart(CM_t *cm);
29 static int bp_is_canonical(char lseq, char rseq);
30 static void createFaceCharts(CM_t *cm, int **ret_inface, int **ret_outface);
31
32 /* Function: CreateFancyAli()
33 * Incept: SRE, Thu May 23 13:46:09 2002 [St. Louis]
34 *
35 * Purpose: Given a trace (and the model and sequence it corresponds
36 * to), create a pairwise alignment for display; return in a Fancyali_t
37 * structure.
38 *
39 * Args: abc - alphabet to create alignment with (often cm->abc)
40 * tr - parsetree for cm aligned to dsq
41 * cm - model
42 * cons - consensus information for cm; see CreateCMConsensus()
43 * dsq - digitized sequence
44 * do_noncanonical - mark half-bps and negative scoring bps that are non-canonicals in top line with 'v'
45 * (by default, all negative scoring and half-bps are marked with 'x')
46 * pcode - posterior code
47 *
48 *
49 * Returns: fancy alignment structure.
50 * Caller frees, with FreeFancyAli(ali).
51 *
52 * Xref: STL6 p.58
53 */
54 Fancyali_t *
CreateFancyAli(const ESL_ALPHABET * abc,Parsetree_t * tr,CM_t * cm,CMConsensus_t * cons,ESL_DSQ * dsq,int do_noncanonical,char * pcode)55 CreateFancyAli(const ESL_ALPHABET *abc, Parsetree_t *tr, CM_t *cm, CMConsensus_t *cons, ESL_DSQ *dsq, int do_noncanonical, char *pcode)
56 {
57 int status;
58 Fancyali_t *ali; /* alignment structure we're building */
59 ESL_STACK *pda; /* pushdown automaton used to traverse trace */
60 int type; /* type of pda move: PDA_RESIDUE, PDA_STATE */
61 int v; /* state index */
62 int nd; /* node index */
63 int ti; /* position in trace */
64 int qinset, tinset; /* # consensus nt skipped by an EL, or in an EL */
65 int ninset; /* max # nt in an EL */
66 int pos; /* position in growing ali */
67 int lc, rc; /* indices for left, right pos in consensus */
68 int symi, symj;
69 int d;
70 char mode;
71 int lrf, rrf; /* chars in reference line; left, right */
72 int lstr, rstr; /* chars in structure line; left, right */
73 int lcons, rcons; /* chars in consensus line; left, right */
74 int lmid, rmid; /* chars in ali quality line; left, right */
75 int ltop, rtop; /* chars in optional noncompensatory line; left, right */
76 int lnegnc, rnegnc; /* chars in optional noncanonical line; left, right */
77 int lseq, rseq; /* chars in aligned target line; left, right */
78 int lpost, rpost; /* chars in aligned posteriors, left, right */
79 int do_left, do_right;/* flags to generate left, right */
80 int cpos_l, cpos_r; /* positions in consensus (1..clen) */
81 int spos_l, spos_r; /* positions in dsq (1..L) */
82 int have_pcodes;
83
84 /* Contract check. We allow the caller to specify the alphabet they want the
85 * resulting MSA in, but it has to make sense (see next few lines). */
86 if(cm->abc->type == eslRNA)
87 {
88 if(abc->type != eslRNA && abc->type != eslDNA)
89 cm_Fail("ERROR in CreateFancyAli(), cm alphabet is RNA, but requested output alphabet is neither DNA nor RNA.");
90 }
91 else if(cm->abc->K != abc->K)
92 cm_Fail("ERROR in CreateFancyAli(), cm alphabet size is %d, but requested output alphabet size is %d.", cm->abc->K, abc->K);
93
94 ESL_ALLOC(ali, sizeof(Fancyali_t));
95 have_pcodes = (pcode != NULL) ? TRUE : FALSE;
96
97 /* Calculate length of the alignment display.
98 * MATP node : +2
99 * MATL, MATR node : +1
100 * IL, IR state : +1
101 * EL: : 4 + width of length display : "*[nn]*"
102 * anything else : 0.
103 */
104 ali->len = 0;
105 for (ti = 0; ti < tr->n; ti++)
106 {
107 v = tr->state[ti];
108 if (v == cm->M) { /* special case: local exit into EL */
109 nd = cm->ndidx[tr->state[ti-1]]; /* calculate node that EL replaced */
110 qinset = cons->rpos[nd] - cons->lpos[nd] + 1;
111 tinset = tr->emitr[ti] - tr->emitl[ti] + 1;
112 ninset = ESL_MAX(qinset,tinset);
113 ali->len += 4;
114 do { ali->len++; ninset/=10; } while (ninset); /* poor man's (int)log_10(ninset)+1 */
115 continue;
116 } else {
117 nd = cm->ndidx[v];
118 if (cm->sttype[v] == IL_st || cm->sttype[v] == IR_st)
119 ali->len += 1;
120 else if (cm->ndtype[nd] == MATL_nd || cm->ndtype[nd] == MATR_nd)
121 ali->len += 1;
122 else if (cm->ndtype[nd] == MATP_nd)
123 ali->len += 2;
124 }
125 /* Catch marginal-type local ends and treat them like EL for output */
126 if ((tr->nxtl[ti] == -1) && (cm->sttype[v] != E_st)) {
127 nd = cm->ndidx[tr->state[ti]];
128 qinset = cons->rpos[nd] - cons->lpos[nd] + 1;
129 tinset = tr->emitr[ti] - tr->emitl[ti] + 1;
130 if (tinset > 0) tinset--;
131 ninset = ESL_MAX(qinset,tinset);
132 ali->len += 4;
133 do { ali->len++; ninset/=10; } while (ninset); /* poor man's (int)log_10(ninset)+1 */
134 }
135 }
136
137 /* Allocate and initialize.
138 * Blank the reference lines (memset calls) - only needed
139 * because of the way we deal w/ EL.
140 */
141 if (cm->rf != NULL )
142 ESL_ALLOC(ali->rf, sizeof(char) * (ali->len+1));
143 else
144 ali->rf = NULL;
145 ESL_ALLOC(ali->cstr, sizeof(char) * (ali->len+1));
146 ESL_ALLOC(ali->cseq, sizeof(char) * (ali->len+1));
147 ESL_ALLOC(ali->mid, sizeof(char) * (ali->len+1));
148 ESL_ALLOC(ali->top, sizeof(char) * (ali->len+1));
149 ESL_ALLOC(ali->aseq, sizeof(char) * (ali->len+1));
150 if(have_pcodes) {
151 ESL_ALLOC(ali->pcode, sizeof(char) * (ali->len+1));
152 }
153 else {
154 ali->pcode = NULL;
155 }
156 ESL_ALLOC(ali->scoord, sizeof(int) * ali->len);
157 ESL_ALLOC(ali->ccoord, sizeof(int) * ali->len);
158
159 if (cm->rf != NULL) memset(ali->rf, ' ', ali->len);
160 memset(ali->cstr, ' ', ali->len);
161 memset(ali->cseq, ' ', ali->len);
162 memset(ali->mid, ' ', ali->len);
163 memset(ali->top, ' ', ali->len);
164 memset(ali->aseq, ' ', ali->len);
165 if(have_pcodes) {
166 memset(ali->pcode, ' ', ali->len);
167 }
168 for (pos = 0; pos < ali->len; pos++)
169 ali->ccoord[pos] = ali->scoord[pos] = 0;
170
171 /* Fill in the lines: traverse the traceback.
172 */
173 pos = 0;
174 if((pda = esl_stack_ICreate()) == NULL) goto ERROR;
175 if((status = esl_stack_IPush(pda, 0)) != eslOK) goto ERROR;
176 if((status = esl_stack_IPush(pda, PDA_STATE)) != eslOK) goto ERROR;
177
178 while (esl_stack_IPop(pda, &type) != eslEOD)
179 {
180 if (type == PDA_RESIDUE) {
181 if (cm->rf != NULL) {
182 esl_stack_IPop(pda, &rrf);
183 ali->rf[pos] = rrf;
184 }
185 esl_stack_IPop(pda, &rstr); ali->cstr[pos] = rstr;
186 esl_stack_IPop(pda, &rcons); ali->cseq[pos] = rcons;
187 esl_stack_IPop(pda, &rmid); ali->mid[pos] = rmid;
188 esl_stack_IPop(pda, &rtop); ali->top[pos] = rtop;
189 esl_stack_IPop(pda, &rseq); ali->aseq[pos] = rseq;
190 if(have_pcodes) {
191 esl_stack_IPop(pda, &rpost); ali->pcode[pos] = rpost;
192 }
193 esl_stack_IPop(pda, &cpos_r); ali->ccoord[pos] = cpos_r;
194 esl_stack_IPop(pda, &spos_r); ali->scoord[pos] = spos_r;
195 pos++;
196 continue;
197 }
198
199 /* Else, we're PDA_STATE - e.g. dealing with a trace node.
200 */
201 esl_stack_IPop(pda, &ti);
202 v = tr->state[ti];
203
204 /* Deal with EL (local ends, state M) as a special case.
205 * We get away with only writing into aseq because we've
206 * memset() the display strings to blank.
207 */
208 if (v == cm->M) {
209 int numwidth; /* number of chars to leave for displaying width numbers */
210
211 nd = 1 + cm->ndidx[tr->state[ti-1]]; /* calculate node that EL replaced */
212 qinset = cons->rpos[nd] - cons->lpos[nd] + 1;
213 tinset = tr->emitr[ti] - tr->emitl[ti] + 1;
214 ninset = ESL_MAX(qinset,tinset);
215 numwidth = 0; do { numwidth++; ninset/=10; } while (ninset); /* poor man's (int)log_10(ninset)+1 */
216 memset(ali->cstr+pos, '~', numwidth+4);
217 sprintf(ali->cseq+pos, "*[%*d]*", numwidth, qinset);
218 sprintf(ali->aseq+pos, "*[%*d]*", numwidth, tinset);
219 /* do nothing for posteriors here, they'll stay as they were init'ed, as ' ' */
220 pos += 4 + numwidth;
221 continue;
222 }
223
224 /* Fetch some info into tmp variables, for "clarity"
225 */
226 nd = cm->ndidx[v]; /* what CM node we're in */
227 lc = cons->lpos[nd]; /* where CM node aligns to in consensus */
228 rc = cons->rpos[nd];
229 symi = dsq[tr->emitl[ti]]; /* residue indices that node is aligned to */
230 symj = dsq[tr->emitr[ti]];
231 if(have_pcodes) { /* posterior codes are indexed 0..alen-1, off-by-one w.r.t dsq */
232 lpost = '.'; /* init to gap, if it corresponds to a residue, we'll reset it below */
233 rpost = '.'; /* init to gap, if it corresponds to a residue, we'll reset it below */
234 }
235 d = tr->emitr[ti] - tr->emitl[ti] + 1;
236 mode = tr->mode[ti];
237
238 /* Calculate four of the five lines: rf, str, cons, and seq.
239 */
240 do_left = do_right = FALSE;
241 if (cm->sttype[v] == IL_st) {
242 do_left = TRUE;
243 if (cm->rf != NULL) lrf = '.';
244 lstr = '.';
245 lcons = '.';
246 if (mode == TRMODE_J || mode == TRMODE_L) lseq = tolower((int) abc->sym[symi]);
247 else lseq = '~';
248 cpos_l = 0;
249 spos_l = tr->emitl[ti];
250 if(pcode != NULL) { lpost = pcode[tr->emitl[ti]-1]; } /* watch off-by-one w.r.t. dsq */
251 } else if (cm->sttype[v] == IR_st) {
252 do_right = TRUE;
253 if (cm->rf != NULL) rrf = '.';
254 rstr = '.';
255 rcons = '.';
256 if (mode == TRMODE_J || TRMODE_R) rseq = tolower((int) abc->sym[symj]);
257 else rseq = '~';
258 cpos_r = 0;
259 spos_r = tr->emitr[ti];
260 if(pcode != NULL) { rpost = pcode[tr->emitr[ti]-1]; } /* watch off-by-one w.r.t. dsq */
261 } else {
262 if (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd) {
263 do_left = TRUE;
264 if (cm->rf != NULL) lrf = cm->rf[lc];
265 lstr = cons->cstr[lc];
266 lcons = cons->cseq[lc];
267 cpos_l = lc+1;
268 if (cm->sttype[v] == MP_st || cm->sttype[v] == ML_st) {
269 if (mode == TRMODE_J) lseq = abc->sym[symi];
270 else if (mode == TRMODE_L && d>0 ) lseq = abc->sym[symi];
271 else lseq = '~';
272 spos_l = tr->emitl[ti];
273 if(pcode != NULL) { lpost = pcode[tr->emitl[ti]-1]; } /* watch off-by-one w.r.t. dsq */
274 } else {
275 if (mode == TRMODE_J || mode == TRMODE_L) lseq = '-';
276 else lseq = '~';
277 spos_l = 0;
278 /* lpost remains as it was init'ed as a gap '.' */
279 }
280 }
281 if (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATR_nd) {
282 do_right = TRUE;
283 if (cm->rf != NULL) rrf = cm->rf[rc];
284 rstr = cons->cstr[rc];
285 rcons = cons->cseq[rc];
286 cpos_r = rc+1;
287 if (cm->sttype[v] == MP_st || cm->sttype[v] == MR_st) {
288 if (mode == TRMODE_J) rseq = abc->sym[symj];
289 else if (TRMODE_R && d>0 ) rseq = abc->sym[symj];
290 else rseq = '~';
291 spos_r = tr->emitr[ti];
292 if(pcode != NULL) { rpost = pcode[tr->emitr[ti]-1]; } /* watch off-by-one w.r.t. dsq */
293 } else {
294 if (mode == TRMODE_J || TRMODE_R) rseq = '-';
295 else rseq = '~';
296 spos_r = 0;
297 /* rpost remains as it was init'ed as a gap '.' */
298 }
299 }
300 }
301
302 /* Use emission p and score to set lmid, rmid line for emitting states.
303 */
304 lmid = rmid = ' ';
305 ltop = rtop = ' ';
306 lnegnc = rnegnc = ' ';
307 if (cm->sttype[v] == MP_st) {
308 if (lseq == toupper(lcons) && rseq == toupper(rcons))
309 {
310 lmid = lseq;
311 rmid = rseq;
312 }
313 else if (mode != 3)
314 {
315 if (mode == TRMODE_L && lseq == toupper(lcons)) lmid = lseq;
316 if (TRMODE_R && rseq == toupper(rcons)) rmid = rseq;
317 }
318 else if (DegeneratePairScore(cm->abc, cm->esc[v], symi, symj) >= 0)
319 lmid = rmid = ':';
320
321 /* determine ltop, rtop for optional noncompensatory annotation, they are 'x' if lmid, rmid are ' ', and ' ' otherwise */
322 if (lmid == ' ' && rmid == ' ')
323 ltop = rtop = 'x';
324
325 /* determine lnegnc, rnegnc for optional negative scoring non-canonical annotation, they are 'v' if lseq and rseq are a negative scoring non-canonical (not a AU,UA,GC,CG,GU,UG) pair */
326 if ((mode == TRMODE_J) && (DegeneratePairScore(cm->abc, cm->esc[v], symi, symj) < 0) && (! bp_is_canonical(lseq, rseq))) {
327 lnegnc = rnegnc = 'v';
328 }
329 } else if (cm->sttype[v] == ML_st || cm->sttype[v] == IL_st) {
330 if (lseq == toupper(lcons))
331 lmid = lseq;
332 else if ( (mode != 3) && (mode != 2) )
333 ;
334 else if(esl_abc_FAvgScore(cm->abc, symi, cm->esc[v]) > 0)
335 lmid = '+';
336 } else if (cm->sttype[v] == MR_st || cm->sttype[v] == IR_st) {
337 if (rseq == toupper(rcons))
338 rmid = rseq;
339 else if ( (mode != 3) && (mode != 1) )
340 ;
341 else if(esl_abc_FAvgScore(cm->abc, symj, cm->esc[v]) > 0)
342 rmid = '+';
343 }
344 if(cm->stid[v] == MATP_ML || cm->stid[v] == MATP_MR) {
345 if(mode == TRMODE_J) {
346 ltop = rtop = 'x'; /* mark non-truncated half base-pairs (MATP_ML or MATP_MR) with 'x' */
347 lnegnc = rnegnc = 'v'; /* mark non-truncated half base-pairs (MATP_ML or MATP_MR) with 'v' */
348 }
349 }
350 /* If we're storing a residue leftwise - just do it.
351 * If rightwise - push it onto stack.
352 */
353 if (do_left) {
354 if (cm->rf != NULL) ali->rf[pos] = lrf;
355 ali->cstr[pos] = lstr;
356 ali->cseq[pos] = lcons;
357 ali->mid[pos] = lmid;
358 ali->top[pos] = (do_noncanonical) ? lnegnc : ltop;
359 ali->aseq[pos] = lseq;
360 if(have_pcodes) {
361 ali->pcode[pos] = lpost;
362 }
363 ali->ccoord[pos] = cpos_l;
364 ali->scoord[pos] = spos_l;
365 pos++;
366 }
367 if (do_right) {
368 if ((status = esl_stack_IPush(pda, spos_r)) != eslOK) goto ERROR;
369 if ((status = esl_stack_IPush(pda, cpos_r)) != eslOK) goto ERROR;
370 if(have_pcodes) {
371 if ((status = esl_stack_IPush(pda, (int) rpost)) != eslOK) goto ERROR;
372 }
373 if ((status = esl_stack_IPush(pda, (int) rseq)) != eslOK) goto ERROR;
374 if (do_noncanonical) { if ((status = esl_stack_IPush(pda, (int) rnegnc)) != eslOK) goto ERROR; }
375 else { if ((status = esl_stack_IPush(pda, (int) rtop)) != eslOK) goto ERROR; }
376 if ((status = esl_stack_IPush(pda, (int) rmid)) != eslOK) goto ERROR;
377 if ((status = esl_stack_IPush(pda, (int) rcons)) != eslOK) goto ERROR;
378 if ((status = esl_stack_IPush(pda, (int) rstr)) != eslOK) goto ERROR;
379 if (cm->rf != NULL) if ((status = esl_stack_IPush(pda, (int) rrf)) != eslOK) goto ERROR;
380 if ((status = esl_stack_IPush(pda, PDA_RESIDUE)) != eslOK) goto ERROR;
381 }
382
383 /* Push the child trace nodes onto the PDA;
384 * right first, so it pops last.
385 */
386 if (tr->nxtr[ti] != -1) {
387 if ((status = esl_stack_IPush(pda, tr->nxtr[ti])) != eslOK) goto ERROR;
388 if ((status = esl_stack_IPush(pda, PDA_STATE)) != eslOK) goto ERROR;
389 }
390 if (tr->nxtl[ti] != -1) {
391 if ((status = esl_stack_IPush(pda, tr->nxtl[ti])) != eslOK) goto ERROR;
392 if ((status = esl_stack_IPush(pda, PDA_STATE)) != eslOK) goto ERROR;
393 }
394 else if (cm->sttype[v] != E_st) {
395 /* Catch marginal-type local ends, treat like EL for output */
396 int numwidth; /* number of chars to leave for displaying width numbers */
397
398 nd = 1 + cm->ndidx[tr->state[ti]]; /* calculate node that EL replaced */
399 qinset = cons->rpos[nd] - cons->lpos[nd] + 1;
400 tinset = tr->emitr[ti] - tr->emitl[ti] + 1;
401 if (tinset > 0) tinset--;
402 ninset = ESL_MAX(qinset,tinset);
403 numwidth = 0; do { numwidth++; ninset/=10; } while (ninset); /* poor man's (int)log_10(ninset)+1 */
404 memset(ali->cstr+pos, '~', numwidth+4);
405 sprintf(ali->cseq+pos, "*[%*d]*", numwidth, qinset);
406 sprintf(ali->aseq+pos, "*[%*d]*", numwidth, tinset);
407 /* do nothing for posteriors here, they'll stay as they were init'ed, as ' ' */
408 pos += 4 + numwidth;
409 }
410 } /* end loop over the PDA; PDA now empty */
411
412 if (cm->rf != NULL) ali->rf[ali->len] = '\0';
413 ali->cstr[ali->len] = '\0';
414 ali->cseq[ali->len] = '\0';
415 ali->mid[ali->len] = '\0';
416 ali->top[ali->len] = '\0';
417 ali->aseq[ali->len] = '\0';
418 if(have_pcodes) {
419 ali->pcode[ali->len] = '\0';
420 }
421 /* Laboriously determine the maximum bounds.
422 */
423 ali->sqfrom = 0;
424 for (pos = 0; pos < ali->len; pos++)
425 if (ali->scoord[pos] != 0) {
426 ali->sqfrom = ali->scoord[pos];
427 break;
428 }
429 ali->sqto = 0;
430 for (pos = 0; pos < ali->len; pos++)
431 if (ali->scoord[pos] != 0) ali->sqto = ali->scoord[pos];
432 ali->cfrom = 0;
433 for (pos = 0; pos < ali->len; pos++)
434 if (ali->ccoord[pos] != 0) {
435 ali->cfrom = ali->ccoord[pos];
436 break;
437 }
438 ali->cto = 0;
439 for (pos = 0; pos < ali->len; pos++)
440 if (ali->ccoord[pos] != 0) ali->cto = ali->ccoord[pos];
441
442 esl_stack_Destroy(pda);
443 return ali;
444
445 ERROR:
446 cm_Fail("Memory allocation error.\n");
447 return NULL; /* not reached */
448 }
449
450 /* Function: PrintFancyAli()
451 * Date: SRE, Thu Jun 13 02:23:18 2002 [Aula Magna, Stockholm]
452 *
453 * Purpose: Write a CM/sequence alignment to a stream, from a
454 * Fancyali_t structure. Line length currently hardcoded
455 * but this could be changed. Modeled on HMMER's
456 * eponymous function.
457 *
458 * Put at least <min_aliwidth> alignment characters per
459 * line; try to make lines no longer than <linewidth>
460 * characters, including name, coords, and spacing. The
461 * width of lines may exceed <linewidth>, if that's what it
462 * takes to put a name, coords, and <min_aliwidth>
463 * characters of alignment on a line.
464 *
465 * As a special case, if <linewidth> is negative or 0, then
466 * alignments are formatted in a single block of unlimited
467 * line length.
468 *
469 *
470 * Args: fp - where to print it (stdout or open FILE)
471 * ali - alignment structure to print.
472 * offset - number of residues to add to target seq index,
473 * to ease MPI search, all target hits start at posn 1
474 * in_revcomp - TRUE if hit we're printing an alignment for a
475 * cmsearch hit on reverse complement strand.
476 * do_top - TRUE to turn on optional 'top-line' annotation.
477 * This was set in CreateFancyAli() as marking either:
478 * negative scoring (non-compensatories) and half-bps:
479 * negative scoring *non-canonical* bps and half bps
480 * (in this 2nd case negative scoring canonicals are unmarked)
481 * min_aliwidth - min length for alignment block (see Purpose above)
482 * linewidth - preferred line length (see Purpose above)
483 * show_accessions - TRUE to print seq/query accessions (if avail.), not names
484 * Returns: (void)
485 */
486 void
PrintFancyAli(FILE * fp,Fancyali_t * ali,int64_t offset,int in_revcomp,int do_top,int linewidth)487 PrintFancyAli(FILE *fp, Fancyali_t *ali, int64_t offset, int in_revcomp, int do_top, int linewidth)
488 {
489 int status;
490 char *buf;
491 int pos;
492 int ci, cj; /* positions in CM consensus 1..clen */
493 int sqi, sqj; /* positions in target seq 1..L */
494 int i;
495 int i2print, j2print; /* i,j indices we'll print, used to deal with case of reverse complement */
496 int have_pcodes; /* TRUE if posterior codes are valid */
497
498 printf("in PrintFancyAli sqfrom..sqto %d..%d in_revcomp: %d offset: %" PRId64 "\n", ali->sqfrom, ali->sqto, in_revcomp, offset);
499
500 have_pcodes = (ali->pcode != NULL) ? TRUE : FALSE;
501
502 ESL_ALLOC(buf, sizeof(char) * (linewidth + 1));
503 buf[linewidth] = '\0';
504 for (pos = 0; pos < ali->len; pos += linewidth)
505 {
506 /* Laboriously determine our coord bounds on dsq
507 * and consensus line for this alignment section.
508 */
509 sqi = 0;
510 for (i = pos; ali->aseq[i] != '\0' && i < pos + linewidth; i++) {
511 if (ali->scoord[i] != 0) {
512 sqi = ali->scoord[i];
513 break;
514 }
515 }
516 sqj = 0;
517 for (i = pos; ali->aseq[i] != '\0' && i < pos + linewidth; i++) {
518 if (ali->scoord[i] != 0) sqj = ali->scoord[i];
519 }
520 ci = 0;
521 for (i = pos; ali->aseq[i] != '\0' && i < pos + linewidth; i++) {
522 if (ali->ccoord[i] != 0) {
523 ci = ali->ccoord[i];
524 break;
525 }
526 }
527 cj = 0;
528 for (i = pos; ali->aseq[i] != '\0' && i < pos + linewidth; i++) {
529 if (ali->ccoord[i] != 0) cj = ali->ccoord[i];
530 }
531
532 /* Formats and print the alignment section.
533 */
534 if (ali->rf != NULL) {
535 strncpy(buf, ali->rf+pos, linewidth);
536 fprintf(fp, " %8s %s\n", " ", buf);
537 }
538 if (do_top && ali->top != NULL) {
539 strncpy(buf, ali->top+pos, linewidth);
540 fprintf(fp, " %8s %s\n", " ", buf);
541 }
542 if (ali->cstr != NULL) {
543 strncpy(buf, ali->cstr+pos, linewidth);
544 fprintf(fp, " %8s %s\n", " ", buf);
545 }
546 if (ali->cseq != NULL) {
547 strncpy(buf, ali->cseq+pos, linewidth);
548 if (ci && cj)
549 fprintf(fp, " %8d %s %-8d\n", ci, buf, cj);
550 else
551 fprintf(fp, " %8s %s %-8s\n", "-", buf, "-");
552 }
553 if (ali->mid != NULL) {
554 strncpy(buf, ali->mid+pos, linewidth);
555 fprintf(fp, " %8s %s\n", " ", buf);
556 }
557 if (ali->aseq != NULL) {
558 strncpy(buf, ali->aseq+pos, linewidth);
559 if (sqj && sqi) {
560 if(in_revcomp) {
561 i2print = offset - (sqi-1) + 1;
562 j2print = i2print - (sqj-sqi);
563 }
564 else {
565 i2print = sqi + offset;
566 j2print = sqj + offset;
567 }
568 fprintf(fp, " %8d %s %-8d\n", i2print, buf, j2print);
569 }
570 else {
571 fprintf(fp, " %8s %s %-8s\n", "-", buf, "-");
572 }
573 }
574 if (have_pcodes && ali->pcode != NULL) {
575 strncpy(buf, ali->pcode+pos, linewidth);
576 fprintf(fp, " %8s %s\n", "PP", buf);
577 }
578 fprintf(fp, "\n");
579 }
580 free(buf);
581 fflush(fp);
582 return;
583
584 ERROR:
585 cm_Fail("Memory allocation error.\n");
586 }
587
588
589 /* Function: FreeFancyAli()
590 * Incept: SRE, Fri May 24 15:37:30 2002 [St. Louis]
591 */
592 void
FreeFancyAli(Fancyali_t * ali)593 FreeFancyAli(Fancyali_t *ali)
594 {
595 if (ali->rf != NULL) free(ali->rf);
596 if (ali->cstr != NULL) free(ali->cstr);
597 if (ali->cseq != NULL) free(ali->cseq);
598 if (ali->mid != NULL) free(ali->mid);
599 if (ali->top != NULL) free(ali->top);
600 if (ali->aseq != NULL) free(ali->aseq);
601 if (ali->pcode != NULL) free(ali->pcode);
602 if (ali->ccoord != NULL) free(ali->ccoord);
603 if (ali->scoord != NULL) free(ali->scoord);
604 free(ali);
605 }
606
607
608 /* Function: CreateCMConsensus()
609 * Incept: SRE, Thu May 23 10:39:39 2002 [St. Louis]
610 *
611 * Purpose: Create displayable strings for consensus sequence
612 * and consensus structure; also create map of
613 * nodes -> right and left consensus positions.
614 *
615 * Consensus sequence shows maximum scoring residue(s)
616 * for each emitting node. If score < pthresh (for pairs)
617 * or < sthresh (for singlets), the residue is shown
618 * in lower case. (That is, "strong" consensus residues
619 * are in upper case.)
620 *
621 * Currently, pthresh is hard-coded as 3.0 and sthresh as
622 * 1.0 (previously v1.0.2 and earlier had these passed in,
623 * but they were always set as 3.0/1.0 so I modified the
624 * function prototype - feel free to change it back if you
625 * want different values, or want to let caller (user) set
626 * them)).
627 *
628 * Consensus structure annotates
629 * base pairs according to "multifurcation order" (how
630 * many multifurcation loops are beneath this pair).
631 * terminal stems: <>
632 * order 1: ()
633 * order 2: []
634 * order >2: {}
635 * Singlets are annotated : if external, _ if hairpin,
636 * - if bulge or interior loop, and , for multifurcation loop.
637 *
638 * Example:
639 * ::(((,,<<<__>>>,<<<__>>->,,)))::
640 * AAGGGAACCCTTGGGTGGGTTCCACAACCCAA
641 *
642 * Args: cm - the model
643 * abc - alphabet to create con->cseq with (often cm->abc)
644 *
645 * Returns: Newly created CMConsensus_t.
646 * Caller frees w/ FreeCMConsensus().
647 * Returns NULL on any error.
648 *
649 * Xref: STL6 p.58.
650 */
651 CMConsensus_t *
CreateCMConsensus(CM_t * cm,const ESL_ALPHABET * abc)652 CreateCMConsensus(CM_t *cm, const ESL_ALPHABET *abc)
653 {
654 int status;
655 CMConsensus_t *con = NULL; /* growing consensus info */
656 char *cseq = NULL; /* growing consensus sequence display string */
657 char *cstr = NULL; /* growing consensus structure display string */
658 int *ct = NULL; /* growing ct Zuker pairing partnet string */
659 int *lpos = NULL; /* maps node->left consensus position, [0..nodes-1] */
660 int *rpos = NULL; /* maps node->right consensus position, [0..nodes-1] */
661 int cpos; /* current position in cseq, cstr */
662 int nalloc; /* current allocated length of cseq, cstr */
663 ESL_STACK *pda; /* pushdown automaton used to traverse model */
664 int *multiorder; /* "height" of each node (multifurcation order), [0..nodes-1] */
665 int *inface; /* face count for each node, inside */
666 int *outface; /* face count for each node, outside */
667 int v, nd;
668 int type;
669 char lchar, rchar;
670 char lstruc, rstruc;
671 int x;
672 int pairpartner; /* coord of left pairing partner of a right base */
673 void *tmp; /* for ESL_RALLOC */
674 /* bit score thresholds for base pairs (pthresh) and singlets
675 * (sthresh) to be lowercased. These are hard-coded. If you want
676 * to change them, consider changing the function prototype so they're
677 * chosen by caller and passed in.
678 */
679 float pthresh = 3.0;
680 float sthresh = 1.0;
681
682 /* Contract check. CM must have log odds scores. Alphabet must make sense. */
683 if(! (cm->flags & CMH_BITS)) return NULL;
684 if(cm->abc->type == eslRNA) {
685 if(abc->type != eslRNA && abc->type != eslDNA) return NULL;
686 }
687 else if(cm->abc->K != abc->K) return NULL;
688
689 ESL_ALLOC(lpos, sizeof(int) * cm->nodes);
690 ESL_ALLOC(rpos, sizeof(int) * cm->nodes);
691 ESL_ALLOC(cseq, sizeof(char) * 100);
692 ESL_ALLOC(cstr, sizeof(char) * 100);
693 ESL_ALLOC(ct, sizeof(int) * 100);
694 nalloc = 100;
695 cpos = 0;
696
697 for (nd = 0; nd < cm->nodes; nd++)
698 lpos[nd] = rpos[nd] = -1;
699
700 multiorder = createMultifurcationOrderChart(cm);
701 createFaceCharts(cm, &inface, &outface);
702
703 if((pda = esl_stack_ICreate()) == NULL) goto ERROR;
704 if ((status = esl_stack_IPush(pda, 0)) != eslOK) goto ERROR;
705 if ((status = esl_stack_IPush(pda, PDA_STATE)) != eslOK) goto ERROR;
706 while (esl_stack_IPop(pda, &type) != eslEOD) {
707 if (type == PDA_RESIDUE)
708 {
709 esl_stack_IPop(pda, &x); rchar = (char) x;
710 esl_stack_IPop(pda, &x); rstruc = (char) x;
711 esl_stack_IPop(pda, &pairpartner);
712 esl_stack_IPop(pda, &nd);
713 rpos[nd] = cpos;
714 cseq[cpos] = rchar;
715 cstr[cpos] = rstruc;
716 ct[cpos] = pairpartner;
717 if (pairpartner != -1) ct[pairpartner] = cpos;
718 cpos++;
719 }
720 else if (type == PDA_MARKER)
721 {
722 esl_stack_IPop(pda, &nd);
723 rpos[nd] = cpos-1;
724 }
725 else if (type == PDA_STATE)
726 {
727 esl_stack_IPop(pda, &v);
728 nd = cm->ndidx[v];
729 lchar = rchar = lstruc = rstruc = 0;
730
731 /* Determine what we emit:
732 * MATP, MATL, MATR consensus states only.
733 */
734 if (cm->stid[v] == MATP_MP)
735 {
736 x = esl_vec_FArgMax(cm->esc[v], abc->K*abc->K);
737 lchar = abc->sym[x / abc->K];
738 rchar = abc->sym[x % abc->K];
739 if (cm->esc[v][x] < pthresh) {
740 lchar = tolower((int) lchar);
741 rchar = tolower((int) rchar);
742 }
743 switch (multiorder[nd]) {
744 case 0: lstruc = '<'; rstruc = '>'; break;
745 case 1: lstruc = '('; rstruc = ')'; break;
746 case 2: lstruc = '['; rstruc = ']'; break;
747 default: lstruc = '{'; rstruc = '}'; break;
748 }
749 } else if (cm->stid[v] == MATL_ML) {
750 x = esl_vec_FArgMax(cm->esc[v], cm->abc->K);
751 lchar = abc->sym[x];
752 if (cm->esc[v][x] < sthresh) lchar = tolower((int) lchar);
753 if (outface[nd] == 0) lstruc = ':'; /* external ss */
754 else if (inface[nd] == 0 && outface[nd] == 1) lstruc = '_'; /* hairpin loop */
755 else if (inface[nd] == 1 && outface[nd] == 1) lstruc = '-'; /* bulge/interior */
756 else lstruc = ','; /* multiloop */
757 rstruc = ' ';
758 } else if (cm->stid[v] == MATR_MR) {
759 x = esl_vec_FArgMax(cm->esc[v], cm->abc->K);
760 rchar = abc->sym[x];
761 if (cm->esc[v][x] < sthresh) rchar = tolower((int) rchar);
762 if (outface[nd] == 0) rstruc = ':'; /* external ss */
763 else if (inface[nd] == 0 && outface[nd] == 1) rstruc = '?'; /* doesn't happen */
764 else if (inface[nd] == 1 && outface[nd] == 1) rstruc = '-'; /* bulge/interior */
765 else rstruc = ','; /* multiloop */
766 lstruc = ' ';
767 }
768
769 /* Emit. A left base, we can do now;
770 * a right base, we defer onto PDA.
771 */
772 lpos[nd] = cpos; /* we always set lpos, rpos even for nonemitters */
773 if (lchar) {
774 cseq[cpos] = lchar;
775 cstr[cpos] = lstruc;
776 ct[cpos] = -1; /* will be overwritten, if needed, when right guy is processed */
777 cpos++;
778 }
779 if (rchar) {
780 if ((status = esl_stack_IPush(pda, nd)) != eslOK) goto ERROR;
781 if (lchar) { if ((status = esl_stack_IPush(pda, cpos-1)) != eslOK) goto ERROR; }
782 else { if ((status = esl_stack_IPush(pda, -1)) != eslOK) goto ERROR; }
783 if ((status = esl_stack_IPush(pda, rstruc)) != eslOK) goto ERROR;
784 if ((status = esl_stack_IPush(pda, rchar)) != eslOK) goto ERROR;
785 if ((status = esl_stack_IPush(pda, PDA_RESIDUE)) != eslOK) goto ERROR;
786 } else {
787 if ((status = esl_stack_IPush(pda, nd)) != eslOK) goto ERROR;
788 if ((status = esl_stack_IPush(pda, PDA_MARKER)) != eslOK) goto ERROR;
789 }
790
791 /* Transit - to consensus states only.
792 * The obfuscated idiom finds the index of the next consensus
793 * state without making assumptions about numbering or connectivity.
794 */
795 if (cm->sttype[v] == B_st) {
796 if ((status = esl_stack_IPush(pda, cm->cnum[v])) != eslOK) goto ERROR; /* right S */
797 if ((status = esl_stack_IPush(pda, PDA_STATE)) != eslOK) goto ERROR;
798 if ((status = esl_stack_IPush(pda, cm->cfirst[v])) != eslOK) goto ERROR; /* left S */
799 if ((status = esl_stack_IPush(pda, PDA_STATE)) != eslOK) goto ERROR;
800 } else if (cm->sttype[v] != E_st) {
801 v = cm->nodemap[cm->ndidx[cm->cfirst[v] + cm->cnum[v] - 1]];
802 if ((status = esl_stack_IPush(pda, v)) != eslOK) goto ERROR;
803 if ((status = esl_stack_IPush(pda, PDA_STATE)) != eslOK) goto ERROR;
804 }
805 } /*end PDA_STATE block*/
806
807 if (cpos == nalloc) {
808 nalloc += 100;
809 ESL_RALLOC(cseq, tmp, sizeof(char) * nalloc);
810 ESL_RALLOC(cstr, tmp, sizeof(char) * nalloc);
811 ESL_RALLOC(ct, tmp, sizeof(int) * nalloc);
812 }
813 }/* PDA now empty... done generating cseq, cstr, and node->consensus residue map */
814 cseq[cpos] = '\0';
815 cstr[cpos] = '\0';
816
817 esl_stack_Destroy(pda);
818 free(multiorder);
819 free(inface);
820 free(outface);
821
822 ESL_ALLOC(con, sizeof(CMConsensus_t));
823 con->cseq = cseq;
824 con->cstr = cstr;
825 con->ct = ct;
826 con->lpos = lpos;
827 con->rpos = rpos;
828 con->clen = cpos;
829 return con;
830
831 ERROR:
832 if(cseq != NULL) free(cseq);
833 if(cstr != NULL) free(cstr);
834 if(ct != NULL) free(ct);
835 if(lpos != NULL) free(lpos);
836 if(rpos != NULL) free(rpos);
837 if(con != NULL) free(con);
838 return NULL;
839 }
840
841 void
FreeCMConsensus(CMConsensus_t * con)842 FreeCMConsensus(CMConsensus_t *con)
843 {
844 if (con->cseq != NULL) free(con->cseq);
845 if (con->cstr != NULL) free(con->cstr);
846 if (con->ct != NULL) free(con->ct);
847 if (con->lpos != NULL) free(con->lpos);
848 if (con->rpos != NULL) free(con->rpos);
849 free(con);
850 }
851
852 /* Function: createMultifurcationOrderChart()
853 * Incept: SRE, Thu May 23 09:48:33 2002 [St. Louis]
854 *
855 * Purpose: Calculates the degree of multifurcation beneath
856 * the master subtree rooted at every node n.
857 * Returns [0..nodes-1] array of these values.
858 *
859 * Terminal stems have value 0. All nodes n starting with
860 * the BEG node for a terminal stem have height[n] = 0.
861 *
862 * A stem "above" a multifurcation into all terminal stems
863 * has value 1; all nodes n starting with BEG and ending
864 * with BIF have height[n] = 1.
865 *
866 * And so on, for "higher order" (deeper) multifurcations.
867 *
868 * Used for figuring out what characters we'll display a
869 * consensus pair with.
870 *
871 * THIS FUNCTION IS BUGGY (Sat Jun 1 12:24:23 2002)
872 *
873 * Args: cm - the model.
874 *
875 * Returns: [0..cm->nodes-1] array of multifurcation orders, for each node.
876 * This array is allocated here; caller free's w/ free().
877 *
878 * xref: STL6 p.58.
879 */
880 static int *
createMultifurcationOrderChart(CM_t * cm)881 createMultifurcationOrderChart(CM_t *cm)
882 {
883 int status;
884 int v, nd, left, right;
885 int *height;
886 int *seg_has_pairs;
887
888 ESL_ALLOC(height, sizeof(int) * cm->nodes);
889 ESL_ALLOC(seg_has_pairs, sizeof(int) * cm->nodes);
890 for (nd = cm->nodes-1; nd >= 0; nd--)
891 {
892 v = cm->nodemap[nd];
893
894 if (cm->stid[v] == MATP_MP) seg_has_pairs[nd] = TRUE;
895 else if (cm->stid[v] == END_E) seg_has_pairs[nd] = FALSE;
896 else if (cm->stid[v] == BIF_B) seg_has_pairs[nd] = FALSE;
897 else seg_has_pairs[nd] = seg_has_pairs[nd+1];
898
899 if (cm->stid[v] == END_E)
900 height[nd] = 0;
901 else if (cm->stid[v] == BIF_B)
902 {
903 left = cm->ndidx[cm->cfirst[v]];
904 right = cm->ndidx[cm->cnum[v]];
905 height[nd] = ESL_MAX(height[left] + seg_has_pairs[left],
906 height[right] + seg_has_pairs[right]);
907 }
908 else
909 height[nd] = height[nd+1];
910 }
911 free(seg_has_pairs);
912 return height;
913
914 ERROR:
915 cm_Fail("Memory allocation error.\n");
916 return 0; /* never reached */
917 }
918
919
920 /* Function: createFaceCharts()
921 * Incept: SRE, Thu May 23 12:40:04 2002 [St. Louis]
922 *
923 * Purpose: Calculate "inface" and "outface" for each node
924 * in the consensus (master) structure of the CM.
925 * These can be used to label nodes:
926 * inface outface
927 * ------------ ----------
928 * external ss any 0
929 * hairpin loop 0 1
930 * bulge/interior 1 1
931 * multifurc >1 1
932 * multifurc 1 >1
933 * doesn't happen 0 >1
934 *
935 * hairpin closing bp 0 1
936 * extern closing bp 1 0
937 * stem bp 1 1
938 * multifurc close bp >1 1
939 * multifurc close bp 1 >1
940 * doesn't happen 0 >1
941 *
942 * Args: cm - the model
943 * ret_inface - RETURN: inface[0..nodes-1]
944 * ret_outface - RETURN: outface[0..nodes-1]
945 *
946 * Returns: inface, outface;
947 * they're alloc'ed here. Caller free's with free()
948 *
949 * Xref: STL6 p.58
950 */
951 static void
createFaceCharts(CM_t * cm,int ** ret_inface,int ** ret_outface)952 createFaceCharts(CM_t *cm, int **ret_inface, int **ret_outface)
953 {
954 int status;
955 int *inface;
956 int *outface;
957 int nd, left, right, parent;
958 int v,y;
959
960 ESL_ALLOC(inface, sizeof(int) * cm->nodes);
961 ESL_ALLOC(outface, sizeof(int) * cm->nodes);
962
963 /* inface - the number of faces below us in descendant
964 * subtrees. if 0, we're either external, or
965 * a closing basepair, or we're in a hairpin loop.
966 * inface is exclusive of current pair - so we
967 * can easily detect closing base pairs.
968 */
969 for (nd = cm->nodes-1; nd >= 0; nd--)
970 {
971 v = cm->nodemap[nd];
972 if (cm->ndtype[nd] == END_nd) inface[nd] = 0;
973 else if (cm->ndtype[nd] == BIF_nd) {
974 left = cm->ndidx[cm->cfirst[v]];
975 right = cm->ndidx[cm->cnum[v]];
976 inface[nd] = inface[left] + inface[right];
977 } else {
978 if (cm->ndtype[nd+1] == MATP_nd) inface[nd] = 1;
979 else inface[nd] = inface[nd+1];
980 }
981 }
982
983 /* outface - the number of faces above us in the tree
984 * excluding our subtree. if 0, we're external.
985 * Like inface, outface is exclusive of current
986 * pair.
987 */
988 for (nd = 0; nd < cm->nodes; nd++)
989 {
990 v = cm->nodemap[nd];
991 if (cm->ndtype[nd] == ROOT_nd) outface[nd] = 0;
992 else if (cm->ndtype[nd] == BEGL_nd)
993 {
994 parent = cm->ndidx[cm->plast[v]];
995 y = cm->nodemap[parent];
996 right = cm->ndidx[cm->cnum[y]];
997 outface[nd] = outface[parent] + inface[right];
998 }
999 else if (cm->ndtype[nd] == BEGR_nd)
1000 {
1001 parent = cm->ndidx[cm->plast[v]];
1002 left = cm->ndidx[cm->cfirst[y]];
1003 outface[nd] = outface[parent] + inface[left];
1004 }
1005 else
1006 {
1007 parent = nd-1;
1008 if (cm->ndtype[parent] == MATP_nd) outface[nd] = 1;
1009 else outface[nd] = outface[parent];
1010 }
1011 }
1012
1013 *ret_inface = inface;
1014 *ret_outface = outface;
1015 return;
1016
1017 ERROR:
1018 cm_Fail("Memory allocation error.\n");
1019 }
1020
1021 /* Function: IsCompensatory()
1022 * Date: SRE, Sun Jun 2 10:16:59 2002 [Madison]
1023 *
1024 * Purpose: Returns TRUE if log[pij/(pi*pj)] is >= 0,
1025 * where pij is the probability of a base pair,
1026 * pi and pj are the marginal probabilities
1027 * for the symbols at i and j.
1028 *
1029 * Currently returns FALSE if symi or symj
1030 * are ambiguous IUPAC nucs. Could do a more
1031 * sophisticated marginalization - prob not
1032 * worth it right now.
1033 *
1034 * Args: pij - joint emission probability vector [0..15]
1035 * indexed symi*4 + symj.
1036 * symi - symbol index at i [0..3], equiv to [a..u]
1037 * symj - symbol index at j [0..3], equiv to [a..u]
1038 *
1039 * Returns: TRUE or FALSE.
1040 */
1041 int
IsCompensatory(const ESL_ALPHABET * abc,float * pij,int symi,int symj)1042 IsCompensatory(const ESL_ALPHABET *abc, float *pij, int symi, int symj)
1043 {
1044 int x;
1045 float pi, pj;
1046
1047 if (symi >= abc->K || symj >= abc->K)
1048 return FALSE;
1049
1050 pi = pj = 0.;
1051 for (x = 0; x < abc->K; x++)
1052 {
1053 pi += pij[symi*abc->K + x];
1054 pj += pij[x*abc->K + symi];
1055 }
1056 if (log(pij[symi*abc->K+symj]) - log(pi) - log(pj) >= 0)
1057 return TRUE;
1058 else
1059 return FALSE;
1060 }
1061
1062 /* Implementation of CMEmitMap_t structure:
1063 * map of a CM's nodes onto consensus sequence positions.
1064 * Structure is declared in infernal.h.
1065 *
1066 * Used for constructing multiple alignments.
1067 *
1068 * clen : consensus length.
1069 * clen is 2* n(MATP) + n(MATL) + n(MATR).
1070 * The consensus sequence is indexed 1..clen.
1071 * 0 and clen+1 are also used, as boundary conditions.
1072 *
1073 * lpos[0..nodes-1] : has values 0 to clen+1.
1074 * Any left match emission from node nd is placed in lpos[nd].
1075 * Any left insert emission from node nd *follows* lpos[nd].
1076 *
1077 * rpos[0..nodes-1] : has values 0..clen+1
1078 * Any right match emission from node nd is placed in rpos[nd].
1079 * Any right insert emission from node nd *precedes* rpos[nd]
1080 *
1081 * epos[0..nodes-1] : has values 0..clen+1.
1082 * Any EL insertion from a nd->EL transition *follows* epos[nd].
1083 *
1084 * There are no dummy values; all lpos, rpos, epos are valid coords
1085 * 0..clen+1, as described above, even for END_nd's.
1086 *
1087 * For nonemitting nodes, rpos and lpos give a noninclusive bound:
1088 * for example, lpos[0] = 0 and rpos[0] = clen+1 by definition.
1089 *
1090 * Insertions occur between consensus positions. An inter-consensus-position
1091 * space may contain more than one type of insertion: an IL insertion and
1092 * an EL insertion, an IR insertion and an EL insertion; or (in
1093 * a single absurd case of a model with a consensus length of 0)
1094 * all three insertion types. Insertions are placed in order IL/EL/IR.
1095 *
1096 */
1097
1098 /* Function: CreateEmitMap()
1099 * Date: ? (SRE pre Infernal version 0.55).
1100 *
1101 * Purpose: Create and fill an emit map for a given CM <cm>
1102 * and return it.
1103 * If we run out of memory or we have a problem creating
1104 * the map, we return NULL.
1105 *
1106 */
1107
1108 CMEmitMap_t *
CreateEmitMap(CM_t * cm)1109 CreateEmitMap(CM_t *cm)
1110 {
1111 int status;
1112 CMEmitMap_t *map = NULL;
1113 ESL_STACK *pda;
1114 int cpos;
1115 int nd;
1116 int on_right;
1117
1118 ESL_ALLOC(map, sizeof(CMEmitMap_t));
1119 ESL_ALLOC(map->lpos, sizeof(int) * cm->nodes);
1120 ESL_ALLOC(map->rpos, sizeof(int) * cm->nodes);
1121 ESL_ALLOC(map->epos, sizeof(int) * cm->nodes);
1122
1123 for (nd = 0; nd < cm->nodes; nd++)
1124 map->lpos[nd] = map->rpos[nd] = map->epos[nd] = -1;
1125
1126 cpos = 0;
1127 nd = 0;
1128 if ((pda = esl_stack_ICreate()) == NULL) goto ERROR;
1129 if ((status = esl_stack_IPush(pda, 0)) != eslOK) goto ERROR; /* 0 = left side. 1 would = right side. */
1130 if ((status = esl_stack_IPush(pda, nd)) != eslOK) goto ERROR;
1131 while (esl_stack_IPop(pda, &nd) != eslEOD)
1132 {
1133 esl_stack_IPop(pda, &on_right);
1134
1135 if (on_right)
1136 {
1137 map->rpos[nd] = cpos+1;
1138 if (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATR_nd) cpos++;
1139 }
1140 else
1141 {
1142 if (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd) cpos++;
1143 map->lpos[nd] = cpos;
1144
1145 if (cm->ndtype[nd] == BIF_nd)
1146 {
1147 /* push the BIF back on for its right side */
1148 if ((status = esl_stack_IPush(pda, 1)) != eslOK) goto ERROR;
1149 if ((status = esl_stack_IPush(pda, nd)) != eslOK) goto ERROR;
1150 /* push node index for right child */
1151 if ((status = esl_stack_IPush(pda, 0)) != eslOK) goto ERROR;
1152 if ((status = esl_stack_IPush(pda, cm->ndidx[cm->cnum[cm->nodemap[nd]]])) != eslOK) goto ERROR;
1153 /* push node index for left child */
1154 if ((status = esl_stack_IPush(pda, 0)) != eslOK) goto ERROR;
1155 if ((status = esl_stack_IPush(pda, cm->ndidx[cm->cfirst[cm->nodemap[nd]]])) != eslOK) goto ERROR;
1156 }
1157 else
1158 {
1159 /* push the node back on for right side */
1160 if ((status = esl_stack_IPush(pda, 1)) != eslOK) goto ERROR;
1161 if ((status = esl_stack_IPush(pda, nd)) != eslOK) goto ERROR;
1162 /* push child node on */
1163 if (cm->ndtype[nd] != END_nd) {
1164 if ((status = esl_stack_IPush(pda, 0)) != eslOK) goto ERROR;
1165 if ((status = esl_stack_IPush(pda, nd+1)) != eslOK) goto ERROR;
1166 }
1167 }
1168 }
1169 }
1170
1171 /* Construct the epos map: if we do a v->EL transition,
1172 * the EL follows what consensus position (and its IL insertions,
1173 * if any)
1174 */
1175 for (nd = cm->nodes-1; nd >= 0; nd--) {
1176 if (cm->ndtype[nd] == END_nd)
1177 cpos = map->lpos[nd];
1178 else if (cm->ndtype[nd] == BIF_nd) /* propagate epos for *right* branch. */
1179 cpos = map->epos[cm->ndidx[cm->cnum[cm->nodemap[nd]]]];
1180
1181 map->epos[nd] = cpos;
1182 }
1183
1184 map->clen = map->rpos[0]-1;
1185 esl_stack_Destroy(pda);
1186
1187 /* ensure we've filled in the map correctly */
1188 for (nd = 0; nd < cm->nodes; nd++) {
1189 if(map->lpos[nd] == -1) goto ERROR;
1190 if(map->rpos[nd] == -1) goto ERROR;
1191 if(map->epos[nd] == -1) goto ERROR;
1192 }
1193
1194 return map;
1195
1196 ERROR:
1197 if(map != NULL) FreeEmitMap(map);
1198 return NULL;
1199 }
1200
1201
1202 /* Function: SizeofEmitMap()
1203 * Date: EPN, Wed Jan 18 10:15:46 2012
1204 *
1205 * Purpose: Calculate and return the size of an emit map
1206 * in Mb.
1207 */
1208
1209 float
SizeofEmitMap(CM_t * cm,CMEmitMap_t * emap)1210 SizeofEmitMap(CM_t *cm, CMEmitMap_t *emap)
1211 {
1212 float bytes = 0.;
1213
1214 if(emap == NULL) return 0.;
1215
1216 bytes = sizeof(CMEmitMap_t);
1217 bytes += sizeof(int) * cm->nodes; /* lpos */
1218 bytes += sizeof(int) * cm->nodes; /* rpos */
1219 bytes += sizeof(int) * cm->nodes; /* epos */
1220
1221 return bytes / 1000000.;
1222 }
1223
1224 void
DumpEmitMap(FILE * fp,CMEmitMap_t * map,CM_t * cm)1225 DumpEmitMap(FILE *fp, CMEmitMap_t *map, CM_t *cm)
1226 {
1227 int nd;
1228
1229 fprintf(fp, "CM to consensus emit map; consensus length = %d \n",
1230 map->clen);
1231 fprintf(fp, "%4s %7s %9s %4s %4s %4s\n",
1232 "Node", "State 1", "Node type", "lpos", "rpos", "epos");
1233 fprintf(fp, "%4s %7s %9s %4s %4s %4s\n",
1234 "----", "-------", "---------", "----", "----", "----");
1235 for (nd = 0; nd < cm->nodes; nd++)
1236 fprintf(fp, "%4d %7d %9s %4d %4d %4d\n",
1237 nd, cm->nodemap[nd], Nodetype(cm->ndtype[nd]),
1238 map->lpos[nd], map->rpos[nd], map->epos[nd]);
1239 }
1240
1241 void
FreeEmitMap(CMEmitMap_t * map)1242 FreeEmitMap(CMEmitMap_t *map)
1243 {
1244 free(map->lpos);
1245 free(map->rpos);
1246 free(map->epos);
1247 free(map);
1248 }
1249
1250 /* format_time_string()
1251 * Date: SRE, Fri Nov 26 15:06:28 1999 [St. Louis]
1252 *
1253 * Purpose: Given a number of seconds, format into
1254 * hh:mm:ss.xx in a provided buffer.
1255 *
1256 * Args: buf - allocated space (128 is plenty!)
1257 * sec - number of seconds
1258 * do_frac - TRUE (1) to include hundredths of a sec
1259 */
1260 void
FormatTimeString(char * buf,double sec,int do_frac)1261 FormatTimeString(char *buf, double sec, int do_frac)
1262 {
1263 int h, m, s, hs;
1264
1265 h = (int) (sec / 3600.);
1266 m = (int) (sec / 60.) - h * 60;
1267 s = (int) (sec) - h * 3600 - m * 60;
1268 if (do_frac) {
1269 hs = (int) (sec * 100.) - h * 360000 - m * 6000 - s * 100;
1270 sprintf(buf, "%02d:%02d:%02d.%02d", h,m,s,hs);
1271 } else {
1272 sprintf(buf, "%02d:%02d:%02d", h,m,s);
1273 }
1274 }
1275
1276
1277 /* Function: GetCommand
1278 * Date: EPN, Fri Jan 25 13:56:10 2008
1279 *
1280 * Purpose: Return the command used to call an infernal executable
1281 * in <ret_command>.
1282 *
1283 * Returns: eslOK on success; eslEMEM on allocation failure.
1284 */
1285 int
GetCommand(const ESL_GETOPTS * go,char * errbuf,char ** ret_command)1286 GetCommand(const ESL_GETOPTS *go, char *errbuf, char **ret_command)
1287 {
1288 int status;
1289 int i;
1290 char *command = NULL;
1291
1292 for (i = 0; i < go->argc; i++) { /* copy all command line options and args */
1293 if((status = esl_strcat(&(command), -1, go->argv[i], -1)) != eslOK) goto ERROR;
1294 if(i < (go->argc-1)) if((status = esl_strcat(&(command), -1, " ", 1)) != eslOK) goto ERROR;
1295 }
1296 *ret_command = command;
1297
1298 return eslOK;
1299
1300 ERROR:
1301 ESL_FAIL(status, errbuf, "GetCommand(): memory allocation error.");
1302 return status;
1303 }
1304
1305 /* Function: GetDate
1306 * Date: EPN, Fri Jan 25 13:59:22 2008
1307 *
1308 * Purpose: Return a string that gives the current date.
1309 *
1310 * Returns: eslOK on success; eslEMEM on allocation failure.
1311 */
1312 int
GetDate(char * errbuf,char ** ret_date)1313 GetDate(char *errbuf, char **ret_date)
1314 {
1315 int status;
1316 time_t date = time(NULL);
1317 char *sdate = NULL;
1318
1319 if((status = esl_strdup(ctime(&date), -1, &sdate)) != eslOK) goto ERROR;
1320 esl_strchop(sdate, -1); /* doesn't return anything but eslOK */
1321
1322 *ret_date = sdate;
1323 return eslOK;
1324
1325 ERROR:
1326 ESL_FAIL(status, errbuf, "get_date() error status: %d, probably out of memory.", status);
1327 return status;
1328 }
1329
1330
1331 /* Function: bp_is_canonical
1332 * Date: EPN, Wed Oct 14 06:17:27 2009
1333 *
1334 * Purpose: Determine if two residues form a canonical base pair or not.
1335 * Works for RNA or DNA (because for some reason cmsearch allows
1336 * the user to format output as DNA (with --dna)).
1337 *
1338 * Returns: TRUE if:
1339 * lseq rseq
1340 * ---- ----
1341 * A U
1342 * U A
1343 * C G
1344 * G C
1345 * G U
1346 * U G
1347 * A T
1348 * T A
1349 * G T
1350 * T G
1351 * Else, return FALSE.
1352 */
1353 int
bp_is_canonical(char lseq,char rseq)1354 bp_is_canonical(char lseq, char rseq)
1355 {
1356 switch (toupper(lseq)) {
1357 case 'A':
1358 switch (toupper(rseq)) {
1359 case 'U': return TRUE; break;
1360 case 'T': return TRUE; break;
1361 default: break;
1362 }
1363 break;
1364 case 'C':
1365 switch (toupper(rseq)) {
1366 case 'G': return TRUE; break;
1367 default: break;
1368 }
1369 break;
1370 case 'G':
1371 switch (toupper(rseq)) {
1372 case 'C': return TRUE; break;
1373 case 'U': return TRUE; break;
1374 case 'T': return TRUE; break;
1375 default: break;
1376 }
1377 break;
1378 case 'U':
1379 switch (toupper(rseq)) {
1380 case 'A': return TRUE; break;
1381 case 'G': return TRUE; break;
1382 default: break;
1383 }
1384 break;
1385 case 'T':
1386 switch (toupper(rseq)) {
1387 case 'A': return TRUE; break;
1388 case 'G': return TRUE; break;
1389 default: break;
1390 }
1391 break;
1392 default: break;
1393 }
1394
1395 return FALSE;
1396 }
1397
1398