1 /* cmemit: generate sequences from a CM.
2  *
3  * EPN, 09.01.06 Janelia Farm
4  * based on HMMER-2.3.2's hmmemit.c from SRE
5  * Easelfied: EPN, Tue Aug 14 07:01:44 2007
6  */
7 
8 #include "esl_config.h"
9 #include "config.h"
10 
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <time.h>
15 
16 #include "easel.h"
17 #include <esl_getopts.h>
18 #include <esl_histogram.h>
19 #include <esl_random.h>
20 #include <esl_randomseq.h>
21 #include <esl_stats.h>
22 #include <esl_vectorops.h>
23 #include <esl_wuss.h>
24 
25 #include "hmmer.h"
26 
27 #include "infernal.h"
28 
29 #define ALPHOPTS "--rna,--dna"  /* Exclusive options for alphabet choice */
30 #define OUTOPTS  "-u,-c,-a"     /* Exclusive options for output */
31 
32 static ESL_OPTIONS options[] = {
33   /* name          type            default       env   range   toggles       reqs  incomp       help                                     docgroup*/
34   { "-h",          eslARG_NONE,    FALSE,        NULL, NULL,   NULL,         NULL, NULL,        "show brief help on version and usage",                                 1 },
35   { "-o",          eslARG_OUTFILE, FALSE,        NULL, NULL,   NULL,         NULL, NULL,        "send sequence output to file <f>, not stdout",                         1 },
36   { "-N",          eslARG_INT,      "10",        NULL, "n>0",  NULL,         NULL, NULL,        "generate <n> sequences",                                               1 },
37   { "-u",          eslARG_NONE, "default",       NULL, NULL,   OUTOPTS,      NULL, NULL,        "write generated sequences as unaligned FASTA [default]",               1 },
38   { "-a",          eslARG_NONE,    FALSE,        NULL, NULL,   OUTOPTS,      NULL, NULL,        "write generated sequences as an alignment",                            1 },
39   { "-c",          eslARG_NONE,    FALSE,        NULL, NULL,   OUTOPTS,      NULL, NULL,        "generate a single \"consensus\" sequence only",                        1, },
40   { "-e",          eslARG_INT,      NULL,        NULL, "n>0",  NULL,         NULL, "-a,-c",     "embed emitted sequences within larger random sequences of length <n>", 1 },
41   { "-l",          eslARG_NONE,    FALSE,        NULL, NULL,   NULL,         NULL, NULL,        "local; emit from a locally configured model [default: global]",        1 },
42   /* options for truncating emitted sequences */
43   { "--u5p",       eslARG_NONE,     NULL,        NULL, NULL,   NULL,         NULL, "-a,-c",     "truncate unaligned sequences 5', choosing a random start posn",      2 },
44   { "--u3p",       eslARG_NONE,     NULL,        NULL, NULL,   NULL,         NULL, "-a,-c",     "truncate unaligned sequences 3', choosing a random end   posn",      2 },
45   { "--a5p",       eslARG_INT,      NULL,        NULL, "n>=0", NULL,   "--a3p,-a", NULL,        "truncate aln 5', start at match column <n> (use 0 for random posn)", 2 },
46   { "--a3p",       eslARG_INT,      NULL,        NULL, "n>=0", NULL,   "--a5p,-a", NULL,        "truncate aln 3', end   at match column <n> (use 0 for random posn)", 2 },
47   /* other options */
48   { "--seed",      eslARG_INT,      "0",         NULL, "n>=0", NULL,         NULL, NULL,        "set RNG seed to <n> [default: one-time arbitrary seed]", 3 },
49   { "--iid",       eslARG_NONE,     FALSE,       NULL, NULL,   NULL,         "-e", NULL,        "with -e, generate larger sequences as 25% ACGU (iid) ",  3 },
50   { "--rna",       eslARG_NONE,     "default",   NULL, NULL,   ALPHOPTS,     NULL, NULL,        "output as RNA sequence data",                            3 },
51   { "--dna",       eslARG_NONE,     FALSE,       NULL, NULL,   ALPHOPTS,     NULL, NULL,        "output as DNA sequence data",                            3 },
52   { "--idx",       eslARG_INT,      "1",         NULL, "n>0",  NULL,         NULL, NULL,        "start sequence numbering at <n>",                        3 },
53   { "--outformat", eslARG_STRING,   "Stockholm", NULL, NULL,   NULL,         "-a", NULL,        "w/-a output alignment in format <s>",                    3 },
54   { "--tfile",     eslARG_OUTFILE,  NULL,        NULL, NULL,   NULL,         NULL, "-c,-e,--u5p,--u3p", "dump parsetrees to file <f>",                            3 },
55   { "--exp",       eslARG_REAL,     NULL,        NULL, "x>0",  NULL,         NULL, NULL,        "exponentiate CM probabilities by <x> before emitting",   3 },
56   { "--hmmonly",   eslARG_NONE,     NULL,        NULL, NULL,   NULL,         NULL, NULL,        "emit from filter HMM, not from CM",                      3 },
57   { "--nohmmonly", eslARG_NONE,     NULL,        NULL, NULL,   NULL,         NULL, "--hmmonly", "always emit from CM, even for models with 0 basepairs", 3 },
58   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
59 };
60 
61 /* struct cfg_s : "Global" application configuration shared by all
62  * threads/processes
63  *
64  * This structure is passed to routines within main.c, as a means of
65  * semi-encapsulation of shared data amongst different parallel
66  * processes (threads or MPI processes).  This strategy is used
67  * despite the fact that neither a MPI nor a threaded version of
68  * cmemit exists.
69  */
70 struct cfg_s {
71   char         *cmfile;	        /* name of input CM file  */
72   CM_FILE      *cmfp;		/* open input CM file stream       */
73   ESL_ALPHABET *abc;		/* digital alphabet for CM */
74   ESL_ALPHABET *abc_out; 	/* digital alphabet for writing */
75   FILE         *ofp;		/* output file (default is stdout) */
76   FILE         *tfp;		/* optional output file for parsetrees */
77   ESL_RANDOMNESS *r;            /* source of randomness */
78   int           ncm;            /* number CM we're at in file */
79   int           use_cm;         /* TRUE to emit from CM for current model, FALSE to emit from filter HMM */
80 };
81 
82 static char usage[]  = "[-options] <cmfile>";
83 static char banner[] = "sample sequences from a covariance model";
84 
85 static int  init_cfg(const ESL_GETOPTS *go, struct cfg_s *cfg, char *errbuf);
86 
87 static void master(const ESL_GETOPTS *go, struct cfg_s *cfg);
88 
89 static int initialize_cm(const ESL_GETOPTS *go, const struct cfg_s *cfg, CM_t *cm, char *errbuf);
90 static int emit_unaligned(const ESL_GETOPTS *go, const struct cfg_s *cfg, CM_t *cm, char *errbuf);
91 static int emit_alignment(const ESL_GETOPTS *go, const struct cfg_s *cfg, CM_t *cm, char *errbuf);
92 static int emit_consensus(const ESL_GETOPTS *go, const struct cfg_s *cfg, CM_t *cm);
93 static int truncate_msa(const ESL_GETOPTS *go, const struct cfg_s *cfg, ESL_MSA *msa, const ESL_ALPHABET *abc, char *errbuf);
94 
95 int
main(int argc,char ** argv)96 main(int argc, char **argv)
97 {
98   ESL_GETOPTS     *go = NULL;   /* command line processing                     */
99   struct cfg_s     cfg;
100 
101   /* setup logsum lookups (could do this only if nec based on options, but this is safer) */
102   init_ilogsum();
103   FLogsumInit();
104 
105   /***********************************************
106    * Parse command line
107    ***********************************************/
108 
109   /* Process command line options.
110    */
111   go = esl_getopts_Create(options);
112   if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK ||
113       esl_opt_VerifyConfig(go)               != eslOK) {
114     printf("Failed to parse command line: %s\n", go->errbuf);
115     esl_usage(stdout, argv[0], usage);
116     printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
117     exit(1);
118   }
119   if (esl_opt_GetBoolean(go, "-h")) {
120     cm_banner(stdout, argv[0], banner);
121     esl_usage(stdout, argv[0], usage);
122     puts("\nBasic options:");
123     esl_opt_DisplayHelp(stdout, go, 1, 2, 80); /* 1=docgroup, 2 = indentation; 80=textwidth*/
124     puts("\nOptions for truncating sequences:");
125     esl_opt_DisplayHelp(stdout, go, 2, 2, 80);
126     puts("\nOther options:");
127     esl_opt_DisplayHelp(stdout, go, 3, 2, 80);
128     puts("\nAlignment output formats (-a) include: Stockholm, Pfam, AFA (aligned FASTA), A2M, Clustal, PHYLIP\n");
129     exit(0);
130   }
131   if (esl_opt_ArgNumber(go) != 1) {
132     puts("Incorrect number of command line arguments.");
133     esl_usage(stdout, argv[0], usage);
134     puts("\n  where basic options are:");
135     esl_opt_DisplayHelp(stdout, go, 1, 2, 80);
136     printf("\nTo see more help on other available options, do %s -h\n\n", argv[0]);
137     exit(1);
138   }
139   /* Initialize what we can in the config structure (without knowing the alphabet yet).
140    * We could assume RNA, but this HMMER3 based approach is more general.
141    */
142   cfg.cmfile     = esl_opt_GetArg(go, 1);
143   cfg.cmfp       = NULL;	           /* opened in init_cfg() */
144   cfg.ofp        = NULL;                   /* opened in init_cfg() */
145   cfg.abc        = NULL;	           /* created in init_cfg() */
146   cfg.abc_out    = NULL;	           /* created in init_cfg() */
147   cfg.tfp        = NULL;	           /* opened in init_cfg() */
148   cfg.r          = NULL;	           /* created in init_cfg() */
149 
150   /* do work */
151   master(go, &cfg);
152 
153   /* Clean up the cfg.
154    */
155   if(esl_opt_IsOn(go, "-o")) {
156     fclose(cfg.ofp);
157   }
158   if (cfg.tfp   != NULL) {
159     fclose(cfg.tfp);
160   }
161   if (cfg.abc     != NULL) { esl_alphabet_Destroy(cfg.abc); cfg.abc = NULL; }
162   if (cfg.abc_out != NULL) esl_alphabet_Destroy(cfg.abc_out);
163   if (cfg.cmfp    != NULL) cm_file_Close(cfg.cmfp);
164   if (cfg.r       != NULL) esl_randomness_Destroy(cfg.r);
165 
166   esl_getopts_Destroy(go);
167   return 0;
168 }
169 
170 /* init_cfg()
171  * Already set:
172  *    cfg->cmfile  - command line arg 1
173  * Sets:
174  *    cfg->cmfp    - open CM file
175  *    cfg->abc_out - digital alphabet for output
176  *    cfg->tfp     - optional output file for parsetrees
177  *    cfg->r       - source of randomness
178  */
179 static int
init_cfg(const ESL_GETOPTS * go,struct cfg_s * cfg,char * errbuf)180 init_cfg(const ESL_GETOPTS *go, struct cfg_s *cfg, char *errbuf)
181 {
182   int status;
183 
184   /* open CM file for reading */
185   status = cm_file_Open(cfg->cmfile, NULL, FALSE, &(cfg->cmfp), errbuf);
186   if      (status == eslENOTFOUND) cm_Fail("File existence/permissions problem in trying to open CM file %s.\n%s\n", cfg->cmfile, errbuf);
187   else if (status == eslEFORMAT)   cm_Fail("File format problem in trying to open CM file %s.\n%s\n",                cfg->cmfile, errbuf);
188   else if (status != eslOK)        cm_Fail("Unexpected error %d in opening CM file %s.\n%s\n",               status, cfg->cmfile, errbuf);
189 
190   /* open sequence output file for writing */
191   if ( esl_opt_IsOn(go, "-o") ) {
192     if ((cfg->ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) ESL_FAIL(eslFAIL, errbuf, "Failed to open output file %s", esl_opt_GetString(go, "-o"));
193   } else cfg->ofp = stdout;
194 
195 
196   /* create output alphabet */
197   if      (esl_opt_GetBoolean(go, "--rna"))     cfg->abc_out = esl_alphabet_Create(eslRNA);
198   else if (esl_opt_GetBoolean(go, "--dna"))     cfg->abc_out = esl_alphabet_Create(eslDNA);
199 
200   /* open parsetree output file if necessary */
201   if (esl_opt_GetString(go, "--tfile") != NULL) {
202     if ((cfg->tfp = fopen(esl_opt_GetString(go, "--tfile"), "w")) == NULL)
203       ESL_FAIL(eslFAIL, errbuf, "Failed to open --tfile output file %s\n", esl_opt_GetString(go, "--tfile"));
204   }
205 
206   /* create RNG */
207   cfg->r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "--seed"));
208 
209   if (cfg->abc_out == NULL) ESL_FAIL(eslEINVAL, errbuf, "Output alphabet creation failed.");
210   if (cfg->r       == NULL) ESL_FAIL(eslEINVAL, errbuf, "Failed to create random number generator: probably out of memory");
211 
212   return eslOK;
213 }
214 
215 /* master()
216  * The serial version of cmemit. (There is no parallel version yet).
217  * For each CM, emit sequences/alignment and create output.
218  *
219  * We only return if successful. All errors are handled immediately and fatally with cm_Fail().
220  */
221 static void
master(const ESL_GETOPTS * go,struct cfg_s * cfg)222 master(const ESL_GETOPTS *go, struct cfg_s *cfg)
223 {
224   int      status;
225   char     errbuf[eslERRBUFSIZE];
226   CM_t    *cm  = NULL;
227 
228   if ((status = init_cfg(go, cfg, errbuf)) != eslOK) cm_Fail(errbuf);
229 
230   cfg->ncm = 0;
231 
232   while ((status = cm_file_Read(cfg->cmfp, TRUE, &(cfg->abc), &cm)) == eslOK)
233   {
234     if (cm == NULL) cm_Fail("Failed to read CM from %s -- file corrupt?\n", cfg->cmfile);
235     cfg->ncm++;
236 
237     /* determine if we'll emit from the CM or the filter HMM for this model */
238     if      (esl_opt_GetBoolean(go, "--nohmmonly")) cfg->use_cm = TRUE;
239     else if (esl_opt_GetBoolean(go, "--hmmonly"))   cfg->use_cm = FALSE;
240     else                                            cfg->use_cm = (CMCountNodetype(cm, MATP_nd) > 0) ? TRUE : FALSE;
241 
242     if((status = initialize_cm(go, cfg, cm, errbuf)) != eslOK) cm_Fail(errbuf);
243 
244     /* Pick 1 of 4 exclusive output options. Output is handled within each function. */
245     if(esl_opt_GetBoolean(go, "-c")) {
246       emit_consensus(go, cfg, cm);
247     }
248     else if(esl_opt_GetBoolean(go, "-a")) {
249       if((status = emit_alignment(go, cfg, cm, errbuf)) != eslOK) cm_Fail(errbuf);
250     }
251     else {
252       if((status = emit_unaligned(go, cfg, cm, errbuf)) != eslOK) cm_Fail(errbuf);
253     }
254     FreeCM(cm);
255   }
256   if(status != eslEOF) cm_Fail(cfg->cmfp->errbuf);
257   return;
258 }
259 
260 /* initialize_cm()
261  * Setup the CM based on the command-line options/defaults;
262  * only set flags and a few parameters. cm_Configure() configures
263  * the CM.
264  */
265 static int
initialize_cm(const ESL_GETOPTS * go,const struct cfg_s * cfg,CM_t * cm,char * errbuf)266 initialize_cm(const ESL_GETOPTS *go, const struct cfg_s *cfg, CM_t *cm, char *errbuf)
267 {
268   int status;
269 
270   /* Update cfg->cm->config_opts and cfg->cm->align_opts based on command line options */
271   if(esl_opt_GetBoolean(go, "-l")) {
272     cm->config_opts |= CM_CONFIG_LOCAL;
273     cm->config_opts |= CM_CONFIG_HMMLOCAL;
274     cm->config_opts |= CM_CONFIG_HMMEL;
275   }
276 
277   /* exponentiate the CM of fHMM, if nec, do this before possibly configuring to local alignment in cm_Configure() */
278   if(esl_opt_IsOn(go, "--exp")) {
279     if(cfg->use_cm) cm_Exponentiate   (cm,      esl_opt_GetReal(go, "--exp"));
280     else            cm_p7_Exponentiate(cm->fp7, esl_opt_GetReal(go, "--exp"));
281   }
282   if((status = cm_Configure(cm, errbuf, -1)) != eslOK) return status;
283 
284   return eslOK;
285 }
286 
287 /* emit_unaligned
288  * Given a configured CM, generate and output unaligned sequences.
289  */
290 static int
emit_unaligned(const ESL_GETOPTS * go,const struct cfg_s * cfg,CM_t * cm,char * errbuf)291 emit_unaligned(const ESL_GETOPTS *go, const struct cfg_s *cfg, CM_t *cm, char *errbuf)
292 {
293   int status;
294   Parsetree_t *tr  = NULL;  /* emitted parsetree */
295   ESL_SQ *esq      = NULL;  /* sequence emitted from CM */
296   ESL_SQ *tsq      = NULL;  /* truncated esq, if nec, or ptr to esq */
297   ESL_SQ *gsq      = NULL;  /* generated sequence (iid) to embed tsq in, if nec, else ptr to tsq */
298   ESL_SQ *sq2print = NULL;  /* ptr to sequence we'll output */
299   char *name;               /* sequence name, for EmitParsetree() */
300   int namelen;              /* length of name */
301   int i, L, embedL;         /* sequence idx, length, length of sq to embed in */
302   int x;                    /* residue idx */
303   float sc, struct_sc;      /* parsetree score, structure score */
304   int offset = esl_opt_GetInteger(go, "--idx"); /* seq index to start naming at */
305   int start, end, swap;     /* for truncating sequences */
306 
307   /* the HMM that generates background sequences */
308   int      ghmm_nstates = 0; /* number of states in the HMM */
309   double  *ghmm_sA  = NULL;  /* start probabilities [0..ghmm_nstates-1] */
310   double **ghmm_tAA = NULL;  /* transition probabilities [0..nstates-1][0..nstates-1] */
311   double **ghmm_eAA = NULL;  /* emission probabilities   [0..nstates-1][0..abc->K-1] */
312   double  *fq       = NULL;  /* double vec for esl_rsq_XIID, only used if --iid */
313 
314   /* parameters necessary only if cfg->use_cm is FALSE, and we'll be
315    * emitting from the filter p7 HMM.
316    */
317   int         hmm_mode   = esl_opt_GetBoolean(go, "-l") ? p7_UNILOCAL : p7_UNIGLOCAL;
318   P7_TRACE   *p7tr       = NULL;
319   P7_PROFILE *gm         = NULL;
320   P7_BG      *bg         = NULL;
321 
322   /* Contract check, output alphabet must be identical to CM alphabet
323    * with sole exception that CM alphabet can be eslRNA with output
324    * alphabet eslDNA. And -e only works with 0 or 1 of --u5p,
325    * --u3p, not both.
326    */
327   if(cm->abc->type != cfg->abc_out->type) {
328     if(! (cm->abc->type == eslRNA && cfg->abc_out->type == eslDNA)) {
329       ESL_FAIL(eslFAIL, errbuf, "CM alphabet type must match output alphabet type (unless CM=RNA and output=DNA).");
330     }
331   }
332   if(esl_opt_IsOn(go, "-e") && esl_opt_IsOn(go, "--u5p") && esl_opt_IsOn(go, "--u3p")) {
333     ESL_FAIL(eslFAIL, errbuf, "-e works in combination with --u5p or --u3p but not both");
334   }
335 
336   namelen = IntMaxDigits() + strlen("sample") + 1;  /* IntMaxDigits() returns number of digits in INT_MAX */
337   if(cm->name != NULL) namelen += strlen(cm->name) + 1;
338   ESL_ALLOC(name, sizeof(char) * namelen);
339 
340   if(! cfg->use_cm) {
341     if ((gm = p7_profile_Create(cm->fp7->M, cm->fp7->abc))               == NULL)  cm_Fail("failed to create profile for filter HMM");
342     if ((bg = p7_bg_Create(cm->fp7->abc))                                == NULL)  cm_Fail("failed to create null model for filter HMM");
343     if (p7_ProfileConfig(cm->fp7, bg, gm, cm->fp7->max_length, hmm_mode) != eslOK) cm_Fail("failed to configure profile for filter HMM");
344     /* set N->N and C->C transitions to IMPOSSIBLE, we don't want them emitting */
345     gm->xsc[p7P_N][p7P_MOVE] = gm->xsc[p7P_C][p7P_MOVE] = 0.0f;
346     gm->xsc[p7P_N][p7P_LOOP] = gm->xsc[p7P_C][p7P_LOOP] = -eslINFINITY;
347   }
348 
349   if(esl_opt_IsOn(go, "-e")) {
350     embedL = esl_opt_GetInteger(go, "-e");
351     /* if --iid we'll generate iid seqs, else we'll use our generative
352      * HMM for 'realistic' genome like background.
353      */
354     if(esl_opt_GetBoolean(go, "--iid")) {
355       ESL_ALLOC(fq, sizeof(double) * cfg->abc_out->K); /* iid, currently only option */
356       esl_vec_DSet(fq, cfg->abc_out->K, 1.0 / (double) cfg->abc_out->K);
357     }
358     else {
359       if((status = CreateGenomicHMM(cfg->abc_out, errbuf, &ghmm_sA, &ghmm_tAA, &ghmm_eAA, &ghmm_nstates)) != eslOK) ESL_FAIL(status, errbuf, "unable to create generative HMM\n%s", errbuf);
360     }
361   }
362 
363   for(i = 0; i < esl_opt_GetInteger(go, "-N"); i++)
364     {
365       if(cm->name != NULL) sprintf(name, "%s-sample%d", cm->name, i+offset);
366       else                 sprintf(name, "%d-sample%d", cfg->ncm, i+offset);
367       if(cfg->use_cm) {
368 	if((status = EmitParsetree(cm, errbuf, cfg->r, name, TRUE, &tr, &esq, &L)) != eslOK) return status; /* TRUE: make sq digital */
369 	esq->abc = cfg->abc_out;
370       }
371       else {
372 	esq = esl_sq_CreateDigital(cfg->abc_out);
373 	if((p7tr = p7_trace_Create()) == NULL)                                     cm_Fail("failed to allocate p7 trace");
374 	/*if((status = p7_CoreEmit(cfg->r, cm->fp7, esq, p7tr)) != eslOK) cm_Fail("failed to emit sequence from cm->fp7");*/
375 	if((status = p7_ProfileEmit(cfg->r, cm->fp7, gm, bg, esq, p7tr)) != eslOK) cm_Fail("failed to emit sequence from cm->fp7");
376 	if((status = esl_sq_SetName(esq, name)) != eslOK)                          cm_Fail("failed to name emitted sequence from cm->fp7");
377       }
378       sq2print = esq; /* we may change what sq2print points to below */
379 
380       /* truncate esq if nec */
381       if(esl_opt_IsOn(go, "--u5p") || esl_opt_IsOn(go, "--u3p")) {
382 	start = esl_opt_IsOn(go, "--u5p") ? esl_rnd_Roll(cfg->r, sq2print->n) + 1 : 1;
383 	end   = esl_opt_IsOn(go, "--u3p") ? esl_rnd_Roll(cfg->r, sq2print->n) + 1 : sq2print->n;
384 	if(start > end) { swap = start; start = end; end = swap; }
385 	if((tsq = esl_sq_CreateDigitalFrom(sq2print->abc, sq2print->name, (sq2print->dsq+start-1), (end-start+1),
386 					   sq2print->desc, sq2print->acc, sq2print->ss)) == NULL) ESL_FAIL(status, errbuf, "out of memory");
387 	tsq->dsq[0] = tsq->dsq[tsq->n+1] = eslDSQ_SENTINEL;
388 	/* destroy sq2print (which points at esq), and point it at tsq */
389 	esl_sq_Destroy(sq2print);
390 	sq2print = tsq;
391       }
392 
393       /* generate sequence to embed in, and embed in it if nec  */
394       if(esl_opt_IsOn(go, "-e")) {
395 	if(sq2print->n > embedL) ESL_FAIL(eslEINCOMPAT, errbuf, "<n>=%d from -eL <n> too small for emitted seq of length %" PRId64 ", increase <n> and rerun", embedL, sq2print->n);
396 
397 	gsq = esl_sq_CreateDigital(cfg->abc_out);
398 	if((status = esl_sq_GrowTo(gsq, embedL)) != eslOK) ESL_FAIL(status, errbuf, "out of memory");
399 	if(esl_opt_GetBoolean(go, "--iid")) {
400 	  esl_rsq_xIID(cfg->r, fq, cfg->abc_out->K, embedL, gsq->dsq);
401 	}
402 	else {
403 	  free(gsq->dsq); /* slightly wasteful */
404 	  if((status = SampleGenomicSequenceFromHMM(cfg->r, cfg->abc_out, errbuf, ghmm_sA, ghmm_tAA, ghmm_eAA, ghmm_nstates, embedL, &(gsq->dsq))) != eslOK) {
405 	    ESL_FAIL(status, errbuf, "failed to generate random background sequence to embed in");
406 	  }
407 	}
408 	gsq->n = embedL;
409 
410 	/* embed (contract enforced at least one of --u5p and --u3p is not on) */
411 	if     (esl_opt_IsOn(go, "--u5p")) { start = 1; }
412 	else if(esl_opt_IsOn(go, "--u3p")) { start = embedL - sq2print->n + 1; }
413 	else                               { start = esl_rnd_Roll(cfg->r, embedL - sq2print->n + 1) + 1; }
414 	for(x = start; x < start + sq2print->n; x++) gsq->dsq[x] = sq2print->dsq[x - start + 1];
415 	/* set name */
416 	esl_sq_FormatName(gsq, "%s/%" PRId64 "-%" PRId64, sq2print->name, (int64_t) start, (int64_t) (start + sq2print->n - 1));
417 	/* destroy sq2print (which points at esq or tsq), and point it at gsq */
418 	esl_sq_Destroy(sq2print);
419 	sq2print = gsq;
420       }
421 
422       /* output sq2print */
423       if((esl_sqio_Write(cfg->ofp, sq2print, eslSQFILE_FASTA, FALSE)) != eslOK) ESL_FAIL(eslFAIL, errbuf, "Error writing unaligned sequences.");
424 
425       /* output parsetree/trace if nec */
426       if(cfg->tfp != NULL) { /* can only be true if none of -e, --u5p and --u3p were used (thus sq2print == esq) */
427 	fprintf(cfg->tfp, "> %s\n", esq->name);
428 	if(cfg->use_cm) {
429 	  if((status = ParsetreeScore(cm, NULL, errbuf, tr, sq2print->dsq, FALSE, &sc, &struct_sc, NULL, NULL, NULL)) != eslOK) return status;
430 	  fprintf(cfg->tfp, "  %16s %.2f bits\n", "SCORE:", sc);
431 	  fprintf(cfg->tfp, "  %16s %.2f bits\n", "STRUCTURE SCORE:", struct_sc);
432 	  ParsetreeDump(cfg->tfp, tr, cm, sq2print->dsq);
433 	}
434 	else {
435 	  if((status = p7_trace_Score(p7tr, sq2print->dsq, gm, &sc)) != eslOK) ESL_FAIL(status, errbuf, "unable to score p7 trace");
436 	  struct_sc = 0.;
437 	  fprintf(cfg->tfp, "  %16s %.2f bits\n", "SCORE:", sc);
438 	  fprintf(cfg->tfp, "  %16s %.2f bits\n", "STRUCTURE SCORE:", struct_sc);
439 	  fprintf(cfg->tfp, "HMM trace dump\n");
440 	  fprintf(cfg->tfp, "------------------\n");
441 	  if((status = p7_trace_Dump(cfg->tfp, p7tr, gm, sq2print->dsq)) != eslOK) ESL_FAIL(status, errbuf, "unable to dump p7 trace");
442 	}
443 	fprintf(cfg->tfp, "//\n");
444       }
445       if(tr != NULL)   { FreeParsetree(tr);      tr   = NULL; }
446       if(p7tr != NULL) { p7_trace_Destroy(p7tr); p7tr = NULL; }
447       esl_sq_Destroy(sq2print);
448       /* we destroyed any other seqs we created due to --u5p, --u3p, -e, above */
449     }
450   free(name);
451   if(bg != NULL) p7_bg_Destroy(bg);
452   if(gm != NULL) p7_profile_Destroy(gm);
453   if(fq != NULL) free(fq);
454   if(ghmm_eAA != NULL) {
455     for(i = 0; i < ghmm_nstates; i++) free(ghmm_eAA[i]);
456     free(ghmm_eAA);
457   }
458   if(ghmm_tAA != NULL) {
459     for(i = 0; i < ghmm_nstates; i++) free(ghmm_tAA[i]);
460     free(ghmm_tAA);
461   }
462   if(ghmm_sA != NULL) free(ghmm_sA);
463 
464   return eslOK;
465 
466  ERROR:
467   return status;
468 }
469 
470 /* emit_alignment
471  * Given a configured CM, generate and output a MSA.
472  */
473 static int
emit_alignment(const ESL_GETOPTS * go,const struct cfg_s * cfg,CM_t * cm,char * errbuf)474 emit_alignment(const ESL_GETOPTS *go, const struct cfg_s *cfg, CM_t *cm, char *errbuf)
475 {
476   int status;
477   Parsetree_t **trA   = NULL;  /* generated parsetrees */
478   ESL_SQ      **sqA   = NULL;  /* generated sequences */
479   P7_TRACE    **p7trA = NULL;  /* generated traces (if !cfg->use_cm) */
480   char *name;
481   int namelen;
482   int i, x, L;
483   ESL_MSA *msa = NULL;      /* the MSA we're building */
484   int nseq = esl_opt_GetInteger(go, "-N");
485   int do_truncate;
486   int offset = esl_opt_GetInteger(go, "--idx");
487   int outfmt;
488 
489   /* Contract check, output alphabet must be identical to CM alphabet
490    * with sole exception that CM alphabet can be eslRNA with output alphabet eslDNA. */
491   if(cm->abc->type != cfg->abc_out->type) {
492     if(! (cm->abc->type == eslRNA && cfg->abc_out->type == eslDNA)){
493       ESL_FAIL(eslFAIL, errbuf, "CM alphabet type must match output alphabet type (unless CM=RNA and output=DNA).");
494     }
495   }
496 
497   /* Determine output alignment file format */
498   outfmt = esl_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat"));
499   if (outfmt == eslMSAFILE_UNKNOWN) {
500     ESL_FAIL(eslEINCOMPAT, errbuf, "%s is not a recognized output MSA file format\n\n", esl_opt_GetString(go, "--outformat"));
501   }
502 
503   do_truncate = (esl_opt_IsOn(go, "--a5p") && esl_opt_IsOn(go, "--a3p")) ? TRUE : FALSE;
504 
505   namelen = IntMaxDigits() + strlen("sample") + 1;  /* IntMaxDigits() returns number of digits in INT_MAX */
506   if(cm->name != NULL) namelen += strlen(cm->name) + 1;
507   ESL_ALLOC(name, sizeof(char) * namelen);
508 
509   ESL_ALLOC(sqA, sizeof(ESL_SQ *) * nseq);
510   if(cfg->use_cm) ESL_ALLOC(trA,   sizeof(Parsetree_t *) * nseq);
511   else            ESL_ALLOC(p7trA, sizeof(P7_TRACE *)    * nseq);
512   if(cfg->use_cm) {
513     for(i = 0; i < nseq; i++)
514       {
515 	if(cm->name != NULL) sprintf(name, "%s-sample%d", cm->name, i+offset);
516 	else                 sprintf(name, "%d-sample%d", cfg->ncm, i+offset);
517 	if((status = EmitParsetree(cm, errbuf, cfg->r, name, TRUE, &(trA[i]), &(sqA[i]), &L)) != eslOK) return status;
518 	sqA[i]->abc = cfg->abc_out;
519 	if(cfg->tfp != NULL) {
520 	  fprintf(cfg->tfp, "> %s\n", sqA[i]->name);
521 	  ParsetreeDump(cfg->tfp, trA[i], cm, sqA[i]->dsq);
522 	  fprintf(cfg->tfp, "//\n");
523 	}
524       }
525     if((status = Parsetrees2Alignment(cm, errbuf, cfg->abc_out, sqA, NULL, trA, NULL, nseq,
526 				      NULL, NULL, /* we're not printing to insert, EL info files */
527 				      TRUE,  /* we want all match columns */
528 				      FALSE, /* we don't want ONLY match columns */
529 				      &msa) != eslOK))
530       ESL_XFAIL(eslFAIL, errbuf, "Error generating alignment from parsetrees.");
531   }
532   else { /* cfg->use_cm is FALSE */
533     for (i = 0; i < nseq; i++) {
534       if ((sqA[i]   = esl_sq_CreateDigital(cm->fp7->abc)) == NULL) cm_Fail("failed to allocate seq");
535       if ((p7trA[i] = p7_trace_Create())                  == NULL) cm_Fail("failed to allocate trace");
536     }
537     for (i = 0; i < nseq; i++) {
538       if (p7_CoreEmit(cfg->r, cm->fp7, sqA[i], p7trA[i]) != eslOK) cm_Fail("Failed to emit sequence");
539 	if(cm->name != NULL) sprintf(name, "%s-sample%d", cm->name, i+offset);
540 	else                 sprintf(name, "%d-sample%d", cfg->ncm, i+offset);
541       if (esl_sq_SetName(sqA[i], name) != eslOK) cm_Fail("Failed to set sequence name\n");
542     }
543     p7_tracealign_Seqs(sqA, p7trA, nseq, cm->fp7->M, p7_ALL_CONSENSUS_COLS, cm->fp7, &msa);
544     /* create an SS_cons string for the msa */
545     if(msa->ss_cons == NULL) {
546       if(msa->rf == NULL) cm_Fail("HMM emitted MSA has no reference annotation");
547       ESL_ALLOC(msa->ss_cons, sizeof(char) * (msa->alen+1));
548       for(x = 0; x < msa->alen; x++) msa->ss_cons[x] = (msa->rf[x] == '.') ? '.' : ':';
549       msa->ss_cons[msa->alen] = '\0';
550     }
551   }
552 
553   if(cm->name != NULL) if((status = esl_strdup(cm->name, -1, &(msa->name))) != eslOK) goto ERROR;
554   if((status = esl_msa_FormatDesc(msa, "%s%s", "Synthetic sequence alignment generated by cmemit",
555 				  (cfg->use_cm ? "" : " [hmm-mode]"))) != eslOK) goto ERROR;
556 
557   /* Truncate the alignment if nec */
558   if(do_truncate) {
559     if((status = truncate_msa(go, cfg, msa, cm->abc, errbuf)) != eslOK) cm_Fail(errbuf);
560   }
561 
562   /* Output the alignment */
563   status = esl_msafile_Write(cfg->ofp, msa, outfmt);
564   if      (status == eslEMEM) ESL_XFAIL(status, errbuf, "Memory error when outputting alignment\n");
565   else if (status != eslOK)   ESL_XFAIL(status, errbuf, "Writing alignment file failed with error %d\n", status);
566 
567   if(sqA != NULL) {
568     for(i = 0; i < nseq; i++) {
569       if(sqA[i] != NULL) esl_sq_Destroy(sqA[i]);
570     }
571     free(sqA);
572   }
573   if(trA != NULL) {
574     for(i = 0; i < nseq; i++) {
575       if(trA[i] != NULL) FreeParsetree(trA[i]);
576     }
577     free(trA);
578   }
579   if(p7trA != NULL) {
580     for (i = 0; i < nseq; i++) {
581       if(p7trA[i] != NULL) p7_trace_Destroy(p7trA[i]);
582     }
583     free(p7trA);
584   }
585 
586   free(name);
587   esl_msa_Destroy(msa);
588   return eslOK;
589 
590  ERROR:
591 
592   if(sqA != NULL) {
593     for(i = 0; i < nseq; i++)
594       if(sqA[i] != NULL) esl_sq_Destroy(sqA[i]);
595     free(sqA);
596   }
597   if(trA != NULL) {
598     for(i = 0; i < nseq; i++)
599       if(trA[i] != NULL) FreeParsetree(trA[i]);
600     free(trA);
601   }
602   if(name != NULL) free(name);
603   if(msa  != NULL) esl_msa_Destroy(msa);
604   return status;
605 }
606 
607 /* emit_consensus
608  * Given a configured CM, generate and output the consensus sequence.
609  */
610 static int
emit_consensus(const ESL_GETOPTS * go,const struct cfg_s * cfg,CM_t * cm)611 emit_consensus(const ESL_GETOPTS *go, const struct cfg_s *cfg, CM_t *cm)
612 {
613   int status;
614   ESL_SQ *csq = NULL;
615   char *cseqname = NULL;
616 
617   if(cfg->use_cm) {
618     if((status = esl_strdup(cm->name, -1, &cseqname))           != eslOK) cm_Fail("failed to create sequence name");
619     if((status = esl_strcat(&cseqname, -1, "-cmconsensus", -1)) != eslOK) cm_Fail("failed to create sequence name");
620     if((csq = esl_sq_CreateFrom(cseqname, cm->cmcons->cseq, NULL, NULL, NULL)) == NULL) cm_Fail("failed to create consensus sequence");
621   }
622   else {
623     if ((csq = esl_sq_Create()) == NULL) cm_Fail("Failed to create sequence");
624     if (p7_emit_FancyConsensus(cm->fp7, 0.0, 0.5, csq)       != eslOK) cm_Fail("failed to create HMM consensus seq");
625     if (esl_sq_FormatName(csq, "%s-hmmconsensus", cm->name)  != eslOK) cm_Fail("failed to set sequence name");
626   }
627   if((esl_sqio_Write(cfg->ofp, csq, eslSQFILE_FASTA, FALSE)) != eslOK) cm_Fail("error writing consensus sequence.");
628 
629   esl_sq_Destroy(csq);
630   free(cseqname);
631 
632   return eslOK;
633 }
634 
635 /* truncate_msa
636  * Truncate a MSA outside begin..end consensus columns
637  * (non-gap RF chars) and return the alignment. Careful
638  * to remove any consensus structure outside begin..end.
639  */
640 static int
truncate_msa(const ESL_GETOPTS * go,const struct cfg_s * cfg,ESL_MSA * msa,const ESL_ALPHABET * abc,char * errbuf)641 truncate_msa(const ESL_GETOPTS *go, const struct cfg_s *cfg, ESL_MSA *msa, const ESL_ALPHABET *abc, char *errbuf)
642 {
643   int status;
644   int *useme = NULL;    /* 1..alen: keep this column? */
645   int *ct    = NULL;    /* 1..alen base pair partners array */
646   int apos = 0;
647   int cc   = 0;
648   int clen = 0;
649   int swap;
650   int spos, epos; /* start and end positions */
651   int set_spos = (esl_opt_IsUsed(go, "--a5p")) ? TRUE : FALSE;
652   int set_epos = (esl_opt_IsUsed(go, "--a3p")) ? TRUE : FALSE;
653   int rnd_spos = (esl_opt_IsUsed(go, "--a5p") && (esl_opt_GetInteger(go, "--a5p") == 0)) ? TRUE : FALSE;
654   int rnd_epos = (esl_opt_IsUsed(go, "--a3p") && (esl_opt_GetInteger(go, "--a3p") == 0)) ? TRUE : FALSE;
655 
656   ESL_ALLOC(useme, sizeof(int) * (msa->alen+1));
657   ESL_ALLOC(ct,    sizeof(int) * (msa->alen+1));
658 
659   /* determine clen */
660   for (apos = 0, cc = 0; apos < msa->alen; apos++) {
661     if (!esl_abc_CIsGap(abc, msa->rf[apos])) clen++;
662   }
663   /* make sure w/ --a3p <n>, that <n> <= clen */
664   if(set_spos && esl_opt_GetInteger(go, "--a3p") > clen) {
665     ESL_XFAIL(eslEINCOMPAT, errbuf, "with --a3p <n> option, <n> must be <= consensus length of CM (%d).\n", clen);
666   }
667   /* enforce that if both --a5p and --a3p are used, they both were set
668    * to 0 (this makes code a little simpler because we can worry about
669    * one less case).
670    */
671   if(set_spos && set_epos) {
672     if((   rnd_spos  && (! rnd_epos)) ||
673        ((! rnd_spos) &&    rnd_epos)) {
674       ESL_XFAIL(eslEINCOMPAT, errbuf, "with --a5p <n1> and --a3p <n2>, either <n1> and <n2> must be 0, or neither must be 0");
675     }
676   }
677   /* determine spos (start posn) */
678   if(set_spos) {
679     if(rnd_spos) spos = esl_rnd_Roll(cfg->r, clen) + 1;
680     else         spos = esl_opt_GetInteger(go, "--a5p");
681   }
682   else spos = 1;
683 
684   /* determine epos (end posn) */
685   if(set_epos) {
686     if(rnd_epos) epos = esl_rnd_Roll(cfg->r, clen) + 1;
687     else         epos = esl_opt_GetInteger(go, "--a3p");
688   }
689   else epos = clen;
690 
691   /* make sure spos <= epos */
692   if(spos > epos) {
693     if(rnd_spos && rnd_epos) { /* random spos and epos, so spos > epos is not an error, swap them */
694       swap = spos; spos = epos; epos = swap;
695     }
696     else { /* user error (assert just to be safe) */
697       assert(esl_opt_IsUsed(go, "--a5p") && esl_opt_IsUsed(go, "--a3p"));
698       assert(esl_opt_GetInteger(go, "--a5p") > esl_opt_GetInteger(go, "--a3p"));
699       ESL_XFAIL(eslEINCOMPAT, errbuf, "with --a5p <n1> and --a3p <n2>, <n1> must be <= <n2>");
700     }
701   }
702 
703   /* remove pknots in place (actually unnec for CM ss_cons) */
704   if((status = esl_wuss_nopseudo(msa->ss_cons, msa->ss_cons)) != eslOK) goto ERROR;
705 
706   /* get a ct array from the structure */
707   if((status = esl_wuss2ct(msa->ss_cons, msa->alen, ct)) != eslOK) goto ERROR;
708 
709   /* Truncate the alignment prior to consensus column spos and after
710    * consensus column epos.  */
711   for (apos = 0, cc = 0; apos < msa->alen; apos++)
712     {
713       /* Careful here, placement of cc++ increment is impt, we want all
714        * inserts between cc=spos-1 and cc=spos, and b/t cc=epos and
715        * cc=epos+1. Also be careful: ct[] is index 1..alen, and
716        * msa->ss_cons is 0..alen-1.
717        */
718       if(cc < (spos-1) || cc > epos) {
719 	useme[apos] = 0;
720 	if(ct[(apos+1)] != 0) ct[ct[(apos+1)]] = 0;
721 	ct[(apos+1)] = 0;
722       }
723       else
724 	useme[apos] = 1;
725       if (!esl_abc_CIsGap(abc, msa->rf[apos])) {
726 	cc++;
727 	if(cc == (epos+1)){
728 	  useme[apos] = 0;
729 	  /* we misassigned this guy, overwrite */
730 	  if(ct[(apos+1)] != 0) ct[ct[(apos+1)]] = 0;
731 	  ct[(apos+1)] = 0;
732 	}
733       }
734     }
735   /* construct the new structure based on the cleaned ct array */
736   if((status = esl_ct2wuss(ct, msa->alen, msa->ss_cons)) != eslOK) goto ERROR;
737 
738   /*printf("\n\nDEBUG PRINTING ORIG ALIGNMENT:\n");
739     WriteStockholm(fp, msa);
740     printf("\n\nDONE DEBUG PRINTING ORIG ALIGNMENT:\n");
741     for(apos=0; apos < msa->alen; apos++)
742     printf("useme[%d]: %d\n", apos, useme[apos]);
743   */
744 
745   if((status = esl_msa_ColumnSubset(msa, errbuf, useme)) != eslOK) return status;
746   free(useme);
747   free(ct);
748   return eslOK;
749 
750  ERROR:
751   if(useme != NULL) free(useme);
752   if(ct    != NULL) free(ct);
753   return status;
754 }
755 
756 
757