1 /* cmemit: generate sequences from a CM.
2 *
3 * EPN, 09.01.06 Janelia Farm
4 * based on HMMER-2.3.2's hmmemit.c from SRE
5 * Easelfied: EPN, Tue Aug 14 07:01:44 2007
6 */
7
8 #include "esl_config.h"
9 #include "config.h"
10
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <time.h>
15
16 #include "easel.h"
17 #include <esl_getopts.h>
18 #include <esl_histogram.h>
19 #include <esl_random.h>
20 #include <esl_randomseq.h>
21 #include <esl_stats.h>
22 #include <esl_vectorops.h>
23 #include <esl_wuss.h>
24
25 #include "hmmer.h"
26
27 #include "infernal.h"
28
29 #define ALPHOPTS "--rna,--dna" /* Exclusive options for alphabet choice */
30 #define OUTOPTS "-u,-c,-a" /* Exclusive options for output */
31
32 static ESL_OPTIONS options[] = {
33 /* name type default env range toggles reqs incomp help docgroup*/
34 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 1 },
35 { "-o", eslARG_OUTFILE, FALSE, NULL, NULL, NULL, NULL, NULL, "send sequence output to file <f>, not stdout", 1 },
36 { "-N", eslARG_INT, "10", NULL, "n>0", NULL, NULL, NULL, "generate <n> sequences", 1 },
37 { "-u", eslARG_NONE, "default", NULL, NULL, OUTOPTS, NULL, NULL, "write generated sequences as unaligned FASTA [default]", 1 },
38 { "-a", eslARG_NONE, FALSE, NULL, NULL, OUTOPTS, NULL, NULL, "write generated sequences as an alignment", 1 },
39 { "-c", eslARG_NONE, FALSE, NULL, NULL, OUTOPTS, NULL, NULL, "generate a single \"consensus\" sequence only", 1, },
40 { "-e", eslARG_INT, NULL, NULL, "n>0", NULL, NULL, "-a,-c", "embed emitted sequences within larger random sequences of length <n>", 1 },
41 { "-l", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "local; emit from a locally configured model [default: global]", 1 },
42 /* options for truncating emitted sequences */
43 { "--u5p", eslARG_NONE, NULL, NULL, NULL, NULL, NULL, "-a,-c", "truncate unaligned sequences 5', choosing a random start posn", 2 },
44 { "--u3p", eslARG_NONE, NULL, NULL, NULL, NULL, NULL, "-a,-c", "truncate unaligned sequences 3', choosing a random end posn", 2 },
45 { "--a5p", eslARG_INT, NULL, NULL, "n>=0", NULL, "--a3p,-a", NULL, "truncate aln 5', start at match column <n> (use 0 for random posn)", 2 },
46 { "--a3p", eslARG_INT, NULL, NULL, "n>=0", NULL, "--a5p,-a", NULL, "truncate aln 3', end at match column <n> (use 0 for random posn)", 2 },
47 /* other options */
48 { "--seed", eslARG_INT, "0", NULL, "n>=0", NULL, NULL, NULL, "set RNG seed to <n> [default: one-time arbitrary seed]", 3 },
49 { "--iid", eslARG_NONE, FALSE, NULL, NULL, NULL, "-e", NULL, "with -e, generate larger sequences as 25% ACGU (iid) ", 3 },
50 { "--rna", eslARG_NONE, "default", NULL, NULL, ALPHOPTS, NULL, NULL, "output as RNA sequence data", 3 },
51 { "--dna", eslARG_NONE, FALSE, NULL, NULL, ALPHOPTS, NULL, NULL, "output as DNA sequence data", 3 },
52 { "--idx", eslARG_INT, "1", NULL, "n>0", NULL, NULL, NULL, "start sequence numbering at <n>", 3 },
53 { "--outformat", eslARG_STRING, "Stockholm", NULL, NULL, NULL, "-a", NULL, "w/-a output alignment in format <s>", 3 },
54 { "--tfile", eslARG_OUTFILE, NULL, NULL, NULL, NULL, NULL, "-c,-e,--u5p,--u3p", "dump parsetrees to file <f>", 3 },
55 { "--exp", eslARG_REAL, NULL, NULL, "x>0", NULL, NULL, NULL, "exponentiate CM probabilities by <x> before emitting", 3 },
56 { "--hmmonly", eslARG_NONE, NULL, NULL, NULL, NULL, NULL, NULL, "emit from filter HMM, not from CM", 3 },
57 { "--nohmmonly", eslARG_NONE, NULL, NULL, NULL, NULL, NULL, "--hmmonly", "always emit from CM, even for models with 0 basepairs", 3 },
58 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
59 };
60
61 /* struct cfg_s : "Global" application configuration shared by all
62 * threads/processes
63 *
64 * This structure is passed to routines within main.c, as a means of
65 * semi-encapsulation of shared data amongst different parallel
66 * processes (threads or MPI processes). This strategy is used
67 * despite the fact that neither a MPI nor a threaded version of
68 * cmemit exists.
69 */
70 struct cfg_s {
71 char *cmfile; /* name of input CM file */
72 CM_FILE *cmfp; /* open input CM file stream */
73 ESL_ALPHABET *abc; /* digital alphabet for CM */
74 ESL_ALPHABET *abc_out; /* digital alphabet for writing */
75 FILE *ofp; /* output file (default is stdout) */
76 FILE *tfp; /* optional output file for parsetrees */
77 ESL_RANDOMNESS *r; /* source of randomness */
78 int ncm; /* number CM we're at in file */
79 int use_cm; /* TRUE to emit from CM for current model, FALSE to emit from filter HMM */
80 };
81
82 static char usage[] = "[-options] <cmfile>";
83 static char banner[] = "sample sequences from a covariance model";
84
85 static int init_cfg(const ESL_GETOPTS *go, struct cfg_s *cfg, char *errbuf);
86
87 static void master(const ESL_GETOPTS *go, struct cfg_s *cfg);
88
89 static int initialize_cm(const ESL_GETOPTS *go, const struct cfg_s *cfg, CM_t *cm, char *errbuf);
90 static int emit_unaligned(const ESL_GETOPTS *go, const struct cfg_s *cfg, CM_t *cm, char *errbuf);
91 static int emit_alignment(const ESL_GETOPTS *go, const struct cfg_s *cfg, CM_t *cm, char *errbuf);
92 static int emit_consensus(const ESL_GETOPTS *go, const struct cfg_s *cfg, CM_t *cm);
93 static int truncate_msa(const ESL_GETOPTS *go, const struct cfg_s *cfg, ESL_MSA *msa, const ESL_ALPHABET *abc, char *errbuf);
94
95 int
main(int argc,char ** argv)96 main(int argc, char **argv)
97 {
98 ESL_GETOPTS *go = NULL; /* command line processing */
99 struct cfg_s cfg;
100
101 /* setup logsum lookups (could do this only if nec based on options, but this is safer) */
102 init_ilogsum();
103 FLogsumInit();
104
105 /***********************************************
106 * Parse command line
107 ***********************************************/
108
109 /* Process command line options.
110 */
111 go = esl_getopts_Create(options);
112 if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK ||
113 esl_opt_VerifyConfig(go) != eslOK) {
114 printf("Failed to parse command line: %s\n", go->errbuf);
115 esl_usage(stdout, argv[0], usage);
116 printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
117 exit(1);
118 }
119 if (esl_opt_GetBoolean(go, "-h")) {
120 cm_banner(stdout, argv[0], banner);
121 esl_usage(stdout, argv[0], usage);
122 puts("\nBasic options:");
123 esl_opt_DisplayHelp(stdout, go, 1, 2, 80); /* 1=docgroup, 2 = indentation; 80=textwidth*/
124 puts("\nOptions for truncating sequences:");
125 esl_opt_DisplayHelp(stdout, go, 2, 2, 80);
126 puts("\nOther options:");
127 esl_opt_DisplayHelp(stdout, go, 3, 2, 80);
128 puts("\nAlignment output formats (-a) include: Stockholm, Pfam, AFA (aligned FASTA), A2M, Clustal, PHYLIP\n");
129 exit(0);
130 }
131 if (esl_opt_ArgNumber(go) != 1) {
132 puts("Incorrect number of command line arguments.");
133 esl_usage(stdout, argv[0], usage);
134 puts("\n where basic options are:");
135 esl_opt_DisplayHelp(stdout, go, 1, 2, 80);
136 printf("\nTo see more help on other available options, do %s -h\n\n", argv[0]);
137 exit(1);
138 }
139 /* Initialize what we can in the config structure (without knowing the alphabet yet).
140 * We could assume RNA, but this HMMER3 based approach is more general.
141 */
142 cfg.cmfile = esl_opt_GetArg(go, 1);
143 cfg.cmfp = NULL; /* opened in init_cfg() */
144 cfg.ofp = NULL; /* opened in init_cfg() */
145 cfg.abc = NULL; /* created in init_cfg() */
146 cfg.abc_out = NULL; /* created in init_cfg() */
147 cfg.tfp = NULL; /* opened in init_cfg() */
148 cfg.r = NULL; /* created in init_cfg() */
149
150 /* do work */
151 master(go, &cfg);
152
153 /* Clean up the cfg.
154 */
155 if(esl_opt_IsOn(go, "-o")) {
156 fclose(cfg.ofp);
157 }
158 if (cfg.tfp != NULL) {
159 fclose(cfg.tfp);
160 }
161 if (cfg.abc != NULL) { esl_alphabet_Destroy(cfg.abc); cfg.abc = NULL; }
162 if (cfg.abc_out != NULL) esl_alphabet_Destroy(cfg.abc_out);
163 if (cfg.cmfp != NULL) cm_file_Close(cfg.cmfp);
164 if (cfg.r != NULL) esl_randomness_Destroy(cfg.r);
165
166 esl_getopts_Destroy(go);
167 return 0;
168 }
169
170 /* init_cfg()
171 * Already set:
172 * cfg->cmfile - command line arg 1
173 * Sets:
174 * cfg->cmfp - open CM file
175 * cfg->abc_out - digital alphabet for output
176 * cfg->tfp - optional output file for parsetrees
177 * cfg->r - source of randomness
178 */
179 static int
init_cfg(const ESL_GETOPTS * go,struct cfg_s * cfg,char * errbuf)180 init_cfg(const ESL_GETOPTS *go, struct cfg_s *cfg, char *errbuf)
181 {
182 int status;
183
184 /* open CM file for reading */
185 status = cm_file_Open(cfg->cmfile, NULL, FALSE, &(cfg->cmfp), errbuf);
186 if (status == eslENOTFOUND) cm_Fail("File existence/permissions problem in trying to open CM file %s.\n%s\n", cfg->cmfile, errbuf);
187 else if (status == eslEFORMAT) cm_Fail("File format problem in trying to open CM file %s.\n%s\n", cfg->cmfile, errbuf);
188 else if (status != eslOK) cm_Fail("Unexpected error %d in opening CM file %s.\n%s\n", status, cfg->cmfile, errbuf);
189
190 /* open sequence output file for writing */
191 if ( esl_opt_IsOn(go, "-o") ) {
192 if ((cfg->ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) ESL_FAIL(eslFAIL, errbuf, "Failed to open output file %s", esl_opt_GetString(go, "-o"));
193 } else cfg->ofp = stdout;
194
195
196 /* create output alphabet */
197 if (esl_opt_GetBoolean(go, "--rna")) cfg->abc_out = esl_alphabet_Create(eslRNA);
198 else if (esl_opt_GetBoolean(go, "--dna")) cfg->abc_out = esl_alphabet_Create(eslDNA);
199
200 /* open parsetree output file if necessary */
201 if (esl_opt_GetString(go, "--tfile") != NULL) {
202 if ((cfg->tfp = fopen(esl_opt_GetString(go, "--tfile"), "w")) == NULL)
203 ESL_FAIL(eslFAIL, errbuf, "Failed to open --tfile output file %s\n", esl_opt_GetString(go, "--tfile"));
204 }
205
206 /* create RNG */
207 cfg->r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "--seed"));
208
209 if (cfg->abc_out == NULL) ESL_FAIL(eslEINVAL, errbuf, "Output alphabet creation failed.");
210 if (cfg->r == NULL) ESL_FAIL(eslEINVAL, errbuf, "Failed to create random number generator: probably out of memory");
211
212 return eslOK;
213 }
214
215 /* master()
216 * The serial version of cmemit. (There is no parallel version yet).
217 * For each CM, emit sequences/alignment and create output.
218 *
219 * We only return if successful. All errors are handled immediately and fatally with cm_Fail().
220 */
221 static void
master(const ESL_GETOPTS * go,struct cfg_s * cfg)222 master(const ESL_GETOPTS *go, struct cfg_s *cfg)
223 {
224 int status;
225 char errbuf[eslERRBUFSIZE];
226 CM_t *cm = NULL;
227
228 if ((status = init_cfg(go, cfg, errbuf)) != eslOK) cm_Fail(errbuf);
229
230 cfg->ncm = 0;
231
232 while ((status = cm_file_Read(cfg->cmfp, TRUE, &(cfg->abc), &cm)) == eslOK)
233 {
234 if (cm == NULL) cm_Fail("Failed to read CM from %s -- file corrupt?\n", cfg->cmfile);
235 cfg->ncm++;
236
237 /* determine if we'll emit from the CM or the filter HMM for this model */
238 if (esl_opt_GetBoolean(go, "--nohmmonly")) cfg->use_cm = TRUE;
239 else if (esl_opt_GetBoolean(go, "--hmmonly")) cfg->use_cm = FALSE;
240 else cfg->use_cm = (CMCountNodetype(cm, MATP_nd) > 0) ? TRUE : FALSE;
241
242 if((status = initialize_cm(go, cfg, cm, errbuf)) != eslOK) cm_Fail(errbuf);
243
244 /* Pick 1 of 4 exclusive output options. Output is handled within each function. */
245 if(esl_opt_GetBoolean(go, "-c")) {
246 emit_consensus(go, cfg, cm);
247 }
248 else if(esl_opt_GetBoolean(go, "-a")) {
249 if((status = emit_alignment(go, cfg, cm, errbuf)) != eslOK) cm_Fail(errbuf);
250 }
251 else {
252 if((status = emit_unaligned(go, cfg, cm, errbuf)) != eslOK) cm_Fail(errbuf);
253 }
254 FreeCM(cm);
255 }
256 if(status != eslEOF) cm_Fail(cfg->cmfp->errbuf);
257 return;
258 }
259
260 /* initialize_cm()
261 * Setup the CM based on the command-line options/defaults;
262 * only set flags and a few parameters. cm_Configure() configures
263 * the CM.
264 */
265 static int
initialize_cm(const ESL_GETOPTS * go,const struct cfg_s * cfg,CM_t * cm,char * errbuf)266 initialize_cm(const ESL_GETOPTS *go, const struct cfg_s *cfg, CM_t *cm, char *errbuf)
267 {
268 int status;
269
270 /* Update cfg->cm->config_opts and cfg->cm->align_opts based on command line options */
271 if(esl_opt_GetBoolean(go, "-l")) {
272 cm->config_opts |= CM_CONFIG_LOCAL;
273 cm->config_opts |= CM_CONFIG_HMMLOCAL;
274 cm->config_opts |= CM_CONFIG_HMMEL;
275 }
276
277 /* exponentiate the CM of fHMM, if nec, do this before possibly configuring to local alignment in cm_Configure() */
278 if(esl_opt_IsOn(go, "--exp")) {
279 if(cfg->use_cm) cm_Exponentiate (cm, esl_opt_GetReal(go, "--exp"));
280 else cm_p7_Exponentiate(cm->fp7, esl_opt_GetReal(go, "--exp"));
281 }
282 if((status = cm_Configure(cm, errbuf, -1)) != eslOK) return status;
283
284 return eslOK;
285 }
286
287 /* emit_unaligned
288 * Given a configured CM, generate and output unaligned sequences.
289 */
290 static int
emit_unaligned(const ESL_GETOPTS * go,const struct cfg_s * cfg,CM_t * cm,char * errbuf)291 emit_unaligned(const ESL_GETOPTS *go, const struct cfg_s *cfg, CM_t *cm, char *errbuf)
292 {
293 int status;
294 Parsetree_t *tr = NULL; /* emitted parsetree */
295 ESL_SQ *esq = NULL; /* sequence emitted from CM */
296 ESL_SQ *tsq = NULL; /* truncated esq, if nec, or ptr to esq */
297 ESL_SQ *gsq = NULL; /* generated sequence (iid) to embed tsq in, if nec, else ptr to tsq */
298 ESL_SQ *sq2print = NULL; /* ptr to sequence we'll output */
299 char *name; /* sequence name, for EmitParsetree() */
300 int namelen; /* length of name */
301 int i, L, embedL; /* sequence idx, length, length of sq to embed in */
302 int x; /* residue idx */
303 float sc, struct_sc; /* parsetree score, structure score */
304 int offset = esl_opt_GetInteger(go, "--idx"); /* seq index to start naming at */
305 int start, end, swap; /* for truncating sequences */
306
307 /* the HMM that generates background sequences */
308 int ghmm_nstates = 0; /* number of states in the HMM */
309 double *ghmm_sA = NULL; /* start probabilities [0..ghmm_nstates-1] */
310 double **ghmm_tAA = NULL; /* transition probabilities [0..nstates-1][0..nstates-1] */
311 double **ghmm_eAA = NULL; /* emission probabilities [0..nstates-1][0..abc->K-1] */
312 double *fq = NULL; /* double vec for esl_rsq_XIID, only used if --iid */
313
314 /* parameters necessary only if cfg->use_cm is FALSE, and we'll be
315 * emitting from the filter p7 HMM.
316 */
317 int hmm_mode = esl_opt_GetBoolean(go, "-l") ? p7_UNILOCAL : p7_UNIGLOCAL;
318 P7_TRACE *p7tr = NULL;
319 P7_PROFILE *gm = NULL;
320 P7_BG *bg = NULL;
321
322 /* Contract check, output alphabet must be identical to CM alphabet
323 * with sole exception that CM alphabet can be eslRNA with output
324 * alphabet eslDNA. And -e only works with 0 or 1 of --u5p,
325 * --u3p, not both.
326 */
327 if(cm->abc->type != cfg->abc_out->type) {
328 if(! (cm->abc->type == eslRNA && cfg->abc_out->type == eslDNA)) {
329 ESL_FAIL(eslFAIL, errbuf, "CM alphabet type must match output alphabet type (unless CM=RNA and output=DNA).");
330 }
331 }
332 if(esl_opt_IsOn(go, "-e") && esl_opt_IsOn(go, "--u5p") && esl_opt_IsOn(go, "--u3p")) {
333 ESL_FAIL(eslFAIL, errbuf, "-e works in combination with --u5p or --u3p but not both");
334 }
335
336 namelen = IntMaxDigits() + strlen("sample") + 1; /* IntMaxDigits() returns number of digits in INT_MAX */
337 if(cm->name != NULL) namelen += strlen(cm->name) + 1;
338 ESL_ALLOC(name, sizeof(char) * namelen);
339
340 if(! cfg->use_cm) {
341 if ((gm = p7_profile_Create(cm->fp7->M, cm->fp7->abc)) == NULL) cm_Fail("failed to create profile for filter HMM");
342 if ((bg = p7_bg_Create(cm->fp7->abc)) == NULL) cm_Fail("failed to create null model for filter HMM");
343 if (p7_ProfileConfig(cm->fp7, bg, gm, cm->fp7->max_length, hmm_mode) != eslOK) cm_Fail("failed to configure profile for filter HMM");
344 /* set N->N and C->C transitions to IMPOSSIBLE, we don't want them emitting */
345 gm->xsc[p7P_N][p7P_MOVE] = gm->xsc[p7P_C][p7P_MOVE] = 0.0f;
346 gm->xsc[p7P_N][p7P_LOOP] = gm->xsc[p7P_C][p7P_LOOP] = -eslINFINITY;
347 }
348
349 if(esl_opt_IsOn(go, "-e")) {
350 embedL = esl_opt_GetInteger(go, "-e");
351 /* if --iid we'll generate iid seqs, else we'll use our generative
352 * HMM for 'realistic' genome like background.
353 */
354 if(esl_opt_GetBoolean(go, "--iid")) {
355 ESL_ALLOC(fq, sizeof(double) * cfg->abc_out->K); /* iid, currently only option */
356 esl_vec_DSet(fq, cfg->abc_out->K, 1.0 / (double) cfg->abc_out->K);
357 }
358 else {
359 if((status = CreateGenomicHMM(cfg->abc_out, errbuf, &ghmm_sA, &ghmm_tAA, &ghmm_eAA, &ghmm_nstates)) != eslOK) ESL_FAIL(status, errbuf, "unable to create generative HMM\n%s", errbuf);
360 }
361 }
362
363 for(i = 0; i < esl_opt_GetInteger(go, "-N"); i++)
364 {
365 if(cm->name != NULL) sprintf(name, "%s-sample%d", cm->name, i+offset);
366 else sprintf(name, "%d-sample%d", cfg->ncm, i+offset);
367 if(cfg->use_cm) {
368 if((status = EmitParsetree(cm, errbuf, cfg->r, name, TRUE, &tr, &esq, &L)) != eslOK) return status; /* TRUE: make sq digital */
369 esq->abc = cfg->abc_out;
370 }
371 else {
372 esq = esl_sq_CreateDigital(cfg->abc_out);
373 if((p7tr = p7_trace_Create()) == NULL) cm_Fail("failed to allocate p7 trace");
374 /*if((status = p7_CoreEmit(cfg->r, cm->fp7, esq, p7tr)) != eslOK) cm_Fail("failed to emit sequence from cm->fp7");*/
375 if((status = p7_ProfileEmit(cfg->r, cm->fp7, gm, bg, esq, p7tr)) != eslOK) cm_Fail("failed to emit sequence from cm->fp7");
376 if((status = esl_sq_SetName(esq, name)) != eslOK) cm_Fail("failed to name emitted sequence from cm->fp7");
377 }
378 sq2print = esq; /* we may change what sq2print points to below */
379
380 /* truncate esq if nec */
381 if(esl_opt_IsOn(go, "--u5p") || esl_opt_IsOn(go, "--u3p")) {
382 start = esl_opt_IsOn(go, "--u5p") ? esl_rnd_Roll(cfg->r, sq2print->n) + 1 : 1;
383 end = esl_opt_IsOn(go, "--u3p") ? esl_rnd_Roll(cfg->r, sq2print->n) + 1 : sq2print->n;
384 if(start > end) { swap = start; start = end; end = swap; }
385 if((tsq = esl_sq_CreateDigitalFrom(sq2print->abc, sq2print->name, (sq2print->dsq+start-1), (end-start+1),
386 sq2print->desc, sq2print->acc, sq2print->ss)) == NULL) ESL_FAIL(status, errbuf, "out of memory");
387 tsq->dsq[0] = tsq->dsq[tsq->n+1] = eslDSQ_SENTINEL;
388 /* destroy sq2print (which points at esq), and point it at tsq */
389 esl_sq_Destroy(sq2print);
390 sq2print = tsq;
391 }
392
393 /* generate sequence to embed in, and embed in it if nec */
394 if(esl_opt_IsOn(go, "-e")) {
395 if(sq2print->n > embedL) ESL_FAIL(eslEINCOMPAT, errbuf, "<n>=%d from -eL <n> too small for emitted seq of length %" PRId64 ", increase <n> and rerun", embedL, sq2print->n);
396
397 gsq = esl_sq_CreateDigital(cfg->abc_out);
398 if((status = esl_sq_GrowTo(gsq, embedL)) != eslOK) ESL_FAIL(status, errbuf, "out of memory");
399 if(esl_opt_GetBoolean(go, "--iid")) {
400 esl_rsq_xIID(cfg->r, fq, cfg->abc_out->K, embedL, gsq->dsq);
401 }
402 else {
403 free(gsq->dsq); /* slightly wasteful */
404 if((status = SampleGenomicSequenceFromHMM(cfg->r, cfg->abc_out, errbuf, ghmm_sA, ghmm_tAA, ghmm_eAA, ghmm_nstates, embedL, &(gsq->dsq))) != eslOK) {
405 ESL_FAIL(status, errbuf, "failed to generate random background sequence to embed in");
406 }
407 }
408 gsq->n = embedL;
409
410 /* embed (contract enforced at least one of --u5p and --u3p is not on) */
411 if (esl_opt_IsOn(go, "--u5p")) { start = 1; }
412 else if(esl_opt_IsOn(go, "--u3p")) { start = embedL - sq2print->n + 1; }
413 else { start = esl_rnd_Roll(cfg->r, embedL - sq2print->n + 1) + 1; }
414 for(x = start; x < start + sq2print->n; x++) gsq->dsq[x] = sq2print->dsq[x - start + 1];
415 /* set name */
416 esl_sq_FormatName(gsq, "%s/%" PRId64 "-%" PRId64, sq2print->name, (int64_t) start, (int64_t) (start + sq2print->n - 1));
417 /* destroy sq2print (which points at esq or tsq), and point it at gsq */
418 esl_sq_Destroy(sq2print);
419 sq2print = gsq;
420 }
421
422 /* output sq2print */
423 if((esl_sqio_Write(cfg->ofp, sq2print, eslSQFILE_FASTA, FALSE)) != eslOK) ESL_FAIL(eslFAIL, errbuf, "Error writing unaligned sequences.");
424
425 /* output parsetree/trace if nec */
426 if(cfg->tfp != NULL) { /* can only be true if none of -e, --u5p and --u3p were used (thus sq2print == esq) */
427 fprintf(cfg->tfp, "> %s\n", esq->name);
428 if(cfg->use_cm) {
429 if((status = ParsetreeScore(cm, NULL, errbuf, tr, sq2print->dsq, FALSE, &sc, &struct_sc, NULL, NULL, NULL)) != eslOK) return status;
430 fprintf(cfg->tfp, " %16s %.2f bits\n", "SCORE:", sc);
431 fprintf(cfg->tfp, " %16s %.2f bits\n", "STRUCTURE SCORE:", struct_sc);
432 ParsetreeDump(cfg->tfp, tr, cm, sq2print->dsq);
433 }
434 else {
435 if((status = p7_trace_Score(p7tr, sq2print->dsq, gm, &sc)) != eslOK) ESL_FAIL(status, errbuf, "unable to score p7 trace");
436 struct_sc = 0.;
437 fprintf(cfg->tfp, " %16s %.2f bits\n", "SCORE:", sc);
438 fprintf(cfg->tfp, " %16s %.2f bits\n", "STRUCTURE SCORE:", struct_sc);
439 fprintf(cfg->tfp, "HMM trace dump\n");
440 fprintf(cfg->tfp, "------------------\n");
441 if((status = p7_trace_Dump(cfg->tfp, p7tr, gm, sq2print->dsq)) != eslOK) ESL_FAIL(status, errbuf, "unable to dump p7 trace");
442 }
443 fprintf(cfg->tfp, "//\n");
444 }
445 if(tr != NULL) { FreeParsetree(tr); tr = NULL; }
446 if(p7tr != NULL) { p7_trace_Destroy(p7tr); p7tr = NULL; }
447 esl_sq_Destroy(sq2print);
448 /* we destroyed any other seqs we created due to --u5p, --u3p, -e, above */
449 }
450 free(name);
451 if(bg != NULL) p7_bg_Destroy(bg);
452 if(gm != NULL) p7_profile_Destroy(gm);
453 if(fq != NULL) free(fq);
454 if(ghmm_eAA != NULL) {
455 for(i = 0; i < ghmm_nstates; i++) free(ghmm_eAA[i]);
456 free(ghmm_eAA);
457 }
458 if(ghmm_tAA != NULL) {
459 for(i = 0; i < ghmm_nstates; i++) free(ghmm_tAA[i]);
460 free(ghmm_tAA);
461 }
462 if(ghmm_sA != NULL) free(ghmm_sA);
463
464 return eslOK;
465
466 ERROR:
467 return status;
468 }
469
470 /* emit_alignment
471 * Given a configured CM, generate and output a MSA.
472 */
473 static int
emit_alignment(const ESL_GETOPTS * go,const struct cfg_s * cfg,CM_t * cm,char * errbuf)474 emit_alignment(const ESL_GETOPTS *go, const struct cfg_s *cfg, CM_t *cm, char *errbuf)
475 {
476 int status;
477 Parsetree_t **trA = NULL; /* generated parsetrees */
478 ESL_SQ **sqA = NULL; /* generated sequences */
479 P7_TRACE **p7trA = NULL; /* generated traces (if !cfg->use_cm) */
480 char *name;
481 int namelen;
482 int i, x, L;
483 ESL_MSA *msa = NULL; /* the MSA we're building */
484 int nseq = esl_opt_GetInteger(go, "-N");
485 int do_truncate;
486 int offset = esl_opt_GetInteger(go, "--idx");
487 int outfmt;
488
489 /* Contract check, output alphabet must be identical to CM alphabet
490 * with sole exception that CM alphabet can be eslRNA with output alphabet eslDNA. */
491 if(cm->abc->type != cfg->abc_out->type) {
492 if(! (cm->abc->type == eslRNA && cfg->abc_out->type == eslDNA)){
493 ESL_FAIL(eslFAIL, errbuf, "CM alphabet type must match output alphabet type (unless CM=RNA and output=DNA).");
494 }
495 }
496
497 /* Determine output alignment file format */
498 outfmt = esl_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat"));
499 if (outfmt == eslMSAFILE_UNKNOWN) {
500 ESL_FAIL(eslEINCOMPAT, errbuf, "%s is not a recognized output MSA file format\n\n", esl_opt_GetString(go, "--outformat"));
501 }
502
503 do_truncate = (esl_opt_IsOn(go, "--a5p") && esl_opt_IsOn(go, "--a3p")) ? TRUE : FALSE;
504
505 namelen = IntMaxDigits() + strlen("sample") + 1; /* IntMaxDigits() returns number of digits in INT_MAX */
506 if(cm->name != NULL) namelen += strlen(cm->name) + 1;
507 ESL_ALLOC(name, sizeof(char) * namelen);
508
509 ESL_ALLOC(sqA, sizeof(ESL_SQ *) * nseq);
510 if(cfg->use_cm) ESL_ALLOC(trA, sizeof(Parsetree_t *) * nseq);
511 else ESL_ALLOC(p7trA, sizeof(P7_TRACE *) * nseq);
512 if(cfg->use_cm) {
513 for(i = 0; i < nseq; i++)
514 {
515 if(cm->name != NULL) sprintf(name, "%s-sample%d", cm->name, i+offset);
516 else sprintf(name, "%d-sample%d", cfg->ncm, i+offset);
517 if((status = EmitParsetree(cm, errbuf, cfg->r, name, TRUE, &(trA[i]), &(sqA[i]), &L)) != eslOK) return status;
518 sqA[i]->abc = cfg->abc_out;
519 if(cfg->tfp != NULL) {
520 fprintf(cfg->tfp, "> %s\n", sqA[i]->name);
521 ParsetreeDump(cfg->tfp, trA[i], cm, sqA[i]->dsq);
522 fprintf(cfg->tfp, "//\n");
523 }
524 }
525 if((status = Parsetrees2Alignment(cm, errbuf, cfg->abc_out, sqA, NULL, trA, NULL, nseq,
526 NULL, NULL, /* we're not printing to insert, EL info files */
527 TRUE, /* we want all match columns */
528 FALSE, /* we don't want ONLY match columns */
529 &msa) != eslOK))
530 ESL_XFAIL(eslFAIL, errbuf, "Error generating alignment from parsetrees.");
531 }
532 else { /* cfg->use_cm is FALSE */
533 for (i = 0; i < nseq; i++) {
534 if ((sqA[i] = esl_sq_CreateDigital(cm->fp7->abc)) == NULL) cm_Fail("failed to allocate seq");
535 if ((p7trA[i] = p7_trace_Create()) == NULL) cm_Fail("failed to allocate trace");
536 }
537 for (i = 0; i < nseq; i++) {
538 if (p7_CoreEmit(cfg->r, cm->fp7, sqA[i], p7trA[i]) != eslOK) cm_Fail("Failed to emit sequence");
539 if(cm->name != NULL) sprintf(name, "%s-sample%d", cm->name, i+offset);
540 else sprintf(name, "%d-sample%d", cfg->ncm, i+offset);
541 if (esl_sq_SetName(sqA[i], name) != eslOK) cm_Fail("Failed to set sequence name\n");
542 }
543 p7_tracealign_Seqs(sqA, p7trA, nseq, cm->fp7->M, p7_ALL_CONSENSUS_COLS, cm->fp7, &msa);
544 /* create an SS_cons string for the msa */
545 if(msa->ss_cons == NULL) {
546 if(msa->rf == NULL) cm_Fail("HMM emitted MSA has no reference annotation");
547 ESL_ALLOC(msa->ss_cons, sizeof(char) * (msa->alen+1));
548 for(x = 0; x < msa->alen; x++) msa->ss_cons[x] = (msa->rf[x] == '.') ? '.' : ':';
549 msa->ss_cons[msa->alen] = '\0';
550 }
551 }
552
553 if(cm->name != NULL) if((status = esl_strdup(cm->name, -1, &(msa->name))) != eslOK) goto ERROR;
554 if((status = esl_msa_FormatDesc(msa, "%s%s", "Synthetic sequence alignment generated by cmemit",
555 (cfg->use_cm ? "" : " [hmm-mode]"))) != eslOK) goto ERROR;
556
557 /* Truncate the alignment if nec */
558 if(do_truncate) {
559 if((status = truncate_msa(go, cfg, msa, cm->abc, errbuf)) != eslOK) cm_Fail(errbuf);
560 }
561
562 /* Output the alignment */
563 status = esl_msafile_Write(cfg->ofp, msa, outfmt);
564 if (status == eslEMEM) ESL_XFAIL(status, errbuf, "Memory error when outputting alignment\n");
565 else if (status != eslOK) ESL_XFAIL(status, errbuf, "Writing alignment file failed with error %d\n", status);
566
567 if(sqA != NULL) {
568 for(i = 0; i < nseq; i++) {
569 if(sqA[i] != NULL) esl_sq_Destroy(sqA[i]);
570 }
571 free(sqA);
572 }
573 if(trA != NULL) {
574 for(i = 0; i < nseq; i++) {
575 if(trA[i] != NULL) FreeParsetree(trA[i]);
576 }
577 free(trA);
578 }
579 if(p7trA != NULL) {
580 for (i = 0; i < nseq; i++) {
581 if(p7trA[i] != NULL) p7_trace_Destroy(p7trA[i]);
582 }
583 free(p7trA);
584 }
585
586 free(name);
587 esl_msa_Destroy(msa);
588 return eslOK;
589
590 ERROR:
591
592 if(sqA != NULL) {
593 for(i = 0; i < nseq; i++)
594 if(sqA[i] != NULL) esl_sq_Destroy(sqA[i]);
595 free(sqA);
596 }
597 if(trA != NULL) {
598 for(i = 0; i < nseq; i++)
599 if(trA[i] != NULL) FreeParsetree(trA[i]);
600 free(trA);
601 }
602 if(name != NULL) free(name);
603 if(msa != NULL) esl_msa_Destroy(msa);
604 return status;
605 }
606
607 /* emit_consensus
608 * Given a configured CM, generate and output the consensus sequence.
609 */
610 static int
emit_consensus(const ESL_GETOPTS * go,const struct cfg_s * cfg,CM_t * cm)611 emit_consensus(const ESL_GETOPTS *go, const struct cfg_s *cfg, CM_t *cm)
612 {
613 int status;
614 ESL_SQ *csq = NULL;
615 char *cseqname = NULL;
616
617 if(cfg->use_cm) {
618 if((status = esl_strdup(cm->name, -1, &cseqname)) != eslOK) cm_Fail("failed to create sequence name");
619 if((status = esl_strcat(&cseqname, -1, "-cmconsensus", -1)) != eslOK) cm_Fail("failed to create sequence name");
620 if((csq = esl_sq_CreateFrom(cseqname, cm->cmcons->cseq, NULL, NULL, NULL)) == NULL) cm_Fail("failed to create consensus sequence");
621 }
622 else {
623 if ((csq = esl_sq_Create()) == NULL) cm_Fail("Failed to create sequence");
624 if (p7_emit_FancyConsensus(cm->fp7, 0.0, 0.5, csq) != eslOK) cm_Fail("failed to create HMM consensus seq");
625 if (esl_sq_FormatName(csq, "%s-hmmconsensus", cm->name) != eslOK) cm_Fail("failed to set sequence name");
626 }
627 if((esl_sqio_Write(cfg->ofp, csq, eslSQFILE_FASTA, FALSE)) != eslOK) cm_Fail("error writing consensus sequence.");
628
629 esl_sq_Destroy(csq);
630 free(cseqname);
631
632 return eslOK;
633 }
634
635 /* truncate_msa
636 * Truncate a MSA outside begin..end consensus columns
637 * (non-gap RF chars) and return the alignment. Careful
638 * to remove any consensus structure outside begin..end.
639 */
640 static int
truncate_msa(const ESL_GETOPTS * go,const struct cfg_s * cfg,ESL_MSA * msa,const ESL_ALPHABET * abc,char * errbuf)641 truncate_msa(const ESL_GETOPTS *go, const struct cfg_s *cfg, ESL_MSA *msa, const ESL_ALPHABET *abc, char *errbuf)
642 {
643 int status;
644 int *useme = NULL; /* 1..alen: keep this column? */
645 int *ct = NULL; /* 1..alen base pair partners array */
646 int apos = 0;
647 int cc = 0;
648 int clen = 0;
649 int swap;
650 int spos, epos; /* start and end positions */
651 int set_spos = (esl_opt_IsUsed(go, "--a5p")) ? TRUE : FALSE;
652 int set_epos = (esl_opt_IsUsed(go, "--a3p")) ? TRUE : FALSE;
653 int rnd_spos = (esl_opt_IsUsed(go, "--a5p") && (esl_opt_GetInteger(go, "--a5p") == 0)) ? TRUE : FALSE;
654 int rnd_epos = (esl_opt_IsUsed(go, "--a3p") && (esl_opt_GetInteger(go, "--a3p") == 0)) ? TRUE : FALSE;
655
656 ESL_ALLOC(useme, sizeof(int) * (msa->alen+1));
657 ESL_ALLOC(ct, sizeof(int) * (msa->alen+1));
658
659 /* determine clen */
660 for (apos = 0, cc = 0; apos < msa->alen; apos++) {
661 if (!esl_abc_CIsGap(abc, msa->rf[apos])) clen++;
662 }
663 /* make sure w/ --a3p <n>, that <n> <= clen */
664 if(set_spos && esl_opt_GetInteger(go, "--a3p") > clen) {
665 ESL_XFAIL(eslEINCOMPAT, errbuf, "with --a3p <n> option, <n> must be <= consensus length of CM (%d).\n", clen);
666 }
667 /* enforce that if both --a5p and --a3p are used, they both were set
668 * to 0 (this makes code a little simpler because we can worry about
669 * one less case).
670 */
671 if(set_spos && set_epos) {
672 if(( rnd_spos && (! rnd_epos)) ||
673 ((! rnd_spos) && rnd_epos)) {
674 ESL_XFAIL(eslEINCOMPAT, errbuf, "with --a5p <n1> and --a3p <n2>, either <n1> and <n2> must be 0, or neither must be 0");
675 }
676 }
677 /* determine spos (start posn) */
678 if(set_spos) {
679 if(rnd_spos) spos = esl_rnd_Roll(cfg->r, clen) + 1;
680 else spos = esl_opt_GetInteger(go, "--a5p");
681 }
682 else spos = 1;
683
684 /* determine epos (end posn) */
685 if(set_epos) {
686 if(rnd_epos) epos = esl_rnd_Roll(cfg->r, clen) + 1;
687 else epos = esl_opt_GetInteger(go, "--a3p");
688 }
689 else epos = clen;
690
691 /* make sure spos <= epos */
692 if(spos > epos) {
693 if(rnd_spos && rnd_epos) { /* random spos and epos, so spos > epos is not an error, swap them */
694 swap = spos; spos = epos; epos = swap;
695 }
696 else { /* user error (assert just to be safe) */
697 assert(esl_opt_IsUsed(go, "--a5p") && esl_opt_IsUsed(go, "--a3p"));
698 assert(esl_opt_GetInteger(go, "--a5p") > esl_opt_GetInteger(go, "--a3p"));
699 ESL_XFAIL(eslEINCOMPAT, errbuf, "with --a5p <n1> and --a3p <n2>, <n1> must be <= <n2>");
700 }
701 }
702
703 /* remove pknots in place (actually unnec for CM ss_cons) */
704 if((status = esl_wuss_nopseudo(msa->ss_cons, msa->ss_cons)) != eslOK) goto ERROR;
705
706 /* get a ct array from the structure */
707 if((status = esl_wuss2ct(msa->ss_cons, msa->alen, ct)) != eslOK) goto ERROR;
708
709 /* Truncate the alignment prior to consensus column spos and after
710 * consensus column epos. */
711 for (apos = 0, cc = 0; apos < msa->alen; apos++)
712 {
713 /* Careful here, placement of cc++ increment is impt, we want all
714 * inserts between cc=spos-1 and cc=spos, and b/t cc=epos and
715 * cc=epos+1. Also be careful: ct[] is index 1..alen, and
716 * msa->ss_cons is 0..alen-1.
717 */
718 if(cc < (spos-1) || cc > epos) {
719 useme[apos] = 0;
720 if(ct[(apos+1)] != 0) ct[ct[(apos+1)]] = 0;
721 ct[(apos+1)] = 0;
722 }
723 else
724 useme[apos] = 1;
725 if (!esl_abc_CIsGap(abc, msa->rf[apos])) {
726 cc++;
727 if(cc == (epos+1)){
728 useme[apos] = 0;
729 /* we misassigned this guy, overwrite */
730 if(ct[(apos+1)] != 0) ct[ct[(apos+1)]] = 0;
731 ct[(apos+1)] = 0;
732 }
733 }
734 }
735 /* construct the new structure based on the cleaned ct array */
736 if((status = esl_ct2wuss(ct, msa->alen, msa->ss_cons)) != eslOK) goto ERROR;
737
738 /*printf("\n\nDEBUG PRINTING ORIG ALIGNMENT:\n");
739 WriteStockholm(fp, msa);
740 printf("\n\nDONE DEBUG PRINTING ORIG ALIGNMENT:\n");
741 for(apos=0; apos < msa->alen; apos++)
742 printf("useme[%d]: %d\n", apos, useme[apos]);
743 */
744
745 if((status = esl_msa_ColumnSubset(msa, errbuf, useme)) != eslOK) return status;
746 free(useme);
747 free(ct);
748 return eslOK;
749
750 ERROR:
751 if(useme != NULL) free(useme);
752 if(ct != NULL) free(ct);
753 return status;
754 }
755
756
757