1 /* hmmemit: sample sequence(s) from a profile HMM.
2 */
3 #include "p7_config.h"
4
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <math.h>
8
9 #include "easel.h"
10 #include "esl_alphabet.h"
11 #include "esl_dmatrix.h"
12 #include "esl_getopts.h"
13 #include "esl_msa.h"
14 #include "esl_msafile.h"
15 #include "esl_random.h"
16 #include "esl_sq.h"
17 #include "esl_sqio.h"
18 #include "esl_vectorops.h"
19
20 #include "hmmer.h"
21
22 #define EMITOPTS "-a,-c,-C,-p"
23 #define MODEOPTS "--local,--unilocal,--glocal,--uniglocal"
24
25 static ESL_OPTIONS options[] = {
26 /* name type default env range toggles reqs incomp help docgroup*/
27 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 1 },
28 { "-o", eslARG_OUTFILE,FALSE, NULL, NULL, NULL, NULL, NULL, "send sequence output to file <f>, not stdout", 1 },
29 { "-N", eslARG_INT, "1", NULL, "n>0", NULL, NULL, "-c,-C", "number of seqs to sample", 1 },
30 /* options controlling what to emit */
31 { "-a", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, EMITOPTS, "emit alignment", 2 },
32 { "-c", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, EMITOPTS, "emit simple majority-rule consensus sequence", 2 },
33 { "-C", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, EMITOPTS, "emit fancier consensus sequence (req's --minl, --minu)", 2 },
34 { "-p", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, EMITOPTS, "sample sequences from profile, not core model", 2 },
35 /* options controlling emission from profiles with -p */
36 { "-L", eslARG_INT, "400", NULL, NULL, NULL, "-p", NULL, "set expected length from profile to <n>", 3 },
37 { "--local", eslARG_NONE,"default",NULL, NULL, MODEOPTS, "-p", NULL, "configure profile in multihit local mode", 3 },
38 { "--unilocal", eslARG_NONE, FALSE, NULL, NULL, MODEOPTS, "-p", NULL, "configure profile in unilocal mode", 3 },
39 { "--glocal", eslARG_NONE, FALSE, NULL, NULL, MODEOPTS, "-p", NULL, "configure profile in multihit glocal mode", 3 },
40 { "--uniglocal", eslARG_NONE, FALSE, NULL, NULL, MODEOPTS, "-p", NULL, "configure profile in unihit glocal mode", 3 },
41 /* options controlling fancy consensus emission with -C */
42 { "--minl", eslARG_REAL, "0.0", NULL, "0<=x<=1", NULL, "-C", NULL, "show consensus as 'any' (X/N) unless >= this fraction", 4 },
43 { "--minu", eslARG_REAL, "0.0", NULL, "0<=x<=1", NULL, "-C", NULL, "show consensus as upper case if >= this fraction", 4 },
44 /* other options */
45 { "--seed", eslARG_INT, "0", NULL, "n>=0", NULL, NULL, NULL, "set RNG seed to <n>", 5 },
46 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
47 };
48
49 static char usage[] = "[-options] <hmmfile (single)>";
50 static char banner[] = "sample sequence(s) from a profile HMM";
51
52 static void cmdline_failure(char *argv0, char *format, ...);
53 static void cmdline_help(char *argv0, ESL_GETOPTS *go);
54
55 static void emit_consensus(ESL_GETOPTS *go, FILE *ofp, int outfmt, P7_HMM *hmm);
56 static void emit_fancycons(ESL_GETOPTS *go, FILE *ofp, int outfmt, P7_HMM *hmm);
57 static void emit_alignment(ESL_GETOPTS *go, FILE *ofp, int outfmt, ESL_RANDOMNESS *r, P7_HMM *hmm);
58 static void emit_sequences(ESL_GETOPTS *go, FILE *ofp, int outfmt, ESL_RANDOMNESS *r, P7_HMM *hmm);
59
60
61 int
main(int argc,char ** argv)62 main(int argc, char **argv)
63 {
64 ESL_GETOPTS *go = NULL; /* command line processing */
65 ESL_ALPHABET *abc = NULL; /* sequence alphabet */
66 ESL_RANDOMNESS *r = NULL; /* source of randomness */
67 char *hmmfile = NULL; /* file to read HMM(s) from */
68 P7_HMMFILE *hfp = NULL; /* open hmmfile */
69 P7_HMM *hmm = NULL; /* HMM to emit from */
70 FILE *ofp = NULL; /* output stream */
71 int outfmt = 0;
72 int nhmms = 0;
73 int status;
74 char errbuf[eslERRBUFSIZE];
75
76 go = esl_getopts_Create(options);
77 if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf);
78 if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in configuration: %s\n", go->errbuf);
79 if (esl_opt_GetBoolean(go, "-h")) cmdline_help (argv[0], go);
80 if (esl_opt_ArgNumber(go) != 1) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n");
81
82 if ((hmmfile = esl_opt_GetArg(go, 1)) == NULL) cmdline_failure(argv[0], "Failed to get <hmmfile> on cmdline: %s\n", go->errbuf);
83
84 if ( esl_opt_IsOn(go, "-o") ) {
85 if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) esl_fatal("Failed to open output file %s", esl_opt_GetString(go, "-o"));
86 } else ofp = stdout;
87
88 if (esl_opt_GetBoolean(go, "-a")) outfmt = eslMSAFILE_STOCKHOLM;
89 else outfmt = eslSQFILE_FASTA;
90
91 r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "--seed"));
92
93 status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf);
94 if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf);
95 else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf);
96 else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf);
97
98 while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF)
99 {
100 if (status == eslEFORMAT) esl_fatal("Bad file format in HMM file %s:\n%s\n", hfp->fname, hfp->errbuf);
101 else if (status == eslEINCOMPAT) esl_fatal("HMM in %s is not in the expected %s alphabet\n", hfp->fname, esl_abc_DecodeType(abc->type));
102 else if (status != eslOK) esl_fatal("Unexpected error in reading HMMs from %s\n", hfp->fname);
103 nhmms++;
104
105 if (esl_opt_GetBoolean(go, "-c")) emit_consensus(go, ofp, outfmt, hmm);
106 else if (esl_opt_GetBoolean(go, "-C")) emit_fancycons(go, ofp, outfmt, hmm);
107 else if (esl_opt_GetBoolean(go, "-a")) emit_alignment(go, ofp, outfmt, r, hmm);
108 else emit_sequences(go, ofp, outfmt, r, hmm);
109
110
111
112 p7_hmm_Destroy(hmm);
113 }
114 if (nhmms == 0) esl_fatal("Empty HMM file %s? No HMM data found.\n");
115
116 if (esl_opt_IsOn(go, "-o")) { fclose(ofp); }
117 esl_randomness_Destroy(r);
118 esl_alphabet_Destroy(abc);
119 esl_getopts_Destroy(go);
120 p7_hmmfile_Close(hfp);
121 return eslOK;
122 }
123
124
125 static void
cmdline_failure(char * argv0,char * format,...)126 cmdline_failure(char *argv0, char *format, ...)
127 {
128 va_list argp;
129 printf("\nERROR: ");
130 va_start(argp, format);
131 vfprintf(stdout, format, argp);
132 va_end(argp);
133 esl_usage(stdout, argv0, usage);
134 printf("\nTo see more help on available options, do %s -h\n\n", argv0);
135 exit(1);
136 }
137
138 static void
cmdline_help(char * argv0,ESL_GETOPTS * go)139 cmdline_help(char *argv0, ESL_GETOPTS *go)
140 {
141 p7_banner (stdout, argv0, banner);
142 esl_usage (stdout, argv0, usage);
143 puts("\nCommon options are:");
144 esl_opt_DisplayHelp(stdout, go, 1, 2, 80);
145 puts("\nOptions controlling what to emit:");
146 esl_opt_DisplayHelp(stdout, go, 2, 2, 80);
147 puts("\nOptions controlling emission from profiles with -p:");
148 esl_opt_DisplayHelp(stdout, go, 3, 2, 80);
149 puts("\nOptions controlling fancy consensus emission with -C:");
150 esl_opt_DisplayHelp(stdout, go, 4, 2, 80);
151 puts("\nOther options::");
152 esl_opt_DisplayHelp(stdout, go, 5, 2, 80);
153 exit(0);
154 }
155
156 static void
emit_consensus(ESL_GETOPTS * go,FILE * ofp,int outfmt,P7_HMM * hmm)157 emit_consensus(ESL_GETOPTS *go, FILE *ofp, int outfmt, P7_HMM *hmm)
158 {
159 ESL_SQ *sq = NULL;
160
161 if ((sq = esl_sq_CreateDigital(hmm->abc)) == NULL) esl_fatal("failed to allocate sequence");
162
163 if (p7_emit_SimpleConsensus(hmm, sq) != eslOK) esl_fatal("failed to create simple consensus seq");
164 if (esl_sq_FormatName(sq, "%s-consensus", hmm->name) != eslOK) esl_fatal("failed to set sequence name");
165 if (esl_sqio_Write(ofp, sq, outfmt, FALSE) != eslOK) esl_fatal("failed to write sequence");
166
167 esl_sq_Destroy(sq);
168 return;
169 }
170
171 static void
emit_fancycons(ESL_GETOPTS * go,FILE * ofp,int outfmt,P7_HMM * hmm)172 emit_fancycons(ESL_GETOPTS *go, FILE *ofp, int outfmt, P7_HMM *hmm)
173 {
174 ESL_SQ *sq = NULL;
175 float minl = esl_opt_GetReal(go, "--minl");
176 float minu = esl_opt_GetReal(go, "--minu");
177
178 if ((sq = esl_sq_Create()) == NULL) esl_fatal("failed to allocate sequence");
179
180 if (p7_emit_FancyConsensus(hmm, minl, minu, sq) != eslOK) esl_fatal("failed to create consensus seq");
181 if (esl_sq_FormatName(sq, "%s-consensus", hmm->name) != eslOK) esl_fatal("failed to set sequence name");
182 if (esl_sqio_Write(ofp, sq, outfmt, FALSE) != eslOK) esl_fatal("failed to write sequence");
183
184 esl_sq_Destroy(sq);
185 return;
186 }
187
188 static void
emit_alignment(ESL_GETOPTS * go,FILE * ofp,int outfmt,ESL_RANDOMNESS * r,P7_HMM * hmm)189 emit_alignment(ESL_GETOPTS *go, FILE *ofp, int outfmt, ESL_RANDOMNESS *r, P7_HMM *hmm)
190 {
191 ESL_MSA *msa = NULL;
192 ESL_SQ **sq = NULL;
193 P7_TRACE **tr = NULL;
194 int N = esl_opt_GetInteger(go, "-N");
195 int optflags = p7_ALL_CONSENSUS_COLS;
196 int i;
197
198 if ((tr = malloc(sizeof(P7_TRACE *) * N)) == NULL) esl_fatal("failed to allocate trace array");
199 if ((sq = malloc(sizeof(ESL_SQ *) * N)) == NULL) esl_fatal("failed to allocate seq array");
200 for (i = 0; i < N; i++)
201 {
202 if ((sq[i] = esl_sq_CreateDigital(hmm->abc)) == NULL) esl_fatal("failed to allocate seq");
203 if ((tr[i] = p7_trace_Create()) == NULL) esl_fatal("failed to allocate trace");
204 }
205
206 for (i = 0; i < N; i++)
207 {
208 if (p7_CoreEmit(r, hmm, sq[i], tr[i]) != eslOK) esl_fatal("Failed to emit sequence");
209 if (esl_sq_FormatName(sq[i], "%s-sample%d", hmm->name, i+1) != eslOK) esl_fatal("Failed to set sequence name\n");
210 }
211
212 p7_tracealign_Seqs(sq, tr, N, hmm->M, optflags, hmm, &msa);
213 esl_msafile_Write(ofp, msa, outfmt);
214
215 for (i = 0; i < N; i++) { p7_trace_Destroy(tr[i]); } free(tr);
216 for (i = 0; i < N; i++) { esl_sq_Destroy(sq[i]); } free(sq);
217 esl_msa_Destroy(msa);
218 return;
219 }
220
221 static void
emit_sequences(ESL_GETOPTS * go,FILE * ofp,int outfmt,ESL_RANDOMNESS * r,P7_HMM * hmm)222 emit_sequences(ESL_GETOPTS *go, FILE *ofp, int outfmt, ESL_RANDOMNESS *r, P7_HMM *hmm)
223 {
224 ESL_SQ *sq = NULL;
225 P7_TRACE *tr = NULL;
226 P7_BG *bg = NULL;
227 P7_PROFILE *gm = NULL;
228 int do_profile = esl_opt_GetBoolean(go, "-p");
229 int N = esl_opt_GetInteger(go, "-N");
230 int L = esl_opt_GetInteger(go, "-L");
231 int mode = p7_LOCAL;
232 int nseq;
233 int status;
234
235 if (esl_opt_GetBoolean(go, "--local")) mode = p7_LOCAL;
236 else if (esl_opt_GetBoolean(go, "--unilocal")) mode = p7_UNILOCAL;
237 else if (esl_opt_GetBoolean(go, "--glocal")) mode = p7_GLOCAL;
238 else if (esl_opt_GetBoolean(go, "--uniglocal")) mode = p7_UNIGLOCAL;
239
240 if ((sq = esl_sq_CreateDigital(hmm->abc)) == NULL) esl_fatal("failed to allocate sequence");
241 if ((tr = p7_trace_Create()) == NULL) esl_fatal("failed to allocate trace");
242 if ((bg = p7_bg_Create(hmm->abc)) == NULL) esl_fatal("failed to create null model");
243 if ((gm = p7_profile_Create(hmm->M, hmm->abc)) == NULL) esl_fatal("failed to create profile");
244
245 if (p7_ProfileConfig(hmm, bg, gm, L, mode) != eslOK) esl_fatal("failed to configure profile");
246 if (p7_bg_SetLength(bg, L) != eslOK) esl_fatal("failed to reconfig null model length");
247 if (p7_hmm_Validate (hmm, NULL, 0.0001) != eslOK) esl_fatal("whoops, HMM is bad!");
248 if (p7_profile_Validate(gm, NULL, 0.0001) != eslOK) esl_fatal("whoops, profile is bad!");
249
250 for (nseq = 1; nseq <= N; nseq++)
251 {
252 if (do_profile) status = p7_ProfileEmit(r, hmm, gm, bg, sq, tr);
253 else status = p7_CoreEmit (r, hmm, sq, tr);
254 if (status) esl_fatal("Failed to emit sequence\n");
255
256 status = esl_sq_FormatName(sq, "%s-sample%d", hmm->name, nseq);
257 if (status) esl_fatal("Failed to set sequence name\n");
258
259 status = esl_sqio_Write(ofp, sq, outfmt, FALSE);
260 if (status != eslOK) esl_fatal("Failed to write sequence\n");
261
262 p7_trace_Reuse(tr);
263 esl_sq_Reuse(sq);
264 }
265
266 esl_sq_Destroy(sq);
267 p7_trace_Destroy(tr);
268 p7_bg_Destroy(bg);
269 p7_profile_Destroy(gm);
270 return;
271 }
272
273
274