1 /* hmmemit: sample sequence(s) from a profile HMM.
2  */
3 #include "p7_config.h"
4 
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <math.h>
8 
9 #include "easel.h"
10 #include "esl_alphabet.h"
11 #include "esl_dmatrix.h"
12 #include "esl_getopts.h"
13 #include "esl_msa.h"
14 #include "esl_msafile.h"
15 #include "esl_random.h"
16 #include "esl_sq.h"
17 #include "esl_sqio.h"
18 #include "esl_vectorops.h"
19 
20 #include "hmmer.h"
21 
22 #define EMITOPTS "-a,-c,-C,-p"
23 #define MODEOPTS "--local,--unilocal,--glocal,--uniglocal"
24 
25 static ESL_OPTIONS options[] = {
26   /* name           type      default  env  range     toggles      reqs   incomp  help   docgroup*/
27   { "-h",          eslARG_NONE,   FALSE, NULL, NULL,      NULL,      NULL,    NULL,  "show brief help on version and usage",                   1 },
28   { "-o",          eslARG_OUTFILE,FALSE, NULL, NULL,      NULL,      NULL,    NULL,  "send sequence output to file <f>, not stdout",           1 },
29   { "-N",          eslARG_INT,      "1", NULL, "n>0",     NULL,      NULL,  "-c,-C", "number of seqs to sample",                               1 },
30 /* options controlling what to emit */
31   { "-a",          eslARG_NONE,   FALSE, NULL, NULL,      NULL,      NULL, EMITOPTS, "emit alignment",                                         2 },
32   { "-c",          eslARG_NONE,   FALSE, NULL, NULL,      NULL,      NULL, EMITOPTS, "emit simple majority-rule consensus sequence",           2 },
33   { "-C",          eslARG_NONE,   FALSE, NULL, NULL,      NULL,      NULL, EMITOPTS, "emit fancier consensus sequence (req's --minl, --minu)", 2 },
34   { "-p",          eslARG_NONE,   FALSE, NULL, NULL,      NULL,      NULL, EMITOPTS, "sample sequences from profile, not core model",          2 },
35 /* options controlling emission from profiles with -p  */
36   { "-L",          eslARG_INT,    "400", NULL, NULL,      NULL,      "-p",    NULL, "set expected length from profile to <n>",               3 },
37   { "--local",     eslARG_NONE,"default",NULL, NULL,    MODEOPTS,    "-p",    NULL, "configure profile in multihit local mode",              3 },
38   { "--unilocal",  eslARG_NONE,  FALSE,  NULL, NULL,    MODEOPTS,    "-p",    NULL, "configure profile in unilocal mode",                    3 },
39   { "--glocal",    eslARG_NONE,  FALSE,  NULL, NULL,    MODEOPTS,    "-p",    NULL, "configure profile in multihit glocal mode",             3 },
40   { "--uniglocal", eslARG_NONE,  FALSE,  NULL, NULL,    MODEOPTS,    "-p",    NULL, "configure profile in unihit glocal mode",               3 },
41 /* options controlling fancy consensus emission with -C */
42   { "--minl",      eslARG_REAL,  "0.0",  NULL, "0<=x<=1", NULL,      "-C",    NULL, "show consensus as 'any' (X/N) unless >= this fraction", 4 },
43   { "--minu",      eslARG_REAL,  "0.0",  NULL, "0<=x<=1", NULL,      "-C",    NULL, "show consensus as upper case if >= this fraction",      4 },
44 /* other options */
45   { "--seed",      eslARG_INT,      "0", NULL, "n>=0",    NULL,      NULL,    NULL, "set RNG seed to <n>",                                    5 },
46   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
47 };
48 
49 static char usage[]  = "[-options] <hmmfile (single)>";
50 static char banner[] = "sample sequence(s) from a profile HMM";
51 
52 static void cmdline_failure(char *argv0, char *format, ...);
53 static void cmdline_help(char *argv0, ESL_GETOPTS *go);
54 
55 static void emit_consensus(ESL_GETOPTS *go, FILE *ofp, int outfmt,                    P7_HMM *hmm);
56 static void emit_fancycons(ESL_GETOPTS *go, FILE *ofp, int outfmt,                    P7_HMM *hmm);
57 static void emit_alignment(ESL_GETOPTS *go, FILE *ofp, int outfmt, ESL_RANDOMNESS *r, P7_HMM *hmm);
58 static void emit_sequences(ESL_GETOPTS *go, FILE *ofp, int outfmt, ESL_RANDOMNESS *r, P7_HMM *hmm);
59 
60 
61 int
main(int argc,char ** argv)62 main(int argc, char **argv)
63 {
64   ESL_GETOPTS     *go         = NULL;             /* command line processing                 */
65   ESL_ALPHABET    *abc        = NULL;             /* sequence alphabet                       */
66   ESL_RANDOMNESS  *r          = NULL;             /* source of randomness                    */
67   char            *hmmfile    = NULL;             /* file to read HMM(s) from                */
68   P7_HMMFILE      *hfp        = NULL;             /* open hmmfile                            */
69   P7_HMM          *hmm        = NULL;             /* HMM to emit from                        */
70   FILE            *ofp        = NULL;	          /* output stream                           */
71   int              outfmt     = 0;
72   int              nhmms      = 0;
73   int              status;
74   char             errbuf[eslERRBUFSIZE];
75 
76   go = esl_getopts_Create(options);
77   if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf);
78   if (esl_opt_VerifyConfig(go)               != eslOK) cmdline_failure(argv[0], "Error in configuration: %s\n",       go->errbuf);
79   if (esl_opt_GetBoolean(go, "-h"))                    cmdline_help   (argv[0], go);
80   if (esl_opt_ArgNumber(go) != 1)                      cmdline_failure(argv[0], "Incorrect number of command line arguments.\n");
81 
82   if ((hmmfile = esl_opt_GetArg(go, 1)) == NULL)       cmdline_failure(argv[0], "Failed to get <hmmfile> on cmdline: %s\n", go->errbuf);
83 
84   if ( esl_opt_IsOn(go, "-o") ) {
85     if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) esl_fatal("Failed to open output file %s", esl_opt_GetString(go, "-o"));
86   } else ofp = stdout;
87 
88   if (esl_opt_GetBoolean(go, "-a"))  outfmt = eslMSAFILE_STOCKHOLM;
89   else                               outfmt = eslSQFILE_FASTA;
90 
91   r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "--seed"));
92 
93   status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf);
94   if      (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf);
95   else if (status == eslEFORMAT)   p7_Fail("File format problem in trying to open HMM file %s.\n%s\n",                hmmfile, errbuf);
96   else if (status != eslOK)        p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n",                       status, hmmfile, errbuf);
97 
98   while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF)
99     {
100       if      (status == eslEFORMAT)    esl_fatal("Bad file format in HMM file %s:\n%s\n",          hfp->fname, hfp->errbuf);
101       else if (status == eslEINCOMPAT)  esl_fatal("HMM in %s is not in the expected %s alphabet\n", hfp->fname, esl_abc_DecodeType(abc->type));
102       else if (status != eslOK)         esl_fatal("Unexpected error in reading HMMs from %s\n",     hfp->fname);
103       nhmms++;
104 
105       if      (esl_opt_GetBoolean(go, "-c"))  emit_consensus(go, ofp, outfmt,    hmm);
106       else if (esl_opt_GetBoolean(go, "-C"))  emit_fancycons(go, ofp, outfmt,    hmm);
107       else if (esl_opt_GetBoolean(go, "-a"))  emit_alignment(go, ofp, outfmt, r, hmm);
108       else                                    emit_sequences(go, ofp, outfmt, r, hmm);
109 
110 
111 
112       p7_hmm_Destroy(hmm);
113     }
114   if (nhmms == 0) esl_fatal("Empty HMM file %s? No HMM data found.\n");
115 
116   if (esl_opt_IsOn(go, "-o")) { fclose(ofp); }
117   esl_randomness_Destroy(r);
118   esl_alphabet_Destroy(abc);
119   esl_getopts_Destroy(go);
120   p7_hmmfile_Close(hfp);
121   return eslOK;
122 }
123 
124 
125 static void
cmdline_failure(char * argv0,char * format,...)126 cmdline_failure(char *argv0, char *format, ...)
127 {
128   va_list argp;
129   printf("\nERROR: ");
130   va_start(argp, format);
131   vfprintf(stdout, format, argp);
132   va_end(argp);
133   esl_usage(stdout, argv0, usage);
134   printf("\nTo see more help on available options, do %s -h\n\n", argv0);
135   exit(1);
136 }
137 
138 static void
cmdline_help(char * argv0,ESL_GETOPTS * go)139 cmdline_help(char *argv0, ESL_GETOPTS *go)
140 {
141   p7_banner (stdout, argv0, banner);
142   esl_usage (stdout, argv0, usage);
143   puts("\nCommon options are:");
144   esl_opt_DisplayHelp(stdout, go, 1, 2, 80);
145   puts("\nOptions controlling what to emit:");
146   esl_opt_DisplayHelp(stdout, go, 2, 2, 80);
147   puts("\nOptions controlling emission from profiles with -p:");
148   esl_opt_DisplayHelp(stdout, go, 3, 2, 80);
149   puts("\nOptions controlling fancy consensus emission with -C:");
150   esl_opt_DisplayHelp(stdout, go, 4, 2, 80);
151   puts("\nOther options::");
152   esl_opt_DisplayHelp(stdout, go, 5, 2, 80);
153   exit(0);
154 }
155 
156 static void
emit_consensus(ESL_GETOPTS * go,FILE * ofp,int outfmt,P7_HMM * hmm)157 emit_consensus(ESL_GETOPTS *go, FILE *ofp, int outfmt, P7_HMM *hmm)
158 {
159   ESL_SQ     *sq           = NULL;
160 
161   if ((sq = esl_sq_CreateDigital(hmm->abc))             == NULL) esl_fatal("failed to allocate sequence");
162 
163   if (p7_emit_SimpleConsensus(hmm, sq)                 != eslOK) esl_fatal("failed to create simple consensus seq");
164   if (esl_sq_FormatName(sq, "%s-consensus", hmm->name) != eslOK) esl_fatal("failed to set sequence name");
165   if (esl_sqio_Write(ofp, sq, outfmt, FALSE)           != eslOK) esl_fatal("failed to write sequence");
166 
167   esl_sq_Destroy(sq);
168   return;
169 }
170 
171 static void
emit_fancycons(ESL_GETOPTS * go,FILE * ofp,int outfmt,P7_HMM * hmm)172 emit_fancycons(ESL_GETOPTS *go, FILE *ofp, int outfmt, P7_HMM *hmm)
173 {
174   ESL_SQ  *sq   = NULL;
175   float    minl = esl_opt_GetReal(go, "--minl");
176   float    minu = esl_opt_GetReal(go, "--minu");
177 
178   if ((sq = esl_sq_Create()) == NULL) esl_fatal("failed to allocate sequence");
179 
180   if (p7_emit_FancyConsensus(hmm, minl, minu, sq)      != eslOK) esl_fatal("failed to create consensus seq");
181   if (esl_sq_FormatName(sq, "%s-consensus", hmm->name) != eslOK) esl_fatal("failed to set sequence name");
182   if (esl_sqio_Write(ofp, sq, outfmt, FALSE)           != eslOK) esl_fatal("failed to write sequence");
183 
184   esl_sq_Destroy(sq);
185   return;
186 }
187 
188 static void
emit_alignment(ESL_GETOPTS * go,FILE * ofp,int outfmt,ESL_RANDOMNESS * r,P7_HMM * hmm)189 emit_alignment(ESL_GETOPTS *go, FILE *ofp, int outfmt, ESL_RANDOMNESS *r, P7_HMM *hmm)
190 {
191   ESL_MSA   *msa       = NULL;
192   ESL_SQ   **sq        = NULL;
193   P7_TRACE **tr        = NULL;
194   int         N        = esl_opt_GetInteger(go, "-N");
195   int         optflags = p7_ALL_CONSENSUS_COLS;
196   int         i;
197 
198   if ((tr = malloc(sizeof(P7_TRACE *) * N)) == NULL) esl_fatal("failed to allocate trace array");
199   if ((sq = malloc(sizeof(ESL_SQ   *) * N)) == NULL) esl_fatal("failed to allocate seq array");
200   for (i = 0; i < N; i++)
201     {
202       if ((sq[i] = esl_sq_CreateDigital(hmm->abc)) == NULL) esl_fatal("failed to allocate seq");
203       if ((tr[i] = p7_trace_Create())              == NULL) esl_fatal("failed to allocate trace");
204     }
205 
206   for (i = 0; i < N; i++)
207     {
208       if (p7_CoreEmit(r, hmm, sq[i], tr[i])                       != eslOK) esl_fatal("Failed to emit sequence");
209       if (esl_sq_FormatName(sq[i], "%s-sample%d", hmm->name, i+1) != eslOK) esl_fatal("Failed to set sequence name\n");
210     }
211 
212   p7_tracealign_Seqs(sq, tr, N, hmm->M, optflags, hmm, &msa);
213   esl_msafile_Write(ofp, msa, outfmt);
214 
215   for (i = 0; i < N; i++) { p7_trace_Destroy(tr[i]); } free(tr);
216   for (i = 0; i < N; i++) { esl_sq_Destroy(sq[i]);   } free(sq);
217   esl_msa_Destroy(msa);
218   return;
219 }
220 
221 static void
emit_sequences(ESL_GETOPTS * go,FILE * ofp,int outfmt,ESL_RANDOMNESS * r,P7_HMM * hmm)222 emit_sequences(ESL_GETOPTS *go, FILE *ofp, int outfmt, ESL_RANDOMNESS *r, P7_HMM *hmm)
223 {
224   ESL_SQ     *sq           = NULL;
225   P7_TRACE   *tr           = NULL;
226   P7_BG      *bg           = NULL;
227   P7_PROFILE *gm           = NULL;
228   int         do_profile   = esl_opt_GetBoolean(go, "-p");
229   int         N            = esl_opt_GetInteger(go, "-N");
230   int         L            = esl_opt_GetInteger(go, "-L");
231   int         mode         = p7_LOCAL;
232   int         nseq;
233   int         status;
234 
235   if      (esl_opt_GetBoolean(go, "--local"))     mode = p7_LOCAL;
236   else if (esl_opt_GetBoolean(go, "--unilocal"))  mode = p7_UNILOCAL;
237   else if (esl_opt_GetBoolean(go, "--glocal"))    mode = p7_GLOCAL;
238   else if (esl_opt_GetBoolean(go, "--uniglocal")) mode = p7_UNIGLOCAL;
239 
240   if ((sq = esl_sq_CreateDigital(hmm->abc))      == NULL)  esl_fatal("failed to allocate sequence");
241   if ((tr = p7_trace_Create())                   == NULL)  esl_fatal("failed to allocate trace");
242   if ((bg = p7_bg_Create(hmm->abc))              == NULL)  esl_fatal("failed to create null model");
243   if ((gm = p7_profile_Create(hmm->M, hmm->abc)) == NULL)  esl_fatal("failed to create profile");
244 
245   if (p7_ProfileConfig(hmm, bg, gm, L, mode)     != eslOK) esl_fatal("failed to configure profile");
246   if (p7_bg_SetLength(bg, L)                     != eslOK) esl_fatal("failed to reconfig null model length");
247   if (p7_hmm_Validate    (hmm, NULL, 0.0001)     != eslOK) esl_fatal("whoops, HMM is bad!");
248   if (p7_profile_Validate(gm,  NULL, 0.0001)     != eslOK) esl_fatal("whoops, profile is bad!");
249 
250   for (nseq = 1; nseq <= N; nseq++)
251     {
252       if (do_profile) status = p7_ProfileEmit(r, hmm, gm, bg, sq, tr);
253       else            status = p7_CoreEmit   (r, hmm, sq, tr);
254       if (status)  esl_fatal("Failed to emit sequence\n");
255 
256       status = esl_sq_FormatName(sq, "%s-sample%d", hmm->name, nseq);
257       if (status) esl_fatal("Failed to set sequence name\n");
258 
259       status = esl_sqio_Write(ofp, sq, outfmt, FALSE);
260       if (status != eslOK) esl_fatal("Failed to write sequence\n");
261 
262       p7_trace_Reuse(tr);
263       esl_sq_Reuse(sq);
264     }
265 
266   esl_sq_Destroy(sq);
267   p7_trace_Destroy(tr);
268   p7_bg_Destroy(bg);
269   p7_profile_Destroy(gm);
270   return;
271 }
272 
273 
274