1 /************************************************************
2 * HMMER - Biological sequence analysis with profile-HMMs
3 * Copyright (C) 1992-1998, Washington University School of Medicine
4 *
5 * This source code is distributed under the terms of the
6 * GNU General Public License. See the files COPYING and
7 * GNULICENSE for details.
8 *
9 ************************************************************/
10
11 /* hmmemit.c
12 * SRE, Sun Mar 8 14:11:24 1998 [St. Louis]
13 *
14 * main() for generating sequences from an HMM
15 * RCS $Id: hmmemit.c,v 1.1.1.1 2001/06/18 13:59:49 birney Exp $
16 */
17
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <time.h>
21
22 #include "structs.h" /* data structures, macros, #define's */
23 #include "config.h" /* compile-time configuration constants */
24 #include "funcs.h" /* function declarations */
25 #include "globals.h" /* alphabet global variables */
26 #include "squid.h" /* general sequence analysis library */
27
28 #ifdef MEMDEBUG
29 #include "dbmalloc.h"
30 #endif
31
32 static char banner[] = "hmmemit - generate sequences from a profile HMM";
33
34 static char usage[] = "\
35 Usage: hmmemit [-options] <hmm file>\n\
36 Available options are:\n\
37 -a : write generated sequences as an alignment, not FASTA\n\
38 -h : help; print brief help on version and usage\n\
39 -n <n> : emit <n> sequences (default 10)\n\
40 -o <f> : save sequences in file <f>\n\
41 -q : quiet - suppress verbose banner\n\
42 ";
43
44 static char experts[] = "\
45 --seed <n> : set random number seed to <n>\n\
46 ";
47
48 static struct opt_s OPTIONS[] = {
49 { "-a", TRUE, sqdARG_NONE },
50 { "-h", TRUE, sqdARG_NONE },
51 { "-n", TRUE, sqdARG_INT},
52 { "-o", TRUE, sqdARG_STRING},
53 { "-q", TRUE, sqdARG_NONE},
54 { "--seed", FALSE, sqdARG_INT},
55 };
56 #define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))
57
58 int
main(int argc,char ** argv)59 main(int argc, char **argv)
60 {
61 char *hmmfile; /* file to read HMMs from */
62 FILE *fp; /* output file handle */
63 HMMFILE *hmmfp; /* opened hmmfile for reading */
64 struct plan7_s *hmm; /* HMM to generate from */
65 int L; /* length of a sequence */
66 int i; /* counter over sequences */
67
68 char *ofile; /* output sequence file */
69 int nseq; /* number of seqs to sample */
70 int seed; /* random number generator seed */
71 int be_quiet; /* TRUE to silence header/footer */
72 int do_alignment;/* TRUE to output in aligned format */
73
74 char *optname; /* name of option found by Getopt() */
75 char *optarg; /* argument found by Getopt() */
76 int optind; /* index in argv[] */
77
78 #ifdef MEMDEBUG
79 unsigned long histid1, histid2, orig_size, current_size;
80 orig_size = malloc_inuse(&histid1);
81 fprintf(stderr, "[... memory debugging is ON ...]\n");
82 #endif
83
84 /***********************************************
85 * Parse command line
86 ***********************************************/
87
88 nseq = 10;
89 seed = time ((time_t *) NULL);
90 be_quiet = FALSE;
91 do_alignment = FALSE;
92 ofile = NULL;
93
94 while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,
95 &optind, &optname, &optarg)) {
96 if (strcmp(optname, "-a") == 0) do_alignment = TRUE;
97 else if (strcmp(optname, "-n") == 0) nseq = atoi(optarg);
98 else if (strcmp(optname, "-o") == 0) ofile = optarg;
99 else if (strcmp(optname, "-q") == 0) be_quiet = TRUE;
100 else if (strcmp(optname, "--seed") == 0) seed = atoi(optarg);
101 else if (strcmp(optname, "-h") == 0)
102 {
103 Banner(stdout, banner);
104 puts(usage);
105 puts(experts);
106 exit(0);
107 }
108 }
109 if (argc - optind != 1)
110 Die("Incorrect number of arguments.\n%s\n", usage);
111
112 hmmfile = argv[optind++];
113
114 sre_srandom(seed);
115
116 /***********************************************
117 * Open HMM file (might be in HMMERDB or current directory).
118 * Read a single HMM from it.
119 ***********************************************/
120
121 if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL)
122 Die("Failed to open HMM file %s\n%s", hmmfile, usage);
123 if (!HMMFileRead(hmmfp, &hmm))
124 Die("Failed to read any HMMs from %s\n", hmmfile);
125 HMMFileClose(hmmfp);
126 if (hmm == NULL)
127 Die("HMM file %s corrupt or in incorrect format? Parse failed", hmmfile);
128
129 /* Configure the HMM to shut off N,J,C emission: so we
130 * do a simple single pass through the model.
131 */
132 Plan7NakedConfig(hmm);
133 Plan7Renormalize(hmm);
134
135 /***********************************************
136 * Open the output file, or stdout
137 ***********************************************/
138
139 if (ofile == NULL) fp = stdout;
140 else {
141 if ((fp = fopen(ofile, "w")) == NULL)
142 Die("Failed to open output file %s for writing", ofile);
143 }
144
145 /***********************************************
146 * Show the options banner
147 ***********************************************/
148
149 if (! be_quiet)
150 {
151 Banner(stdout, banner);
152 printf("HMM file: %s\n", hmmfile);
153 printf("Random seed: %d\n", seed);
154 printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n");
155 }
156
157 /***********************************************
158 * Do the work.
159 * If we're generating an alignment, we have to collect
160 * all our traces, then output. If we're generating unaligned
161 * sequences, we can emit one at a time.
162 ***********************************************/
163
164 if (do_alignment)
165 {
166 struct p7trace_s **tr; /* traces for aligned sequences */
167 char **dsq; /* digitized sequences */
168 SQINFO *sqinfo; /* info about sequences (name/desc) */
169 char **aseq; /* sequence alignment */
170 AINFO ainfo; /* optional alignment info */
171 float *wgt;
172
173 dsq = MallocOrDie(sizeof(char *) * nseq);
174 tr = MallocOrDie(sizeof(struct p7trace_s *) * nseq);
175 sqinfo = MallocOrDie(sizeof(SQINFO) * nseq);
176 wgt = MallocOrDie(sizeof(float) * nseq);
177 FSet(wgt, nseq, 1.0);
178
179 for (i = 0; i < nseq; i++)
180 {
181 EmitSequence(hmm, &(dsq[i]), &L, &(tr[i]));
182 sprintf(sqinfo[i].name, "seq%d", i+1);
183 sqinfo[i].len = L;
184 sqinfo[i].flags = SQINFO_NAME | SQINFO_LEN;
185 }
186
187 P7Traces2Alignment(dsq, sqinfo, wgt, nseq, hmm->M, tr, FALSE,
188 &aseq, &ainfo);
189
190 /* Output the alignment */
191 WriteSELEX(fp, aseq, &ainfo, 50);
192 if (ofile != NULL && !be_quiet) printf("Alignment saved in file %s\n", ofile);
193
194 /* Free memory
195 */
196 for (i = 0; i < nseq; i++)
197 {
198 P7FreeTrace(tr[i]);
199 free(dsq[i]);
200 }
201 FreeAlignment(aseq, &ainfo);
202 free(sqinfo);
203 free(dsq);
204 free(wgt);
205 free(tr);
206 }
207 else /* unaligned sequence output */
208 {
209 struct p7trace_s *tr; /* generated trace */
210 char *dsq; /* digitized sequence */
211 char *seq; /* alphabetic sequence */
212 SQINFO sqinfo; /* info about sequence (name/len) */
213
214 for (i = 0; i < nseq; i++)
215 {
216 EmitSequence(hmm, &dsq, &L, &tr);
217 sprintf(sqinfo.name, "seq%d", i+1);
218 sqinfo.len = L;
219 sqinfo.flags = SQINFO_NAME | SQINFO_LEN;
220
221 seq = DedigitizeSequence(dsq, L);
222
223 WriteSeq(fp, kPearson, seq, &sqinfo);
224
225 P7FreeTrace(tr);
226 free(dsq);
227 free(seq);
228 }
229 }
230
231 if (ofile != NULL) fclose(fp);
232 FreePlan7(hmm);
233 SqdClean();
234
235 #ifdef MEMDEBUG
236 current_size = malloc_inuse(&histid2);
237 if (current_size != orig_size) malloc_list(2, histid1, histid2);
238 else fprintf(stderr, "[No memory leaks.]\n");
239 #endif
240 return 0;
241 }
242
243