1 /* Input/output of CMs.
2 *
3 * EPN, Fri Jun 17 09:37:56 2011
4 * Based on p7_hmmfile.c copied from HMMER svn revision 3570.
5 *
6 * Contents:
7 * 1. The CM_FILE object for reading CMs
8 * 2. Writing CM files.
9 * 3. API for reading CM files in various formats.
10 * 4. API for reading/writing p7 HMMs/profile filters.
11 * 5. Private, specific CM file format parsers.
12 * 6. Other private functions involved in i/o.
13 * 7. Legacy v1.0 ascii file format output.
14 * 8. Benchmark driver.
15 * 9. Unit tests.
16 * 10. Test driver.
17 * 11. Example.
18 */
19 #include "esl_config.h"
20 #include "p7_config.h"
21 #include "config.h"
22
23 #include <stdlib.h>
24 #include <stdio.h>
25 #include <string.h>
26
27 #ifdef HMMER_THREADS
28 #include <pthread.h>
29 #endif
30
31 #include "easel.h"
32 #include "esl_alphabet.h"
33 #include "esl_ssi.h" /* this gives us esl_byteswap */
34 #include "esl_vectorops.h" /* gives us esl_vec_FCopy() */
35
36 #include "hmmer.h"
37
38 #include "infernal.h"
39
40 /* Magic numbers identifying binary formats.
41 * Do not change the old magics! Necessary for backwards compatibility.
42 */
43 #if 0 /* temporarily remove all the magic; write backwards compat stuff later */
44 static unsigned int v01magic = 0xe3edb0b1; /* v0.1 binary: "cm01" + 0x80808080 */
45 #endif
46
47 static uint32_t v1a_magic = 0xe3edb0b2; /* v1.1 binary: "cm02" + 0x80808080 */
48 static uint32_t v1a_fmagic = 0xb1e1e6f3; /* 1/a binary MSV/SSV file: "1afs" = 0x 31 61 66 73 + 0x80808080 */
49 /* Note: 's' at end of 1afs is arbitrary. It is consistent with H3's
50 * trailing 's' iforSSE binary files, but in Infernal this is used
51 * whether we're building for SSE or VMX (since this code is neither
52 * SSE nor VMX-specific). I suppose we could say it stands for SSV,
53 * with 'f' for filter... Also, http://www.ascii-code.com/ and
54 * hmmer/src/impl_sse/io.c might be useful to determine how each code
55 * derives from its text string.
56 */
57
58 static int read_asc_1p1_cm(CM_FILE *hfp, int read_fp7, ESL_ALPHABET **ret_abc, CM_t **opt_cm);
59 static int read_bin_1p1_cm(CM_FILE *hfp, int read_fp7, ESL_ALPHABET **ret_abc, CM_t **opt_cm);
60 static int read_asc_1p0_cm(CM_FILE *hfp, int read_fp7, ESL_ALPHABET **ret_abc, CM_t **opt_cm);
61
62 static int write_bin_string(FILE *fp, char *s);
63 static int read_bin_string (FILE *fp, char **ret_s);
64
65 static char *prob2ascii(float p, float null);
66 static float ascii2prob(char *s, float null);
67 static int is_integer(char *s);
68 static int is_real(char *s);
69
70 /* from p7_hmmfile.c, for reading p7 filters */
71 static uint32_t v3a_magic = 0xe8ededb6; /* 3/a binary: "hmm6" + 0x80808080 */
72 static uint32_t v3b_magic = 0xe8ededb7; /* 3/b binary: "hmm7" + 0x80808080 */
73 static uint32_t v3c_magic = 0xe8ededb8; /* 3/c binary: "hmm8" + 0x80808080 */
74 static uint32_t v3d_magic = 0xe8ededb9; /* 3/d binary: "hmm9" + 0x80808080 */
75 static uint32_t v3e_magic = 0xe8ededb0; /* 3/e binary: "hmm0" + 0x80808080 */
76 static uint32_t v3f_magic = 0xe8ededba; /* 3/f binary: "hmma" + 0x80808080 */
77
78 static int read_asc30hmm(P7_HMMFILE *hfp, ESL_ALPHABET **ret_abc, P7_HMM **opt_hmm);
79 static int read_bin30hmm(P7_HMMFILE *hfp, ESL_ALPHABET **ret_abc, P7_HMM **opt_hmm);
80
81 /*****************************************************************
82 * 1. The CM_FILE object for reading CMs.
83 *****************************************************************/
84 static int open_engine(char *filename, char *env, CM_FILE **ret_cmfp, int do_ascii_only, int allow_1p0, char *errbuf);
85
86 /* Function: cm_file_Open()
87 * Synopsis: Open an CM file <filename>.
88 * Incept: EPN, Fri Jun 17 09:42:13 2011
89 * SRE, Tue Dec 21 10:44:38 2010 [Zaragoza] (p7_hmmfile_OpenE())
90 *
91 * Purpose: Open an CM file <filename>, and prepare to read the first
92 * CM from it.
93 *
94 * The format should be INFERNAL1/a or more recent. If
95 * <allow_1p0>, we also allow the file to be in Infernal
96 * v1.0 to v1.0.2 ascii format.
97 *
98 * We look for <filename> relative to the current working
99 * directory. Additionally, if we don't find it in the cwd
100 * and <env> is non-NULL, we will look for <filename>
101 * relative to one or more directories in a colon-delimited
102 * list obtained from the environment variable <env>. For
103 * example, if we had <setenv INFERNALDB
104 * /misc/db/Rfam:/misc/db/Pfam> in the environment, a
105 * CM application might pass "INFERNALDB" as <env>.
106 *
107 * As a special case, if <filename> is "-", then CMs will
108 * be read from <stdin>. In this case, <env> has no effect.
109 *
110 * As another special case, if <filename> ends in a <.gz>
111 * suffix, the file is assumed to be compressed by GNU
112 * <gzip>, and it is opened for reading from a pipe with
113 * <gunzip -dc>. This feature is only available on
114 * POSIX-compliant systems that have a <popen()> call, and
115 * <HAVE_POPEN> is defined by the configure script at
116 * compile time.
117 *
118 * Args: filename - CM file to open; or "-" for <stdin>
119 * env - list of paths to look for <cmfile> in, in
120 * addition to current working dir; or <NULL>
121 * allow_1p0 - TRUE to allow 1.0 formatted files
122 * ret_cmfp - RETURN: opened <P7_CMFILE>.
123 * errbuf - error message buffer: <NULL>, or a ptr
124 * to <eslERRBUFSIZE> chars of allocated space.
125 *
126 * Returns: <eslOK> on success, and the open <CM_FILE> is returned
127 * in <*ret_cmfp>.
128 *
129 * <eslENOTFOUND> if <filename> can't be opened for
130 * reading, even after the list of directories in <env> (if
131 * any) is checked.
132 *
133 * <eslEFORMAT> if <filename> is not in a recognized Infernal
134 * CM file format.
135 *
136 * On either type of error, if a non-NULL <errbuf> was provided,
137 * a useful user error message is left in it.
138 *
139 * Throws: <eslEMEM> on allocation failure.
140 */
141 int
cm_file_Open(char * filename,char * env,int allow_1p0,CM_FILE ** ret_cmfp,char * errbuf)142 cm_file_Open(char *filename, char *env, int allow_1p0, CM_FILE **ret_cmfp, char *errbuf)
143 {
144 return open_engine(filename, env, ret_cmfp, FALSE, allow_1p0, errbuf);
145 }
146
147 /* Function: cm_file_OpenNoDB()
148 * Synopsis: Open only a CM flatfile, even if pressed db exists.
149 * Incept: SRE, Tue Dec 21 10:52:35 2010 [Zaragoza]
150 *
151 * Purpose: Same as <cm_file_OpenE()> except that if a pressed
152 * database exists for <filename>, it is ignored. Only
153 * <filename> itself is opened.
154 *
155 * hmmpress needs this call. Otherwise, it opens a press'ed
156 * database that it may be about to overwrite.
157 */
158 int
cm_file_OpenNoDB(char * filename,char * env,int allow_1p0,CM_FILE ** ret_cmfp,char * errbuf)159 cm_file_OpenNoDB(char *filename, char *env, int allow_1p0, CM_FILE **ret_cmfp, char *errbuf)
160 {
161 return open_engine(filename, env, ret_cmfp, TRUE, allow_1p0, errbuf);
162 }
163
164
165 /* Function: cm_file_OpenBuffer()
166 * Incept: EPN, Fri Jun 17 09:47:34 2011
167 * MSF, Thu Aug 19 2010 [Janelia] (p7_hmmfile_OpenBuffer())
168 *
169 * Purpose: Perparse a buffer containing an ascii CM for parsing.
170 *
171 * As another special case, if <filename> ends in a <.gz>
172 * suffix, the file is assumed to be compressed by GNU
173 * <gzip>, and it is opened for reading from a pipe with
174 * <gunzip -dc>. This feature is only available on
175 * POSIX-compliant systems that have a <popen()> call, and
176 * <HAVE_POPEN> is defined by the configure script at
177 * compile time.
178 *
179 * Args: filename - CM file to open; or "-" for <stdin>
180 * env - list of paths to look for <cmfile> in, in
181 * addition to current working dir; or <NULL>
182 * ret_cmfp - RETURN: opened <CM_FILE>.
183 *
184 * Returns: <eslOK> on success, and the open <CM_FILE> is returned
185 * in <*ret_cmfp>.
186 *
187 * <eslENOTFOUND> if <filename> can't be opened for
188 * reading, even after the list of directories in <env> (if
189 * any) is checked.
190 *
191 * <eslEFORMAT> if <filename> is not in a recognized HMMER
192 * CM file format.
193 *
194 * Throws: <eslEMEM> on allocation failure.
195 */
196 int
cm_file_OpenBuffer(char * buffer,int size,int allow_1p0,CM_FILE ** ret_cmfp)197 cm_file_OpenBuffer(char *buffer, int size, int allow_1p0, CM_FILE **ret_cmfp)
198 {
199 CM_FILE *cmfp = NULL;
200 int status;
201 char *tok;
202 int toklen;
203
204 ESL_ALLOC(cmfp, sizeof(CM_FILE));
205 cmfp->f = NULL;
206 cmfp->fname = NULL;
207 cmfp->do_gzip = FALSE;
208 cmfp->do_stdin = FALSE;
209 cmfp->newly_opened = TRUE; /* well, it will be, real soon now */
210 cmfp->is_pressed = FALSE;
211 #ifdef HMMER_THREADS
212 cmfp->syncRead = FALSE;
213 #endif
214 cmfp->parser = NULL;
215 cmfp->efp = NULL;
216 cmfp->ffp = NULL;
217 cmfp->pfp = NULL;
218 cmfp->ssi = NULL;
219 cmfp->errbuf[0] = '\0';
220
221 if ((cmfp->efp = esl_fileparser_CreateMapped(buffer, size)) == NULL) { status = eslEMEM; goto ERROR; }
222 if ((status = esl_fileparser_SetCommentChar(cmfp->efp, '#')) != eslOK) goto ERROR;
223 if ((status = esl_fileparser_GetToken(cmfp->efp, &tok, &toklen)) != eslOK) goto ERROR;
224
225 if ( strcmp("INFERNAL1/a", tok) == 0) { cmfp->format = CM_FILE_1a; cmfp->parser = read_asc_1p1_cm; }
226 else if (allow_1p0 && strcmp("INFERNAL-1", tok) == 0) { cmfp->format = CM_FILE_1; cmfp->parser = read_asc_1p0_cm; }
227
228 if (cmfp->parser == NULL) { status = eslEFORMAT; goto ERROR; }
229
230 *ret_cmfp = cmfp;
231 return eslOK;
232
233 ERROR:
234 if (cmfp != NULL) cm_file_Close(cmfp);
235 *ret_cmfp = NULL;
236 if (status == eslEMEM) return status;
237 else if (status == eslENOTFOUND) return status;
238 else return eslEFORMAT;
239 }
240
241
242 /* open_engine()
243 *
244 * Implements the file opening functions:
245 * <cm_file_Open()>, <cm_file_OpenNoDB()>,
246 * See their comments above.
247 *
248 * Only returns three types of errors:
249 * eslENOTFOUND - file (the CM file) or program (gzip, for .gz files) not found
250 * eslEFORMAT - bad CM file format (or format of associated file)
251 * eslEMEM - allocation failure somewhere
252 * <errbuf>, if non-NULL, will contain a useful error message.
253 *
254 */
255 static int
open_engine(char * filename,char * env,CM_FILE ** ret_cmfp,int do_ascii_only,int allow_1p0,char * errbuf)256 open_engine(char *filename, char *env, CM_FILE **ret_cmfp, int do_ascii_only, int allow_1p0, char *errbuf)
257 {
258 CM_FILE *cmfp = NULL;
259 char *envfile = NULL; /* full path to filename after using environment */
260 char *dbfile = NULL; /* constructed name of an index or binary db file */
261 char *cmd = NULL; /* constructed gzip -dc pipe command */
262 int status;
263 int n = strlen(filename);
264 union { char c[4]; uint32_t n; } magic;
265 char *tok;
266 int toklen;
267
268 ESL_ALLOC(cmfp, sizeof(CM_FILE));
269 cmfp->f = NULL;
270 cmfp->fname = NULL;
271 cmfp->do_gzip = FALSE;
272 cmfp->do_stdin = FALSE;
273 cmfp->newly_opened = TRUE; /* well, it will be, real soon now */
274 cmfp->is_pressed = FALSE;
275 cmfp->is_binary = FALSE;
276 #ifdef HMMER_THREADS
277 cmfp->syncRead = FALSE;
278 #endif
279 cmfp->parser = NULL;
280 cmfp->efp = NULL;
281 cmfp->hfp = NULL;
282 cmfp->ffp = NULL;
283 cmfp->pfp = NULL;
284 cmfp->ssi = NULL;
285 cmfp->errbuf[0] = '\0';
286
287 /* 1. There's two special reading modes that have limited indexing
288 * and optimization capability: reading from standard input, and
289 * reading a gzip'ped file. Once we've set one of these up and set
290 * either the <do_stdin> or <do_gzip> flag, we won't try to open
291 * any associated indexes or binary database files.
292 */
293 if (strcmp(filename, "-") == 0) /* "-" means read from stdin */
294 {
295 cmfp->f = stdin;
296 cmfp->do_stdin = TRUE;
297 if ((status = esl_strdup("[STDIN]", -1, &(cmfp->fname))) != eslOK) ESL_XFAIL(status, errbuf, "esl_strdup failed; shouldn't happen");
298 }
299 #ifdef HAVE_POPEN
300 else if (n > 3 && strcmp(filename+n-3, ".gz") == 0) /* a <*.gz> filename means read via gunzip pipe */
301 {
302 if (! esl_FileExists(filename)) ESL_XFAIL(eslENOTFOUND, errbuf, ".gz file %s not found or not readable", filename);
303 if ((status = esl_sprintf(&cmd, "gzip -dc %s", filename)) != eslOK) ESL_XFAIL(status, errbuf, "when setting up .gz pipe: esl_sprintf() failed");
304 if ((cmfp->f = popen(cmd, "r")) == NULL) ESL_XFAIL(eslENOTFOUND, errbuf, "gzip -dc %s failed; gzip not installed or not in PATH?", filename);
305 if ((status = esl_strdup(filename, n, &(cmfp->fname))) != eslOK) ESL_XFAIL(status, errbuf, "esl_strdup() failed, shouldn't happen");
306 cmfp->do_gzip = TRUE;
307 free(cmd); cmd = NULL;
308 }
309 #endif /*HAVE_POPEN: gzip mode */
310
311
312 /* 2. If <cmfp->f> is still NULL, then we're in the usual situation
313 * of looking for a file on disk. It may either be in the cwd, or
314 * in one of the directories listed in the <env> string. Find it,
315 * open it to <cmfp->f>, and set <cmfp->filename>. The
316 * <cmfp->filename> string will be used later to construct the
317 * names of expected index and binary database files.
318 */
319 if (cmfp->f == NULL)
320 {
321 if ((cmfp->f = fopen(filename, "r")) != NULL)
322 {
323 if ((status = esl_strdup(filename, n, &(cmfp->fname))) != eslOK) ESL_XFAIL(status, errbuf, "esl_strdup() failed, shouldn't happen");
324 }
325 else if (esl_FileEnvOpen(filename, env, &(cmfp->f), &envfile) == eslOK)
326 {
327 n = strlen(envfile);
328 if ((status = esl_strdup(envfile, n, &(cmfp->fname))) != eslOK) ESL_XFAIL(status, errbuf, "esl_strdup() failed, shouldn't happen");
329 free(envfile); envfile = NULL;
330 }
331 else
332 { /* temporarily copy filename over to cmfp->fname, even though we haven't opened anything: we'll next try to open <filename>.h3m */
333 if ((status = esl_strdup(filename, n, &(cmfp->fname))) != eslOK) ESL_XFAIL(status, errbuf, "esl_strdup() failed, shouldn't happen");
334 }
335 }
336 /* <cmfp->f> may *still* be NULL, if <filename> is a press'ed database and ASCII file is deleted */
337
338
339 /* 3. Look for the binary model file component of a press'ed CM database.
340 *
341 * If <cmfp->f> is still NULL, this is our last chance to find it.
342 * (The ASCII base file may have been deleted to save space, leaving
343 * binary press'ed files.)
344 *
345 * If we've been asked to open only an ASCII file -- because we're being
346 * called by cmpress, for example! -- then don't do this.
347 */
348 if (! do_ascii_only && ! cmfp->do_stdin && ! cmfp->do_gzip)
349 {
350 FILE *tmpfp;
351 /* if we opened an ASCII file in the INFERNALDB directory, cmfp->fname contains fully qualified name of file including the path */
352 if ((status = esl_sprintf(&dbfile, "%s.i1m", cmfp->fname) != eslOK)) ESL_XFAIL(status, errbuf, "esl_sprintf() failed; shouldn't happen");
353
354 if ((tmpfp = fopen(dbfile, "rb")) != NULL)
355 {
356 if (cmfp->f != NULL) fclose(cmfp->f); /* preferentially read the .i1m file, not the original */
357 cmfp->f = tmpfp;
358 cmfp->is_pressed = TRUE;
359 free(cmfp->fname);
360 if ((status = esl_strdup(dbfile, -1, &(cmfp->fname))) != eslOK) ESL_XFAIL(status, errbuf, "esl_strdup() failed, shouldn't happen");
361 }
362 else if (cmfp->f == NULL && esl_FileEnvOpen(dbfile, env, &(cmfp->f), &envfile) == eslOK)
363 { /* found a binary-only press'ed db in one of the env directories. */
364 free(cmfp->fname);
365 if ((status = esl_strdup(envfile, -1, &(cmfp->fname))) != eslOK) ESL_XFAIL(status, errbuf, "esl_strdup() failed; shouldn't happen");
366 cmfp->is_pressed = TRUE;
367 }
368 free(dbfile); dbfile = NULL;
369 }
370
371 /* 4. <cmfp->f> must now point to a valid model input stream: if not, we fail.
372 */
373 if (cmfp->f == NULL)
374 {
375 if (env) ESL_XFAIL(eslENOTFOUND, errbuf, "CM file %s not found (nor an .i1m binary of it); also looked in %s", filename, env);
376 else ESL_XFAIL(eslENOTFOUND, errbuf, "CM file %s not found (nor an .i1m binary of it)", filename);
377 }
378
379 /* 5. Set up the HMM file <cmfp->hfp> that we'll use to read p7
380 * filter HMMs from within the CM file. We do this before we
381 * handle press'd model files, so we can set <cmfp->hfp->ffp>
382 * and <cmfp->hfp->pfp> if necessary.
383 *
384 */
385 ESL_ALLOC(cmfp->hfp, sizeof(P7_HMMFILE));
386
387 if (!cmfp->do_stdin && !cmfp->do_gzip) {
388 if ((cmfp->hfp->f = fopen(cmfp->fname, "r")) == NULL) goto ERROR;
389 }
390 else if (cmfp->do_stdin) {
391 cmfp->hfp->f = stdin;
392 }
393 #ifdef HAVE_POPEN /* gzip functionality */
394 else if (cmfp->do_gzip) { /* will only possibly be TRUE if HAVE_POPEN */
395 /* we don't open the file separately for cmfp->hfp in gzip case,
396 * which works fine (since we're never a press'd db) but we have
397 * to be careful when closing cmfp in cm_file_Close().
398 */
399 cmfp->hfp->f = cmfp->f;
400 }
401 #endif
402 cmfp->hfp->do_gzip = cmfp->do_gzip;
403 cmfp->hfp->do_stdin = cmfp->do_stdin;
404 cmfp->hfp->newly_opened = TRUE; /* well, it will be, real soon now */
405 cmfp->hfp->is_pressed = cmfp->is_pressed;
406 #ifdef HMMER_THREADS
407 cmfp->hfp->syncRead = FALSE;
408 #endif
409 cmfp->hfp->parser = NULL;
410 cmfp->hfp->efp = NULL;
411 cmfp->hfp->ffp = NULL;
412 cmfp->hfp->pfp = NULL;
413 cmfp->hfp->ssi = NULL; /* not sure if this should point to cmfp->ssi */
414 cmfp->hfp->errbuf[0] = '\0';
415 if ((status = esl_strdup(cmfp->fname, -1, &(cmfp->hfp->fname))) != eslOK) goto ERROR;
416
417 /* 6. If we found and opened a binary model file .i1m, open the rest of
418 * the press'd model files. (this can't be true if do_ascii_only is set).
419 * Note that we set cmfp->hfp->ffp and cmfp->hfp->pfp at the end of the
420 * function after we've
421 */
422 if (cmfp->is_pressed)
423 {
424 /* here we rely on the fact that the suffixes are .i1{mfpi}, to construct other names from .i1m file name !! */
425 n = strlen(cmfp->fname); /* so, n = '\0', n-1 = 'm' */
426 esl_strdup(cmfp->fname, n, &dbfile);
427
428 dbfile[n-1] = 'f'; /* the MSV filter part of the optimized sequence profiles (HMMs) */
429 if ((cmfp->ffp = fopen(dbfile, "rb")) == NULL) ESL_XFAIL(eslENOTFOUND, errbuf, "Opened %s, a pressed CM file; but no .i1f file found", cmfp->fname);
430 if ((cmfp->hfp->ffp = fopen(dbfile, "rb")) == NULL) ESL_XFAIL(eslENOTFOUND, errbuf, "Opened %s, a pressed CM file; but no .i1f file found", cmfp->fname);
431
432 dbfile[n-1] = 'p'; /* the remainder of the optimized sequence profiles (HMMs) */
433 if ((cmfp->hfp->pfp = fopen(dbfile, "rb")) == NULL) ESL_XFAIL(eslENOTFOUND, errbuf, "Opened %s, a pressed CM file; but no .i1p file found", cmfp->fname);
434
435 dbfile[n-1] = 'i'; /* the SSI index for the .i1m file */
436 status = esl_ssi_Open(dbfile, &(cmfp->ssi));
437 if (status == eslENOTFOUND) ESL_XFAIL(eslENOTFOUND, errbuf, "Opened %s, a pressed CM file; but no .i1i file found", cmfp->fname);
438 else if (status == eslEFORMAT) ESL_XFAIL(eslEFORMAT, errbuf, "Opened %s, a pressed CM file; but format of its .i1i file unrecognized", cmfp->fname);
439 else if (status == eslERANGE) ESL_XFAIL(eslEFORMAT, errbuf, "Opened %s, a pressed CM file; but its .i1i file is 64-bit and your system is 32-bit", cmfp->fname);
440 else if (status != eslOK) ESL_XFAIL(eslEFORMAT, errbuf, "Opened %s, a pressed CM file; but failed to open its .i1i file", cmfp->fname);
441
442 free(dbfile); dbfile = NULL;
443 }
444 else
445 {
446 if ((status = esl_sprintf(&dbfile, "%s.ssi", cmfp->fname)) != eslOK) ESL_XFAIL(status, errbuf, "esl_sprintf() failed");
447
448 status = esl_ssi_Open(dbfile, &(cmfp->ssi)); /* not finding an SSI file is ok. we open it if we find it. */
449 if (status == eslEFORMAT) ESL_XFAIL(status, errbuf, "a %s.ssi file exists (an SSI index), but its SSI format is not recognized", cmfp->fname);
450 else if (status == eslERANGE) ESL_XFAIL(status, errbuf, "a %s.ssi file exists (an SSI index), but is 64-bit, and your system is 32-bit", cmfp->fname);
451 else if (status != eslOK && status != eslENOTFOUND) ESL_XFAIL(status, errbuf, "esl_ssi_Open() failed");
452 free(dbfile); dbfile = NULL;
453 }
454
455
456 /* 7. Check for binary file format. A pressed db is automatically binary: verify. */
457 if (! fread((char *) &(magic.n), sizeof(uint32_t), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, errbuf, "File exists, but appears to be empty?");
458 if (magic.n == v1a_magic) { cmfp->format = CM_FILE_1a; cmfp->parser = read_bin_1p1_cm; cmfp->is_binary = TRUE; }
459 else if (cmfp->is_pressed) ESL_XFAIL(eslEFORMAT, errbuf, "Binary format tag in %s unrecognized\nCurrent Infernal format is INFERNAL1/a. Previous binary formats are not supported.", cmfp->fname);
460
461 /* 8. Checks for ASCII file format */
462 if (cmfp->parser == NULL)
463 {
464 /* Does the magic appear to be binary, yet we didn't recognize it? */
465 if (magic.n & 0x80000000) ESL_XFAIL(eslEFORMAT, errbuf, "Format tag appears binary, but unrecognized\nCurrent Infernal format is INFERNAL1/a. Previous binary formats are not supported.");
466
467 if ((cmfp->efp = esl_fileparser_Create(cmfp->f)) == NULL) ESL_XFAIL(eslEMEM, errbuf, "internal error in esl_fileparser_Create()");
468 if ((status = esl_fileparser_SetCommentChar(cmfp->efp, '#')) != eslOK) ESL_XFAIL(status, errbuf, "internal error in esl_fileparser_SetCommentChar()");
469 if ((status = esl_fileparser_NextLinePeeked(cmfp->efp, magic.c, 4)) != eslOK) ESL_XFAIL(status, errbuf, "internal error in esl_fileparser_NextLinePeeked()");
470 if ((status = esl_fileparser_GetToken(cmfp->efp, &tok, &toklen)) != eslOK) ESL_XFAIL(status, errbuf, "internal error in esl_fileparser_GetToken()");
471
472 if ( strcmp("INFERNAL1/a", tok) == 0) { cmfp->format = CM_FILE_1a; cmfp->parser = read_asc_1p1_cm; }
473 else if (( allow_1p0) && strcmp("INFERNAL-1", tok) == 0) { cmfp->format = CM_FILE_1; cmfp->parser = read_asc_1p0_cm; }
474 else if ((! allow_1p0) && strcmp("INFERNAL-1", tok) == 0) { ESL_XFAIL(eslEFORMAT, errbuf, "Format tag is '%s': use cmconvert to reformat Infernal v1.0 to v1.0.2 CM files to current format", tok); }
475 else { ESL_XFAIL(eslEFORMAT, errbuf, "Format tag is '%s': unrecognized or not supported.", tok); }
476 }
477
478 *ret_cmfp = cmfp;
479 return eslOK;
480
481 ERROR:
482 if (cmd != NULL) free(cmd);
483 if (dbfile != NULL) free(dbfile);
484 if (envfile != NULL) free(envfile);
485 if (cmfp != NULL) cm_file_Close(cmfp);
486 *ret_cmfp = NULL;
487 if (status == eslEMEM) return status;
488 else if (status == eslENOTFOUND) return status;
489 else return eslEFORMAT;
490 }
491
492 /* Function: cm_file_Close()
493 * Incept: EPN, Fri Jun 17 10:06:42 2011
494 * SRE, Wed Jan 3 18:48:44 2007 [Casa de Gatos] (p7_hmmfile_Close())
495 *
496 * Purpose: Closes an open CM file <cmfp>.
497 *
498 * Returns: (void)
499 */
500 void
cm_file_Close(CM_FILE * cmfp)501 cm_file_Close(CM_FILE *cmfp)
502 {
503 if (cmfp == NULL) return;
504
505 #ifdef HAVE_POPEN /* gzip functionality */
506 if (cmfp->do_gzip && cmfp->f != NULL) {
507 pclose(cmfp->f);
508 /* careful here: in gzip mode we defined cmfp->hfp->f == cmfp->f,
509 * instead of reopening for cmfp->hfp->f, so we need to set
510 * cmfp->hfp->f to NULL so p7_hmmfile_Close() doesn't try to close
511 * it.
512 */
513 if(cmfp->f == cmfp->hfp->f) { /* this should be TRUE */
514 cmfp->hfp->f = NULL;
515 }
516 cmfp->f = NULL;
517 }
518 #endif
519 if (!cmfp->do_gzip && !cmfp->do_stdin && cmfp->f != NULL) fclose(cmfp->f);
520 if (cmfp->ffp != NULL) fclose(cmfp->ffp);
521 if (cmfp->pfp != NULL) fclose(cmfp->pfp);
522 if (cmfp->fname != NULL) free(cmfp->fname);
523 if (cmfp->efp != NULL) esl_fileparser_Destroy(cmfp->efp);
524 if (cmfp->ssi != NULL) esl_ssi_Close(cmfp->ssi);
525 #ifdef HMMER_THREADS
526 if (cmfp->syncRead) pthread_mutex_destroy (&cmfp->readMutex);
527 #endif
528
529 if(cmfp->hfp != NULL) p7_hmmfile_Close(cmfp->hfp);
530
531 free(cmfp);
532 }
533
534 #ifdef HMMER_THREADS
535 /* Function: cm_file_CreateLock()
536 * Incept: EPN, Fri Jun 17 10:07:06 2011
537 * MSF, Wed July 15 2009 (p7_hmmfile_CreateLock())
538 *
539 * Purpose: Create a lock to syncronize readers.
540 *
541 * Returns: <eslOK> on success.
542 */
543 int
cm_file_CreateLock(CM_FILE * cmfp)544 cm_file_CreateLock(CM_FILE *cmfp)
545 {
546 int status;
547
548 if (cmfp == NULL) return eslEINVAL;
549
550 /* make sure the lock is not created twice */
551 if (!cmfp->syncRead)
552 {
553 cmfp->syncRead = TRUE;
554 status = pthread_mutex_init(&cmfp->readMutex, NULL);
555 if (status != 0) goto ERROR;
556 }
557
558 /* create lock on hmm files as well */
559 if (cmfp->hfp != NULL)
560 if((status = p7_hmmfile_CreateLock(cmfp->hfp)) != eslOK) goto ERROR;
561
562 return eslOK;
563
564 ERROR:
565 cmfp->syncRead = FALSE;
566 return eslFAIL;
567 }
568 #endif
569 /*----------------- end, CM_FILE object ----------------------*/
570
571
572 /*****************************************************************
573 * 2. Writing CM files.
574 *****************************************************************/
575 static int multiline(FILE *fp, const char *pfx, char *s);
576
577 /* Function: cm_file_WriteASCII()
578 * Synopsis: Write an Infernal 1.1 ASCII save file.
579 * Incept: EPN, Fri Jun 17 10:07:46 2011
580 * SRE, Tue May 19 09:39:31 2009 [Janelia] (p7_hmmfile_WriteASCII())
581 *
582 * Purpose: Write a covariance model <cm> in an ASCII save file format to
583 * an open stream <fp>.
584 *
585 * Currently only outputs in the default standard format,
586 * so format must be <CM_FILE_1a> or <-1> (which specifies
587 * the current default format be used).
588 * In the future other formats will be accepted.
589 *
590 * Args: fp - open stream for writing
591 * format - -1 for default format, or a 1.x format code like <CM_FILE_1a>
592 * cm - CM to save
593 *
594 * Returns: <eslOK> on success.
595 *
596 * Throws: <eslEINVAL> if <format> isn't a valid 3.0 format code.
597 */
598 int
cm_file_WriteASCII(FILE * fp,int format,CM_t * cm)599 cm_file_WriteASCII(FILE *fp, int format, CM_t *cm)
600 {
601 int x, v, nd, y;
602
603 if((cm->flags & CMH_LOCAL_BEGIN) || (cm->flags & CMH_LOCAL_END)) cm_Fail("cm_file_WriteASCII(): CM is in local mode");
604
605 if (format == -1) format = CM_FILE_1a;
606
607 if (format == CM_FILE_1a) fprintf(fp, "INFERNAL1/a [%s | %s]\n", INFERNAL_VERSION, INFERNAL_DATE);
608 else ESL_EXCEPTION(eslEINVAL, "invalid CM file format code");
609
610 fprintf(fp, "NAME %s\n", cm->name);
611 if (cm->acc) fprintf(fp, "ACC %s\n", cm->acc);
612 if (cm->desc) fprintf(fp, "DESC %s\n", cm->desc);
613 fprintf(fp, "STATES %d\n", cm->M);
614 fprintf(fp, "NODES %d\n", cm->nodes);
615 fprintf(fp, "CLEN %d\n", cm->clen);
616 fprintf(fp, "W %d\n", cm->W);
617 fprintf(fp, "ALPH %s\n", esl_abc_DecodeType(cm->abc->type));
618 fprintf(fp, "RF %s\n", (cm->flags & CMH_RF) ? "yes" : "no");
619 fprintf(fp, "CONS %s\n", (cm->flags & CMH_CONS) ? "yes" : "no");
620 fprintf(fp, "MAP %s\n", (cm->flags & CMH_MAP) ? "yes" : "no");
621 if (cm->ctime != NULL) fprintf (fp, "DATE %s\n", cm->ctime);
622 if (cm->comlog != NULL) multiline(fp, "COM ", cm->comlog);
623 fprintf(fp, "PBEGIN %g\n", cm->pbegin);
624 fprintf(fp, "PEND %g\n", cm->pend);
625 fprintf(fp, "WBETA %g\n", cm->beta_W);
626 fprintf(fp, "QDBBETA1 %g\n", cm->qdbinfo->beta1);
627 fprintf(fp, "QDBBETA2 %g\n", cm->qdbinfo->beta2);
628 fprintf(fp, "N2OMEGA %6g\n",cm->null2_omega);
629 fprintf(fp, "N3OMEGA %6g\n",cm->null3_omega);
630 fprintf(fp, "ELSELF %.8f\n",cm->el_selfsc);
631 fprintf(fp, "NSEQ %d\n", cm->nseq);
632 fprintf(fp, "EFFN %f\n",cm->eff_nseq);
633 if (cm->flags & CMH_CHKSUM) fprintf(fp, "CKSUM %u\n", cm->checksum); /* unsigned 32-bit */
634 fputs("NULL ", fp);
635 for (x = 0; x < cm->abc->K; x++) { fprintf(fp, "%6s ", prob2ascii(cm->null[x], 1/(float)(cm->abc->K))); }
636 fputc('\n', fp);
637 if (cm->flags & CMH_GA) fprintf(fp, "GA %.2f\n", cm->ga);
638 if (cm->flags & CMH_TC) fprintf(fp, "TC %.2f\n", cm->tc);
639 if (cm->flags & CMH_NC) fprintf(fp, "NC %.2f\n", cm->nc);
640
641 if (cm->flags & CMH_FP7) {
642 fprintf(fp, "EFP7GF %.4f %.5f\n", cm->fp7_evparam[CM_p7_GFMU], cm->fp7_evparam[CM_p7_GFLAMBDA]);
643 }
644 if (cm->flags & CMH_EXPTAIL_STATS)
645 {
646 /* make sure our dbsize values can be cast to a long reliably,
647 * even on 32 bit systems (<= 2 Gb), they certainly should be,
648 * max value in cmcalibrate is 160Mb. (This is related to bug
649 * i31.)
650 */
651 if(cm->expA[EXP_CM_LC]->dbsize > (2000. * 1000000.)) ESL_EXCEPTION(eslEINVAL, "invalid dbsize (too big) EXP_CM_LC");
652 if(cm->expA[EXP_CM_GC]->dbsize > (2000. * 1000000.)) ESL_EXCEPTION(eslEINVAL, "invalid dbsize (too big) EXP_CM_GC");
653 if(cm->expA[EXP_CM_LI]->dbsize > (2000. * 1000000.)) ESL_EXCEPTION(eslEINVAL, "invalid dbsize (too big) EXP_CM_LI");
654 if(cm->expA[EXP_CM_GI]->dbsize > (2000. * 1000000.)) ESL_EXCEPTION(eslEINVAL, "invalid dbsize (too big) EXP_CM_GI");
655
656 fprintf(fp, "ECMLC %.5f %10.5f %10.5f %10ld %10d %.6f\n",
657 cm->expA[EXP_CM_LC]->lambda, cm->expA[EXP_CM_LC]->mu_extrap, cm->expA[EXP_CM_LC]->mu_orig,
658 (long) (cm->expA[EXP_CM_LC]->dbsize + 0.5), cm->expA[EXP_CM_LC]->nrandhits, cm->expA[EXP_CM_LC]->tailp);
659 fprintf(fp, "ECMGC %.5f %10.5f %10.5f %10ld %10d %.6f\n",
660 cm->expA[EXP_CM_GC]->lambda, cm->expA[EXP_CM_GC]->mu_extrap, cm->expA[EXP_CM_GC]->mu_orig,
661 (long) (cm->expA[EXP_CM_GC]->dbsize + 0.5), cm->expA[EXP_CM_GC]->nrandhits, cm->expA[EXP_CM_GC]->tailp);
662 fprintf(fp, "ECMLI %.5f %10.5f %10.5f %10ld %10d %.6f\n",
663 cm->expA[EXP_CM_LI]->lambda, cm->expA[EXP_CM_LI]->mu_extrap, cm->expA[EXP_CM_LI]->mu_orig,
664 (long) (cm->expA[EXP_CM_LI]->dbsize + 0.5), cm->expA[EXP_CM_LI]->nrandhits, cm->expA[EXP_CM_LI]->tailp);
665 fprintf(fp, "ECMGI %.5f %10.5f %10.5f %10ld %10d %.6f\n",
666 cm->expA[EXP_CM_GI]->lambda, cm->expA[EXP_CM_GI]->mu_extrap, cm->expA[EXP_CM_GI]->mu_orig,
667 (long) (cm->expA[EXP_CM_GI]->dbsize + 0.5), cm->expA[EXP_CM_GI]->nrandhits, cm->expA[EXP_CM_GI]->tailp);
668 }
669
670 /* main model section */
671 fputs("CM\n", fp);
672
673 /* Create emit map if nec, so we can output map, consensus and rf info appropriately */
674 if(cm->emap == NULL) {
675 cm->emap = CreateEmitMap(cm);
676 if(cm->emap == NULL) ESL_EXCEPTION(eslEINVAL, "unable to create an emit map");
677 }
678
679 for (v = 0; v < cm->M; v++) {
680 nd = cm->ndidx[v];
681
682 /* Node line. node type and additional per-consensus position annotation */
683 if (cm->nodemap[nd] == v) {
684 fprintf(fp, "%45s[ %-4s %4d ]", "", Nodetype(cm->ndtype[nd]), nd);
685
686 /* additional annotation */
687 /* map (optional) */
688 if(cm->flags & CMH_MAP) {
689 if (cm->ndtype[nd] == MATP_nd) fprintf(fp, " %6d %6d", cm->map[cm->emap->lpos[nd]], cm->map[cm->emap->rpos[nd]]);
690 else if(cm->ndtype[nd] == MATL_nd) fprintf(fp, " %6d %6s", cm->map[cm->emap->lpos[nd]], "-");
691 else if(cm->ndtype[nd] == MATR_nd) fprintf(fp, " %6s %6d", "-", cm->map[cm->emap->rpos[nd]]);
692 else fprintf(fp, " %6s %6s", "-", "-");
693 }
694 else { /* no map annotation */
695 fprintf(fp, " %6s %6s", "-", "-");
696 }
697 /* consensus sequence (mandatory) */
698 if (cm->ndtype[nd] == MATP_nd) fprintf(fp, " %c %c", cm->consensus[cm->emap->lpos[nd]], cm->consensus[cm->emap->rpos[nd]]);
699 else if(cm->ndtype[nd] == MATL_nd) fprintf(fp, " %c %c", cm->consensus[cm->emap->lpos[nd]], '-');
700 else if(cm->ndtype[nd] == MATR_nd) fprintf(fp, " %c %c", '-', cm->consensus[cm->emap->rpos[nd]]);
701 else fprintf(fp, " %c %c", '-', '-');
702 /* RF (optional) */
703 if(cm->flags & CMH_RF) {
704 if (cm->ndtype[nd] == MATP_nd) fprintf(fp, " %c %c", cm->rf[cm->emap->lpos[nd]], cm->rf[cm->emap->rpos[nd]]);
705 else if(cm->ndtype[nd] == MATL_nd) fprintf(fp, " %c %c", cm->rf[cm->emap->lpos[nd]], '-');
706 else if(cm->ndtype[nd] == MATR_nd) fprintf(fp, " %c %c", '-', cm->rf[cm->emap->rpos[nd]]);
707 else fprintf(fp, " %c %c", '-', '-');
708 }
709 else { /* no RF annotation */
710 fprintf(fp, " %c %c", '-', '-');
711 }
712 fputs("\n", fp);
713 }
714
715 /* State line, w/ parents, children, dmin2, dmin1, dmax1, dmax2, transitions and emissions */
716 fprintf(fp, " %2s %5d %5d %1d %5d %5d %5d %5d %5d %5d ",
717 Statetype(cm->sttype[v]), v,
718 cm->plast[v], cm->pnum[v],
719 cm->cfirst[v], cm->cnum[v],
720 cm->qdbinfo->dmin2[v], cm->qdbinfo->dmin1[v],
721 cm->qdbinfo->dmax1[v], cm->qdbinfo->dmax2[v]);
722
723 /* Transitions */
724 if (cm->sttype[v] != B_st) {
725 for (x = 0; x < cm->cnum[v]; x++) {
726 fprintf(fp, "%7s ", prob2ascii(cm->t[v][x], 1.));
727 }
728 }
729 else {
730 x = 0;
731 }
732 for (; x < 6; x++) {
733 fprintf(fp, "%7s ", "");
734 }
735
736 /* Emissions */
737 if (cm->sttype[v] == MP_st) {
738 for (x = 0; x < cm->abc->K; x++) {
739 for (y = 0; y < cm->abc->K; y++) {
740 fprintf(fp, "%6s ", prob2ascii(cm->e[v][x*cm->abc->K+y], cm->null[x]*cm->null[y]));
741 }
742 }
743 }
744 else if (cm->sttype[v] == ML_st || cm->sttype[v] == MR_st || cm->sttype[v] == IL_st || cm->sttype[v] == IR_st) {
745 for (x = 0; x < cm->abc->K; x++) {
746 fprintf(fp, "%6s ", prob2ascii(cm->e[v][x], cm->null[x]));
747 }
748 }
749 fputs("\n", fp);
750 }
751 fputs("//\n", fp);
752
753 /* print additional p7 hmms if any */
754 if(cm->flags & CMH_FP7 && cm->fp7 != NULL) {
755 p7_hmmfile_WriteASCII(fp, -1, cm->fp7);
756 }
757 return eslOK;
758 }
759
760
761 /* Function: cm_file_WriteBinary()
762 * Incept: EPN, Fri Jun 17 11:01:17 2011
763 * SRE, Wed Jan 3 13:50:26 2007 [Janelia] (p7_hmmfile_WriteBinary())
764 * Purpose: Writes an CM to a file in INFERNAL binary format.
765 *
766 * Legacy binary file formats will eventually be supported
767 * by specifying the <format> code, but currently only one
768 * valid format exists. Passing <-1> as format specifies
769 * the default current standard format; pass a valid
770 * code such as <CM_FILE_1a> to select a specific
771 * binary format.
772 *
773 * Returns: <eslOK> on success. File position of start of fp7 is
774 * sent back in <*opt_fp7_offset> if it is non-NULL. If no
775 * fp7 is written, (<*opt_fp7_offset> is 0) and caller will
776 * know no fp7 is written because (cm->fp7 == NULL) || (!
777 * (cm->flags & CMH_FP7)). <eslFAIL> if any writes fail
778 * (for instance, if disk fills up, which did happen during
779 * testing!).
780 *
781 * Throws: <eslEINVAL> if <format> isn't a valid 3.0 format code.
782 */
783 int
cm_file_WriteBinary(FILE * fp,int format,CM_t * cm,off_t * opt_fp7_offset)784 cm_file_WriteBinary(FILE *fp, int format, CM_t *cm, off_t *opt_fp7_offset)
785 {
786 int v, z;
787 off_t fp7_offset;
788
789 if((cm->flags & CMH_LOCAL_BEGIN) || (cm->flags & CMH_LOCAL_END)) cm_Fail("cm_file_WriteASCII(): CM is in local mode");
790
791 if (format == -1) format = CM_FILE_1a;
792
793 /* ye olde magic number */
794 if (format == CM_FILE_1a) { if (fwrite((char *) &(v1a_magic), sizeof(uint32_t), 1, fp) != 1) return eslFAIL; }
795 else ESL_EXCEPTION(eslEINVAL, "invalid CM file format code");
796
797 /* info necessary for sizes of things
798 */
799 if (fwrite((char *) &(cm->flags), sizeof(int), 1, fp) != 1) return eslFAIL;
800 if (fwrite((char *) &(cm->M), sizeof(int), 1, fp) != 1) return eslFAIL;
801 if (fwrite((char *) &(cm->nodes), sizeof(int), 1, fp) != 1) return eslFAIL;
802 if (fwrite((char *) &(cm->clen), sizeof(int), 1, fp) != 1) return eslFAIL;
803 if (fwrite((char *) &(cm->abc->type), sizeof(int), 1, fp) != 1) return eslFAIL;
804
805 /* main model section
806 */
807
808 if (fwrite((char *) cm->sttype, sizeof(char), cm->M, fp) != cm->M) return eslFAIL;
809 if (fwrite((char *) cm->ndidx, sizeof(int), cm->M, fp) != cm->M) return eslFAIL;
810 if (fwrite((char *) cm->stid, sizeof(char), cm->M, fp) != cm->M) return eslFAIL;
811 if (fwrite((char *) cm->cfirst, sizeof(int), cm->M, fp) != cm->M) return eslFAIL;
812 if (fwrite((char *) cm->cnum, sizeof(int), cm->M, fp) != cm->M) return eslFAIL;
813 if (fwrite((char *) cm->plast, sizeof(int), cm->M, fp) != cm->M) return eslFAIL;
814 if (fwrite((char *) cm->pnum, sizeof(int), cm->M, fp) != cm->M) return eslFAIL;
815 if (fwrite((char *) cm->nodemap, sizeof(int), cm->nodes, fp) != cm->nodes) return eslFAIL;
816 if (fwrite((char *) cm->ndtype, sizeof(char), cm->nodes, fp) != cm->nodes) return eslFAIL;
817 if (fwrite((char *) cm->qdbinfo->dmin1, sizeof(int), cm->M, fp) != cm->M) return eslFAIL;
818 if (fwrite((char *) cm->qdbinfo->dmax1, sizeof(int), cm->M, fp) != cm->M) return eslFAIL;
819 if (fwrite((char *) cm->qdbinfo->dmin2, sizeof(int), cm->M, fp) != cm->M) return eslFAIL;
820 if (fwrite((char *) cm->qdbinfo->dmax2, sizeof(int), cm->M, fp) != cm->M) return eslFAIL;
821
822 for (v = 0; v < cm->M; v++) {
823 if (fwrite((char *) cm->t[v], sizeof(float), MAXCONNECT, fp) != MAXCONNECT) return eslFAIL;
824 if (fwrite((char *) cm->e[v], sizeof(float), cm->abc->K*cm->abc->K, fp) != (cm->abc->K*cm->abc->K)) return eslFAIL;
825 }
826
827 /* annotation section
828 */
829 if ( write_bin_string(fp, cm->name) != eslOK) return eslFAIL;
830 if ((cm->flags & CMH_ACC) && (write_bin_string(fp, cm->acc) != eslOK)) return eslFAIL;
831 if ((cm->flags & CMH_DESC) && (write_bin_string(fp, cm->desc) != eslOK)) return eslFAIL;
832 if ((cm->flags & CMH_RF) && (fwrite((char *) cm->rf, sizeof(char), cm->clen+2, fp) != cm->clen+2)) return eslFAIL; /* +2: 1..clen and trailing \0 */
833 if ((cm->flags & CMH_CONS) && (fwrite((char *) cm->consensus, sizeof(char), cm->clen+2, fp) != cm->clen+2)) return eslFAIL; /* consensus is mandatory */
834 if ((cm->flags & CMH_MAP) && (fwrite((char *) cm->map, sizeof(int), cm->clen+1, fp) != cm->clen+1)) return eslFAIL; /* +2: 1..clen and trailing \0 */
835 if (fwrite((char *) &(cm->W), sizeof(int), 1, fp) != 1) return eslFAIL;
836
837 if ((write_bin_string(fp, cm->ctime)) != eslOK) return eslFAIL;
838 if ((write_bin_string(fp, cm->comlog)) != eslOK) return eslFAIL;
839
840 if (fwrite((char *) &(cm->pbegin), sizeof(float), 1, fp) != 1) return eslFAIL;
841 if (fwrite((char *) &(cm->pend), sizeof(float), 1, fp) != 1) return eslFAIL;
842 if (fwrite((char *) &(cm->beta_W), sizeof(double), 1, fp) != 1) return eslFAIL;
843 if (fwrite((char *) &(cm->qdbinfo->beta1), sizeof(double), 1, fp) != 1) return eslFAIL;
844 if (fwrite((char *) &(cm->qdbinfo->beta2), sizeof(double), 1, fp) != 1) return eslFAIL;
845 if (fwrite((char *) &(cm->null2_omega), sizeof(float), 1, fp) != 1) return eslFAIL;
846 if (fwrite((char *) &(cm->null3_omega), sizeof(float), 1, fp) != 1) return eslFAIL;
847 if (fwrite((char *) &(cm->el_selfsc), sizeof(float), 1, fp) != 1) return eslFAIL;
848 if (fwrite((char *) &(cm->nseq), sizeof(int), 1, fp) != 1) return eslFAIL;
849 if (fwrite((char *) &(cm->eff_nseq), sizeof(float), 1, fp) != 1) return eslFAIL;
850 if (fwrite((char *) &(cm->checksum), sizeof(uint32_t), 1, fp) != 1) return eslFAIL;
851 if (fwrite((char *) cm->null, sizeof(float), cm->abc->K, fp) != cm->abc->K) return eslFAIL;
852
853 /* Rfam cutoffs
854 */
855 if ((cm->flags & CMH_GA) && (fwrite((char *) &(cm->ga), sizeof(float), 1, fp) != 1)) return eslFAIL;
856 if ((cm->flags & CMH_TC) && (fwrite((char *) &(cm->tc), sizeof(float), 1, fp) != 1)) return eslFAIL;
857 if ((cm->flags & CMH_NC) && (fwrite((char *) &(cm->nc), sizeof(float), 1, fp) != 1)) return eslFAIL;
858
859 /* E-value parameters
860 */
861 if (cm->flags & CMH_FP7) { /* should always be true */
862 if (fwrite((char *) &(cm->fp7_evparam[CM_p7_GFMU]), sizeof(float), 1, fp) != 1) return eslFAIL;
863 if (fwrite((char *) &(cm->fp7_evparam[CM_p7_GFLAMBDA]), sizeof(float), 1, fp) != 1) return eslFAIL;
864 }
865 if (cm->flags & CMH_EXPTAIL_STATS) {
866 long dbsize_long;
867 for(z = 0; z < EXP_NMODES; z++) {
868 /* make sure our dbsize values can be cast to a long reliably,
869 * even on 32 bit systems (<= 2 Gb), they certainly should be,
870 * max value in cmcalibrate is 160Mb. (This is related to bug
871 * i31.)
872 */
873 if(cm->expA[z]->dbsize > (2000. * 1000000.)) ESL_EXCEPTION(eslEINVAL, "invalid dbsize (too big)");
874 dbsize_long = (long) cm->expA[z]->dbsize + 0.5;
875 if (fwrite((char *) &(cm->expA[z]->lambda), sizeof(double), 1, fp) != 1) return eslFAIL;
876 if (fwrite((char *) &(cm->expA[z]->mu_extrap), sizeof(double), 1, fp) != 1) return eslFAIL;
877 if (fwrite((char *) &(cm->expA[z]->mu_orig), sizeof(double), 1, fp) != 1) return eslFAIL;
878 if (fwrite((char *) &(dbsize_long), sizeof(long), 1, fp) != 1) return eslFAIL;
879 if (fwrite((char *) &(cm->expA[z]->nrandhits), sizeof(int), 1, fp) != 1) return eslFAIL;
880 if (fwrite((char *) &(cm->expA[z]->tailp), sizeof(double), 1, fp) != 1) return eslFAIL;
881 }
882 }
883
884 /* finally, write the filter p7 HMM */
885 fp7_offset = 0;
886 /* fp7_offset remains 0 if we don't write a fp7, caller will know that no
887 * fp7 was written if (! (cm->flags & CMH_FP7)) || cm->fp7 == NULL
888 */
889 if(cm->flags & CMH_FP7 && cm->fp7 != NULL) {
890 if(opt_fp7_offset != NULL) {
891 /* figure out offset only if we are returning it, otherwise it's irrelevant and can cause a failure
892 * in some cases (e.g. cmconvert -b test.cm | cat > /dev/null), this was bug i41
893 */
894 if((fp7_offset = ftello(fp)) == -1) ESL_EXCEPTION(eslEINVAL, "failed to determine file position for p7 filter");
895 }
896 p7_hmmfile_WriteBinary(fp, -1, cm->fp7);
897 }
898
899 if(opt_fp7_offset != NULL) *opt_fp7_offset = fp7_offset;
900 return eslOK;
901 }
902 /*----------------- end, save file output ----------------------*/
903
904
905 /* Function: write_bin_string()
906 * Date: SRE, Wed Oct 29 13:49:27 1997 [TWA 721 over Canada]
907 *
908 * Purpose: Write a string in binary save format: an integer
909 * for the string length (including \0), followed by
910 * the string.
911 *
912 * Return: <eslOK> on success;
913 * <eslFAIL> if a write fails due to system error, such
914 * as a filled disk (as happened in testing).
915 */
916 static int
write_bin_string(FILE * fp,char * s)917 write_bin_string(FILE *fp, char *s)
918 {
919 int len;
920 if (s != NULL)
921 {
922 len = strlen(s) + 1;
923 if (fwrite((char *) &len, sizeof(int), 1, fp) != 1) return eslFAIL;
924 if (fwrite((char *) s, sizeof(char), len, fp) != len) return eslFAIL;
925 }
926 else
927 {
928 len = 0;
929 if (fwrite((char *) &len, sizeof(int), 1, fp) != 1) return eslFAIL;
930 }
931 return eslOK;
932 }
933
934 /*****************************************************************
935 * 3. API for reading CMs in various formats.
936 *****************************************************************/
937
938 /* Function: cm_file_Read()
939 * Incept: EPN, Tue Jun 21 10:41:54 2011
940 * SRE, Sat Jan 6 18:04:58 2007 [Casa de Gatos] (p7_hmmfile_Read())
941 *
942 * Purpose: Read the next CM from open save file <cmfp>, and
943 * optionally return this newly allocated CM in <opt_cm>.
944 * (The optional return is so that an application is
945 * only interested in whether the file contains a valid
946 * CM or not -- for example, to verify that a file contains
947 * only a single CM instead of a database of them.)
948 *
949 * Read a p7 filter HMM (if one exists) for the CM if
950 * <read_fp7> is TRUE, else, don't read one. This allows us
951 * to read only the CM if we're using a pressed file in
952 * cmscan and already have read the p7 filter HMM
953 * previously. However be careful with this, because caller
954 * will have problems if it tries to read the file
955 * sequentially with <read_fp7> as FALSE because file
956 * offset will not be at the beginning of the next CM upon
957 * return, it will be at the beginning of the p7 filter for
958 * the last CM read, if one exists.
959 *
960 * Caller may or may not already know what alphabet the CM
961 * is expected to be in. A reference to the pointer to the
962 * current alphabet is passed in <*ret_abc>. If the alphabet
963 * is unknown, pass <*ret_abc = NULL>, and when the
964 * new HMM is read, an appropriate new alphabet object is
965 * allocated and passed back to the caller in <*ret_abc>.
966 * If the alphabet is already known, <ret_abc> points to
967 * that object ptr, and the new CM's alphabet type is
968 * verified to agree with it. This mechanism allows an
969 * application to let the first CM determine the alphabet
970 * type for the application, while still keeping the
971 * alphabet under the application's scope of control.
972 *
973 * Returns: <eslOK> on success, and the newly allocated CM is
974 * optionally returned via <opt_cm>. Additionally, if
975 * <ret_abc> pointed to <NULL>, it now points to a newly
976 * allocated alphabet.
977 *
978 * Returns <eslEOF> if no CMs remain in the file; this may
979 * indicate success or failure, depending on what the
980 * caller is expecting.
981 *
982 * Returns <eslEFORMAT> on any format problems, including
983 * premature end of data or bad magic at the start of a
984 * binary file. An informative error message is left in
985 * <cmfp->errbuf>; the filename (fully qualified, if opened
986 * in a directory specified by an <env> list) is in
987 * <cmfp->fname>; and if <cmfp->efp> is non-<NULL>, the CM
988 * file is in an ASCII text format, and the caller may also
989 * obtain the line number at which the format error was
990 * detected, in <cmfp->efp->linenumber>, and use it to
991 * format informative output for a user.
992 *
993 * Returns <eslEINCOMPAT> if the caller passed a known
994 * alphabet (a non-<NULL> <*ret_abc>), but the alphabet
995 * of the CM doesn't match this expectation.
996 *
997 * Upon any return that is not <eslOK>, <*opt_cm> is
998 * <NULL> and <*ret_abc> is left unchanged from what caller
999 * passed it as.
1000 *
1001 * Throws: <eslEMEM> upon an allocation error.
1002 * <eslESYS> on failure of other system calls, such
1003 * as file positioning functions (<fseeko()> or <ftello()>.
1004 */
1005 int
cm_file_Read(CM_FILE * cmfp,int read_fp7,ESL_ALPHABET ** ret_abc,CM_t ** opt_cm)1006 cm_file_Read(CM_FILE *cmfp, int read_fp7, ESL_ALPHABET **ret_abc, CM_t **opt_cm)
1007 {
1008 /* A call to SSI to remember file position may eventually go here. */
1009 return (*cmfp->parser)(cmfp, read_fp7, ret_abc, opt_cm);
1010 }
1011
1012 /* Function: cm_file_PositionByKey()
1013 * Synopsis: Use SSI to reposition file to start of named CM.
1014 * Incept: EPN, Tue Jun 21 10:46:33 2011
1015 * SRE, Mon Jun 18 10:57:15 2007 [Janelia] (p7_hmmfile_PositionByKey())
1016 *
1017 * Purpose: Reposition <cmfp> so the next CM we read will be the
1018 * one named (or accessioned) <key>.
1019 *
1020 * Returns: <eslOK> on success.
1021 *
1022 * Returns <eslENOTFOUND> if <key> isn't found in the index for
1023 * <cmfp>.
1024 *
1025 * Returns <eslEFORMAT> is something goes wrong trying to
1026 * read the index, indicating a file format problem in the
1027 * SSI file.
1028 *
1029 * In the event of either error, the state of <cmfp> is left
1030 * unchanged.
1031 *
1032 * Throws: <eslEMEM> on allocation failure, or <eslESYS> on system i/o
1033 * call failure, or <eslEINVAL> if <cmfp> doesn't have an SSI
1034 * index or is not a seekable stream.
1035 */
1036 int
cm_file_PositionByKey(CM_FILE * cmfp,const char * key)1037 cm_file_PositionByKey(CM_FILE *cmfp, const char *key)
1038 {
1039 uint16_t fh;
1040 off_t offset;
1041 int status;
1042
1043 if (cmfp->ssi == NULL) ESL_EXCEPTION(eslEINVAL, "Need an open SSI index to call cm_file_PositionByKey()");
1044 if ((status = esl_ssi_FindName(cmfp->ssi, key, &fh, &offset, NULL, NULL)) != eslOK) return status;
1045 if (fseeko(cmfp->f, offset, SEEK_SET) != 0) ESL_EXCEPTION(eslESYS, "fseek failed");
1046
1047 cmfp->newly_opened = FALSE; /* because we're poised on the magic number, and must read it */
1048 return eslOK;
1049 }
1050
1051
1052 /* Function: cm_file_Position()
1053 * Synopsis: Reposition file to a given offset.
1054 * Incept: EPN, Tue Jun 21 10:47:48 2011
1055 * MSF Wed Nov 4, 2009 [Janelia] (p7_hmmfile_Position())
1056 *
1057 * Purpose: Reposition <cmfp> to position <offset>.
1058 *
1059 * Returns: <eslOK> on success.
1060 *
1061 * In the event an error, the state of <cmfp> is left
1062 * unchanged.
1063 *
1064 * Throws: <eslESYS> on system i/o call failure, or <eslEINVAL> if
1065 * <cmfp> is not a seekable stream.
1066 */
1067 int
cm_file_Position(CM_FILE * cmfp,const off_t offset)1068 cm_file_Position(CM_FILE *cmfp, const off_t offset)
1069 {
1070 if (fseeko(cmfp->f, offset, SEEK_SET) != 0) ESL_EXCEPTION(eslESYS, "fseek failed");
1071
1072 cmfp->newly_opened = FALSE; /* because we're poised on the magic number, and must read it */
1073 return eslOK;
1074 }
1075
1076 /*------------------- end, input API ----------------------------*/
1077
1078
1079 /*****************************************************************
1080 * 4. API for reading/writing p7 HMMs/profiles to filter for CMs.
1081 *****************************************************************/
1082
1083 /* Function: cm_p7_hmmfile_Read()
1084 * Incept: EPN, Fri Jul 22 09:21:53 2011
1085 *
1086 * Purpose: Position an HMM save file <cmfp->hfp> to <offset> and
1087 * read an HMM from it at that position. Return the HMM
1088 * in <ret_hmm>.
1089 *
1090 * Caller must already know the alphabet the HMM is
1091 * expected to be in, passed in as <abc>. If the alphabet
1092 * for the HMM is of a different type than <abc>, we return
1093 * eslEINCOMPAT.
1094 *
1095 * Returns: <eslOK> on success, and the newly allocated HMM is
1096 * in <ret_hmm>.
1097 *
1098 * Returns <eslEOF> if no HMMs remain in the file; this
1099 * is a failure, as we expect there to be one at <offset>.
1100 *
1101 * Returns <eslEFORMAT> on any format problems, including
1102 * premature end of data or bad magic at the start of a
1103 * binary file. An informative error message is left in
1104 * <cmfp->errbuf>; the filename (fully qualified, if opened
1105 * in a directory specified by an <env> list) is in
1106 * <cmfp->fname>; and if <cmfp->hfp->efp> is non-<NULL>,
1107 * the HMM file is in an ASCII text format, and the caller
1108 * may also obtain the line number at which the format
1109 * error was detected, in <cmfp->hfp->efp->linenumber>, and
1110 * use it to format informative output for a user.
1111 *
1112 * Returns <eslEINCOMPAT> if the alphabet of the HMM doesn't
1113 * match the expectation in <abc>.
1114 *
1115 * Upon any return that is not <eslOK>, <*ret_hmm> is
1116 * <NULL>.
1117 *
1118 * Throws: <eslEMEM> upon an allocation error.
1119 * <eslESYS> on failure of other system calls, such
1120 * as file positioning functions (<fseeko()> or <ftello()>.
1121 */
1122 int
cm_p7_hmmfile_Read(CM_FILE * cmfp,ESL_ALPHABET * abc,off_t offset,P7_HMM ** ret_hmm)1123 cm_p7_hmmfile_Read(CM_FILE *cmfp, ESL_ALPHABET *abc, off_t offset, P7_HMM **ret_hmm)
1124 {
1125 int status;
1126 P7_HMM *hmm = NULL;
1127 char *tok1 = NULL;
1128 uint32_t magic;
1129
1130 #ifdef HMMER_THREADS
1131 if(!cmfp->hfp->do_stdin && !cmfp->hfp->do_gzip) {
1132 /* lock the mutex to prevent other threads from reading the file at the same time */
1133 if (cmfp->hfp->syncRead) {
1134 if (pthread_mutex_lock (&cmfp->hfp->readMutex) != 0) ESL_EXCEPTION(eslESYS, "mutex lock failed");
1135 }
1136 }
1137 #endif
1138 if(!cmfp->hfp->do_stdin && !cmfp->hfp->do_gzip) {
1139 if (fseeko(cmfp->hfp->f, offset, SEEK_SET) != 0) ESL_XFAIL(eslEINCOMPAT, cmfp->errbuf, "Failed to set position for HMM file parser");
1140 }
1141
1142 /* set the parser if it's unset (which it is only for first HMM read from cmfp->hfp) */
1143 if(cmfp->hfp->parser == NULL) {
1144 if(cmfp->is_binary) { /* CM file is binary, HMM file must be too */
1145 if (! fread((char *) &(magic), sizeof(uint32_t), 1, cmfp->hfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Failed to read magic number at start of p7 additional filters");
1146 if (magic == v3a_magic) { cmfp->hfp->format = p7_HMMFILE_3a; cmfp->hfp->parser = read_bin30hmm; }
1147 else if (magic == v3b_magic) { cmfp->hfp->format = p7_HMMFILE_3b; cmfp->hfp->parser = read_bin30hmm; }
1148 else if (magic == v3c_magic) { cmfp->hfp->format = p7_HMMFILE_3c; cmfp->hfp->parser = read_bin30hmm; }
1149 else if (magic == v3d_magic) { cmfp->hfp->format = p7_HMMFILE_3d; cmfp->hfp->parser = read_bin30hmm; }
1150 else if (magic == v3e_magic) { cmfp->hfp->format = p7_HMMFILE_3e; cmfp->hfp->parser = read_bin30hmm; }
1151 else if (magic == v3f_magic) { cmfp->hfp->format = p7_HMMFILE_3f; cmfp->hfp->parser = read_bin30hmm; }
1152 else ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Unknown magic number for p7 HMM filter");
1153 }
1154 else { /* CM file is ascii, HMM file must be too */
1155 if(cmfp->hfp->efp != NULL) ESL_XFAIL(eslEINVAL, cmfp->errbuf, "HMM ascii file parsers are out of sync");
1156 if ((cmfp->hfp->efp = esl_fileparser_Create(cmfp->hfp->f)) == NULL) { status = eslEMEM; goto ERROR; }
1157 if ((status = esl_fileparser_SetCommentChar(cmfp->hfp->efp, '#')) != eslOK) goto ERROR;
1158 if ((status = esl_fileparser_GetToken(cmfp->hfp->efp, &tok1, NULL)) != eslOK) goto ERROR;
1159
1160 if (strcmp("HMMER3/f", tok1) == 0) { cmfp->hfp->format = p7_HMMFILE_3f; cmfp->hfp->parser = read_asc30hmm; }
1161 else if (strcmp("HMMER3/e", tok1) == 0) { cmfp->hfp->format = p7_HMMFILE_3e; cmfp->hfp->parser = read_asc30hmm; }
1162 else if (strcmp("HMMER3/d", tok1) == 0) { cmfp->hfp->format = p7_HMMFILE_3d; cmfp->hfp->parser = read_asc30hmm; }
1163 else if (strcmp("HMMER3/c", tok1) == 0) { cmfp->hfp->format = p7_HMMFILE_3c; cmfp->hfp->parser = read_asc30hmm; }
1164 else if (strcmp("HMMER3/b", tok1) == 0) { cmfp->hfp->format = p7_HMMFILE_3b; cmfp->hfp->parser = read_asc30hmm; }
1165 else if (strcmp("HMMER3/a", tok1) == 0) { cmfp->hfp->format = p7_HMMFILE_3a; cmfp->hfp->parser = read_asc30hmm; }
1166 else ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Unknown format for p7 HMM filter");
1167 }
1168 }
1169
1170 /* read the HMM */
1171 status = p7_hmmfile_Read(cmfp->hfp, &abc, &hmm);
1172 if (status == eslEOF) ESL_XFAIL(status, cmfp->errbuf, "failed to read profile HMM piece, CM file may be truncated?");
1173 else if (status == eslEFORMAT) ESL_XFAIL(status, cmfp->errbuf, "bad file format for HMM filter");
1174 else if (status == eslEINCOMPAT) ESL_XFAIL(status, cmfp->errbuf, "HMM filters are of different alphabets");
1175 else if (status != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Unexpected error in reading HMM filters");
1176
1177 #ifdef HMMER_THREADS
1178 if(!cmfp->hfp->do_stdin && !cmfp->hfp->do_gzip) {
1179 if (cmfp->hfp->syncRead) {
1180 if (pthread_mutex_unlock (&cmfp->hfp->readMutex) != 0) ESL_EXCEPTION(eslESYS, "mutex unlock failed");
1181 }
1182 }
1183 #endif
1184
1185 *ret_hmm = hmm;
1186 return status;
1187
1188 ERROR:
1189 #ifdef HMMER_THREADS
1190 if(cmfp->hfp->f != NULL) {
1191 if (cmfp->hfp->syncRead) {
1192 if (pthread_mutex_unlock (&cmfp->hfp->readMutex) != 0) ESL_EXCEPTION(eslESYS, "mutex unlock failed");
1193 }
1194 }
1195 #endif
1196
1197 if(hmm != NULL) p7_hmm_Destroy(hmm);
1198 *ret_hmm = NULL;
1199 return status;
1200 }
1201
1202 /* Function: cm_p7_oprofile_Write()
1203 * Synopsis: Write an optimized p7 profile in two files.
1204 * Incept: EPN, Fri Jul 8 06:57:50 2011
1205 *
1206 * Purpose: Write the MSV filter part of <om> to open binary stream
1207 * <ffp>, and the rest of the model to <pfp>. These two
1208 * streams will typically be <.i1f> and <.i1p> files
1209 * being created by cmpress.
1210 *
1211 * Most of the work is done by p7_oprofile_Write(). This
1212 * function is only necessary to write five pieces of data
1213 * that are specific to CMs to the MSV file and one
1214 * to the profile file:
1215 *
1216 * 1. <cm_offset>: the position in the corresponding <.i1m>
1217 * file at which the CM corresponding to the optimized
1218 * profile can be found.
1219 *
1220 * 2. <cm_clen>: consensus length of the CM this p7 is a
1221 * filter for.
1222 *
1223 * 3. <cm_W>: window length of the CM this p7 is a
1224 * filter for.
1225 *
1226 * 4. <cm_nbp>: number of base pairs in CM (if 0, pipeline
1227 * will be run in HMM only mode.
1228 *
1229 * 5. <gfmu>: glocal forward mu for this <om>
1230 *
1231 * 6. <gflambda>: glocal forward lambda for this <om>
1232 *
1233 * Args: ffp - open binary stream for saving MSV filter part
1234 * pfp - open binary stream for saving rest of profile
1235 * cm_offset - disk offset for CM in <.i1m> file that corresponds
1236 * to this profile
1237 * cm_clen - consensus length of CM corresponding to this om (usually om->M)
1238 * cm_W - window length for the CM
1239 * cm_nbp - number of basepairs in the CM
1240 * gfmu - E value mu param for glocal forward for this om
1241 * gflambda - E value lambda param for glocal forward for this om
1242 * om - optimized profile to save
1243 *
1244 * Returns: <eslOK> on success.
1245 *
1246 * Returns <eslFAIL> on any write failure; for example,
1247 * if disk is full.
1248 *
1249 * Throws: (no abnormal error conditions)
1250 */
1251 int
cm_p7_oprofile_Write(FILE * ffp,FILE * pfp,off_t cm_offset,int cm_clen,int cm_W,int cm_nbp,float gfmu,float gflambda,P7_OPROFILE * om)1252 cm_p7_oprofile_Write(FILE *ffp, FILE *pfp, off_t cm_offset, int cm_clen, int cm_W, int cm_nbp, float gfmu, float gflambda, P7_OPROFILE *om)
1253 {
1254 /* <ffp> is the part of the oprofile that MSVFilter() needs */
1255 if (fwrite((char *) &(v1a_fmagic), sizeof(uint32_t), 1, ffp) != 1) return eslFAIL;
1256 if (fwrite((char *) &(cm_offset), sizeof(off_t), 1, ffp) != 1) return eslFAIL;
1257 if (fwrite((char *) &(cm_clen), sizeof(int), 1, ffp) != 1) return eslFAIL;
1258 if (fwrite((char *) &(cm_W), sizeof(int), 1, ffp) != 1) return eslFAIL;
1259 if (fwrite((char *) &(cm_nbp), sizeof(int), 1, ffp) != 1) return eslFAIL;
1260 if (fwrite((char *) &(gfmu), sizeof(float), 1, ffp) != 1) return eslFAIL;
1261 if (fwrite((char *) &(gflambda), sizeof(float), 1, ffp) != 1) return eslFAIL;
1262
1263 /* <pfp> gets the rest of the oprofile, we don't need to write anything extra, so
1264 * p7_oprofile_Write handles it all */
1265
1266 /* pass to p7_oprofile_Write to do the rest */
1267 return p7_oprofile_Write(ffp, pfp, om);
1268 }
1269
1270
1271 /* Function: cm_p7_oprofile_Position()
1272 * Synopsis: Reposition the hmm filter file part of a CM file to an offset.
1273 * Incept: EPN, Fri Jul 22 10:49:41 2011
1274 * MSF, Thu Oct 15, 2009 [Janelia] (p7_oprofile_Position())
1275 *
1276 * Purpose: Reposition an open <cmfp->ffp> to offset <offset>.
1277 * <offset> would usually be the first byte of a
1278 * desired hmm record.
1279 *
1280 * Returns: <eslOK> on success;
1281 * <eslEOF> if no data can be read from this position.
1282 *
1283 * Throws: <eslEINVAL> if the <sqfp> is not positionable.
1284 * <eslEFORMAT> if no msv profile opened.
1285 * <eslESYS> if the fseeko() call fails.
1286 */
1287 int
cm_p7_oprofile_Position(CM_FILE * cmfp,off_t offset)1288 cm_p7_oprofile_Position(CM_FILE *cmfp, off_t offset)
1289 {
1290 if (cmfp->ffp == NULL) ESL_EXCEPTION(eslEFORMAT, cmfp->errbuf, "no MSV profile file; cmpress probably wasn't run");
1291 if (cmfp->do_stdin) ESL_EXCEPTION(eslEINVAL, "can't Position() in standard input");
1292 if (cmfp->do_gzip) ESL_EXCEPTION(eslEINVAL, "can't Position() in a gzipped file");
1293 if (offset < 0) ESL_EXCEPTION(eslEINVAL, "bad offset");
1294
1295 if (fseeko(cmfp->ffp, offset, SEEK_SET) != 0) ESL_EXCEPTION(eslESYS, "fseeko() failed");
1296
1297 return eslOK;
1298 }
1299
1300
1301 /* Function: cm_p7_oprofile_ReadMSV()
1302 * Synopsis: Read MSV filter part of an optimized profile.
1303 * Incept: EPN, Fri Jul 8 08:19:58 2011
1304 *
1305 * Purpose: Read the MSV filter part of a p7 filter profile from the
1306 * <.i1f> file associated with an open CM file <cmfp>.
1307 * Allocate a new model, populate it with this minimal
1308 * MSV filter information, and return a pointer to them
1309 * in <*ret_om>. Return pointers to additional information
1310 * on the model in <*ret_cm_offset>, <*ret_cm_clen> and
1311 * <*ret_cm_W>.
1312 *
1313 * Our alphabet may get set by the first HMM we read. If
1314 * <*byp_abc> is <NULL> at start, create a new alphabet and
1315 * return a pointer to it in <*byp_abc>. If <*byp_abc> is
1316 * non-<NULL>, it is assumed to be a pointer to an existing
1317 * alphabet; we verify that the HMM's alphabet matches it
1318 * and <*ret_abc> isn't changed. This is the same
1319 * convention used by <cm_file_Read()>.
1320 *
1321 * The <.i1f> file was opened automatically, if it existed,
1322 * when the CM file was opened with <cm_file_Open()>.
1323 *
1324 * When no more HMMs remain in the file, return <eslEOF>.
1325 *
1326 * Most of the work is done by p7_oprofile_ReadMSV(). This
1327 * function is only necessary to read five pieces of data
1328 * that are specific to CMs. See cm_p7_oprofile_Write()
1329 * for more explanation.
1330 *
1331 * If <read_scores> is TRUE: read all the MSV data
1332 * (including scores) using p7_oprofile_ReadMSV().
1333 * If <read_scores> is FALSE: read only the MSV info
1334 * (no scores) using p7_oprofile_ReadInfoMSV().
1335 *
1336 *
1337 * Args: cmfp - open CM file, with associated .i1p file
1338 * byp_abc - BYPASS: <*byp_abc == ESL_ALPHABET *> if known;
1339 * <*byp_abc == NULL> if desired;
1340 * <NULL> if unwanted.
1341 * ret_cm_offset - RETURN: offset of CM each om corresponds to
1342 * ret_cm_clen - RETURN: clen of CM each om corresponds to
1343 * ret_cm_W - RETURN: W of CM each om corresponds to
1344 * ret_cm_nbp - RETURN: number of bps in CM each om corresponds to
1345 * ret_gfmu - RETURN: glocal fwd mu the om
1346 * ret_gflambda - RETURN: glocal forward lambda the om
1347 * ret_om - RETURN: the read <om> with MSV filter
1348 * data filled in.
1349 *
1350 * Returns: <eslOK> on success. <*ret_om> is allocated here;
1351 * caller free's with <p7_oprofile_Destroy()>.
1352 * <*byp_abc> is allocated here if it was requested;
1353 * caller free's with <esl_alphabet_Destroy()>.
1354 *
1355 * Returns <eslEFORMAT> if <cmfp> has no <.i1f> file open,
1356 * or on any parsing error.
1357 *
1358 * Returns <eslEINCOMPAT> if the HMM we read is incompatible
1359 * with the existing alphabet <*byp_abc> led us to expect.
1360 *
1361 * On any returned error, <cmfp->errbuf> contains an
1362 * informative error message.
1363 *
1364 * Throws: <eslEMEM> on allocation error.
1365 */
1366 int
cm_p7_oprofile_ReadMSV(CM_FILE * cmfp,int read_scores,ESL_ALPHABET ** byp_abc,off_t * ret_cm_offset,int * ret_cm_clen,int * ret_cm_W,int * ret_cm_nbp,float * ret_gfmu,float * ret_gflambda,P7_OPROFILE ** ret_om)1367 cm_p7_oprofile_ReadMSV(CM_FILE *cmfp, int read_scores, ESL_ALPHABET **byp_abc, off_t *ret_cm_offset, int *ret_cm_clen, int *ret_cm_W, int *ret_cm_nbp, float *ret_gfmu, float *ret_gflambda, P7_OPROFILE **ret_om)
1368 {
1369 int status = eslOK; /* return status */
1370 uint32_t magic; /* magic number used to verify format */
1371 off_t cm_offset; /* offset of the corresponding CM in cmfp->mfp */
1372 int cm_clen; /* CM consensus length (will likely be om->M) */
1373 int cm_W; /* CM window length (will likely differ from om->max_length) */
1374 int cm_nbp; /* number of basepairs in CM (if 0 pipeline will be run in HMM only mode) */
1375 float gfmu; /* glocal fwd mu parameter for current hmm */
1376 float gflambda; /* glocal fwd lambda parameter for current hmm */
1377 P7_OPROFILE *om = NULL; /* the om we've read */
1378
1379 cmfp->errbuf[0] = '\0';
1380 if (cmfp->ffp == NULL) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "no MSV profile file; cmpress probably wasn't run");
1381
1382 if (feof(cmfp->ffp)) { status = eslEOF; goto ERROR; } /* normal EOF: no more profiles */
1383 if (! fread( (char *) &magic, sizeof(uint32_t), 1, cmfp->ffp)) { status = eslEOF; goto ERROR; }
1384 if (magic != v1a_fmagic) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "bad magic; not a CM database?");
1385
1386 if (! fread( (char *) &cm_offset, sizeof(off_t), 1, cmfp->ffp)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM offset");
1387 if (! fread( (char *) &cm_clen, sizeof(int), 1, cmfp->ffp)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM consensus length");
1388 if (! fread( (char *) &cm_W, sizeof(int), 1, cmfp->ffp)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM window length (W)");
1389 if (! fread( (char *) &cm_nbp, sizeof(int), 1, cmfp->ffp)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM number of bps");
1390 if (! fread( (char *) &gfmu, sizeof(int), 1, cmfp->ffp)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read glocal fwd mu parameter");
1391 if (! fread( (char *) &gflambda, sizeof(int), 1, cmfp->ffp)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read glocal fwd lambda parameter");
1392
1393 /* move HMM file position (cmfp->hfp->ffp) to where we are in CM file (cmfp->ffp) */
1394 if (fseeko(cmfp->hfp->ffp, ftello(cmfp->ffp), SEEK_SET) != 0) ESL_XFAIL(eslEINCOMPAT, cmfp->errbuf, "failed to set position for HMM file parser (MSV)");
1395 if (read_scores) {
1396 if ((status = p7_oprofile_ReadMSV(cmfp->hfp, byp_abc, &om)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "failed to read MSV filter model");
1397 }
1398 else {
1399 if ((status = p7_oprofile_ReadInfoMSV(cmfp->hfp, byp_abc, &om)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "failed to read MSV info");
1400 }
1401 /* move CM file position (cmfp->ffp) to where we are in HMM file (cmfp->hfp->ffp) */
1402 if (fseeko(cmfp->ffp, ftello(cmfp->hfp->ffp), SEEK_SET) != 0) ESL_XFAIL(eslEINCOMPAT, cmfp->errbuf, "Failed to set position for CM file parser (MSV)");
1403
1404 if(ret_cm_offset != NULL) *ret_cm_offset = cm_offset;
1405 if(ret_cm_clen != NULL) *ret_cm_clen = cm_clen;
1406 if(ret_cm_W != NULL) *ret_cm_W = cm_W;
1407 if(ret_cm_nbp != NULL) *ret_cm_nbp = cm_nbp;
1408 if(ret_gfmu != NULL) *ret_gfmu = gfmu;
1409 if(ret_gflambda != NULL) *ret_gflambda = gflambda;
1410 if(ret_om != NULL) *ret_om = om;
1411 return eslOK;
1412
1413 ERROR:
1414 if (om != NULL) p7_oprofile_Destroy(om);
1415 if (ret_om != NULL) *ret_om = NULL;
1416 return status;
1417 }
1418
1419 /* Function: cm_p7_oprofile_ReadBlockMSV()
1420 * Synopsis: Read the next block of MSV filter parts from a HMM file.
1421 * Incept: EPN, Thu Jul 21 04:44:19 2011
1422 *
1423 * Purpose: Reads a block of the MSV filter parts of optimized
1424 * p7 profiles from open cm file <cmfp> into
1425 * <hmmBlock>.
1426 *
1427 * Most of the work is done by p7_oprofile_ReadMSV(). This
1428 * function is only necessary to read five pieces of data
1429 * that are specific to CMs. See cm_p7_oprofile_Write()
1430 * for more explanation.
1431 *
1432 * Args: cmfp - open CM file, with associated .i1f file
1433 * idx0 - the index of the next profile in the file (0 if first profile)
1434 * byp_abc - BYPASS: <*byp_abc == ESL_ALPHABET *> if known;
1435 * <*byp_abc == NULL> if desired;
1436 * <NULL> if unwanted.
1437 * hmmBlock- RETURN: the block of profiles.
1438 *
1439 * Returns: <eslOK> on success; block in <hmmBlock>.
1440 *
1441 * Returns <eslEFORMAT> if <cmfp> has no <.i1f> file open,
1442 * or on any parsing error.
1443 *
1444 * Returns <eslEINCOMPAT> if any HMM we read is incompatible
1445 * with the existing alphabet <*byp_abc> led us to expect.
1446 *
1447 * Returns <eslEOF> when there is no profiles left in the
1448 * file (including first attempt to read an empty file).
1449 */
1450 int
cm_p7_oprofile_ReadBlockMSV(CM_FILE * cmfp,int64_t idx0,ESL_ALPHABET ** byp_abc,CM_P7_OM_BLOCK * hmmBlock)1451 cm_p7_oprofile_ReadBlockMSV(CM_FILE *cmfp, int64_t idx0, ESL_ALPHABET **byp_abc, CM_P7_OM_BLOCK *hmmBlock)
1452 {
1453 int status = eslOK; /* return status */
1454 int i;
1455
1456 hmmBlock->count = 0;
1457 hmmBlock->idx0 = idx0;
1458 for (i = 0; i < hmmBlock->listSize; ++i)
1459 {
1460 status = cm_p7_oprofile_ReadMSV(cmfp, TRUE, byp_abc,
1461 &(hmmBlock->cm_offsetA[i]),
1462 &(hmmBlock->cm_clenA[i]),
1463 &(hmmBlock->cm_WA[i]),
1464 &(hmmBlock->cm_nbpA[i]),
1465 &(hmmBlock->gfmuA[i]),
1466 &(hmmBlock->gflambdaA[i]),
1467 &(hmmBlock->list[i]));
1468 if (status != eslOK) break;
1469 ++hmmBlock->count;
1470 }
1471 /* create the MSV data */
1472 for (i = 0; i < hmmBlock->count; ++i) {
1473 hmmBlock->msvdataA[i] = p7_hmm_ScoreDataCreate(hmmBlock->list[i], FALSE);
1474 }
1475 /* initiate the clan idx data, caller will need to fill this if nec */
1476 for (i = 0; i < hmmBlock->count; ++i) {
1477 hmmBlock->clan_idxA[i] = -1;
1478 }
1479
1480 /* EOF will be returned only in the case were no profiles were read */
1481 if (status == eslEOF && i > 0) status = eslOK;
1482
1483 return status;
1484 }
1485
1486 /*----------------- end of API for p7 filters ----------------------*/
1487
1488
1489 /*****************************************************************
1490 * 5. Private, specific profile CM file format parsers.
1491 *****************************************************************/
1492
1493 /* Parsing save files from INFERNAL 1.x
1494 * All parsers follow the same API.
1495 *
1496 * Returns <eslOK> on success, and if <opt_cm> is non-NULL,
1497 * <*opt_cm> points at a newly allocated CM.
1498 *
1499 * Additionally, if <*ret_abc> was NULL, then a new alphabet is
1500 * allocated according to the alphabet type of this CM, and returned
1501 * thru <ret_abc>. This allocation mechanism allows a main()
1502 * application that doesn't yet know its alphabet to determine the
1503 * alphabet when the first CM is read, while also allowing an
1504 * application to allocate its own alphabet and assure that the
1505 * input CMs are appropriate for that alphabet.
1506 *
1507 * Returns <eslEOF> when no CM remains in the file, indicating a
1508 * normal end-of-file.
1509 *
1510 * Two types of "normal error" may happen, which the caller must check
1511 * for. Returns <eslEFORMAT> on any save file format error, including
1512 * bad magic (i.e. this is not an INFERNAL file at all). Returns
1513 * <eslEINCOMPAT> if the expected alphabet (a non-<NULL> alphabet
1514 * specified by <*ret_abc>) does not match the alphabet type of the
1515 * HMM.
1516 *
1517 * When these normal errors occur, the caller can construct its error
1518 * message from:
1519 * <cmfp->errbuf>: contains an informative error message
1520 * <cmfp->fname>: name of the CM file (or '-' if STDIN)
1521 * and if <cmfp->efp> is non-<NULL>, the CM file is in ASCII text,
1522 * and the caller may also use:
1523 * <cmfp->efp->linenumber>: line on which the parse error occurred.
1524 *
1525 * Throws: <eslEMEM> on allocation error.
1526 * <eslESYS> if a system i/o call fails.
1527 * In cases of error (including both thrown error and normal error), <*ret_abc>
1528 * is left in its original state as passed by the caller, and <*ret_hmm> is
1529 * returned <NULL>.
1530 */
1531 static int
read_asc_1p1_cm(CM_FILE * cmfp,int read_fp7,ESL_ALPHABET ** ret_abc,CM_t ** opt_cm)1532 read_asc_1p1_cm(CM_FILE *cmfp, int read_fp7, ESL_ALPHABET **ret_abc, CM_t **opt_cm)
1533 {
1534 int status;
1535 ESL_ALPHABET *abc = NULL;
1536 CM_t *cm = NULL;
1537 P7_HMM *hmm = NULL;
1538 char *tag = NULL;
1539 char *tok1 = NULL;
1540 char *tok2 = NULL;
1541 char *tok3 = NULL;
1542 char *tok4 = NULL;
1543 char *tok5 = NULL;
1544 char *tok6 = NULL;
1545 int alphatype;
1546 off_t offset = 0;
1547 off_t fp7_offset = 0;
1548 int v, x, y, nd; /* counters */
1549 int read_fp7_stats = FALSE;
1550 uint32_t cm_statstracker = 0; /* for making sure we have all CM E-value stats, if we have any */
1551 int exp_mode;
1552 int read_el_selfsc = FALSE; /* set to true when we read ELSELF line */
1553
1554 /* temporary parameters, for storing values prior to their allocation in the CM */
1555 float *tmp_null = NULL;
1556 float tmp_fp7_gfmu;
1557 float tmp_fp7_gflambda;
1558 double tmp_qdbbeta1;
1559 double tmp_qdbbeta2;
1560
1561 /* temporary per-node annotation, will be converted to per-consensus position once the model
1562 * architecture is known, after the full model is read */
1563 char *tmp_rf_left = NULL;
1564 char *tmp_rf_right = NULL;
1565 char *tmp_cons_left = NULL;
1566 char *tmp_cons_right = NULL;
1567 int *tmp_map_left = NULL;
1568 int *tmp_map_right = NULL;
1569
1570 cmfp->errbuf[0] = '\0';
1571
1572 if (cmfp->newly_opened)
1573 {
1574 offset = 0;
1575 cmfp->newly_opened = FALSE;
1576 }
1577 else
1578 {
1579 /* Record where this CM starts on disk */
1580 if ((! cmfp->do_stdin) && (! cmfp->do_gzip) && (offset = ftello(cmfp->f)) < 0) ESL_XEXCEPTION(eslESYS, "ftello() failed");
1581
1582 /* First line of file: "INFERNAL1/a". Allocate shell for CM annotation information (we don't know M,nodes yet) */
1583 if ((status = esl_fileparser_NextLine(cmfp->efp)) != eslOK) goto ERROR; /* EOF here is normal; could also be a thrown EMEM */
1584 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tag, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "unexpected absence of tokens on data line");
1585
1586 if (cmfp->format == CM_FILE_1a) { if (strcmp(tag, "INFERNAL1/a") != 0) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Didn't find INFERNAL1/a tag: bad format or not an INFERNAL save file?"); }
1587 else ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No such CM file format code: this shouldn't happen");
1588 }
1589
1590 if ((cm = CreateCMShell()) == NULL) ESL_XFAIL(eslEMEM, cmfp->errbuf, "allocation failure, CM shell");
1591 cm->offset = offset;
1592
1593 /* Header section */
1594 while ((status = esl_fileparser_NextLine(cmfp->efp)) == eslOK)
1595 {
1596 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tag, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Premature end of line");
1597
1598 if (strcmp(tag, "NAME") == 0) {
1599 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "No name found on NAME line");
1600 cm_SetName(cm, tok1);
1601 }
1602
1603 else if (strcmp(tag, "ACC") == 0) {
1604 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "No accession found on ACC line");
1605 cm_SetAccession(cm, tok1);
1606 }
1607
1608 else if (strcmp(tag, "DESC") == 0) {
1609 if ((status = esl_fileparser_GetRemainingLine(cmfp->efp, &tok1)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "No description found on DESC line");
1610 cm_SetDescription(cm, tok1);
1611 }
1612
1613 else if (strcmp(tag, "STATES") == 0) {
1614 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Number of model states not found on STATES line");
1615 if ((cm->M = atoi(tok1)) == 0) ESL_XFAIL(status, cmfp->errbuf, "Invalid number of states %s on STATES line", tok1);
1616 }
1617
1618 else if (strcmp(tag, "NODES") == 0) {
1619 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Number of model nodes not found on NODES line");
1620 if ((cm->nodes = atoi(tok1)) == 0) ESL_XFAIL(status, cmfp->errbuf, "Invalid number of nodes %s on NODES line", tok1);
1621 }
1622
1623 else if (strcmp(tag, "CLEN") == 0) {
1624 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "No consensus length found on CLEN line");
1625 if ((cm->clen = atoi(tok1)) == 0) ESL_XFAIL(status, cmfp->errbuf, "Invalid consensus length %s on CLEN line", tok1);
1626 }
1627
1628 else if (strcmp(tag, "W") == 0) {
1629 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "No consensus length found on W line");
1630 if ((cm->W = atoi(tok1)) == 0) ESL_XFAIL(status, cmfp->errbuf, "Invalid consensus length %s on W line", tok1);
1631 }
1632
1633 else if (strcmp(tag, "ALPH") == 0) {
1634 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "No alphabet type found on ALPH");
1635 if ((alphatype = esl_abc_EncodeType(tok1)) == eslUNKNOWN) ESL_XFAIL(status, cmfp->errbuf, "Unrecognized alphabet type %s", tok1);
1636 if (*ret_abc == NULL) {
1637 if ((abc = esl_alphabet_Create(alphatype)) == NULL) ESL_XFAIL(eslEMEM, cmfp->errbuf, "Failed to create alphabet");
1638 } else {
1639 if ((*ret_abc)->type != alphatype) ESL_XFAIL(eslEINCOMPAT,cmfp->errbuf,"Alphabet type mismatch: was %s, but current CM says %s", esl_abc_DecodeType( (*ret_abc)->type), tok1);
1640 abc = *ret_abc;
1641 }
1642 }
1643
1644 else if (strcmp(tag, "RF") == 0) {
1645 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "No yes/no found for RF line");
1646 if (strcasecmp(tok1, "yes") == 0) cm->flags |= CMH_RF;
1647 else if (strcasecmp(tok1, "no") != 0) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "RF header line must say yes/no, not %s", tok1);
1648 }
1649
1650 else if (strcmp(tag, "CONS") == 0) {
1651 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "No yes/no found for CONS line");
1652 if (strcasecmp(tok1, "yes") == 0) cm->flags |= CMH_CONS;
1653 else if (strcasecmp(tok1, "no") != 0) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "CONS header line must say yes/no, not %s", tok1);
1654 }
1655
1656 else if (strcmp(tag, "MAP") == 0) {
1657 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "No yes/no found for MAP line");
1658 if (strcasecmp(tok1, "yes") == 0) cm->flags |= CMH_MAP;
1659 else if (strcasecmp(tok1, "no") != 0) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "MAP header line must say yes/no, not %s", tok1);
1660 }
1661
1662 else if (strcmp(tag, "DATE") == 0) {
1663 if ((status = esl_fileparser_GetRemainingLine(cmfp->efp, &tok1)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "No date found on DATE line");
1664 if (esl_strdup(tok1, -1, &(cm->ctime)) != eslOK) ESL_XFAIL(eslEMEM, cmfp->errbuf, "strdup() failed to set date");
1665 }
1666
1667 else if (strcmp(tag, "COM") == 0) {
1668 /* just skip the first token; it's something like [1], numbering the command lines */
1669 if ((status = esl_fileparser_GetTokenOnLine (cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "No command number on COM line");
1670 if ((status = esl_fileparser_GetRemainingLine(cmfp->efp, &tok1)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "No command on COM line");
1671 if (cm->comlog == NULL) {
1672 if (esl_strdup(tok1, -1, &(cm->comlog)) != eslOK) ESL_XFAIL(eslEMEM, cmfp->errbuf, "esl_strdup() failed");
1673 } else {
1674 if (esl_strcat(&(cm->comlog), -1, "\n", -1) != eslOK) ESL_XFAIL(eslEMEM, cmfp->errbuf, "esl_strcat() failed");
1675 if (esl_strcat(&(cm->comlog), -1, tok1, -1) != eslOK) ESL_XFAIL(eslEMEM, cmfp->errbuf, "esl_strcat() failed");
1676 }
1677 }
1678
1679 else if (strcmp(tag, "PBEGIN") == 0) {
1680 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Nothing follows PBEGIN tag");
1681 if ((cm->pbegin = atof(tok1)) <= 0.0f) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid beta on PBEGIN line: should be a positive real number, not %s", tok1);
1682 }
1683
1684 else if (strcmp(tag, "PEND") == 0) {
1685 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Nothing follows PEND tag");
1686 if ((cm->pend = atof(tok1)) <= 0.0f) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid beta on PEND line: should be a positive real number, not %s", tok1);
1687 }
1688
1689 else if (strcmp(tag, "WBETA") == 0) {
1690 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Nothing follows WBETA tag");
1691 if ((cm->beta_W = atof(tok1)) <= 0.0f) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid beta on WBETA line: should be a positive real number, not %s", tok1);
1692 }
1693
1694 else if (strcmp(tag, "QDBBETA1") == 0) {
1695 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Nothing follows QDBBETA tag");
1696 if ((tmp_qdbbeta1 = atof(tok1)) <= 0.0f) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid beta on QDBBETA line: should be a positive real number, not %s", tok1);
1697 }
1698
1699 else if (strcmp(tag, "QDBBETA2") == 0) {
1700 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Nothing follows QDBBETA tag");
1701 if ((tmp_qdbbeta2 = atof(tok1)) <= 0.0f) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid beta on QDBBETA line: should be a positive real number, not %s", tok1);
1702 }
1703
1704 else if (strcmp(tag, "N2OMEGA") == 0) {
1705 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Nothing follows N2OMEGA tag");
1706 if ((cm->null2_omega = atof(tok1)) <= 0.0f) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid omega on N2OMEGA line: should be a positive real number, not %s", tok1);
1707 }
1708
1709 else if (strcmp(tag, "N3OMEGA") == 0) {
1710 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Nothing follows N3OMEGA tag");
1711 if ((cm->null3_omega = atof(tok1)) <= 0.0f) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid omega on N3OMEGA line: should be a positive real number, not %s", tok1);
1712 }
1713
1714 else if (strcmp(tag, "ELSELF") == 0) {
1715 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Nothing follows ELSELF tag");
1716 cm->el_selfsc = atof(tok1);
1717 read_el_selfsc = TRUE;
1718 }
1719
1720 else if (strcmp(tag, "NSEQ") == 0) {
1721 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Nothing follows NSEQ tag");
1722 if ((cm->nseq = atoi(tok1)) == 0) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid nseq on NSEQ line: should be integer, not %s", tok1);
1723 }
1724
1725 else if (strcmp(tag, "EFFN") == 0) {
1726 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Nothing follows EFFN tag");
1727 if ((cm->eff_nseq = atof(tok1)) < (-1.*eslSMALLX1)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid eff_nseq on EFFN line: should be a positive real number, not %s", tok1);
1728 }
1729
1730 else if (strcmp(tag, "CKSUM") == 0) {
1731 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Nothing follows CKSUM tag");
1732 cm->checksum = atoll(tok1); /* if atoi(), then you may truncate uint32_t checksums > 2^31-1 */
1733 cm->flags |= CMH_CHKSUM;
1734 }
1735
1736 else if (strcmp(tag, "NULL") == 0) {
1737 if(abc == NULL) ESL_XFAIL(status, cmfp->errbuf, "Read NULL line before ALPH line, ALPH line must come first");
1738 ESL_ALLOC(tmp_null, sizeof(float) * abc->K);
1739 for (x = 0; x < abc->K; x++) {
1740 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on NULL line");
1741 tmp_null[x] = ascii2prob(tok1, (1./(float) abc->K));
1742 }
1743 }
1744
1745 else if (strcmp(tag, "GA") == 0) {
1746 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on GA line");
1747 cm->ga = atof(tok1);
1748 cm->flags |= CMH_GA;
1749 }
1750
1751 else if (strcmp(tag, "TC") == 0) {
1752 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on TC line");
1753 cm->tc = atof(tok1);
1754 cm->flags |= CMH_TC;
1755 }
1756
1757 else if (strcmp(tag, "NC") == 0) {
1758 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on NC line");
1759 cm->nc = atof(tok1);
1760 cm->flags |= CMH_NC;
1761 }
1762
1763 else if (strcmp(tag, "EFP7GF") == 0) {
1764 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on EFP7GF line"); /* tau */
1765 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok2, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on EFP7GF line"); /* lambda */
1766 tmp_fp7_gfmu = atof(tok1);
1767 tmp_fp7_gflambda = atof(tok2);
1768 read_fp7_stats = TRUE;
1769 }
1770
1771 else if (strncmp(tag, "ECM", 3) == 0) { /* one of 4 possible CM E-value lines */
1772 /* determine which one */
1773 if (strncmp(tag+3, "LC", 2) == 0) { exp_mode = EXP_CM_LC; cm_statstracker += 1; }
1774 else if (strncmp(tag+3, "GC", 2) == 0) { exp_mode = EXP_CM_GC; cm_statstracker += 2; }
1775 else if (strncmp(tag+3, "LI", 2) == 0) { exp_mode = EXP_CM_LI; cm_statstracker += 4; }
1776 else if (strncmp(tag+3, "GI", 2) == 0) { exp_mode = EXP_CM_GI; cm_statstracker += 8; }
1777 else { ESL_XFAIL(status, cmfp->errbuf, "Invalid tag beginning with ECM"); }
1778 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on ECM.. line"); /* lambda */
1779 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok2, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on ECM.. line"); /* mu_extrap */
1780 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok3, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on ECM.. line"); /* mu_orig */
1781 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok4, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on ECM.. line"); /* dbsize */
1782 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok5, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on ECM.. line"); /* nrandhits */
1783 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok6, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on ECM.. line"); /* tailp */
1784 if (cm->expA == NULL) {
1785 ESL_ALLOC(cm->expA, sizeof(ExpInfo_t *) * EXP_NMODES);
1786 for(x = 0; x < EXP_NMODES; x++) { cm->expA[x] = CreateExpInfo(); }
1787 }
1788 cm->expA[exp_mode]->lambda = atof(tok1);
1789 cm->expA[exp_mode]->mu_extrap = atof(tok2);
1790 cm->expA[exp_mode]->mu_orig = atof(tok3);
1791 cm->expA[exp_mode]->dbsize = atof(tok4); /* store as double, even though it was written as a long */
1792 cm->expA[exp_mode]->nrandhits = atoi(tok5);
1793 cm->expA[exp_mode]->tailp = atof(tok6);
1794 cm->expA[exp_mode]->is_valid = TRUE;
1795 }
1796 else if (strcmp(tag, "CM") == 0) {
1797 /* skip the remainder of this line */
1798 if ((status = esl_fileparser_NextLine(cmfp->efp)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Premature end of data before main model section");
1799 break;
1800 }
1801 } /* end, loop over possible header tags */
1802 if (status != eslOK) goto ERROR;
1803
1804 /* Done reading the header information.
1805 * Check that everything is ok and mandatory info is present before moving on.
1806 */
1807 if (cm->M < 1) ESL_XFAIL(status, cmfp->errbuf, "Failed to read STATES line in header section");
1808 if (cm->nodes < 1) ESL_XFAIL(status, cmfp->errbuf, "Failed to read NODES line in header section");
1809 if (cm->clen < 1) ESL_XFAIL(status, cmfp->errbuf, "Failed to read CLEN line in header section");
1810 if (cm->W < 1) ESL_XFAIL(status, cmfp->errbuf, "Failed to read W line in header section");
1811 if (! read_el_selfsc) ESL_XFAIL(status, cmfp->errbuf, "Failed to read ELSELF line in header section");
1812 if (! read_fp7_stats) ESL_XFAIL(status, cmfp->errbuf, "Failed to read EFP7GF line in header section");
1813 if (cm->name == NULL) ESL_XFAIL(status, cmfp->errbuf, "Failed to read NAME line in header section");
1814 if (abc == NULL) ESL_XFAIL(status, cmfp->errbuf, "Failed to read ALPH line in header section");
1815 if (tmp_null == NULL) ESL_XFAIL(status, cmfp->errbuf, "Failed to read NULL line in header section");
1816
1817 /* Check to make sure we parsed CM E-value stats correctly.
1818 */
1819 if (cm->expA != NULL) {
1820 if (cm_statstracker == 15) cm->flags |= CMH_EXPTAIL_STATS;
1821 else if (cm_statstracker != 0) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Missing one or more ECM.. parameter lines");
1822 }
1823
1824 /* Allocate body of CM now that # states (M) and # nodes (nnodes) are known */
1825 CreateCMBody(cm, cm->nodes, cm->M, cm->clen, abc);
1826
1827 /* Copy values we stored in temp parameters awaiting CM allocation in CreateCMBody() */
1828 esl_vec_FCopy(tmp_null, abc->K, cm->null); /* cm->null allocated in CreateCMBody */
1829 cm->qdbinfo->beta1 = tmp_qdbbeta1;
1830 cm->qdbinfo->beta2 = tmp_qdbbeta2;
1831
1832 /* Allocate and initialize temporary parameters for values we can't store
1833 * until after we've read the full model and are able to construct an emit map */
1834 ESL_ALLOC(tmp_map_left, sizeof(int) * cm->nodes);
1835 ESL_ALLOC(tmp_map_right, sizeof(int) * cm->nodes);
1836 esl_vec_ISet(tmp_map_left, cm->nodes, -1);
1837 esl_vec_ISet(tmp_map_right, cm->nodes, -1);
1838
1839 ESL_ALLOC(tmp_rf_left, sizeof(char) * (cm->nodes+1));
1840 ESL_ALLOC(tmp_rf_right, sizeof(char) * (cm->nodes+1));
1841 tmp_rf_left[cm->nodes] = '\0';
1842 tmp_rf_right[cm->nodes] = '\0';
1843
1844 ESL_ALLOC(tmp_cons_left, sizeof(char) * (cm->nodes+1));
1845 ESL_ALLOC(tmp_cons_right, sizeof(char) * (cm->nodes+1));
1846 tmp_cons_left[cm->nodes] = '\0';
1847 tmp_cons_right[cm->nodes] = '\0';
1848
1849 nd = -1;
1850 cm->clen = 0;
1851
1852 for (v = 0; v < cm->M; v++)
1853 {
1854 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Premature end of data before main model section");
1855
1856 /* Ah, a node line. Process it and get the following line.
1857 */
1858 if (*tok1 == '[')
1859 {
1860 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on node line: expected %d, got %d", 10, 1);
1861 if ((x = NodeCode(tok1)) == -1) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid node type %s", tok1);
1862 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on node line: expected %d, got %d", 10, 2);
1863 if (!is_integer(tok1)) ESL_XFAIL(status, cmfp->errbuf, "Invalid node index on node line: should be integer >= 0, not %s", tok1);
1864 nd = atoi(tok1);
1865 if (nd < 0) ESL_XFAIL(status, cmfp->errbuf, "Invalid node index on node line: should be integer >= 0, not %s", tok1);
1866 if (nd >= cm->nodes) ESL_XFAIL(status, cmfp->errbuf, "Invalid node index on node line: should not exceed %d, read %s", cm->nodes, tok1);
1867 cm->ndtype[nd] = x;
1868 if (cm->ndtype[nd] == MATP_nd) cm->clen+=2;
1869 else if(cm->ndtype[nd] == MATL_nd) cm->clen++;
1870 else if(cm->ndtype[nd] == MATR_nd) cm->clen++;
1871 cm->nodemap[nd] = v;
1872
1873 /* chew up ']' */
1874 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on node line: expected %d, got %d", 10, 3);
1875
1876 /* read annotation: MAP, consensus sequence and RF. Proper format depends on node type.
1877 */
1878 /* MAP (optional: CMH_MAP? yes, else no */
1879 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on node line: expected %d, got %d", 10, 4);
1880 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok2, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on node line: expected %d, got %d", 10, 5);
1881 if ((cm->flags & CMH_MAP) && cm->ndtype[nd] == MATP_nd) {
1882 if (!is_integer(tok1)) ESL_XFAIL(status, cmfp->errbuf, "Invalid 1st MAP value on MATP node line: should be positive integer, not %s", tok1);
1883 if (!is_integer(tok2)) ESL_XFAIL(status, cmfp->errbuf, "Invalid 2nd MAP value on MATP node line: should be positive integer, not %s", tok2);
1884 tmp_map_left[nd] = atoi(tok1);
1885 tmp_map_right[nd] = atoi(tok2);
1886 }
1887 else if((cm->flags & CMH_MAP) && cm->ndtype[nd] == MATL_nd) {
1888 if (!is_integer(tok1)) ESL_XFAIL(status, cmfp->errbuf, "Invalid 1st MAP value on MATL node line: should be positive integer, not %s", tok1);
1889 if (*tok2 != '-') ESL_XFAIL(status, cmfp->errbuf, "Invalid 2nd MAP value on MATL node line: should be '-', not %s", tok2);
1890 tmp_map_left[nd] = atoi(tok1);
1891 tmp_map_right[nd] = -1;
1892 }
1893 else if((cm->flags & CMH_MAP) && cm->ndtype[nd] == MATR_nd) {
1894 if (*tok1 != '-') ESL_XFAIL(status, cmfp->errbuf, "Invalid 1st MAP value on MATR node line: should be '-', not %s", tok1);
1895 if (!is_integer(tok2)) ESL_XFAIL(status, cmfp->errbuf, "Invalid 2nd MAP value on MATR node line: should be positive integer, not %s", tok2);
1896 tmp_map_left[nd] = -1;
1897 tmp_map_right[nd] = atoi(tok2);
1898 }
1899 else { /* either (! (cm->flags & CMH_MAP)) or ndtype is not MATP, MATL nor MATR */
1900 if (*tok1 != '-') ESL_XFAIL(status, cmfp->errbuf, "Invalid 1st MAP value on node line: should be '-', not %s", tok1);
1901 if (*tok2 != '-') ESL_XFAIL(status, cmfp->errbuf, "Invalid 2nd MAP value on node line: should be '-', not %s", tok2);
1902 tmp_map_left[nd] = -1;
1903 tmp_map_right[nd] = -1;
1904 }
1905
1906 /* consensus sequence (optional: CMH_CONS? yes, else no */
1907 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on node line: expected %d, got %d", 10, 6);
1908 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok2, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on node line: expected %d, got %d", 10, 7);
1909 if ((cm->flags & CMH_CONS) && (cm->ndtype[nd] == MATP_nd)) {
1910 tmp_cons_left[nd] = *tok1;
1911 tmp_cons_right[nd] = *tok2;
1912 }
1913 else if((cm->flags & CMH_CONS) && cm->ndtype[nd] == MATL_nd) {
1914 if (*tok2 != '-') ESL_XFAIL(status, cmfp->errbuf, "Invalid 2nd consensus character on MATL node line: should be '-', not %s", tok2);
1915 tmp_cons_left[nd] = *tok1;
1916 tmp_cons_right[nd] = *tok2;
1917 }
1918 else if((cm->flags & CMH_CONS) && cm->ndtype[nd] == MATR_nd) {
1919 if (*tok1 != '-') ESL_XFAIL(status, cmfp->errbuf, "Invalid 1st consensus character on MATR node line: should be '-', not %s", tok1);
1920 tmp_cons_left[nd] = *tok1;
1921 tmp_cons_right[nd] = *tok2;
1922 }
1923 else { /* either (! (cm->flags & CMH_CONS) or ndtype is not MATP, MATL nor MATR */
1924 if (*tok1 != '-') ESL_XFAIL(status, cmfp->errbuf, "Invalid 1st consensus character on node line: should be '-', not %s", tok1);
1925 if (*tok2 != '-') ESL_XFAIL(status, cmfp->errbuf, "Invalid 2nd consensus character on node line: should be '-', not %s", tok2);
1926 tmp_cons_left[nd] = *tok1;
1927 tmp_cons_right[nd] = *tok2;
1928 }
1929
1930 /* RF (optional: CMH_RF? yes, else no */
1931 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on node line: expected %d, got %d", 10, 8);
1932 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok2, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on node line: expected %d, got %d", 10, 9);
1933 if ((cm->flags & CMH_RF) && cm->ndtype[nd] == MATP_nd) {
1934 tmp_rf_left[nd] = *tok1;
1935 tmp_rf_right[nd] = *tok2;
1936 }
1937 else if((cm->flags & CMH_RF) && cm->ndtype[nd] == MATL_nd) {
1938 if (*tok2 != '-') ESL_XFAIL(status, cmfp->errbuf, "Invalid 2nd RF character on MATL node line: should be '-', not %s", tok2);
1939 tmp_rf_left[nd] = *tok1;
1940 tmp_rf_right[nd] = *tok2;
1941 }
1942 else if((cm->flags & CMH_RF) && cm->ndtype[nd] == MATR_nd) {
1943 if (*tok1 != '-') ESL_XFAIL(status, cmfp->errbuf, "Invalid 1st RF character on MATR node line: should be '-', not %s", tok1);
1944 tmp_rf_left[nd] = *tok1;
1945 tmp_rf_right[nd] = *tok2;
1946 }
1947 else { /* either (! (cm->flags & CMH_RF)) or ndtype is not MATP, MATL nor MATR */
1948 if (*tok1 != '-') ESL_XFAIL(status, cmfp->errbuf, "Invalid 1st RF character on node line: should be '-', not %s", tok1);
1949 if (*tok2 != '-') ESL_XFAIL(status, cmfp->errbuf, "Invalid 2nd RF character on node line: should be '-', not %s", tok2);
1950 tmp_rf_left[nd] = *tok1;
1951 tmp_rf_right[nd] = *tok2;
1952 }
1953 if ((status = esl_fileparser_NextLine(cmfp->efp)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Premature end of data in main model: no state %d line", v);
1954 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on state line: expected at least %d, got %d", 8, 0);
1955 } /* done with node line */
1956
1957 /* Process state line.
1958 * <statecode> <v> <plast> <pnum> <cfirst> <cnum> <dmin2> <dmin1> <dmax1> <dmax2> <transition probs (variable number)> <emission probs (variable number)>
1959 */
1960
1961 /* <statecode> */
1962 if((cm->sttype[v] = StateCode(tok1)) == -1) ESL_XFAIL(status, cmfp->errbuf, "Invalid state type %s\n", tok1);
1963
1964 /* <v> */
1965 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on state %d line: expected at least %d, got %d", v, 8, 1);
1966 if (! is_integer(tok1)) ESL_XFAIL(status, cmfp->errbuf, "Invalid state index on state line: should be integer >= 0, not %s", tok1);
1967 if (atoi(tok1) != v) ESL_XFAIL(status, cmfp->errbuf, "Invalid state index on state line: should be %d, not %s", v, tok1);
1968
1969 /* <plast> */
1970 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on state %d line: expected at least %d, got %d", v, 8, 2);
1971 if (! is_integer(tok1)) ESL_XFAIL(status, cmfp->errbuf, "Invalid plast value on state line: should be integer, not %s", tok1);
1972 cm->plast[v] = atoi(tok1);
1973
1974 /* <pnum> */
1975 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on state %d line: expected at least %d, got %d", v, 8, 3);
1976 if (! is_integer(tok1)) ESL_XFAIL(status, cmfp->errbuf, "Invalid pnum value on state line: should be integer, not %s", tok1);
1977 cm->pnum[v] = atoi(tok1);
1978
1979 /* <cfirst> */
1980 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on state %d line: expected at least %d, got %d", v, 8, 4);
1981 if (! is_integer(tok1)) ESL_XFAIL(status, cmfp->errbuf, "Invalid cfirst value on state line: should be integer, not %s", tok1);
1982 cm->cfirst[v] = atoi(tok1);
1983
1984 /* <cnum> */
1985 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on state %d line: expected at least %d, got %d", v, 8, 5);
1986 if (! is_integer(tok1)) ESL_XFAIL(status, cmfp->errbuf, "Invalid cnum value on state line: should be integer, not %s", tok1);
1987 cm->cnum[v] = atoi(tok1);
1988
1989 /* <dmin2> */
1990 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on state %d line: expected at least %d, got %d", v, 8, 6);
1991 if (! is_integer(tok1)) ESL_XFAIL(status, cmfp->errbuf, "Invalid dmin2 value on state line: should be integer, not %s", tok1);
1992 cm->qdbinfo->dmin2[v] = atoi(tok1);
1993
1994 /* <dmin1> */
1995 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on state %d line: expected at least %d, got %d", v, 8, 6);
1996 if (! is_integer(tok1)) ESL_XFAIL(status, cmfp->errbuf, "Invalid dmin1 value on state line: should be integer, not %s", tok1);
1997 cm->qdbinfo->dmin1[v] = atoi(tok1);
1998
1999 /* <dmax1> */
2000 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on state %d line: expected at least %d, got %d", v, 8, 7);
2001 if (! is_integer(tok1)) ESL_XFAIL(status, cmfp->errbuf, "Invalid dmax1 value on state line: should be integer, not %s", tok1);
2002 cm->qdbinfo->dmax1[v] = atoi(tok1);
2003
2004 /* <dmax2> */
2005 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on state %d line: expected at least %d, got %d", v, 8, 7);
2006 if (! is_integer(tok1)) ESL_XFAIL(status, cmfp->errbuf, "Invalid dmax2 value on state line: should be integer, not %s", tok1);
2007 cm->qdbinfo->dmax2[v] = atoi(tok1);
2008
2009 /* Transition probabilities. */
2010 if (cm->sttype[v] != B_st) {
2011 for (x = 0; x < cm->cnum[v]; x++) {
2012 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on state %d line: expected at least %d, got %d", v, 8+cm->cnum[v], 8+x);
2013 if ((! is_real(tok1) && (*tok1 != '*'))) ESL_XFAIL(status, cmfp->errbuf, "Invalid transition score %d on state line: should be real number or '*', not %s", x+1, tok1);
2014 cm->t[v][x] = ascii2prob(tok1, 1.);
2015 }
2016 }
2017 /* Emission probabilities. */
2018 if (cm->sttype[v] == ML_st || cm->sttype[v] == MR_st || cm->sttype[v] == IL_st || cm->sttype[v] == IR_st) {
2019 for (x = 0; x < cm->abc->K; x++) {
2020 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on state %d line: expected %d, got %d", v, 8+cm->cnum[v]+cm->abc->K, 8+cm->cnum[v]+x);
2021 if ((! is_real(tok1) && (*tok1 != '*'))) ESL_XFAIL(status, cmfp->errbuf, "Invalid emission score %d on state line: should be real number or '*', not %s", x+1, tok1);
2022 cm->e[v][x] = ascii2prob(tok1, cm->null[x]);
2023 }
2024 }
2025 else if (cm->sttype[v] == MP_st) {
2026 for (x = 0; x < cm->abc->K; x++) {
2027 for (y = 0; y < cm->abc->K; y++) {
2028 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Too few fields on state %d line: expected %d, got %d", v, 8+cm->cnum[v]+(cm->abc->K*cm->abc->K), 8+cm->cnum[v]+(x*cm->abc->K)+y);
2029 if ((! is_real(tok1) && (*tok1 != '*'))) ESL_XFAIL(status, cmfp->errbuf, "Invalid emission score %d on state line: should be real number or '*', not %s", (x*cm->abc->K)+y+1, tok1);
2030 cm->e[v][x*cm->abc->K+y] = ascii2prob(tok1, cm->null[x]*cm->null[y]);
2031 }
2032 }
2033 }
2034 cm->ndidx[v] = nd;
2035 cm->stid[v] = DeriveUniqueStateCode(cm->ndtype[nd], cm->sttype[v]);
2036 if ((status = esl_fileparser_NextLine(cmfp->efp)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Premature end of data in main model: no state %d line", v+1);
2037 } /* end of loop over states */
2038 /* The closing // */
2039 if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Premature end of data: missing //?");
2040 if (strcmp(tok1, "//") != 0) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Expected closing //; found %s instead", tok1);
2041
2042 /* Finally, read the filter HMM for this CM, unless we're explicitly told not to */
2043 if(read_fp7) {
2044 if(!cmfp->hfp->do_stdin && !cmfp->hfp->do_gzip) {
2045 if((fp7_offset = ftello(cmfp->f)) < 0) ESL_XFAIL(eslESYS, cmfp->errbuf, "ftello() failed");
2046 }
2047 else {
2048 fp7_offset = -1; /* this is irrelevant if stdin or gzip mode, cm_p7_hmmfile_Read will ignore it */
2049 }
2050 if((status = cm_p7_hmmfile_Read(cmfp, abc, fp7_offset, &hmm)) != eslOK) goto ERROR;
2051 if((status = cm_SetFilterHMM(cm, hmm, tmp_fp7_gfmu, tmp_fp7_gflambda)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Unable to set filter HMM for CM");
2052 if(!cmfp->hfp->do_stdin && !cmfp->hfp->do_gzip) {
2053 /* now position the CM file to the end of the HMM we just read */
2054 if (fseeko(cmfp->f, ftello(cmfp->hfp->f), SEEK_SET) != 0) ESL_XFAIL(eslESYS, cmfp->errbuf, "Failed to set position for CM file parser after reading filter HMMs");
2055 }
2056 }
2057
2058 CMRenormalize(cm);
2059 cm->qdbinfo->setby = CM_QDBINFO_SETBY_CMFILE;
2060 cm->W_setby = CM_W_SETBY_CMFILE;
2061
2062 /* Create emit map now that we know the model architecture */
2063 cm->emap = CreateEmitMap(cm);
2064 if(cm->emap == NULL) ESL_XFAIL(eslEINVAL, cmfp->errbuf, "After reading complete model, failed to create an emit map");
2065
2066 /* Use emit map to map the per-node annotation to per-consensus position */
2067 if (cm->flags & CMH_RF) {
2068 cm->rf[0] = ' ';
2069 for(nd = 0; nd < cm->nodes; nd++) {
2070 if(cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd) cm->rf[cm->emap->lpos[nd]] = tmp_rf_left[nd];
2071 if(cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATR_nd) cm->rf[cm->emap->rpos[nd]] = tmp_rf_right[nd];
2072 }
2073 cm->rf[cm->clen+1] = '\0';
2074 }
2075 if (cm->flags & CMH_CONS) {
2076 cm->consensus[0] = ' ';
2077 for(nd = 0; nd < cm->nodes; nd++) {
2078 if(cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd) cm->consensus[cm->emap->lpos[nd]] = tmp_cons_left[nd];
2079 if(cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATR_nd) cm->consensus[cm->emap->rpos[nd]] = tmp_cons_right[nd];
2080 }
2081 cm->consensus[cm->clen+1] = '\0';
2082 }
2083 if (cm->flags & CMH_MAP) {
2084 cm->map[0] = 0;
2085 for(nd = 0; nd < cm->nodes; nd++) {
2086 if(cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd) cm->map[cm->emap->lpos[nd]] = tmp_map_left[nd];
2087 if(cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATR_nd) cm->map[cm->emap->rpos[nd]] = tmp_map_right[nd];
2088 }
2089 }
2090
2091 if(tmp_null != NULL) free(tmp_null);
2092
2093 /* these get allocated regardless of flag status, free them */
2094 free(tmp_rf_left);
2095 free(tmp_rf_right);
2096 free(tmp_cons_left);
2097 free(tmp_cons_right);
2098 free(tmp_map_left);
2099 free(tmp_map_right);
2100
2101 if (*ret_abc == NULL) *ret_abc = abc;
2102 if ( opt_cm != NULL) *opt_cm = cm; else FreeCM(cm);
2103 return eslOK;
2104
2105 ERROR:
2106 if (*ret_abc == NULL && abc != NULL) esl_alphabet_Destroy(abc);
2107 if (cm != NULL) FreeCM(cm);
2108 if (opt_cm != NULL) *opt_cm = NULL;
2109 if (status == eslEMEM || status == eslESYS) return status;
2110 else if (status == eslEOF) return status;
2111 else if (status == eslEINCOMPAT) return status;
2112 else return eslEFORMAT; /* anything else is a format error: includes premature EOF, EOL, EOD */
2113 }
2114
2115
2116 static int
read_bin_1p1_cm(CM_FILE * cmfp,int read_fp7,ESL_ALPHABET ** ret_abc,CM_t ** opt_cm)2117 read_bin_1p1_cm(CM_FILE *cmfp, int read_fp7, ESL_ALPHABET **ret_abc, CM_t **opt_cm)
2118 {
2119 ESL_ALPHABET *abc = NULL;
2120 CM_t *cm = NULL;
2121 P7_HMM *hmm = NULL;
2122 uint32_t magic;
2123 int alphabet_type;
2124 int v, x;
2125 off_t offset = 0;
2126 off_t fp7_offset = 0;
2127 int status;
2128 float tmp_fp7_gfmu;
2129 float tmp_fp7_gflambda;
2130
2131 cmfp->errbuf[0] = '\0';
2132 if (feof(cmfp->f)) { status = eslEOF; goto ERROR; }
2133
2134 if (cmfp->newly_opened)
2135 {
2136 offset = 0;
2137 cmfp->newly_opened = FALSE;
2138 }
2139 else
2140 { /* Check magic. */
2141 if ((!cmfp->do_stdin) && (! cmfp->do_gzip)) {
2142 if ((offset = ftello(cmfp->f)) < 0) ESL_XEXCEPTION(eslESYS, "ftello() failed");
2143 }
2144 if (! fread((char *) &magic, sizeof(uint32_t), 1, cmfp->f)) { status = eslEOF; goto ERROR; }
2145
2146 if (cmfp->format == CM_FILE_1a) { if (magic != v1a_magic) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "bad magic number at start of CM"); }
2147 else ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "no such CM file format code");
2148 }
2149
2150 /* Allocate shell of the new CM.
2151 * Two-step allocation lets us read/set the flags first;
2152 * then the later CreateBody() call will allocate optional internal fields we need.
2153 */
2154 if ((cm = CreateCMShell()) == NULL) ESL_XFAIL(eslEMEM, cmfp->errbuf, "allocation failure, CM shell");
2155 cm->offset = offset;
2156
2157 /* Get sizes of things */
2158 if (! fread((char *) &(cm->flags), sizeof(int), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read flags");
2159
2160 /* Important step that's easy to overlook: lower flags that may have
2161 * been up when CM was output to file, but that won't be true of the
2162 * CM we're about to read (since not all CM parameters go into the
2163 * file).
2164 */
2165 cm->flags &= ~CMH_BITS;
2166 cm->flags &= ~CMH_CP9;
2167 cm->flags &= ~CMH_CP9_TRUNC;
2168 cm->flags &= ~CMH_MLP7;
2169 cm->flags &= ~CM_IS_CONFIGURED;
2170
2171 if (! fread((char *) &(cm->M), sizeof(int), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read number of states");
2172 if (! fread((char *) &(cm->nodes), sizeof(int), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read number of nodes");
2173 if (! fread((char *) &(cm->clen), sizeof(int), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read consensus length");
2174 if (! fread((char *) &alphabet_type, sizeof(int), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read alphabet_type");
2175
2176 /* Set or verify alphabet. */
2177 if (*ret_abc == NULL) { /* still unknown: set it, pass control of it back to caller */
2178 if ((abc = esl_alphabet_Create(alphabet_type)) == NULL) ESL_XFAIL(eslEMEM, cmfp->errbuf, "allocation failed, alphabet");
2179 } else { /* already known: check it */
2180 abc = *ret_abc;
2181 if (abc->type != alphabet_type) ESL_XFAIL(eslEINCOMPAT, cmfp->errbuf, "Alphabet type mismatch: was %s, but current HMM says %s", esl_abc_DecodeType( abc->type), esl_abc_DecodeType(alphabet_type));
2182 }
2183
2184 /* Finish the allocation of the CM
2185 */
2186 CreateCMBody(cm, cm->nodes, cm->M, cm->clen, abc);
2187
2188 /* Core model probabilities. */
2189 if (! fread((char *) cm->sttype, sizeof(char), cm->M, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read sttype array");
2190 if (! fread((char *) cm->ndidx, sizeof(int), cm->M, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read ndidx array");
2191 if (! fread((char *) cm->stid, sizeof(char), cm->M, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read stid array");
2192 if (! fread((char *) cm->cfirst, sizeof(int), cm->M, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read cfirst array");
2193 if (! fread((char *) cm->cnum, sizeof(int), cm->M, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read cnum array");
2194 if (! fread((char *) cm->plast, sizeof(int), cm->M, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read plast array");
2195 if (! fread((char *) cm->pnum, sizeof(int), cm->M, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read pnum array");
2196 if (! fread((char *) cm->nodemap, sizeof(int), cm->nodes, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read nodemap array");
2197 if (! fread((char *) cm->ndtype, sizeof(char), cm->nodes, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read ndtype array");
2198 if (! fread((char *) cm->qdbinfo->dmin1, sizeof(int), cm->M, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read dmin1 array");
2199 if (! fread((char *) cm->qdbinfo->dmax1, sizeof(int), cm->M, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read dmax1 array");
2200 if (! fread((char *) cm->qdbinfo->dmin2, sizeof(int), cm->M, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read dmin2 array");
2201 if (! fread((char *) cm->qdbinfo->dmax2, sizeof(int), cm->M, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read dmax2 array");
2202
2203 cm->qdbinfo->setby = CM_QDBINFO_SETBY_CMFILE;
2204 cm->W_setby = CM_W_SETBY_CMFILE;
2205
2206 /* Create emit map now that we know the model architecture */
2207 cm->emap = CreateEmitMap(cm);
2208 if(cm->emap == NULL) ESL_XFAIL(eslEINVAL, cmfp->errbuf, "After reading complete model, failed to create an emit map");
2209
2210 for (v = 0; v < cm->M; v++) {
2211 if (! fread((char *) cm->t[v], sizeof(float), MAXCONNECT, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read transitions for state %d", v);
2212 if (! fread((char *) cm->e[v], sizeof(float), cm->abc->K*cm->abc->K,cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read emissions for state %d", v);
2213 }
2214
2215 /* Annotations. */
2216 if (read_bin_string(cmfp->f, &(cm->name)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read name");
2217 if ((cm->flags & CMH_ACC) && read_bin_string(cmfp->f, &(cm->acc)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read acc");
2218 if ((cm->flags & CMH_DESC) && read_bin_string(cmfp->f, &(cm->desc)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read desc");
2219 if ((cm->flags & CMH_RF) && ! fread((char *) cm->rf, sizeof(char), cm->clen+2, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read rf"); /* +2: 1..M and trailing \0 */
2220 if ((cm->flags & CMH_CONS) && ! fread((char *) cm->consensus, sizeof(char), cm->clen+2, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read consensus"); /* don't need to test for >=3e format, because the flag is sufficient (didn't exist pre-3e) */
2221 if ((cm->flags & CMH_MAP) && ! fread((char *) cm->map, sizeof(int), cm->clen+1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read map");
2222 if (! fread((char *) &(cm->W), sizeof(int), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read nseq");
2223
2224 if (read_bin_string(cmfp->f, &(cm->ctime)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read ctime");
2225 if (read_bin_string(cmfp->f, &(cm->comlog)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read comlog");
2226
2227 if (! fread((char *) &(cm->pbegin), sizeof(float), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read beta_W");
2228 if (! fread((char *) &(cm->pend), sizeof(float), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read beta_W");
2229 if (! fread((char *) &(cm->beta_W), sizeof(double), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read beta_W");
2230 if (! fread((char *) &(cm->qdbinfo->beta1), sizeof(double), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read betaqdb1");
2231 if (! fread((char *) &(cm->qdbinfo->beta2), sizeof(double), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read betaqdb2");
2232 if (! fread((char *) &(cm->null2_omega), sizeof(float), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read null2_omega");
2233 if (! fread((char *) &(cm->null3_omega), sizeof(float), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read null3_omega");
2234 if (! fread((char *) &(cm->el_selfsc), sizeof(float), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read el_selfsc");
2235 if (! fread((char *) &(cm->nseq), sizeof(int), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read nseq");
2236 if (! fread((char *) &(cm->eff_nseq), sizeof(float), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read eff_nseq");
2237 if (! fread((char *) &(cm->checksum), sizeof(uint32_t), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read checksum");
2238 if (! fread((char *) cm->null, sizeof(float), cm->abc->K, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read null vector");
2239
2240 /* Rfam cutoffs */
2241 if ((cm->flags & CMH_GA) && (! fread((char *) &(cm->ga), sizeof(float), 1, cmfp->f))) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read GA cutoff");
2242 if ((cm->flags & CMH_TC) && (! fread((char *) &(cm->tc), sizeof(float), 1, cmfp->f))) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read TC cutoff");
2243 if ((cm->flags & CMH_NC) && (! fread((char *) &(cm->nc), sizeof(float), 1, cmfp->f))) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read NC cutoff");
2244
2245 /* E-value parameters */
2246 if (cm->flags & CMH_FP7) { /* should always be true */
2247 if (! fread((char *) &tmp_fp7_gfmu, sizeof(float), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read additional P7 E-value stats");
2248 if (! fread((char *) &tmp_fp7_gflambda, sizeof(float), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read additional P7 E-value stats");
2249 }
2250 if (cm->flags & CMH_EXPTAIL_STATS) {
2251 long dbsize_long;
2252 ESL_ALLOC(cm->expA, sizeof(ExpInfo_t *) * EXP_NMODES);
2253 for(x = 0; x < EXP_NMODES; x++) {
2254 cm->expA[x] = CreateExpInfo();
2255 if (! fread((char *) &(cm->expA[x]->lambda), sizeof(double), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM E-value stats");
2256 if (! fread((char *) &(cm->expA[x]->mu_extrap), sizeof(double), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM E-value stats");
2257 if (! fread((char *) &(cm->expA[x]->mu_orig), sizeof(double), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM E-value stats");
2258 if (! fread((char *) &(dbsize_long), sizeof(long), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM E-value stats");
2259 cm->expA[x]->dbsize = (double) dbsize_long;
2260 if (! fread((char *) &(cm->expA[x]->nrandhits), sizeof(int), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM E-value stats");
2261 if (! fread((char *) &(cm->expA[x]->tailp), sizeof(double), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM E-value stats");
2262 }
2263 }
2264
2265 /* Finally, read the filter HMM for this CM, unless we're explicitly asked not to. */
2266 if(read_fp7) {
2267 if(!cmfp->hfp->do_stdin && !cmfp->hfp->do_gzip) {
2268 if((fp7_offset = ftello(cmfp->f)) < 0) ESL_XFAIL(eslESYS, cmfp->errbuf, "ftello() failed");
2269 }
2270 else {
2271 fp7_offset = -1; /* this is irrelevant if stdin or gzip mode, cm_p7_hmmfile_Read will ignore it */
2272 }
2273 if((status = cm_p7_hmmfile_Read(cmfp, abc, fp7_offset, &hmm)) != eslOK) goto ERROR;
2274 if((status = cm_SetFilterHMM(cm, hmm, tmp_fp7_gfmu, tmp_fp7_gflambda)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Unable to set filter HMM for CM");
2275 if(!cmfp->hfp->do_stdin && !cmfp->hfp->do_gzip) {
2276 /* now position the CM file to the end of the HMM we just read */
2277 if (fseeko(cmfp->f, ftello(cmfp->hfp->f), SEEK_SET) != 0) ESL_XFAIL(eslESYS, cmfp->errbuf, "Failed to set position for CM file parser after reading filter HMMs");
2278 }
2279 }
2280
2281 if (*ret_abc == NULL) *ret_abc = abc; /* pass our new alphabet back to caller, if caller didn't know it already */
2282 if ( opt_cm != NULL) *opt_cm = cm; else FreeCM(cm);
2283 return eslOK;
2284
2285 ERROR:
2286 if (*ret_abc == NULL && abc != NULL) esl_alphabet_Destroy(abc); /* the test is for an alphabet created here, not passed */
2287 if (cm != NULL) FreeCM(cm);
2288 if (opt_cm != NULL) *opt_cm = NULL;
2289 return status;
2290 }
2291
2292 /* read_asc_1p0_cm(): inputting version 1.0-->1.0.2 CM file format.
2293 *
2294 * This function was stolen from Infernal v1.0.2's
2295 * cm_io.c:read_ascii_cm() (which is identical to the function of the
2296 * same time in v1.0 and v1.0.1.), and minmally changed to work in the
2297 * context of new CM_FILE data structure. This version also includes
2298 * better error message handling.
2299 *
2300 * Note that the value of <read_fp7> is irrelevant. It is only
2301 * in the prototype so that read_asc_1p0_cm is consistent with
2302 * other parser prototypes.
2303 */
2304 static int
read_asc_1p0_cm(CM_FILE * cmfp,int read_fp7,ESL_ALPHABET ** ret_abc,CM_t ** opt_cm)2305 read_asc_1p0_cm(CM_FILE *cmfp, int read_fp7, ESL_ALPHABET **ret_abc, CM_t **opt_cm)
2306 {
2307 int status;
2308 CM_t *cm;
2309 char *buf;
2310 int n; /* length of buf */
2311 char *s;
2312 int M,N; /* number of states, nodes in model */
2313 int v,x,y,nd; /* counters for states, events, nodes */
2314 char *tok;
2315 int toklen;
2316 int exp_flags[EXP_NMODES]; /* keep track of which exp tails we've read */
2317 int exp_mode; /* index of exp tail info */
2318 int have_exps; /* for checking we get 0 or all exp tails*/
2319 int have_ga = FALSE; /* we have GA cutoff, needed b/c we can't set cm->flags until after CreateCMBody() call */
2320 int have_tc = FALSE; /* we have TC cutoff, needed b/c we can't set cm->flags until after CreateCMBody() call */
2321 int have_nc = FALSE; /* we have NC cutoff, needed b/c we can't set cm->flags until after CreateCMBody() call */
2322 int p; /* counter for partitions */
2323 int i; /* counter over exp_modes for exp tails */
2324 int alphabet_type; /* type of ESL_ALPHABET */
2325 ESL_ALPHABET *abc = NULL;
2326 int read_nstates = FALSE; /* TRUE once we've read the number of states */
2327 int read_nnodes = FALSE; /* TRUE once we've read the number of nodes */
2328 int read_atype = FALSE; /* TRUE once we've read the alphabet type */
2329 int read_clen = FALSE;
2330 int evalues_are_invalid = FALSE; /* TRUE if the PART line reports more than 1 partition */
2331 int clen = 0;
2332 int npartitions = 0;
2333 off_t offset = 0;
2334
2335 cm = NULL;
2336 buf = NULL;
2337 n = 0;
2338 for(i = 0; i < EXP_NMODES; i++) exp_flags[i] = FALSE;
2339
2340 cmfp->errbuf[0] = '\0';
2341
2342 if (cmfp->newly_opened)
2343 {
2344 offset = 0;
2345 cmfp->newly_opened = FALSE;
2346 }
2347 else
2348 {
2349 /* Record where this CM starts on disk */
2350 if ((! cmfp->do_stdin) && (! cmfp->do_gzip) && (offset = ftello(cmfp->f)) < 0) ESL_XEXCEPTION(eslESYS, "ftello() failed");
2351
2352 /* First line of file: "INFERNAL-1" */
2353 if (feof(cmfp->f) || esl_fgets(&buf, &n, cmfp->f) != eslOK) { /* end of file, free buf and return eslEOF */
2354 if(buf != NULL) free(buf);
2355 return eslEOF;
2356 }
2357 if (cmfp->format == CM_FILE_1) { if (strncmp(buf, "INFERNAL-1", 10) != 0) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Didn't find INFERNAL-1 tag: bad format or not an INFERNAL save file?"); }
2358 else ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No such CM file format code: this shouldn't happen");
2359 }
2360
2361
2362 /* Parse the header information
2363 * These are all tag/value.
2364 * Ignore unknown tags (forward compatibility).
2365 */
2366 cm = CreateCMShell();
2367 M = N = -1;
2368 while (esl_fgets(&buf, &n, cmfp->f) != eslEOF)
2369 {
2370 s = buf;
2371 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Premature end of data, prior to MODEL section");
2372 else if (strcmp(tok, "NAME") == 0)
2373 {
2374 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No name found on NAME line");
2375 if ((status = esl_strdup(tok, toklen, &(cm->name))) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Problem setting name for CM");
2376 }
2377 else if (strcmp(tok, "ACC") == 0)
2378 {
2379 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No accession found on ACC line");
2380 if ((status = esl_strdup(tok, toklen, &(cm->acc))) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Problem setting accession for CM");
2381 }
2382 else if (strcmp(tok, "DESC") == 0)
2383 {
2384 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No description found on DESC line");
2385 if ((status = esl_strdup(tok, toklen, &(cm->desc))) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Problem setting description for CM");
2386 }
2387 else if (strcmp(tok, "GA") == 0)
2388 {
2389 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No GA threshold found on GA line");
2390 cm->ga = atof(tok);
2391 have_ga = TRUE;
2392 }
2393 else if (strcmp(tok, "TC") == 0)
2394 {
2395 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No TC threshold found on TC line");
2396 cm->tc = atof(tok);
2397 have_tc = TRUE;
2398 }
2399 else if (strcmp(tok, "NC") == 0)
2400 {
2401 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No NC threshold found on NC line");
2402 cm->nc = atof(tok);
2403 have_nc = TRUE;
2404 }
2405 else if (strcmp(tok, "STATES") == 0)
2406 {
2407 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No number of STATES found on STATES line");
2408 M = atoi(tok);
2409 read_nstates = TRUE;
2410 }
2411 else if (strcmp(tok, "NODES") == 0)
2412 {
2413 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No number of NODES found on NODES line");
2414 N = atoi(tok);
2415 read_nnodes = TRUE;
2416 }
2417 else if (strcmp(tok, "ALPHABET") == 0)
2418 {
2419 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No alphabet type found on ALPHABET line");
2420 alphabet_type = atoi(tok);
2421 /* Set or verify alphabet. */
2422 if (*ret_abc == NULL) { /* still unknown: set it, pass control of it back to caller */
2423 if ((abc = esl_alphabet_Create(alphabet_type)) == NULL) ESL_XFAIL(eslEMEM, cmfp->errbuf, "Failed to create alphabet");
2424 } else { /* already known: check it */
2425 abc = *ret_abc;
2426 if ((*ret_abc)->type != alphabet_type) ESL_XFAIL(eslEINCOMPAT,cmfp->errbuf,"Alphabet type mismatch");
2427 }
2428 read_atype = TRUE;
2429 }
2430 else if (strcmp(tok, "ELSELF") == 0)
2431 {
2432 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No ELSELF score found on ELSELF line");
2433 cm->el_selfsc = atof(tok);
2434 }
2435 else if (strcmp(tok, "WBETA") == 0)
2436 {
2437 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No WBETA found on WBETA line");
2438 cm->beta_W = (double) atof(tok);
2439 }
2440 else if (strcmp(tok, "NSEQ") == 0)
2441 {
2442 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No name found on NAME line");
2443 cm->nseq = atoi(tok);
2444 }
2445 else if (strcmp(tok, "EFFNSEQ") == 0)
2446 {
2447 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No effective seq number found on EFFNSEQ line");
2448 cm->eff_nseq = atof(tok);
2449 }
2450 else if (strcmp(tok, "CLEN") == 0)
2451 {
2452 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No consensus length found on CLEN line");
2453 clen = atoi(tok); /* we'll compare this to what we calculate at end of func */
2454 read_clen = TRUE;
2455 /* Now we have the clen and we should have the alphabet type and N and M, so we can build the
2456 * full model, and set the alphabet (which we need to do before alloc'ing/setting
2457 * the null model */
2458 if(! (read_nstates && read_nnodes && read_atype))
2459 {
2460 ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "ERROR, ALPHABET, STATES and NODES lines should precede CLEN line");
2461 }
2462 CreateCMBody(cm, N, M, clen, abc);
2463 }
2464 /* comlog and ctime info. Careful, we want the full line, so a token becomes a full line
2465 * Also we stored this data differently in version 1.0, we have to throw out the CDATE
2466 * info, we don't store that anymore.
2467 */
2468 else if (strcmp(tok, "BCOM") == 0)
2469 {
2470 while(isspace((int) (*s))) s++; /* chew up leading whitespace */
2471 if ((status = esl_strtok_adv(&s, "\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No build command found on BCOM line");
2472 if(cm->comlog == NULL) {
2473 if((status = esl_strdup(tok, toklen, &(cm->comlog))) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Problem setting build date");
2474 }
2475 else {
2476 if((status = esl_strcat(&(cm->comlog), -1,"\n", 1)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Problem setting build date");
2477 if((status = esl_strcat(&(cm->comlog), -1, tok, toklen)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Problem setting build date");
2478 }
2479 }
2480 else if (strcmp(tok, "BDATE") == 0)
2481 {
2482 while(isspace((int) (*s))) s++; /* chew up leading whitespace */
2483 if ((status = esl_strtok_adv(&s, "\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No date found on BDATE line");
2484 if (esl_strdup(tok, toklen, &(cm->ctime)) != eslOK) ESL_XFAIL(eslEMEM, cmfp->errbuf, "strdup() failed to set date");
2485 }
2486 else if (strcmp(tok, "CCOM") == 0)
2487 {
2488 while(isspace((int) (*s))) s++; /* chew up leading whitespace */
2489 if ((status = esl_strtok_adv(&s, "\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No calibrate command found on CCOM line");
2490 if(cm->comlog == NULL) {
2491 if((status = esl_strdup(tok, toklen, &(cm->comlog))) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Problem setting calibrate date");
2492 }
2493 else {
2494 if((status = esl_strcat(&(cm->comlog), -1,"\n", 1)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Problem setting calibrate date");
2495 if((status = esl_strcat(&(cm->comlog), -1, tok, toklen)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Problem setting calibrate date");
2496 }
2497 }
2498 else if (strcmp(tok, "CDATE") == 0)
2499 {
2500 while(isspace((int) (*s))) s++; /* chew up leading whitespace */
2501 if ((status = esl_strtok_adv(&s, "\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No date found on CDATE line");
2502 /* we don't store this anymore */
2503 }
2504 else if (strcmp(tok, "NULL") == 0)
2505 {
2506 if(cm->abc == NULL) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "NULL line must be preceded by ALPHABET line");
2507 /* cm-> null already allocated in CreateCMBody() */
2508 for (x = 0; x < abc->K; x++)
2509 {
2510 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Problem reading NULL line");
2511 cm->null[x] = ascii2prob(tok, (1./(float) abc->K));
2512 }
2513 }
2514 /* exp tail distribution information */
2515 else if (strcmp(tok, "PART") == 0)
2516 {
2517 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No number of partitions on PART line");
2518 if (! is_integer(tok)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "number of partitions is not an integer on PART line");
2519 npartitions = atoi(tok);
2520 if(npartitions != 1) {
2521 evalues_are_invalid = TRUE;
2522 /* we can't deal with more than 1 partition in the current codebase, if there are more, throw away any E-value parameters we read */
2523 }
2524 /* else 1 partition, we can handle this (nearly all infernal
2525 * 1.0--1.0.2 files should have 1 partitions, you could
2526 * only create a multi-partition file with the undocument
2527 * --exp-pfile option to cmcalibrate). */
2528
2529 /* Ignore the rest of this line, it includes partition start/stop info that's no longer parsed */
2530 if ((status = esl_strtok_adv(&s, "\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid PART- line");
2531 }
2532 /* exp tail info */
2533 else if ((strncmp(tok, "E-", 2) == 0) && (! evalues_are_invalid)) /* skip the E- lines if we've read more than one partition above */
2534 {
2535 /* determine which exp tail we're reading */
2536 if (strncmp(tok+2, "LC", 2) == 0) exp_mode = EXP_CM_LC;
2537 else if (strncmp(tok+2, "GC", 2) == 0) exp_mode = EXP_CM_GC;
2538 else if (strncmp(tok+2, "LI", 2) == 0) exp_mode = EXP_CM_LI;
2539 else if (strncmp(tok+2, "GI", 2) == 0) exp_mode = EXP_CM_GI;
2540 else if (strncmp(tok+2, "LV", 2) == 0) continue; /* cp9 HMM local viterbi, irrelevant in current format */
2541 else if (strncmp(tok+2, "GV", 2) == 0) continue; /* cp9 HMM glocal viterbi, irrelevant in current format */
2542 else if (strncmp(tok+2, "LF", 2) == 0) continue; /* cp9 HMM local forward, irrelevant in current format */
2543 else if (strncmp(tok+2, "GF", 2) == 0) continue; /* cp9 HMM glocal forward, irrelevant in current format */
2544 else ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid xx on E-xx line");
2545
2546 /* create the expA array if this is the first E- line we've read */
2547 if (cm->expA == NULL) {
2548 ESL_ALLOC(cm->expA, sizeof(ExpInfo_t *) * EXP_NMODES);
2549 for(x = 0; x < EXP_NMODES; x++) { cm->expA[x] = CreateExpInfo(); }
2550 }
2551
2552 /* now we know what exp tail we're reading, read it */
2553 /* chew up partition */
2554 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No partition read on E-xx line");
2555 p = atoi(tok);
2556 if (p != 0) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid partition on E-xx line");
2557
2558 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No lambda read on E-xx line");
2559 if (! is_real(tok)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "lambda not real number on E-xx line");
2560 cm->expA[exp_mode]->lambda = atof(tok);
2561
2562 if ((esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No mu_extrap read on E-xx line");
2563 if (! is_real(tok)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "mu_extrap not real number on E-xx line");
2564 cm->expA[exp_mode]->mu_extrap = atof(tok);
2565
2566 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No mu_orig read on E-xx line");
2567 if (! is_real(tok)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "mu_orign is not real number on E-xx line");
2568 cm->expA[exp_mode]->mu_orig = atof(tok);
2569
2570 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No dbsize read on E-xx line");
2571 if (! is_integer(tok)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "dbsize is not integer on E-xx line");
2572 cm->expA[exp_mode]->dbsize = atof(tok); /* read it as a double, even though it's written as a long */
2573
2574 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No nrandhits read on E-xx line");
2575 if (! is_integer(tok)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "nrandhits is not integer on E-xx line");
2576 cm->expA[exp_mode]->nrandhits = atoi(tok);
2577
2578 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No tailp read on E-xx line");
2579 if (! is_real(tok)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "tailp is not real number on E-xx line");
2580 cm->expA[exp_mode]->tailp = atof(tok);
2581
2582 cm->expA[exp_mode]->cur_eff_dbsize = (double) (cm->expA[exp_mode]->nrandhits);
2583 /* Previous line is to set cur_eff_dbsize as if database was of size cm->stats->expAA[p]->dbsize, we
2584 * act as if the max hits we'll see is nrandhits, the number of hits we saw in cmcalibrate,
2585 * so this is the highest possible E-value we can get.
2586 * cur_eff_dbsize will be updated in cmsearch for whatever the target database size is. */
2587 cm->expA[exp_mode]->is_valid = TRUE; /* set valid flag */
2588 exp_flags[exp_mode] = TRUE;
2589 }
2590 else if (strncmp(tok, "FT-", 3) == 0)
2591 {
2592 /* filter thresholds statistics are deprecated, chew up the rest ot this line, and following two lines */
2593 if ((status = esl_strtok_adv(&s, "\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid FT- line");
2594 if (esl_fgets(&buf, &n, cmfp->f) == eslEOF) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Premature end to file, while reading FT- line 2");
2595 if (esl_fgets(&buf, &n, cmfp->f) == eslEOF) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Premature end to file, while reading FT- line 3");
2596 }
2597 else if (strcmp(tok, "MODEL:") == 0)
2598 break;
2599 }
2600
2601 /* Done reading the header information.
2602 * Check that everything is ok and mandatory info is present before moving on.
2603 */
2604 if (feof(cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Premature end to CM file, file truncated?");
2605 if (! read_nstates) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "MODEL: line precedes STATES line");
2606 if (! read_nnodes) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "MODEL: line precedes NODES line");
2607 if (! read_clen) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "MODEL: line precedes CLEN line");
2608 if (! read_atype) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "MODEL: line precedes ALPHABET line");
2609 if (cm->name == NULL) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "MODEL: line precedes NAME line");
2610
2611 /* if we have any exp tail stats, we (currently) require all of them */
2612 have_exps = exp_flags[0];
2613 for(exp_mode = 1; exp_mode < EXP_NMODES; exp_mode++)
2614 if(((have_exps && (!exp_flags[exp_mode]))) ||
2615 ((!have_exps) && (exp_flags[exp_mode])))
2616 ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Some but not all E-xx lines read, expected 8");
2617
2618 /* Main model section.
2619 */
2620 CMZero(cm);
2621 if(have_exps) cm->flags |= CMH_EXPTAIL_STATS;
2622 if(have_ga) cm->flags |= CMH_GA;
2623 if(have_tc) cm->flags |= CMH_TC;
2624 if(have_nc) cm->flags |= CMH_NC;
2625 nd = -1;
2626 clen = 0;
2627 for (v = 0; v < cm->M; v++)
2628 {
2629 if ((status = esl_fgets(&buf, &n, cmfp->f)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Premature end of data before main model section");
2630 s = buf;
2631 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Premature end of data before main model section");
2632
2633 /* Ah, a node line. Process it and get the following line.
2634 */
2635 if (*tok == '[')
2636 {
2637 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Too few fields on node line");
2638 if ((x = NodeCode(tok)) == -1) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid node type %s", tok);
2639 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Too few fields on node line");
2640 if (!is_integer(tok)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid node index on node line");
2641 nd = atoi(tok);
2642 cm->ndtype[nd] = x;
2643 if(cm->ndtype[nd] == MATP_nd) clen+=2;
2644 else if(cm->ndtype[nd] == MATL_nd) clen++;
2645 else if(cm->ndtype[nd] == MATR_nd) clen++;
2646 cm->nodemap[nd] = v;
2647
2648 if ((status = esl_fgets(&buf, &n, cmfp->f)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Too few fields on NODE line");
2649 s = buf;
2650 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Too few fields on NODE line");
2651 }
2652
2653 /* Process state line.
2654 */
2655 cm->sttype[v] = StateCode(tok);
2656 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2657 if (! is_integer(tok)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2658 if (atoi(tok) != v) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2659 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2660 if (! is_integer(tok)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2661 cm->plast[v] = atoi(tok);
2662 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2663 if (! is_integer(tok)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2664 cm->pnum[v] = atoi(tok);
2665 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2666 if (! is_integer(tok)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2667 cm->cfirst[v] = atoi(tok);
2668 if ((status= esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2669 if (! is_integer(tok)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2670 cm->cnum[v] = atoi(tok);
2671 /* Transition probabilities. */
2672 if (cm->sttype[v] != B_st)
2673 {
2674 for (x = 0; x < cm->cnum[v]; x++)
2675 {
2676 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2677 if (! is_real(tok) && *tok != '*') ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2678 cm->t[v][x] = ascii2prob(tok, 1.);
2679 }
2680 }
2681 /* Emission probabilities. */
2682 if (cm->sttype[v] == ML_st || cm->sttype[v] == MR_st ||
2683 cm->sttype[v] == IL_st || cm->sttype[v] == IR_st)
2684 {
2685 for (x = 0; x < cm->abc->K; x++)
2686 {
2687 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2688 if (! is_real(tok) && *tok != '*') ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2689 cm->e[v][x] = ascii2prob(tok, cm->null[x]);
2690 }
2691 }
2692 else if (cm->sttype[v] == MP_st)
2693 {
2694 for (x = 0; x < cm->abc->K; x++)
2695 for (y = 0; y < cm->abc->K; y++)
2696 {
2697 if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2698 if (! is_real(tok) && *tok != '*') ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2699 cm->e[v][x*cm->abc->K+y] = ascii2prob(tok, cm->null[x]*cm->null[y]);
2700 }
2701 }
2702
2703 cm->ndidx[v] = nd;
2704 cm->stid[v] = DeriveUniqueStateCode(cm->ndtype[nd], cm->sttype[v]);
2705 } /* end of loop over states */
2706
2707 /* Advance to record separator
2708 */
2709 while (esl_fgets(&buf, &n, cmfp->f) != eslEOF)
2710 if (strncmp(buf, "//", 2) == 0)
2711 break;
2712
2713 /* EPN 10.29.06 Remove the sole source of CM ambiguities. Find and detach insert states
2714 * that are 1 state before an END_E. */
2715 cm_find_and_detach_dual_inserts(cm,
2716 FALSE, /* Don't check END_E-1 states have 0 counts, they may not if
2717 * an old version (0.7 or earlier) of cmbuild was used, or
2718 * cmbuild --nodetach was used to build the CM */
2719 TRUE); /* Detach the states by setting trans probs into them as 0.0 */
2720
2721 /* check that the clen we calc'ed is the same as the CLEN line said */
2722 if (read_clen && clen != cm->clen)
2723 {
2724 ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "calculated consensus length %d does not equal read CLEN: %d.\n", clen, cm->clen);
2725 }
2726
2727 /* Success.
2728 * Renormalize the CM, and return.
2729 */
2730 CMRenormalize(cm);
2731
2732 if (buf != NULL) free(buf);
2733 if (*ret_abc == NULL) *ret_abc = abc;
2734 if ( opt_cm != NULL) *opt_cm = cm; else FreeCM(cm);
2735 return eslOK;
2736
2737 ERROR:
2738 if (buf != NULL) free(buf);
2739 if (*ret_abc == NULL && abc != NULL) esl_alphabet_Destroy(abc);
2740 if (cm != NULL) FreeCM(cm);
2741 if (opt_cm != NULL) *opt_cm = NULL;
2742 if (status == eslEMEM || status == eslESYS) return status;
2743 else if (status == eslEOF) return status;
2744 else if (status == eslEINCOMPAT) return status;
2745 else return eslEFORMAT; /* anything else is a format error: includes premature EOF, EOL, EOD */
2746 }
2747
2748
2749 /* Below, read_asc30hmm(), etc. are EXACT copies from H3 p7_hmmfile.c
2750 * This isn't the right way to do it, but it's what we did. Do NOT
2751 * edit these here. If you edit them in H3, you MUST bring those edits
2752 * here. [SRE:24-Jan-17]
2753 */
2754
2755 /* Parsing save files from HMMER 3.x
2756 * All parsers follow the same API.
2757 *
2758 * Returns <eslOK> on success, and if <opt_hmm> is non-NULL,
2759 * <*opt_hmm> points at a newly allocated HMM.
2760 *
2761 * Additionally, if <*ret_abc> was NULL, then a new alphabet is
2762 * allocated according to the alphabet type of this HMM, and returned
2763 * thru <ret_abc>. This allocation mechanism allows a main()
2764 * application that doesn't yet know its alphabet to determine the
2765 * alphabet when the first HMM is read, while also allowing an
2766 * application to allocate its own alphabet and assure that the
2767 * input HMMs are appropriate for that alphabet.
2768 *
2769 * Returns <eslEOF> when no HMM remains in the file, indicating a
2770 * normal end-of-file.
2771 *
2772 * Two types of "normal error" may happen, which the caller must check
2773 * for. Returns <eslEFORMAT> on any save file format error, including
2774 * bad magic (i.e. this is not a HMMER file at all). Returns
2775 * <eslEINCOMPAT> if the expected alphabet (a non-<NULL> alphabet
2776 * specified by <*ret_abc>) does not match the alphabet type of the
2777 * HMM.
2778 *
2779 * When these normal errors occur, the caller can construct its error
2780 * message from:
2781 * <hfp->errbuf>: contains an informative error message
2782 * <hfp->fname>: name of the HMM file (or '-' if STDIN)
2783 * and if <hfp->efp> is non-<NULL>, the HMM file is in ASCII text,
2784 * and the caller may also use:
2785 * <hfp->efp->linenumber>: line on which the parse error occurred.
2786 *
2787 * Throws: <eslEMEM> on allocation error.
2788 * <eslESYS> if a system i/o call fails.
2789 * In cases of error (including both thrown error and normal error), <*ret_abc>
2790 * is left in its original state as passed by the caller, and <*ret_hmm> is
2791 * returned <NULL>.
2792 */
2793 static int
read_asc30hmm(P7_HMMFILE * hfp,ESL_ALPHABET ** ret_abc,P7_HMM ** opt_hmm)2794 read_asc30hmm(P7_HMMFILE *hfp, ESL_ALPHABET **ret_abc, P7_HMM **opt_hmm)
2795 {
2796 ESL_ALPHABET *abc = NULL;
2797 P7_HMM *hmm = NULL;
2798 char *tag = NULL;
2799 char *tok1 = NULL;
2800 char *tok2 = NULL;
2801 char *tok3 = NULL;
2802 char *tok4 = NULL;
2803 int alphatype;
2804 int k,x;
2805 off_t offset = 0;
2806 int status;
2807 uint32_t statstracker = 0;
2808
2809 hfp->errbuf[0] = '\0';
2810
2811 if (hfp->newly_opened)
2812 {
2813 offset = 0;
2814 hfp->newly_opened = FALSE;
2815 }
2816 else
2817 {
2818 /* Record where this HMM starts on disk */
2819 if ((! hfp->do_stdin) && (! hfp->do_gzip) && (offset = ftello(hfp->f)) < 0) ESL_XEXCEPTION(eslESYS, "ftello() failed");
2820
2821 /* First line of file: "HMMER3/f". Allocate shell for HMM annotation information (we don't know K,M yet) */
2822 if ((status = esl_fileparser_NextLine(hfp->efp)) != eslOK) goto ERROR; /* EOF here is normal; could also be a thrown EMEM */
2823 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tag, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "unexpected absence of tokens on data line");
2824
2825 if (hfp->format == p7_HMMFILE_3f) { if (strcmp(tag, "HMMER3/f") != 0) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Didn't find HMMER3/f tag: bad format or not a HMMER save file?"); }
2826 else if (hfp->format == p7_HMMFILE_3e) { if (strcmp(tag, "HMMER3/e") != 0) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Didn't find HMMER3/e tag: bad format or not a HMMER save file?"); }
2827 else if (hfp->format == p7_HMMFILE_3d) { if (strcmp(tag, "HMMER3/d") != 0) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Didn't find HMMER3/d tag: bad format or not a HMMER save file?"); }
2828 else if (hfp->format == p7_HMMFILE_3c) { if (strcmp(tag, "HMMER3/c") != 0) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Didn't find HMMER3/c tag: bad format or not a HMMER save file?"); }
2829 else if (hfp->format == p7_HMMFILE_3b) { if (strcmp(tag, "HMMER3/b") != 0) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Didn't find HMMER3/b tag: bad format or not a HMMER save file?"); }
2830 else if (hfp->format == p7_HMMFILE_3a) { if (strcmp(tag, "HMMER3/a") != 0) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Didn't find HMMER3/a tag: bad format or not a HMMER save file?"); }
2831 else ESL_XFAIL(eslEFORMAT, hfp->errbuf, "No such HMM file format code: this shouldn't happen");
2832 }
2833
2834 if ((hmm = p7_hmm_CreateShell()) == NULL) ESL_XFAIL(eslEMEM, hfp->errbuf, "allocation failure, HMM shell");
2835 hmm->offset = offset;
2836
2837 /* Header section */
2838 while ((status = esl_fileparser_NextLine(hfp->efp)) == eslOK)
2839 {
2840 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tag, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Premature end of line");
2841
2842 if (strcmp(tag, "NAME") == 0) {
2843 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "No name found on NAME line");
2844 p7_hmm_SetName(hmm, tok1);
2845 }
2846
2847 else if (strcmp(tag, "ACC") == 0) {
2848 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "No accession found on ACC line");
2849 p7_hmm_SetAccession(hmm, tok1);
2850 }
2851
2852 else if (strcmp(tag, "DESC") == 0) {
2853 if ((status = esl_fileparser_GetRemainingLine(hfp->efp, &tok1)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "No description found on DESC line");
2854 p7_hmm_SetDescription(hmm, tok1);
2855 }
2856
2857 else if (strcmp(tag, "LENG") == 0) {
2858 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "No model length found on LENG line");
2859 if ((hmm->M = atoi(tok1)) == 0) ESL_XFAIL(status, hfp->errbuf, "Invalid model length %s on LENG line", tok1);
2860 }
2861
2862 else if (hfp->format >= p7_HMMFILE_3c && strcmp(tag, "MAXL") == 0) {
2863 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "No max length found on MAXL line");
2864 if ((hmm->max_length = atoi(tok1)) == 0) ESL_XFAIL(status, hfp->errbuf, "Invalid max length %s on MAXL line", tok1);
2865 }
2866
2867 else if (strcmp(tag, "ALPH") == 0) {
2868 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "No alphabet type found on ALPH");
2869 if ((alphatype = esl_abc_EncodeType(tok1)) == eslUNKNOWN) ESL_XFAIL(status, hfp->errbuf, "Unrecognized alphabet type %s", tok1);
2870 if (*ret_abc == NULL) {
2871 if ((abc = esl_alphabet_Create(alphatype)) == NULL) ESL_XFAIL(eslEMEM, hfp->errbuf, "Failed to create alphabet");
2872 } else {
2873 if ((*ret_abc)->type != alphatype) ESL_XFAIL(eslEINCOMPAT,hfp->errbuf,"Alphabet type mismatch: was %s, but current HMM says %s", esl_abc_DecodeType( (*ret_abc)->type), tok1);
2874 abc = *ret_abc;
2875 }
2876 }
2877
2878 else if (strcmp(tag, "RF") == 0) {
2879 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "No yes/no found for RF line");
2880 if (strcasecmp(tok1, "yes") == 0)
2881 hmm->flags |= p7H_RF;
2882 else if (strcasecmp(tok1, "no") != 0) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "RF header line must say yes/no, not %s", tok1);
2883 }
2884
2885 else if (strcmp(tag, "MM") == 0) {
2886 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "No yes/no found for MM line");
2887 if (strcasecmp(tok1, "yes") == 0)
2888 hmm->flags |= p7H_MMASK;
2889 else if (strcasecmp(tok1, "no") != 0) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "MM header line must say yes/no, not %s", tok1);
2890 }
2891
2892
2893 else if (strcmp(tag, "CONS") == 0) {
2894 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "No yes/no found for CONS line");
2895 if (strcasecmp(tok1, "yes") == 0)
2896 hmm->flags |= p7H_CONS;
2897 else if (strcasecmp(tok1, "no") != 0) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "CONS header line must say yes/no, not %s", tok1);
2898 }
2899
2900 else if (strcmp(tag, "CS") == 0) {
2901 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "No yes/no found for CS line");
2902 if (strcasecmp(tok1, "yes") == 0)
2903 hmm->flags |= p7H_CS;
2904 else if (strcasecmp(tok1, "no") != 0) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "CS header line must say yes/no, not %s", tok1);
2905 }
2906
2907 else if (strcmp(tag, "MAP") == 0) {
2908 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "No yes/no found for MAP line");
2909 if (strcasecmp(tok1, "yes") == 0)
2910 hmm->flags |= p7H_MAP;
2911 else if (strcasecmp(tok1, "no") != 0) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "MAP header line must say yes/no, not %s", tok1);
2912 }
2913
2914 else if (strcmp(tag, "DATE") == 0) {
2915 if ((status = esl_fileparser_GetRemainingLine(hfp->efp, &tok1)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "No date found on DATE line");
2916 if (esl_strdup(tok1, -1, &(hmm->ctime)) != eslOK) ESL_XFAIL(eslEMEM, hfp->errbuf, "strdup() failed to set date");
2917 }
2918
2919 else if (strcmp(tag, "COM") == 0) {
2920 /* just skip the first token; it's something like [1], numbering the command lines */
2921 if ((status = esl_fileparser_GetTokenOnLine (hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "No command number on COM line");
2922 if ((status = esl_fileparser_GetRemainingLine(hfp->efp, &tok1)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "No command on COM line");
2923 if (hmm->comlog == NULL) {
2924 if (esl_strdup(tok1, -1, &(hmm->comlog)) != eslOK) ESL_XFAIL(eslEMEM, hfp->errbuf, "esl_strdup() failed");
2925 } else {
2926 if (esl_strcat(&(hmm->comlog), -1, "\n", -1) != eslOK) ESL_XFAIL(eslEMEM, hfp->errbuf, "esl_strcat() failed");
2927 if (esl_strcat(&(hmm->comlog), -1, tok1, -1) != eslOK) ESL_XFAIL(eslEMEM, hfp->errbuf, "esl_strcat() failed");
2928 }
2929 }
2930
2931 else if (strcmp(tag, "NSEQ") == 0) {
2932 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Nothing follows NSEQ tag");
2933 if ((hmm->nseq = atoi(tok1)) == 0) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Invalid nseq on NSEQ line: should be integer, not %s", tok1);
2934 }
2935
2936 else if (strcmp(tag, "EFFN") == 0) {
2937 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Nothing follows EFFN tag");
2938 if ((hmm->eff_nseq = atof(tok1)) <= 0.0f) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Invalid eff_nseq on EFFN line: should be a real number, not %s", tok1);
2939 }
2940
2941 else if (strcmp(tag, "CKSUM") == 0) {
2942 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Nothing follows CKSUM tag");
2943 hmm->checksum = atoll(tok1); /* if atoi(), then you may truncate uint32_t checksums > 2^31-1 */
2944 hmm->flags |= p7H_CHKSUM;
2945 }
2946
2947 else if (strcmp(tag, "STATS") == 0) {
2948 if (hfp->format >= p7_HMMFILE_3b)
2949 {
2950 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Too few fields on STATS line"); /* LOCAL */
2951 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok2, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Too few fields on STATS line"); /* MSV | VITERBI | FORWARD */
2952 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok3, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Too few fields on STATS line"); /* mu | tau */
2953 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok4, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Too few fields on STATS line"); /* lambda */
2954 if (strcasecmp(tok1, "LOCAL") == 0)
2955 {
2956 if (strcasecmp(tok2, "MSV") == 0) { hmm->evparam[p7_MMU] = atof(tok3); hmm->evparam[p7_MLAMBDA] = atof(tok4); statstracker |= 0x1; }
2957 else if (strcasecmp(tok2, "VITERBI") == 0) { hmm->evparam[p7_VMU] = atof(tok3); hmm->evparam[p7_VLAMBDA] = atof(tok4); statstracker |= 0x2; }
2958 else if (strcasecmp(tok2, "FORWARD") == 0) { hmm->evparam[p7_FTAU] = atof(tok3); hmm->evparam[p7_FLAMBDA] = atof(tok4); statstracker |= 0x4; }
2959 else ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Failed to parse STATS, %s unrecognized as field 3", tok2);
2960 } else ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Failed to parse STATS, %s unrecognized as field 2", tok1);
2961 }
2962 else if (hfp->format == p7_HMMFILE_3a) /* reverse compatibility with 30a */
2963 {
2964 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Too few fields on STATS line"); /* LOCAL */
2965 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok2, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Too few fields on STATS line"); /* VLAMBDA | VMU | FTAU */
2966 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok3, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Too few fields on STATS line"); /* value */
2967 if (strcasecmp(tok1, "LOCAL") == 0)
2968 {
2969 if (strcasecmp(tok2, "VLAMBDA") == 0) { hmm->evparam[p7_MLAMBDA] = hmm->evparam[p7_VLAMBDA] = hmm->evparam[p7_FLAMBDA] = atof(tok3); statstracker |= 0x1; }
2970 else if (strcasecmp(tok2, "VMU") == 0) { hmm->evparam[p7_MMU] = hmm->evparam[p7_VMU] = atof(tok3); statstracker |= 0x2; }
2971 else if (strcasecmp(tok2, "FTAU") == 0) { hmm->evparam[p7_FTAU] = atof(tok3); statstracker |= 0x4; }
2972 else ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Failed to parse STATS, %s unrecognized as field 3", tok2);
2973 } else ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Failed to parse STATS, %s unrecognized as field 2", tok1);
2974 }
2975 }
2976
2977 else if (strcmp(tag, "GA") == 0) {
2978 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Too few fields on GA line");
2979 hmm->cutoff[p7_GA1] = atof(tok1);
2980 if ( (abc->type == eslDNA || abc->type == eslRNA) ) { //if DNA, there's no need for a 2nd value (domain GA)
2981 hmm->cutoff[p7_GA2] = hmm->cutoff[p7_GA1];
2982 } else {
2983 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok2, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Too few fields on GA line");
2984 hmm->cutoff[p7_GA2] = atof(tok2);
2985 }
2986 hmm->flags |= p7H_GA;
2987 }
2988
2989 else if (strcmp(tag, "TC") == 0) {
2990 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Too few fields on TC line");
2991 hmm->cutoff[p7_TC1] = atof(tok1);
2992 if ( (abc->type == eslDNA || abc->type == eslRNA) ) { //if DNA, there's no need for a 2nd value (domain GA)
2993 hmm->cutoff[p7_TC2] = hmm->cutoff[p7_TC1];
2994 } else {
2995 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok2, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Too few fields on TC line");
2996 hmm->cutoff[p7_TC2] = atof(tok2);
2997 }
2998 hmm->flags |= p7H_TC;
2999 }
3000
3001 else if (strcmp(tag, "NC") == 0) {
3002 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Too few fields on NC line");
3003 hmm->cutoff[p7_NC1] = atof(tok1);
3004 if ( (abc->type == eslDNA || abc->type == eslRNA) ) { //if DNA, there's no need for a 2nd value (domain GA)
3005 hmm->cutoff[p7_NC2] = hmm->cutoff[p7_NC1];
3006 } else {
3007 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok2, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Too few fields on NC line");
3008 hmm->cutoff[p7_NC2] = atof(tok2);
3009 }
3010 hmm->flags |= p7H_NC;
3011 }
3012
3013 else if (strcmp(tag, "HMM") == 0)
3014 break;
3015 } /* end, loop over possible header tags */
3016
3017 if (status != eslOK) goto ERROR;
3018
3019 /* If we saw one STATS line, we need all 3. (True for both 3/a and 3/b formats) */
3020 if (statstracker == 0x7) hmm->flags |= p7H_STATS;
3021 else if (statstracker != 0x0) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Missing one or more STATS parameter lines");
3022
3023
3024 /* Skip main model header lines; allocate body of HMM now that K,M are known */
3025 if ((status = esl_fileparser_NextLine(hfp->efp)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Premature end of data before main model section");
3026 if ((status = esl_fileparser_NextLine(hfp->efp)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Premature end of data before main model section");
3027 if ((status = p7_hmm_CreateBody(hmm, hmm->M, abc)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Failed to allocate body of the new HMM");
3028 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Premature end of data before main model section");
3029
3030 /* Optional model composition (filter null model) may immediately follow headers */
3031 if (strcmp(tok1, "COMPO") == 0) {
3032 for (x = 0; x < abc->K; x++) {
3033 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Too few fields on COMPO line");
3034 hmm->compo[x] = (*tok1 == '*' ? 0.0 : expf(-1.0 * atof(tok1)));
3035 }
3036 hmm->flags |= p7H_COMPO;
3037 if ((status = esl_fileparser_NextLine(hfp->efp)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Premature end of data after COMPO line");
3038 if ((esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Premature end of data after COMPO line");
3039 }
3040
3041 /* First two lines are node 0: insert emissions, then transitions from node 0 (begin) */
3042
3043 hmm->ins[0][0] = (*tok1 == '*' ? 0.0 : expf(-1.0 *atof(tok1)));
3044 for (x = 1; x < abc->K; x++) {
3045 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Too few fields on insert line, node 0: expected %d, got %d\n", abc->K, x);
3046 hmm->ins[0][x] = (*tok1 == '*' ? 0.0 : expf(-1.0 *atof(tok1)));
3047 }
3048 if ((status = esl_fileparser_NextLine(hfp->efp)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Premature end of data in main model: no node 0 transition line");
3049 for (x = 0; x < p7H_NTRANSITIONS; x++) {
3050 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Too few fields on begin (0) transition line");
3051 hmm->t[0][x] = (*tok1 == '*' ? 0.0 : expf(-1.0 *atof(tok1)));
3052 }
3053
3054 /* The main model section. */
3055 for (k = 1; k <= hmm->M; k++)
3056 {
3057
3058 if ((status = esl_fileparser_NextLine(hfp->efp)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Premature end of data in main model section, at node %d (expected %d)", k, hmm->M);
3059 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Premature end of data in main model section, at node %d (expected %d)", k, hmm->M);
3060 if (atoi(tok1) != k) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Expected match line to start with %d (of %d); saw %s", k, hmm->M, tok1);
3061
3062 for (x = 0; x < abc->K; x++) {
3063 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Too few probability fields on match line, node %d: expected %d, got %d\n", k, abc->K, x);
3064 hmm->mat[k][x] = (*tok1 == '*' ? 0.0 : expf(-1.0 *atof(tok1)));
3065 }
3066
3067 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Missing MAP field on match line for node %d: should at least be -", k);
3068 if (hmm->flags & p7H_MAP) hmm->map[k] = atoi(tok1);
3069
3070 if (hfp->format >= p7_HMMFILE_3e) {
3071 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Missing CONS field on match line for node %d: should at least be -", k);
3072 if (hmm->flags & p7H_CONS) hmm->consensus[k] = *tok1;
3073 }
3074 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Missing RF field on match line for node %d: should at least be -", k);
3075 if (hmm->flags & p7H_RF) hmm->rf[k] = *tok1;
3076
3077 if (hfp->format >= p7_HMMFILE_3f) {
3078 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Missing MM field on match line for node %d: should at least be -", k);
3079 if (hmm->flags & p7H_MMASK) hmm->mm[k] = *tok1;
3080 }
3081
3082
3083 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Missing CS field on match line for node %d: should at least be -", k);
3084 if (hmm->flags & p7H_CS) hmm->cs[k] = *tok1;
3085
3086
3087 if ((status = esl_fileparser_NextLine(hfp->efp)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Premature end of data in main model: no insert emission line, node %d", k);
3088 for (x = 0; x < abc->K; x++) {
3089 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Too few probability fields on insert line, node %d: expected %d, got %d\n", k, abc->K, x);
3090 hmm->ins[k][x] = (*tok1 == '*' ? 0.0 : expf(-1.0 *atof(tok1)));
3091 }
3092 if ((status = esl_fileparser_NextLine(hfp->efp)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Premature end of data in main model: no transition line, node %d", k);
3093 for (x = 0; x < p7H_NTRANSITIONS; x++) {
3094 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Too few probability fields on transition line, node %d: expected %d, got %d\n", k, abc->K, x);
3095 hmm->t[k][x] = (*tok1 == '*' ? 0.0 : expf(-1.0 *atof(tok1)));
3096 }
3097 }
3098
3099 /* The closing // */
3100 if ((status = esl_fileparser_NextLine(hfp->efp)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Premature end of data: missing //?");
3101 if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Premature end of data: missing //?");
3102 if (strcmp(tok1, "//") != 0) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Expected closing //; found %s instead", tok1);
3103
3104 /* legacy issues */
3105 if (hfp->format < p7_HMMFILE_3e && (status = p7_hmm_SetConsensus(hmm, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Failed to set consensus on legacy HMM format");
3106
3107 /* Finish up. */
3108 if (hmm->flags & p7H_RF) { hmm->rf[0] = ' '; hmm->rf[hmm->M+1] = '\0'; }
3109 if (hmm->flags & p7H_MMASK){ hmm->mm[0] = ' '; hmm->mm[hmm->M+1] = '\0'; }
3110 if (hmm->flags & p7H_CONS) { hmm->consensus[0] = ' '; hmm->consensus[hmm->M+1] = '\0'; }
3111 if (hmm->flags & p7H_CS) { hmm->cs[0] = ' '; hmm->cs[hmm->M+1] = '\0'; }
3112 if (hmm->flags & p7H_MAP) { hmm->map[0] = 0; }
3113 if (hmm->name == NULL) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "No NAME found for HMM");
3114 if (hmm->M <= 0) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "No LENG found for HMM (or LENG <= 0)");
3115 if (abc == NULL) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "No ALPH found for HMM");
3116
3117 if (*ret_abc == NULL) *ret_abc = abc;
3118 if ( opt_hmm != NULL) *opt_hmm = hmm; else p7_hmm_Destroy(hmm);
3119 return eslOK;
3120
3121 ERROR:
3122 if (*ret_abc == NULL && abc != NULL) esl_alphabet_Destroy(abc);
3123 if (hmm != NULL) p7_hmm_Destroy(hmm);
3124 if (opt_hmm != NULL) *opt_hmm = NULL;
3125 if (status == eslEMEM || status == eslESYS) return status;
3126 else if (status == eslEOF) return status;
3127 else if (status == eslEINCOMPAT) return status;
3128 else return eslEFORMAT; /* anything else is a format error: includes premature EOF, EOL, EOD */
3129 }
3130
3131
3132 static int
read_bin30hmm(P7_HMMFILE * hfp,ESL_ALPHABET ** ret_abc,P7_HMM ** opt_hmm)3133 read_bin30hmm(P7_HMMFILE *hfp, ESL_ALPHABET **ret_abc, P7_HMM **opt_hmm)
3134 {
3135 ESL_ALPHABET *abc = NULL;
3136 P7_HMM *hmm = NULL;
3137 uint32_t magic;
3138 int alphabet_type;
3139 int k;
3140 off_t offset = 0;
3141 int status;
3142
3143 hfp->errbuf[0] = '\0';
3144 if (feof(hfp->f)) { status = eslEOF; goto ERROR; }
3145
3146 if (hfp->newly_opened)
3147 {
3148 offset = 0;
3149 hfp->newly_opened = FALSE;
3150 }
3151 else
3152 { /* Check magic. */
3153 if ((!hfp->do_stdin) && (! hfp->do_gzip)) {
3154 if ((offset = ftello(hfp->f)) < 0) ESL_XEXCEPTION(eslESYS, "ftello() failed");
3155 }
3156 if (! fread((char *) &magic, sizeof(uint32_t), 1, hfp->f)) { status = eslEOF; goto ERROR; }
3157
3158 if (hfp->format == p7_HMMFILE_3f) { if (magic != v3f_magic) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "bad magic number at start of HMM"); }
3159 else if (hfp->format == p7_HMMFILE_3e) { if (magic != v3e_magic) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "bad magic number at start of HMM"); }
3160 else if (hfp->format == p7_HMMFILE_3d) { if (magic != v3d_magic) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "bad magic number at start of HMM"); }
3161 else if (hfp->format == p7_HMMFILE_3c) { if (magic != v3c_magic) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "bad magic number at start of HMM"); }
3162 else if (hfp->format == p7_HMMFILE_3b) { if (magic != v3b_magic) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "bad magic number at start of HMM"); }
3163 else if (hfp->format == p7_HMMFILE_3a) { if (magic != v3a_magic) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "bad magic number at start of HMM"); }
3164 else ESL_XFAIL(eslEFORMAT, hfp->errbuf, "no such HMM file format code");
3165 }
3166
3167 /* Allocate shell of the new HMM.
3168 * Two-step allocation lets us read/set the flags first;
3169 * then the later CreateBody() call will allocate optional internal fields we need.
3170 */
3171 if ((hmm = p7_hmm_CreateShell()) == NULL) ESL_XFAIL(eslEMEM, hfp->errbuf, "allocation failed, HMM shell");
3172 hmm->offset = offset;
3173
3174 /* Get sizes of things */
3175 /* xref J5/114 for a legacy use of <flags> for optional acc, desc annotation */
3176 if (! fread((char *) &(hmm->flags), sizeof(int), 1, hfp->f)) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read flags");
3177 if (! fread((char *) &(hmm->M), sizeof(int), 1, hfp->f)) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read model size M");
3178 if (! fread((char *) &alphabet_type, sizeof(int), 1, hfp->f)) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read alphabet_type");
3179
3180 /* Set or verify alphabet. */
3181 if (*ret_abc == NULL) { /* still unknown: set it, pass control of it back to caller */
3182 if ((abc = esl_alphabet_Create(alphabet_type)) == NULL) ESL_XFAIL(eslEMEM, hfp->errbuf, "allocation failed, alphabet");
3183 } else { /* already known: check it */
3184 abc = *ret_abc;
3185 if (abc->type != alphabet_type) ESL_XFAIL(eslEINCOMPAT, hfp->errbuf, "Alphabet type mismatch: was %s, but current HMM says %s", esl_abc_DecodeType( abc->type), esl_abc_DecodeType(alphabet_type));
3186 }
3187
3188 /* Finish the allocation of the HMM
3189 */
3190 if ((status = p7_hmm_CreateBody(hmm, hmm->M, abc)) != eslOK) ESL_XFAIL(eslEMEM, hfp->errbuf, "allocation failed, HMM body");
3191
3192 /* Core model probabilities. */
3193 for (k = 1; k <= hmm->M; k++)
3194 if (! fread((char *) hmm->mat[k], sizeof(float), hmm->abc->K, hfp->f)) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read mat[%d]", k);
3195 for (k = 0; k <= hmm->M; k++)
3196 if (! fread((char *) hmm->ins[k], sizeof(float), hmm->abc->K, hfp->f)) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read ins[%d]", k);
3197 for (k = 0; k <= hmm->M; k++)
3198 if (! fread((char *) hmm->t[k], sizeof(float), p7H_NTRANSITIONS, hfp->f)) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read t[%d]", k);
3199
3200 /* Annotations. */
3201 if (read_bin_string(hfp->f, &(hmm->name)) != eslOK) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read name");
3202 if ((hmm->flags & p7H_ACC) && read_bin_string(hfp->f, &(hmm->acc)) != eslOK) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read acc");
3203 if ((hmm->flags & p7H_DESC) && read_bin_string(hfp->f, &(hmm->desc)) != eslOK) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read desc");
3204 if ((hmm->flags & p7H_RF) && ! fread((char *) hmm->rf, sizeof(char), hmm->M+2, hfp->f)) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read rf"); /* +2: 1..M and trailing \0 */
3205 if ((hmm->flags & p7H_MMASK)&& ! fread((char *) hmm->mm, sizeof(char), hmm->M+2, hfp->f)) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read mm"); /* +2: 1..M and trailing \0 */
3206 if ((hmm->flags & p7H_CONS) && ! fread((char *) hmm->consensus, sizeof(char), hmm->M+2, hfp->f)) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read consensus"); /* don't need to test for >=3e format, because the flag is sufficient (didn't exist pre-3e) */
3207 if ((hmm->flags & p7H_CS) && ! fread((char *) hmm->cs, sizeof(char), hmm->M+2, hfp->f)) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read cs");
3208 if ((hmm->flags & p7H_CA) && ! fread((char *) hmm->ca, sizeof(char), hmm->M+2, hfp->f)) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read ca");
3209 if (read_bin_string(hfp->f, &(hmm->comlog)) != eslOK) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read comlog");
3210 if (! fread((char *) &(hmm->nseq), sizeof(int), 1, hfp->f)) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read nseq");
3211 if (! fread((char *) &(hmm->eff_nseq), sizeof(float), 1, hfp->f)) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read eff_nseq");
3212 if (hfp->format >= p7_HMMFILE_3c) {
3213 if (! fread((char *) &(hmm->max_length), sizeof(int), 1, hfp->f)) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read max_length");
3214 }
3215 if (read_bin_string(hfp->f, &(hmm->ctime)) != eslOK) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read ctime");
3216 if ((hmm->flags & p7H_MAP) && ! fread((char *) hmm->map, sizeof(int), hmm->M+1, hfp->f)) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read map");
3217 if (! fread((char *) &(hmm->checksum), sizeof(uint32_t),1,hfp->f)) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read checksum");
3218
3219 /* E-value parameters and Pfam cutoffs */
3220 if (hfp->format >= p7_HMMFILE_3b) {
3221 if (! fread((char *) hmm->evparam, sizeof(float), p7_NEVPARAM, hfp->f)) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read statistical params");
3222 } else if (hfp->format == p7_HMMFILE_3a) {
3223 /* a backward compatibility mode. 3/a files stored 3 floats: LAMBDA, MU, TAU. Read 3 #'s and carefully copy/rearrange them into new 6 format */
3224 if (! fread((char *) hmm->evparam, sizeof(float), 3, hfp->f)) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read statistical params");
3225 hmm->evparam[p7_FLAMBDA] = hmm->evparam[0];
3226 hmm->evparam[p7_FTAU] = hmm->evparam[2];
3227 hmm->evparam[p7_VLAMBDA] = hmm->evparam[0];
3228 hmm->evparam[p7_VMU] = hmm->evparam[1];
3229 hmm->evparam[p7_MLAMBDA] = hmm->evparam[p7_VLAMBDA];
3230 hmm->evparam[p7_MMU] = hmm->evparam[p7_VMU];
3231 }
3232 if (! fread((char *) hmm->cutoff, sizeof(float), p7_NCUTOFFS, hfp->f)) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read Pfam score cutoffs");
3233 if ((hmm->flags & p7H_COMPO) && ! fread((char *) hmm->compo, sizeof(float), hmm->abc->K, hfp->f)) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read model composition");
3234
3235 /* other legacy issues */
3236 if (hfp->format < p7_HMMFILE_3e && (status = p7_hmm_SetConsensus(hmm, NULL)) != eslOK) ESL_XFAIL(status, hfp->errbuf, "Failed to set consensus on legacy HMM format");
3237
3238 if (*ret_abc == NULL) *ret_abc = abc; /* pass our new alphabet back to caller, if caller didn't know it already */
3239 if ( opt_hmm != NULL) *opt_hmm = hmm; else p7_hmm_Destroy(hmm);
3240 return eslOK;
3241
3242 ERROR:
3243 if (*ret_abc == NULL && abc != NULL) esl_alphabet_Destroy(abc); /* the test is for an alphabet created here, not passed */
3244 if (hmm != NULL) p7_hmm_Destroy(hmm);
3245 if (opt_hmm != NULL) *opt_hmm = NULL;
3246 return status;
3247 }
3248 /*--------------- end, private format parsers -------------------*/
3249
3250
3251
3252
3253
3254 /*****************************************************************
3255 * 6. Other private functions involved in i/o
3256 *****************************************************************/
3257
3258 /*****************************************************************
3259 * Some miscellaneous utility functions
3260 *****************************************************************/
3261
3262 /* Function: read_bin_string()
3263 * Date: SRE, Wed Oct 29 14:03:23 1997 [TWA 721]
3264 *
3265 * Purpose: Read in a string from a binary file, where
3266 * the first integer is the length (including '\0').
3267 * If the length is 0, <*ret_s> is set to <NULL>.
3268 *
3269 * This is a reasonable convention for storing/ reading
3270 * strings in binary files. Note that because the length is
3271 * inclusive of '\0', there's a difference between a NULL
3272 * string and an empty string.
3273 *
3274 * Args: fp - FILE to read from
3275 * ret_s - string to read into
3276 *
3277 * Return: <eslOK> on success. ret_s is malloc'ed here.
3278 * <eslEOD> if a read fails - likely because no more
3279 * data in file.
3280 *
3281 * Throws <eslEMEM> on allocation error.
3282 */
3283 static int
read_bin_string(FILE * fp,char ** ret_s)3284 read_bin_string(FILE *fp, char **ret_s)
3285 {
3286 int status;
3287 char *s = NULL;
3288 int len;
3289
3290 if (! fread((char *) &len, sizeof(int), 1, fp)) { status = eslEOD; goto ERROR; }
3291 if (len > 0) {
3292 ESL_ALLOC(s, (sizeof(char) * len));
3293 if (! fread((char *) s, sizeof(char), len, fp)) { status = eslEOD; goto ERROR; }
3294 }
3295 *ret_s = s;
3296 return eslOK;
3297
3298 ERROR:
3299 if (s != NULL) free(s);
3300 *ret_s = NULL;
3301 return status;
3302 }
3303
3304 /* Function: prob2ascii()
3305 *
3306 * Purpose: Format a probability for output to an ASCII save
3307 * file. Returns a ptr to a static internal buffer.
3308 *
3309 */
3310 static char *
prob2ascii(float p,float null)3311 prob2ascii(float p, float null)
3312 {
3313 static char buffer[32];
3314
3315 if (p == 0.0) return "*";
3316 sprintf(buffer, "%.3f", sreLOG2(p/null));
3317 return buffer;
3318 }
3319
3320
3321 /* Function: ascii2prob()
3322 *
3323 * Purpose: Convert a saved string back to a probability.
3324 */
3325 static float
ascii2prob(char * s,float null)3326 ascii2prob(char *s, float null)
3327 {
3328 return (*s == '*') ? 0. : exp(atof(s)/1.44269504)*null;
3329 }
3330
3331 /* EPN, Tue Aug 7 15:54:15 2007
3332 * is_integer() and is_real(), savagely ripped verbatim out
3333 * of Easel's esl_getopts.c, where they were private.
3334 */
3335 /* Function: is_integer()
3336 *
3337 * Returns TRUE if <s> points to something that atoi() will parse
3338 * completely and convert to an integer.
3339 */
3340 static int
is_integer(char * s)3341 is_integer(char *s)
3342 {
3343 int hex = 0;
3344
3345 if (s == NULL) return 0;
3346 while (isspace((int) (*s))) s++; /* skip whitespace */
3347 if (*s == '-' || *s == '+') s++; /* skip leading sign */
3348 /* skip leading conversion signals */
3349 if ((strncmp(s, "0x", 2) == 0 && (int) strlen(s) > 2) ||
3350 (strncmp(s, "0X", 2) == 0 && (int) strlen(s) > 2))
3351 {
3352 s += 2;
3353 hex = 1;
3354 }
3355 else if (*s == '0' && (int) strlen(s) > 1)
3356 s++;
3357 /* examine remainder for garbage chars */
3358 if (!hex)
3359 while (*s != '\0')
3360 {
3361 if (!isdigit((int) (*s))) return 0;
3362 s++;
3363 }
3364 else
3365 while (*s != '\0')
3366 {
3367 if (!isxdigit((int) (*s))) return 0;
3368 s++;
3369 }
3370 return 1;
3371 }
3372
3373
3374 /* is_real()
3375 *
3376 * Returns TRUE if <s> is a string representation
3377 * of a valid floating point number, convertable
3378 * by atof().
3379 */
3380 static int
is_real(char * s)3381 is_real(char *s)
3382 {
3383 int gotdecimal = 0;
3384 int gotexp = 0;
3385 int gotreal = 0;
3386
3387 if (s == NULL) return 0;
3388
3389 while (isspace((int) (*s))) s++; /* skip leading whitespace */
3390 if (*s == '-' || *s == '+') s++; /* skip leading sign */
3391
3392 /* Examine remainder for garbage. Allowed one '.' and
3393 * one 'e' or 'E'; if both '.' and e/E occur, '.'
3394 * must be first.
3395 */
3396 while (*s != '\0')
3397 {
3398 if (isdigit((int) (*s))) gotreal++;
3399 else if (*s == '.')
3400 {
3401 if (gotdecimal) return 0; /* can't have two */
3402 if (gotexp) return 0; /* e/E preceded . */
3403 else gotdecimal++;
3404 }
3405 else if (*s == 'e' || *s == 'E')
3406 {
3407 if (gotexp) return 0; /* can't have two */
3408 else gotexp++;
3409 }
3410 else if (isspace((int) (*s)))
3411 break;
3412 s++;
3413 }
3414
3415 while (isspace((int) (*s))) s++; /* skip trailing whitespace */
3416 if (*s == '\0' && gotreal) return 1;
3417 else return 0;
3418 }
3419
3420 /*---------------- end, private utilities -----------------------*/
3421
3422 /*****************************************************************
3423 * 7. Legacy v1.0 ascii file format output.
3424 *****************************************************************/
3425
3426 /* Function: cm_file_Write1p0Ascii()
3427 * Incept: EPN, Tue Feb 28 14:35:47 2012
3428 *
3429 * Purpose: Write a CM in version 1.0 format.
3430 * cm_io.c:write_ascii_cm() from Infernal 1.0.2
3431 * was used as the starting point for this function.
3432 *
3433 * Calibration parameters: E-values and HMM filter
3434 * thresholds are not written here. This is because 1.0
3435 * expects either both E-values and HMM filter thresholds
3436 * or neither and HMM filter thresholds don't exist
3437 * in the new format.
3438 *
3439 * Returns: eslOK on success;
3440 */
3441 int
cm_file_Write1p0ASCII(FILE * fp,CM_t * cm)3442 cm_file_Write1p0ASCII(FILE *fp, CM_t *cm)
3443 {
3444 int status;
3445 int v,x,y,nd;
3446 char *comlog2print = NULL;
3447
3448 fprintf(fp, "INFERNAL-1 [converted from %s]\n", INFERNAL_VERSION);
3449
3450 fprintf(fp, "NAME %s\n", cm->name);
3451 if (cm->acc != NULL) fprintf(fp, "ACC %s\n", cm->acc);
3452 if (cm->desc != NULL) fprintf(fp, "DESC %s\n", cm->desc);
3453 /* Rfam cutoffs */
3454 if (cm->flags & CMH_GA) fprintf(fp, "GA %.2f\n", cm->ga);
3455 if (cm->flags & CMH_TC) fprintf(fp, "TC %.2f\n", cm->tc);
3456 if (cm->flags & CMH_NC) fprintf(fp, "NC %.2f\n", cm->nc);
3457 fprintf(fp, "STATES %d\n", cm->M);
3458 fprintf(fp, "NODES %d\n", cm->nodes);
3459 fprintf(fp, "ALPHABET %d\n", cm->abc->type);
3460 fprintf(fp, "ELSELF %.8f\n", cm->el_selfsc);
3461 fprintf(fp, "WBETA %g\n", cm->beta_W);
3462 fprintf(fp, "NSEQ %d\n", cm->nseq);
3463 fprintf(fp, "EFFNSEQ %.3f\n", cm->eff_nseq);
3464 fprintf(fp, "CLEN %d\n", cm->clen);
3465
3466 /* Print out the BCOM line as the cm->comlog string up to the first
3467 * '\n' or the end of the string. Print cm->ctime as BDATE. We
3468 * don't print CCOM and CDATE. This should be okay because we won't
3469 * likely have E-value and HMM filter stats since we're probably
3470 * converting from a 1.1 or later file. But it is possible we're
3471 * converting from a 1.0 file to a 1.0 file in which case we output
3472 * E-value stats and filter threshold stats but no CCOM and CDATE
3473 * lines. This won't affect the parsing of the file by 1.0 though.
3474 */
3475 if(cm->comlog != NULL) {
3476 ESL_ALLOC(comlog2print, sizeof(char) * (strlen(cm->comlog)+1));
3477 x = 0;
3478 while(x < strlen(cm->comlog) && cm->comlog[x] != '\n') { comlog2print[x] = cm->comlog[x]; x++; }
3479 comlog2print[x] = '\0';
3480 fprintf(fp, "BCOM %s\n", comlog2print);
3481 free(comlog2print);
3482 }
3483 if(cm->ctime != NULL) fprintf(fp, "BDATE %s\n", cm->ctime);
3484
3485 fputs( "NULL ", fp);
3486 for (x = 0; x < cm->abc->K; x++)
3487 fprintf(fp, "%6s ", prob2ascii(cm->null[x], 1/(float)(cm->abc->K)));
3488 fputs("\n", fp);
3489
3490 /* E-value statistics skipped in converted output
3491 * mainly because HMM filter thresholds no longer exist
3492 * in current version so we can't output them here.
3493 */
3494 /* main model section */
3495 fputs("MODEL:\n", fp);
3496 for (v = 0; v < cm->M; v++)
3497 {
3498 nd = cm->ndidx[v];
3499
3500 /* Node line.
3501 */
3502 if (cm->nodemap[nd] == v)
3503 fprintf(fp, "\t\t\t\t[ %-4s %4d ]\n", Nodetype(cm->ndtype[nd]), nd);
3504
3505 /* State line, w/ parents, children, and transitions
3506 */
3507 fprintf(fp, " %2s %5d %5d %1d %5d %5d ",
3508 Statetype(cm->sttype[v]), v,
3509 cm->plast[v], cm->pnum[v],
3510 cm->cfirst[v], cm->cnum[v]);
3511 if (cm->sttype[v] != B_st)
3512 for (x = 0; x < cm->cnum[v]; x++)
3513 fprintf(fp, "%7s ", prob2ascii(cm->t[v][x], 1.));
3514 else x = 0;
3515 for (; x < 6; x++)
3516 fprintf(fp, "%7s ", "");
3517
3518 /* Emission line
3519 */
3520 if (cm->sttype[v] == MP_st)
3521 {
3522 for (x = 0; x < cm->abc->K; x++)
3523 for (y = 0; y < cm->abc->K; y++)
3524 fprintf(fp, "%6s ", prob2ascii(cm->e[v][x*cm->abc->K+y], cm->null[x]*cm->null[y]));
3525 }
3526 else if (cm->sttype[v] == ML_st || cm->sttype[v] == MR_st || cm->sttype[v] == IL_st || cm->sttype[v] == IR_st)
3527 {
3528 for (x = 0; x < cm->abc->K; x++)
3529 fprintf(fp, "%6s ", prob2ascii(cm->e[v][x], cm->null[x]));
3530 }
3531 fputs("\n", fp);
3532 }
3533 fputs("//\n", fp);
3534 return eslOK;
3535
3536 ERROR:
3537 ESL_EXCEPTION(eslEMEM, "out of memory");
3538 return status;
3539 }
3540
3541 /* multiline()
3542 *
3543 * Stolen from HMMER3, verbatim.
3544 *
3545 * Used to print the command log to ASCII save files.
3546 *
3547 * Given a record (like the comlog) that contains
3548 * multiple lines, print it as multiple lines with
3549 * a given prefix. e.g.:
3550 *
3551 * given: "COM ", "foo\nbar\nbaz"
3552 * print: COM 1 foo
3553 * COM 2 bar
3554 * COM 3 baz
3555 *
3556 * If <s> is NULL, no-op. Otherwise <s> must be a <NUL>-terminated
3557 * string. It does not matter if it ends in <\n> or not. <pfx>
3558 * must be a valid <NUL>-terminated string; it may be empty.
3559 *
3560 * Args: fp: FILE to print to
3561 * pfx: prefix for each line
3562 * s: line to break up and print; tolerates a NULL
3563 *
3564 * Returns: <eslOK> on success.
3565 *
3566 * Throws: <eslEWRITE> on write error.
3567 */
3568 static int
multiline(FILE * fp,const char * pfx,char * s)3569 multiline(FILE *fp, const char *pfx, char *s)
3570 {
3571 char *sptr = s;
3572 char *end = NULL;
3573 int n = 0;
3574 int nline = 1;
3575
3576 do {
3577 end = strchr(sptr, '\n');
3578
3579 if (end != NULL) /* if there's no \n left, end == NULL */
3580 {
3581 n = end - sptr; /* n chars exclusive of \n */
3582 if (fprintf(fp, "%s [%d] ", pfx, nline++) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "hmm write failed");
3583 if (fwrite(sptr, sizeof(char), n, fp) != n) ESL_EXCEPTION_SYS(eslEWRITE, "hmm write failed"); /* using fwrite lets us write fixed # of chars */
3584 if (fprintf(fp, "\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "hmm write failed"); /* while writing \n w/ printf allows newline conversion */
3585 sptr += n + 1; /* +1 to get past \n */
3586 }
3587 else
3588 {
3589 if (fprintf(fp, "%s [%d] %s\n", pfx, nline++, sptr) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "hmm write failed"); /* last line */
3590 }
3591 } while (end != NULL && *sptr != '\0'); /* *sptr == 0 if <s> terminates with a \n */
3592 return eslOK;
3593 }
3594
3595
3596 /*****************************************************************
3597 * 8. Benchmark driver.
3598 *****************************************************************/
3599 #ifdef CM_FILE_BENCHMARK
3600 /*
3601 gcc -pthread -std=gnu99 -g -Wall -static -o cm_file_benchmark -I. -L. -I../hmmer/src -L../hmmer/src -I../easel -L../easel -DCM_FILE_BENCHMARK cm_file.c -linfernal -lhmmer -leasel -lm
3602 icc -pthread -O3 -static -o cm_file_benchmark -I. -L. -I../hmmer/src -L../hmmer/src -I../easel -L../easel -DCM_FILE_BENCHMARK cm_file.c -linfernal -lhmmer -leasel -lm
3603 ./cm_file_benchmark Rfam.cm
3604 */
3605 #include "esl_config.h"
3606 #include "p7_config.h"
3607 #include "config.h"
3608
3609 #include <stdlib.h>
3610 #include <stdio.h>
3611
3612 #include "easel.h"
3613 #include "esl_getopts.h"
3614 #include "esl_stopwatch.h"
3615
3616 #include "hmmer.h"
3617
3618 #include "infernal.h"
3619
3620 static ESL_OPTIONS options[] = {
3621 /* name type default env range toggles reqs incomp help docgroup*/
3622 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
3623 { "-a", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "include time of CM configuration", 0 },
3624 { "-v", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "verbose: print model info as they're read", 0 },
3625 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3626 };
3627 static char usage[] = "[-options] <CM file>";
3628 static char banner[] = "benchmark driver for CM input";
3629
3630 int
main(int argc,char ** argv)3631 main(int argc, char **argv)
3632 {
3633 ESL_GETOPTS *go = cm_CreateDefaultApp(options, 1, argc, argv, banner, usage);
3634 ESL_STOPWATCH *w = esl_stopwatch_Create();
3635 ESL_STOPWATCH *w2 = esl_stopwatch_Create();
3636 ESL_ALPHABET *abc = NULL;
3637 char *cmfile = esl_opt_GetArg(go, 1);
3638 CM_FILE *cmfp = NULL;
3639 CM_t *cm = NULL;
3640 int nmodel = 0;
3641 uint64_t tot_clen = 0;
3642 int status;
3643 int be_verbose = esl_opt_GetBoolean(go, "-v");
3644 char errbuf[eslERRBUFSIZE];
3645
3646 esl_stopwatch_Start(w);
3647
3648 status = cm_file_Open(cmfile, NULL, FALSE, &cmfp, errbuf);
3649 if (status == eslENOTFOUND) cm_Fail("File existence/permissions problem in trying to open CM file %s.\n%s\n", cmfile, errbuf);
3650 else if (status == eslEFORMAT) cm_Fail("File format problem in trying to open CM file %s.\n%s\n", cmfile, errbuf);
3651 else if (status != eslOK) cm_Fail("Unexpected error %d in opening CM file %s.\n%s\n", status, cmfile, errbuf);
3652 if (cmfp->do_gzip) cm_Fail("Reading gzipped CM files is not supported");
3653 if (cmfp->do_stdin) cm_Fail("Reading CM files from stdin is not supported");
3654
3655 if(be_verbose) esl_stopwatch_Start(w2);
3656 while ((status = cm_file_Read(cmfp, &abc, &cm)) == eslOK)
3657 {
3658 nmodel++;
3659 tot_clen += cm->clen;
3660
3661 if (esl_opt_GetBoolean(go, "-a")) {
3662 if((status = cm_Configure(cm, errbuf, -1)) != eslOK) cm_Fail(errbuf);
3663 }
3664
3665 if(be_verbose) {
3666 esl_stopwatch_Stop(w2);
3667 printf("%-30s ", cm->name);
3668 esl_stopwatch_Display(stdout, w2, "CPU time: ");
3669 esl_stopwatch_Start(w2);
3670 }
3671 FreeCM(cm);
3672 }
3673 if (status == eslEFORMAT) cm_Fail("bad file format in CM file %s\n%s", cmfile, cmfp->errbuf);
3674 else if (status == eslEINCOMPAT) cm_Fail("CM file %s contains different alphabets\n%s", cmfile, cmfp->errbuf);
3675 else if (status != eslEOF) cm_Fail("Unexpected error in reading CMs from %s\n%s", cmfile, cmfp->errbuf);
3676
3677 esl_stopwatch_Stop(w);
3678 esl_stopwatch_Display(stdout, w, "# CPU time: ");
3679 printf("# number of models: %d\n", nmodel);
3680 printf("# total clen: %" PRId64 "\n", tot_clen);
3681
3682 cm_file_Close(cmfp);
3683 esl_alphabet_Destroy(abc);
3684 esl_stopwatch_Destroy(w);
3685 esl_stopwatch_Destroy(w2);
3686 esl_getopts_Destroy(go);
3687 return 0;
3688 }
3689 #endif /*cm_FILE_BENCHMARK*/
3690 /*---------------- end, benchmark driver ------------------------*/
3691
3692
3693 /*****************************************************************
3694 * 9. Example.
3695 *****************************************************************/
3696 /* On using the example to test error messages from cm_file_Open():
3697 * Message
3698 * --------------
3699 * .gz file missing/not readable \rm test.cm.gz; touch test.cm.gz; src/cmfile_example test.cm.gz
3700 * gzip -dc doesn't exist \cp testsuite/20aa.cm test.cm; gzip test.cm; sudo mv /usr/bin/gzip /usr/bin/gzip.old; src/cmfile_example test.cm.gz
3701 * cm file not found \rm test.cm; src/cmfile_example test.cm
3702 * bad SSI file format \cp testsuite/20aa.cm test.cm; \rm test.cm.ssi; touch test.cm.ssi; src/cmfile_example test.cm
3703 * 64-bit SSI on 32-bit sys
3704 * empty file \rm test.cm; touch test.cm
3705 * unrecognized format (binary) cat testsuite/20aa.cm > test.cm; src/cmpress test.cm; \rm test.cm; [edit test.cm.h3m, delete first byte]
3706 * unrecognized format (ascii) cat testsuite/20aa.cm | sed -e 's/^HMMER3\/b/HMMER3\/x/' > test.hmm
3707 *
3708 */
3709
3710 #ifdef CMFILE_EXAMPLE
3711 /* gcc -g -Wall -DCMFILE_EXAMPLE -I. -I../easel -L. -L../easel -o cmfile_example cm_file.c -linfernal -lhmmer -leasel -lm
3712 */
3713 #include "esl_config.h"
3714 #include "p7_config.h"
3715 #include "config.h"
3716
3717 #include "easel.h"
3718 #include "esl_alphabet.h"
3719
3720 #include "hmmer.h"
3721
3722 #include "infernal.h"
3723
3724 int
main(int argc,char ** argv)3725 main(int argc, char **argv)
3726 {
3727 char *cmfile = argv[1];
3728 CM_FILE *cmfp = NULL;
3729 CM_t *cm = NULL;
3730 ESL_ALPHABET *abc = NULL;
3731 char errbuf[eslERRBUFSIZE];
3732 int status;
3733
3734 /* An example of reading a single CM from a file, and checking that it is the only one. */
3735 status = cm_file_Open(cmfile, NULL, FALSE, &cmfp, errbuf);
3736 if (status == eslENOTFOUND) cm_Fail("File existence/permissions problem in trying to open CM file %s.\n%s\n", cmfile, errbuf);
3737 else if (status == eslEFORMAT) cm_Fail("File format problem in trying to open CM file %s.\n%s\n", cmfile, errbuf);
3738 else if (status != eslOK) cm_Fail("Unexpected error %d in opening CM file %s.\n%s\n", status, cmfile, errbuf);
3739
3740 status = cm_file_Read(cmfp, TRUE, &abc, &cm);
3741 if (status == eslEFORMAT) cm_Fail("Bad file format in CM file %s:\n%s\n", cmfp->fname, cmfp->errbuf);
3742 else if (status == eslEINCOMPAT) cm_Fail("CM in %s is not in the expected %s alphabet\n", cmfp->fname, esl_abc_DecodeType(abc->type));
3743 else if (status == eslEOF) cm_Fail("Empty CM file %s? No CM data found.\n", cmfp->fname);
3744 else if (status != eslOK) cm_Fail("Unexpected error in reading CMs from %s\n", cmfp->fname);
3745
3746 status = cm_file_Read(cmfp, TRUE, &abc, NULL);
3747 if (status != eslEOF) cm_Fail("CM file %s does not contain just one CM\n", cmfp->fname);
3748
3749 cm_file_Close(cmfp);
3750
3751 cm_file_WriteASCII(stdout, -1, cm);
3752
3753 esl_alphabet_Destroy(abc);
3754 FreeCM(cm);
3755 return 0;
3756 }
3757 #endif /*CMFILE_EXAMPLE*/
3758 /*----------------------- end, example --------------------------*/
3759
3760