1 /* Input/output of CMs.
2  *
3  * EPN, Fri Jun 17 09:37:56 2011
4  * Based on p7_hmmfile.c copied from HMMER svn revision 3570.
5  *
6  * Contents:
7  *     1. The CM_FILE object for reading CMs
8  *     2. Writing CM files.
9  *     3. API for reading CM files in various formats.
10  *     4. API for reading/writing p7 HMMs/profile filters.
11  *     5. Private, specific CM file format parsers.
12  *     6. Other private functions involved in i/o.
13  *     7. Legacy v1.0 ascii file format output.
14  *     8. Benchmark driver.
15  *     9. Unit tests.
16  *    10. Test driver.
17  *    11. Example.
18  */
19 #include "esl_config.h"
20 #include "p7_config.h"
21 #include "config.h"
22 
23 #include <stdlib.h>
24 #include <stdio.h>
25 #include <string.h>
26 
27 #ifdef HMMER_THREADS
28 #include <pthread.h>
29 #endif
30 
31 #include "easel.h"
32 #include "esl_alphabet.h"
33 #include "esl_ssi.h" 		/* this gives us esl_byteswap */
34 #include "esl_vectorops.h" 	/* gives us esl_vec_FCopy()   */
35 
36 #include "hmmer.h"
37 
38 #include "infernal.h"
39 
40 /* Magic numbers identifying binary formats.
41  * Do not change the old magics! Necessary for backwards compatibility.
42  */
43 #if 0 /* temporarily remove all the magic; write backwards compat stuff later */
44 static unsigned int v01magic = 0xe3edb0b1; /* v0.1 binary: "cm01" + 0x80808080 */
45 #endif
46 
47 static uint32_t  v1a_magic  = 0xe3edb0b2; /* v1.1 binary: "cm02" + 0x80808080 */
48 static uint32_t  v1a_fmagic = 0xb1e1e6f3; /* 1/a binary MSV/SSV file: "1afs" = 0x 31 61 66 73  + 0x80808080 */
49 /* Note: 's' at end of 1afs is arbitrary. It is consistent with H3's
50  * trailing 's' iforSSE binary files, but in Infernal this is used
51  * whether we're building for SSE or VMX (since this code is neither
52  * SSE nor VMX-specific). I suppose we could say it stands for SSV,
53  * with 'f' for filter... Also, http://www.ascii-code.com/ and
54  * hmmer/src/impl_sse/io.c might be useful to determine how each code
55  * derives from its text string.
56  */
57 
58 static int read_asc_1p1_cm(CM_FILE *hfp, int read_fp7, ESL_ALPHABET **ret_abc, CM_t **opt_cm);
59 static int read_bin_1p1_cm(CM_FILE *hfp, int read_fp7, ESL_ALPHABET **ret_abc, CM_t **opt_cm);
60 static int read_asc_1p0_cm(CM_FILE *hfp, int read_fp7, ESL_ALPHABET **ret_abc, CM_t **opt_cm);
61 
62 static int   write_bin_string(FILE *fp, char *s);
63 static int   read_bin_string (FILE *fp, char **ret_s);
64 
65 static char *prob2ascii(float p, float null);
66 static float ascii2prob(char *s, float null);
67 static int is_integer(char *s);
68 static int is_real(char *s);
69 
70 /* from p7_hmmfile.c, for reading p7 filters */
71 static uint32_t  v3a_magic = 0xe8ededb6; /* 3/a binary: "hmm6" + 0x80808080 */
72 static uint32_t  v3b_magic = 0xe8ededb7; /* 3/b binary: "hmm7" + 0x80808080 */
73 static uint32_t  v3c_magic = 0xe8ededb8; /* 3/c binary: "hmm8" + 0x80808080 */
74 static uint32_t  v3d_magic = 0xe8ededb9; /* 3/d binary: "hmm9" + 0x80808080 */
75 static uint32_t  v3e_magic = 0xe8ededb0; /* 3/e binary: "hmm0" + 0x80808080 */
76 static uint32_t  v3f_magic = 0xe8ededba; /* 3/f binary: "hmma" + 0x80808080 */
77 
78 static int read_asc30hmm(P7_HMMFILE *hfp, ESL_ALPHABET **ret_abc, P7_HMM **opt_hmm);
79 static int read_bin30hmm(P7_HMMFILE *hfp, ESL_ALPHABET **ret_abc, P7_HMM **opt_hmm);
80 
81 /*****************************************************************
82  * 1. The CM_FILE object for reading CMs.
83  *****************************************************************/
84 static int open_engine(char *filename, char *env, CM_FILE **ret_cmfp, int do_ascii_only, int allow_1p0, char *errbuf);
85 
86 /* Function:  cm_file_Open()
87  * Synopsis:  Open an CM file <filename>.
88  * Incept:    EPN, Fri Jun 17 09:42:13 2011
89  *            SRE, Tue Dec 21 10:44:38 2010 [Zaragoza] (p7_hmmfile_OpenE())
90  *
91  * Purpose:   Open an CM file <filename>, and prepare to read the first
92  *            CM from it.
93  *
94  *            The format should be INFERNAL1/a or more recent.  If
95  *            <allow_1p0>, we also allow the file to be in Infernal
96  *            v1.0 to v1.0.2 ascii format.
97  *
98  *            We look for <filename> relative to the current working
99  *            directory. Additionally, if we don't find it in the cwd
100  *            and <env> is non-NULL, we will look for <filename>
101  *            relative to one or more directories in a colon-delimited
102  *            list obtained from the environment variable <env>. For
103  *            example, if we had <setenv INFERNALDB
104  *            /misc/db/Rfam:/misc/db/Pfam> in the environment, a
105  *            CM application might pass "INFERNALDB" as <env>.
106  *
107  *            As a special case, if <filename> is "-", then CMs will
108  *            be read from <stdin>. In this case, <env> has no effect.
109  *
110  *            As another special case, if <filename> ends in a <.gz>
111  *            suffix, the file is assumed to be compressed by GNU
112  *            <gzip>, and it is opened for reading from a pipe with
113  *            <gunzip -dc>. This feature is only available on
114  *            POSIX-compliant systems that have a <popen()> call, and
115  *            <HAVE_POPEN> is defined by the configure script at
116  *            compile time.
117  *
118  * Args:      filename  - CM file to open; or "-" for <stdin>
119  *            env       - list of paths to look for <cmfile> in, in
120  *                        addition to current working dir; or <NULL>
121  *            allow_1p0 - TRUE to allow 1.0 formatted files
122  *            ret_cmfp  - RETURN: opened <P7_CMFILE>.
123  *            errbuf    - error message buffer: <NULL>, or a ptr
124  *                        to <eslERRBUFSIZE> chars of allocated space.
125  *
126  * Returns:   <eslOK> on success, and the open <CM_FILE> is returned
127  *            in <*ret_cmfp>.
128  *
129  *            <eslENOTFOUND> if <filename> can't be opened for
130  *            reading, even after the list of directories in <env> (if
131  *            any) is checked.
132  *
133  *            <eslEFORMAT> if <filename> is not in a recognized Infernal
134  *            CM file format.
135  *
136  *            On either type of error, if a non-NULL <errbuf> was provided,
137  *            a useful user error message is left in it.
138  *
139  * Throws:    <eslEMEM> on allocation failure.
140  */
141 int
cm_file_Open(char * filename,char * env,int allow_1p0,CM_FILE ** ret_cmfp,char * errbuf)142 cm_file_Open(char *filename, char *env, int allow_1p0, CM_FILE **ret_cmfp, char *errbuf)
143 {
144   return open_engine(filename, env, ret_cmfp, FALSE, allow_1p0, errbuf);
145 }
146 
147 /* Function:  cm_file_OpenNoDB()
148  * Synopsis:  Open only a CM flatfile, even if pressed db exists.
149  * Incept:    SRE, Tue Dec 21 10:52:35 2010 [Zaragoza]
150  *
151  * Purpose:   Same as <cm_file_OpenE()> except that if a pressed
152  *            database exists for <filename>, it is ignored. Only
153  *            <filename> itself is opened.
154  *
155  *            hmmpress needs this call. Otherwise, it opens a press'ed
156  *            database that it may be about to overwrite.
157  */
158 int
cm_file_OpenNoDB(char * filename,char * env,int allow_1p0,CM_FILE ** ret_cmfp,char * errbuf)159 cm_file_OpenNoDB(char *filename, char *env, int allow_1p0, CM_FILE **ret_cmfp, char *errbuf)
160 {
161   return open_engine(filename, env, ret_cmfp, TRUE, allow_1p0, errbuf);
162 }
163 
164 
165 /* Function:  cm_file_OpenBuffer()
166  * Incept:    EPN, Fri Jun 17 09:47:34 2011
167  *            MSF, Thu Aug 19 2010 [Janelia] (p7_hmmfile_OpenBuffer())
168  *
169  * Purpose:   Perparse a buffer containing an ascii CM for parsing.
170  *
171  *            As another special case, if <filename> ends in a <.gz>
172  *            suffix, the file is assumed to be compressed by GNU
173  *            <gzip>, and it is opened for reading from a pipe with
174  *            <gunzip -dc>. This feature is only available on
175  *            POSIX-compliant systems that have a <popen()> call, and
176  *            <HAVE_POPEN> is defined by the configure script at
177  *            compile time.
178  *
179  * Args:      filename - CM file to open; or "-" for <stdin>
180  *            env      - list of paths to look for <cmfile> in, in
181  *                       addition to current working dir; or <NULL>
182  *            ret_cmfp  - RETURN: opened <CM_FILE>.
183  *
184  * Returns:   <eslOK> on success, and the open <CM_FILE> is returned
185  *            in <*ret_cmfp>.
186  *
187  *            <eslENOTFOUND> if <filename> can't be opened for
188  *            reading, even after the list of directories in <env> (if
189  *            any) is checked.
190  *
191  *            <eslEFORMAT> if <filename> is not in a recognized HMMER
192  *            CM file format.
193  *
194  * Throws:    <eslEMEM> on allocation failure.
195  */
196 int
cm_file_OpenBuffer(char * buffer,int size,int allow_1p0,CM_FILE ** ret_cmfp)197 cm_file_OpenBuffer(char *buffer, int size, int allow_1p0, CM_FILE **ret_cmfp)
198 {
199   CM_FILE    *cmfp = NULL;
200   int         status;
201   char       *tok;
202   int         toklen;
203 
204   ESL_ALLOC(cmfp, sizeof(CM_FILE));
205   cmfp->f            = NULL;
206   cmfp->fname        = NULL;
207   cmfp->do_gzip      = FALSE;
208   cmfp->do_stdin     = FALSE;
209   cmfp->newly_opened = TRUE;	/* well, it will be, real soon now */
210   cmfp->is_pressed   = FALSE;
211 #ifdef HMMER_THREADS
212   cmfp->syncRead     = FALSE;
213 #endif
214   cmfp->parser       = NULL;
215   cmfp->efp          = NULL;
216   cmfp->ffp          = NULL;
217   cmfp->pfp          = NULL;
218   cmfp->ssi          = NULL;
219   cmfp->errbuf[0]    = '\0';
220 
221   if ((cmfp->efp = esl_fileparser_CreateMapped(buffer, size))         == NULL)   { status = eslEMEM; goto ERROR; }
222   if ((status = esl_fileparser_SetCommentChar(cmfp->efp, '#'))        != eslOK)  goto ERROR;
223   if ((status = esl_fileparser_GetToken(cmfp->efp, &tok, &toklen))    != eslOK)  goto ERROR;
224 
225   if      (             strcmp("INFERNAL1/a", tok) == 0) { cmfp->format = CM_FILE_1a; cmfp->parser = read_asc_1p1_cm; }
226   else if (allow_1p0 && strcmp("INFERNAL-1",  tok) == 0) { cmfp->format = CM_FILE_1;  cmfp->parser = read_asc_1p0_cm; }
227 
228   if (cmfp->parser == NULL) { status = eslEFORMAT; goto ERROR; }
229 
230   *ret_cmfp = cmfp;
231   return eslOK;
232 
233  ERROR:
234   if (cmfp != NULL) cm_file_Close(cmfp);
235   *ret_cmfp = NULL;
236   if      (status == eslEMEM)       return status;
237   else if (status == eslENOTFOUND)  return status;
238   else                              return eslEFORMAT;
239 }
240 
241 
242 /* open_engine()
243  *
244  * Implements the file opening functions:
245  * <cm_file_Open()>, <cm_file_OpenNoDB()>,
246  * See their comments above.
247  *
248  * Only returns three types of errors:
249  *    eslENOTFOUND - file (the CM file) or program (gzip, for .gz files) not found
250  *    eslEFORMAT   - bad CM file format (or format of associated file)
251  *    eslEMEM      - allocation failure somewhere
252  * <errbuf>, if non-NULL, will contain a useful error message.
253  *
254  */
255 static int
open_engine(char * filename,char * env,CM_FILE ** ret_cmfp,int do_ascii_only,int allow_1p0,char * errbuf)256 open_engine(char *filename, char *env, CM_FILE **ret_cmfp, int do_ascii_only, int allow_1p0, char *errbuf)
257 {
258   CM_FILE    *cmfp     = NULL;
259   char       *envfile  = NULL;	/* full path to filename after using environment  */
260   char       *dbfile   = NULL;	/* constructed name of an index or binary db file */
261   char       *cmd      = NULL;	/* constructed gzip -dc pipe command              */
262   int         status;
263   int         n       = strlen(filename);
264   union { char c[4]; uint32_t n; } magic;
265   char       *tok;
266   int         toklen;
267 
268   ESL_ALLOC(cmfp, sizeof(CM_FILE));
269   cmfp->f            = NULL;
270   cmfp->fname        = NULL;
271   cmfp->do_gzip      = FALSE;
272   cmfp->do_stdin     = FALSE;
273   cmfp->newly_opened = TRUE;	/* well, it will be, real soon now */
274   cmfp->is_pressed   = FALSE;
275   cmfp->is_binary    = FALSE;
276 #ifdef HMMER_THREADS
277   cmfp->syncRead     = FALSE;
278 #endif
279   cmfp->parser       = NULL;
280   cmfp->efp          = NULL;
281   cmfp->hfp          = NULL;
282   cmfp->ffp          = NULL;
283   cmfp->pfp          = NULL;
284   cmfp->ssi          = NULL;
285   cmfp->errbuf[0]    = '\0';
286 
287   /* 1. There's two special reading modes that have limited indexing
288    *    and optimization capability: reading from standard input, and
289    *    reading a gzip'ped file. Once we've set one of these up and set
290    *    either the <do_stdin> or <do_gzip> flag, we won't try to open
291    *    any associated indexes or binary database files.
292    */
293   if (strcmp(filename, "-") == 0) /* "-" means read from stdin */
294     {
295       cmfp->f        = stdin;
296       cmfp->do_stdin = TRUE;
297       if ((status = esl_strdup("[STDIN]", -1, &(cmfp->fname))) != eslOK)   ESL_XFAIL(status, errbuf, "esl_strdup failed; shouldn't happen");
298     }
299 #ifdef HAVE_POPEN
300   else if (n > 3 && strcmp(filename+n-3, ".gz") == 0) /* a <*.gz> filename means read via gunzip pipe */
301     {
302       if (! esl_FileExists(filename))	                                   ESL_XFAIL(eslENOTFOUND, errbuf, ".gz file %s not found or not readable", filename);
303       if ((status = esl_sprintf(&cmd, "gzip -dc %s", filename)) != eslOK)  ESL_XFAIL(status,       errbuf, "when setting up .gz pipe: esl_sprintf() failed");
304       if ((cmfp->f = popen(cmd, "r")) == NULL)                             ESL_XFAIL(eslENOTFOUND, errbuf, "gzip -dc %s failed; gzip not installed or not in PATH?", filename);
305       if ((status = esl_strdup(filename, n, &(cmfp->fname))) != eslOK)     ESL_XFAIL(status,       errbuf, "esl_strdup() failed, shouldn't happen");
306       cmfp->do_gzip  = TRUE;
307       free(cmd); cmd = NULL;
308     }
309 #endif /*HAVE_POPEN: gzip mode */
310 
311 
312   /* 2. If <cmfp->f> is still NULL, then we're in the usual situation
313    *    of looking for a file on disk. It may either be in the cwd, or
314    *    in one of the directories listed in the <env> string. Find it,
315    *    open it to <cmfp->f>, and set <cmfp->filename>. The
316    *    <cmfp->filename> string will be used later to construct the
317    *    names of expected index and binary database files.
318    */
319   if (cmfp->f == NULL)
320     {
321       if ((cmfp->f = fopen(filename, "r")) != NULL)
322 	{
323 	  if ((status = esl_strdup(filename, n, &(cmfp->fname)))    != eslOK) ESL_XFAIL(status, errbuf, "esl_strdup() failed, shouldn't happen");
324 	}
325       else if (esl_FileEnvOpen(filename, env, &(cmfp->f), &envfile) == eslOK)
326 	{
327 	  n = strlen(envfile);
328 	  if ((status = esl_strdup(envfile, n, &(cmfp->fname)))     != eslOK) ESL_XFAIL(status, errbuf, "esl_strdup() failed, shouldn't happen");
329 	  free(envfile); envfile = NULL;
330 	}
331       else
332 	{ /* temporarily copy filename over to cmfp->fname, even though we haven't opened anything: we'll next try to open <filename>.h3m  */
333 	  if ((status = esl_strdup(filename, n, &(cmfp->fname)))    != eslOK) ESL_XFAIL(status, errbuf, "esl_strdup() failed, shouldn't happen");
334 	}
335     }
336   /* <cmfp->f> may *still* be NULL, if <filename> is a press'ed database and ASCII file is deleted */
337 
338 
339   /* 3. Look for the binary model file component of a press'ed CM database.
340    *
341    *    If <cmfp->f> is still NULL, this is our last chance to find it.
342    *    (The ASCII base file may have been deleted to save space, leaving
343    *    binary press'ed files.)
344    *
345    * If we've been asked to open only an ASCII file -- because we're being
346    * called by cmpress, for example! -- then don't do this.
347    */
348   if (! do_ascii_only && ! cmfp->do_stdin && ! cmfp->do_gzip)
349     {
350       FILE *tmpfp;
351       /* if we opened an ASCII file in the INFERNALDB directory, cmfp->fname contains fully qualified name of file including the path */
352       if ((status = esl_sprintf(&dbfile, "%s.i1m", cmfp->fname) != eslOK)) ESL_XFAIL(status, errbuf, "esl_sprintf() failed; shouldn't happen");
353 
354       if ((tmpfp = fopen(dbfile, "rb")) != NULL)
355 	{
356 	  if (cmfp->f != NULL) fclose(cmfp->f); /* preferentially read the .i1m file, not the original */
357 	  cmfp->f = tmpfp;
358 	  cmfp->is_pressed = TRUE;
359 	  free(cmfp->fname);
360 	  if ((status = esl_strdup(dbfile, -1, &(cmfp->fname)))    != eslOK) ESL_XFAIL(status, errbuf, "esl_strdup() failed, shouldn't happen");
361 	}
362       else if (cmfp->f == NULL && esl_FileEnvOpen(dbfile, env, &(cmfp->f), &envfile) == eslOK)
363 	{ /* found a binary-only press'ed db in one of the env directories. */
364 	  free(cmfp->fname);
365 	  if ((status = esl_strdup(envfile, -1, &(cmfp->fname)))    != eslOK) ESL_XFAIL(status, errbuf, "esl_strdup() failed; shouldn't happen");
366 	  cmfp->is_pressed = TRUE;
367 	}
368       free(dbfile); dbfile = NULL;
369     }
370 
371   /* 4. <cmfp->f> must now point to a valid model input stream: if not, we fail.
372    */
373   if (cmfp->f == NULL)
374     {
375       if (env) ESL_XFAIL(eslENOTFOUND, errbuf, "CM file %s not found (nor an .i1m binary of it); also looked in %s", filename, env);
376       else     ESL_XFAIL(eslENOTFOUND, errbuf, "CM file %s not found (nor an .i1m binary of it)",                    filename);
377     }
378 
379   /* 5. Set up the HMM file <cmfp->hfp> that we'll use to read p7
380    *     filter HMMs from within the CM file. We do this before we
381    *     handle press'd model files, so we can set <cmfp->hfp->ffp>
382    *     and <cmfp->hfp->pfp> if necessary.
383    *
384    */
385   ESL_ALLOC(cmfp->hfp, sizeof(P7_HMMFILE));
386 
387   if (!cmfp->do_stdin && !cmfp->do_gzip) {
388     if ((cmfp->hfp->f = fopen(cmfp->fname, "r")) == NULL) goto ERROR;
389   }
390   else if (cmfp->do_stdin) {
391     cmfp->hfp->f = stdin;
392   }
393 #ifdef HAVE_POPEN /* gzip functionality */
394   else if (cmfp->do_gzip)  {  /* will only possibly be TRUE if HAVE_POPEN */
395     /* we don't open the file separately for cmfp->hfp in gzip case,
396      * which works fine (since we're never a press'd db) but we have
397      * to be careful when closing cmfp in cm_file_Close().
398      */
399     cmfp->hfp->f = cmfp->f;
400   }
401 #endif
402   cmfp->hfp->do_gzip      = cmfp->do_gzip;
403   cmfp->hfp->do_stdin     = cmfp->do_stdin;
404   cmfp->hfp->newly_opened = TRUE;	/* well, it will be, real soon now */
405   cmfp->hfp->is_pressed   = cmfp->is_pressed;
406 #ifdef HMMER_THREADS
407   cmfp->hfp->syncRead     = FALSE;
408 #endif
409   cmfp->hfp->parser       = NULL;
410   cmfp->hfp->efp          = NULL;
411   cmfp->hfp->ffp          = NULL;
412   cmfp->hfp->pfp          = NULL;
413   cmfp->hfp->ssi          = NULL;      /* not sure if this should point to cmfp->ssi */
414   cmfp->hfp->errbuf[0]    = '\0';
415   if ((status = esl_strdup(cmfp->fname, -1, &(cmfp->hfp->fname))) != eslOK) goto ERROR;
416 
417   /* 6. If we found and opened a binary model file .i1m, open the rest of
418    *     the press'd model files. (this can't be true if do_ascii_only is set).
419    *     Note that we set cmfp->hfp->ffp and cmfp->hfp->pfp at the end of the
420    *     function after we've
421    */
422   if (cmfp->is_pressed)
423     {
424       /* here we rely on the fact that the suffixes are .i1{mfpi}, to construct other names from .i1m file name !! */
425       n = strlen(cmfp->fname); 	/* so, n = '\0', n-1 = 'm'  */
426       esl_strdup(cmfp->fname, n, &dbfile);
427 
428       dbfile[n-1] = 'f';	/* the MSV filter part of the optimized sequence profiles (HMMs) */
429       if ((cmfp->ffp      = fopen(dbfile, "rb")) == NULL) ESL_XFAIL(eslENOTFOUND, errbuf, "Opened %s, a pressed CM file; but no .i1f file found", cmfp->fname);
430       if ((cmfp->hfp->ffp = fopen(dbfile, "rb")) == NULL) ESL_XFAIL(eslENOTFOUND, errbuf, "Opened %s, a pressed CM file; but no .i1f file found", cmfp->fname);
431 
432       dbfile[n-1] = 'p';	/* the remainder of the optimized sequence profiles (HMMs) */
433       if ((cmfp->hfp->pfp = fopen(dbfile, "rb")) == NULL) ESL_XFAIL(eslENOTFOUND, errbuf, "Opened %s, a pressed CM file; but no .i1p file found", cmfp->fname);
434 
435       dbfile[n-1] = 'i';	/* the SSI index for the .i1m file */
436       status = esl_ssi_Open(dbfile, &(cmfp->ssi));
437       if      (status == eslENOTFOUND) ESL_XFAIL(eslENOTFOUND, errbuf, "Opened %s, a pressed CM file; but no .i1i file found", cmfp->fname);
438       else if (status == eslEFORMAT)   ESL_XFAIL(eslEFORMAT,   errbuf, "Opened %s, a pressed CM file; but format of its .i1i file unrecognized", cmfp->fname);
439       else if (status == eslERANGE)    ESL_XFAIL(eslEFORMAT,   errbuf, "Opened %s, a pressed CM file; but its .i1i file is 64-bit and your system is 32-bit", cmfp->fname);
440       else if (status != eslOK)        ESL_XFAIL(eslEFORMAT,   errbuf, "Opened %s, a pressed CM file; but failed to open its .i1i file", cmfp->fname);
441 
442       free(dbfile); dbfile = NULL;
443     }
444   else
445     {
446       if ((status = esl_sprintf(&dbfile, "%s.ssi", cmfp->fname)) != eslOK) ESL_XFAIL(status, errbuf, "esl_sprintf() failed");
447 
448       status = esl_ssi_Open(dbfile, &(cmfp->ssi)); /* not finding an SSI file is ok. we open it if we find it. */
449       if      (status == eslEFORMAT)   ESL_XFAIL(status, errbuf, "a %s.ssi file exists (an SSI index), but its SSI format is not recognized",     cmfp->fname);
450       else if (status == eslERANGE)    ESL_XFAIL(status, errbuf, "a %s.ssi file exists (an SSI index), but is 64-bit, and your system is 32-bit", cmfp->fname);
451       else if (status != eslOK && status != eslENOTFOUND) ESL_XFAIL(status, errbuf, "esl_ssi_Open() failed");
452       free(dbfile); dbfile = NULL;
453     }
454 
455 
456   /* 7. Check for binary file format. A pressed db is automatically binary: verify. */
457   if (! fread((char *) &(magic.n), sizeof(uint32_t), 1, cmfp->f))  ESL_XFAIL(eslEFORMAT, errbuf, "File exists, but appears to be empty?");
458   if      (magic.n == v1a_magic) { cmfp->format = CM_FILE_1a; cmfp->parser = read_bin_1p1_cm; cmfp->is_binary = TRUE; }
459   else if (cmfp->is_pressed) ESL_XFAIL(eslEFORMAT, errbuf, "Binary format tag in %s unrecognized\nCurrent Infernal format is INFERNAL1/a. Previous binary formats are not supported.", cmfp->fname);
460 
461   /* 8. Checks for ASCII file format */
462   if (cmfp->parser == NULL)
463     {
464       /* Does the magic appear to be binary, yet we didn't recognize it? */
465       if (magic.n & 0x80000000) ESL_XFAIL(eslEFORMAT, errbuf, "Format tag appears binary, but unrecognized\nCurrent Infernal format is INFERNAL1/a. Previous binary formats are not supported.");
466 
467       if ((cmfp->efp = esl_fileparser_Create(cmfp->f))                     == NULL)  ESL_XFAIL(eslEMEM, errbuf, "internal error in esl_fileparser_Create()");
468       if ((status = esl_fileparser_SetCommentChar(cmfp->efp, '#'))        != eslOK)  ESL_XFAIL(status,  errbuf, "internal error in esl_fileparser_SetCommentChar()");
469       if ((status = esl_fileparser_NextLinePeeked(cmfp->efp, magic.c, 4)) != eslOK)  ESL_XFAIL(status,  errbuf, "internal error in esl_fileparser_NextLinePeeked()");
470       if ((status = esl_fileparser_GetToken(cmfp->efp, &tok, &toklen))    != eslOK)  ESL_XFAIL(status,  errbuf, "internal error in esl_fileparser_GetToken()");
471 
472       if      (                 strcmp("INFERNAL1/a", tok) == 0) { cmfp->format = CM_FILE_1a; cmfp->parser = read_asc_1p1_cm; }
473       else if ((  allow_1p0) && strcmp("INFERNAL-1",  tok) == 0) { cmfp->format = CM_FILE_1;  cmfp->parser = read_asc_1p0_cm; }
474       else if ((! allow_1p0) && strcmp("INFERNAL-1",  tok) == 0) { ESL_XFAIL(eslEFORMAT, errbuf, "Format tag is '%s': use cmconvert to reformat Infernal v1.0 to v1.0.2 CM files to current format", tok); }
475       else                                                       { ESL_XFAIL(eslEFORMAT, errbuf, "Format tag is '%s': unrecognized or not supported.", tok); }
476     }
477 
478   *ret_cmfp = cmfp;
479   return eslOK;
480 
481  ERROR:
482   if (cmd     != NULL)  free(cmd);
483   if (dbfile  != NULL)  free(dbfile);
484   if (envfile != NULL)  free(envfile);
485   if (cmfp    != NULL)  cm_file_Close(cmfp);
486   *ret_cmfp = NULL;
487   if      (status == eslEMEM)       return status;
488   else if (status == eslENOTFOUND)  return status;
489   else                              return eslEFORMAT;
490 }
491 
492 /* Function:  cm_file_Close()
493  * Incept:    EPN, Fri Jun 17 10:06:42 2011
494  *            SRE, Wed Jan  3 18:48:44 2007 [Casa de Gatos] (p7_hmmfile_Close())
495  *
496  * Purpose:   Closes an open CM file <cmfp>.
497  *
498  * Returns:   (void)
499  */
500 void
cm_file_Close(CM_FILE * cmfp)501 cm_file_Close(CM_FILE *cmfp)
502 {
503   if (cmfp == NULL) return;
504 
505 #ifdef HAVE_POPEN /* gzip functionality */
506   if (cmfp->do_gzip && cmfp->f != NULL) {
507     pclose(cmfp->f);
508     /* careful here: in gzip mode we defined cmfp->hfp->f == cmfp->f,
509      * instead of reopening for cmfp->hfp->f, so we need to set
510      * cmfp->hfp->f to NULL so p7_hmmfile_Close() doesn't try to close
511      * it.
512      */
513     if(cmfp->f == cmfp->hfp->f) { /* this should be TRUE */
514       cmfp->hfp->f = NULL;
515     }
516     cmfp->f = NULL;
517   }
518 #endif
519   if (!cmfp->do_gzip && !cmfp->do_stdin && cmfp->f != NULL) fclose(cmfp->f);
520   if (cmfp->ffp   != NULL) fclose(cmfp->ffp);
521   if (cmfp->pfp   != NULL) fclose(cmfp->pfp);
522   if (cmfp->fname != NULL) free(cmfp->fname);
523   if (cmfp->efp   != NULL) esl_fileparser_Destroy(cmfp->efp);
524   if (cmfp->ssi   != NULL) esl_ssi_Close(cmfp->ssi);
525 #ifdef HMMER_THREADS
526   if (cmfp->syncRead)      pthread_mutex_destroy (&cmfp->readMutex);
527 #endif
528 
529   if(cmfp->hfp != NULL) p7_hmmfile_Close(cmfp->hfp);
530 
531   free(cmfp);
532 }
533 
534 #ifdef HMMER_THREADS
535 /* Function:  cm_file_CreateLock()
536  * Incept:    EPN, Fri Jun 17 10:07:06 2011
537  *            MSF, Wed July 15 2009 (p7_hmmfile_CreateLock())
538  *
539  * Purpose:   Create a lock to syncronize readers.
540  *
541  * Returns:   <eslOK> on success.
542  */
543 int
cm_file_CreateLock(CM_FILE * cmfp)544 cm_file_CreateLock(CM_FILE *cmfp)
545 {
546   int status;
547 
548   if (cmfp == NULL) return eslEINVAL;
549 
550   /* make sure the lock is not created twice */
551   if (!cmfp->syncRead)
552     {
553       cmfp->syncRead = TRUE;
554       status = pthread_mutex_init(&cmfp->readMutex, NULL);
555       if (status != 0) goto ERROR;
556     }
557 
558   /* create lock on hmm files as well */
559   if (cmfp->hfp != NULL)
560     if((status = p7_hmmfile_CreateLock(cmfp->hfp)) != eslOK) goto ERROR;
561 
562   return eslOK;
563 
564  ERROR:
565   cmfp->syncRead = FALSE;
566   return eslFAIL;
567 }
568 #endif
569 /*----------------- end, CM_FILE object ----------------------*/
570 
571 
572 /*****************************************************************
573  * 2. Writing CM files.
574  *****************************************************************/
575 static int multiline(FILE *fp, const char *pfx, char *s);
576 
577 /* Function:  cm_file_WriteASCII()
578  * Synopsis:  Write an Infernal 1.1 ASCII save file.
579  * Incept:    EPN, Fri Jun 17 10:07:46 2011
580  *            SRE, Tue May 19 09:39:31 2009 [Janelia] (p7_hmmfile_WriteASCII())
581  *
582  * Purpose:   Write a covariance model <cm> in an ASCII save file format to
583  *            an open stream <fp>.
584  *
585  *            Currently only outputs in the default standard format,
586  *            so format must be <CM_FILE_1a> or <-1> (which specifies
587  *            the current default format be used).
588  *            In the future other formats will be accepted.
589  *
590  * Args:      fp     - open stream for writing
591  *            format - -1 for default format, or a 1.x format code like <CM_FILE_1a>
592  *            cm     - CM to save
593  *
594  * Returns:   <eslOK> on success.
595  *
596  * Throws:    <eslEINVAL> if <format> isn't a valid 3.0 format code.
597  */
598 int
cm_file_WriteASCII(FILE * fp,int format,CM_t * cm)599 cm_file_WriteASCII(FILE *fp, int format, CM_t *cm)
600 {
601   int x, v, nd, y;
602 
603   if((cm->flags & CMH_LOCAL_BEGIN) || (cm->flags & CMH_LOCAL_END)) cm_Fail("cm_file_WriteASCII(): CM is in local mode");
604 
605   if (format == -1) format = CM_FILE_1a;
606 
607   if   (format == CM_FILE_1a) fprintf(fp, "INFERNAL1/a [%s | %s]\n", INFERNAL_VERSION, INFERNAL_DATE);
608   else ESL_EXCEPTION(eslEINVAL, "invalid CM file format code");
609 
610   fprintf(fp, "NAME     %s\n", cm->name);
611   if (cm->acc)  fprintf(fp, "ACC      %s\n", cm->acc);
612   if (cm->desc) fprintf(fp, "DESC     %s\n", cm->desc);
613   fprintf(fp, "STATES   %d\n", cm->M);
614   fprintf(fp, "NODES    %d\n", cm->nodes);
615   fprintf(fp, "CLEN     %d\n", cm->clen);
616   fprintf(fp, "W        %d\n", cm->W);
617   fprintf(fp, "ALPH     %s\n", esl_abc_DecodeType(cm->abc->type));
618   fprintf(fp, "RF       %s\n", (cm->flags & CMH_RF)   ? "yes" : "no");
619   fprintf(fp, "CONS     %s\n", (cm->flags & CMH_CONS) ? "yes" : "no");
620   fprintf(fp, "MAP      %s\n", (cm->flags & CMH_MAP)  ? "yes" : "no");
621   if (cm->ctime   != NULL) fprintf  (fp, "DATE     %s\n", cm->ctime);
622   if (cm->comlog  != NULL) multiline(fp, "COM     ",     cm->comlog);
623   fprintf(fp, "PBEGIN   %g\n", cm->pbegin);
624   fprintf(fp, "PEND     %g\n", cm->pend);
625   fprintf(fp, "WBETA    %g\n", cm->beta_W);
626   fprintf(fp, "QDBBETA1 %g\n", cm->qdbinfo->beta1);
627   fprintf(fp, "QDBBETA2 %g\n", cm->qdbinfo->beta2);
628   fprintf(fp, "N2OMEGA  %6g\n",cm->null2_omega);
629   fprintf(fp, "N3OMEGA  %6g\n",cm->null3_omega);
630   fprintf(fp, "ELSELF   %.8f\n",cm->el_selfsc);
631   fprintf(fp, "NSEQ     %d\n", cm->nseq);
632   fprintf(fp, "EFFN     %f\n",cm->eff_nseq);
633   if (cm->flags & CMH_CHKSUM)  fprintf(fp, "CKSUM    %u\n", cm->checksum); /* unsigned 32-bit */
634   fputs("NULL    ", fp);
635   for (x = 0; x < cm->abc->K; x++) { fprintf(fp, "%6s ", prob2ascii(cm->null[x], 1/(float)(cm->abc->K))); }
636   fputc('\n', fp);
637   if (cm->flags & CMH_GA)  fprintf(fp, "GA       %.2f\n", cm->ga);
638   if (cm->flags & CMH_TC)  fprintf(fp, "TC       %.2f\n", cm->tc);
639   if (cm->flags & CMH_NC)  fprintf(fp, "NC       %.2f\n", cm->nc);
640 
641   if (cm->flags & CMH_FP7) {
642     fprintf(fp, "EFP7GF   %.4f %.5f\n", cm->fp7_evparam[CM_p7_GFMU],  cm->fp7_evparam[CM_p7_GFLAMBDA]);
643   }
644   if (cm->flags & CMH_EXPTAIL_STATS)
645     {
646       /* make sure our dbsize values can be cast to a long reliably,
647        * even on 32 bit systems (<= 2 Gb), they certainly should be,
648        * max value in cmcalibrate is 160Mb. (This is related to bug
649        * i31.)
650        */
651       if(cm->expA[EXP_CM_LC]->dbsize > (2000. * 1000000.)) ESL_EXCEPTION(eslEINVAL, "invalid dbsize (too big) EXP_CM_LC");
652       if(cm->expA[EXP_CM_GC]->dbsize > (2000. * 1000000.)) ESL_EXCEPTION(eslEINVAL, "invalid dbsize (too big) EXP_CM_GC");
653       if(cm->expA[EXP_CM_LI]->dbsize > (2000. * 1000000.)) ESL_EXCEPTION(eslEINVAL, "invalid dbsize (too big) EXP_CM_LI");
654       if(cm->expA[EXP_CM_GI]->dbsize > (2000. * 1000000.)) ESL_EXCEPTION(eslEINVAL, "invalid dbsize (too big) EXP_CM_GI");
655 
656       fprintf(fp, "ECMLC    %.5f  %10.5f  %10.5f  %10ld  %10d  %.6f\n",
657 	      cm->expA[EXP_CM_LC]->lambda, cm->expA[EXP_CM_LC]->mu_extrap, cm->expA[EXP_CM_LC]->mu_orig,
658 	      (long) (cm->expA[EXP_CM_LC]->dbsize + 0.5), cm->expA[EXP_CM_LC]->nrandhits, cm->expA[EXP_CM_LC]->tailp);
659       fprintf(fp, "ECMGC    %.5f  %10.5f  %10.5f  %10ld  %10d  %.6f\n",
660 	      cm->expA[EXP_CM_GC]->lambda, cm->expA[EXP_CM_GC]->mu_extrap, cm->expA[EXP_CM_GC]->mu_orig,
661 	      (long) (cm->expA[EXP_CM_GC]->dbsize + 0.5), cm->expA[EXP_CM_GC]->nrandhits, cm->expA[EXP_CM_GC]->tailp);
662       fprintf(fp, "ECMLI    %.5f  %10.5f  %10.5f  %10ld  %10d  %.6f\n",
663 	      cm->expA[EXP_CM_LI]->lambda, cm->expA[EXP_CM_LI]->mu_extrap, cm->expA[EXP_CM_LI]->mu_orig,
664 	      (long) (cm->expA[EXP_CM_LI]->dbsize + 0.5), cm->expA[EXP_CM_LI]->nrandhits, cm->expA[EXP_CM_LI]->tailp);
665       fprintf(fp, "ECMGI    %.5f  %10.5f  %10.5f  %10ld  %10d  %.6f\n",
666 	      cm->expA[EXP_CM_GI]->lambda, cm->expA[EXP_CM_GI]->mu_extrap, cm->expA[EXP_CM_GI]->mu_orig,
667 	      (long) (cm->expA[EXP_CM_GI]->dbsize + 0.5), cm->expA[EXP_CM_GI]->nrandhits, cm->expA[EXP_CM_GI]->tailp);
668     }
669 
670   /* main model section */
671   fputs("CM\n", fp);
672 
673   /* Create emit map if nec, so we can output map, consensus and rf info appropriately */
674   if(cm->emap == NULL) {
675     cm->emap = CreateEmitMap(cm);
676     if(cm->emap == NULL) ESL_EXCEPTION(eslEINVAL, "unable to create an emit map");
677   }
678 
679   for (v = 0; v < cm->M; v++) {
680     nd = cm->ndidx[v];
681 
682     /* Node line. node type and additional per-consensus position annotation */
683     if (cm->nodemap[nd] == v) {
684       fprintf(fp, "%45s[ %-4s %4d ]", "", Nodetype(cm->ndtype[nd]), nd);
685 
686       /* additional annotation */
687       /* map (optional) */
688       if(cm->flags & CMH_MAP) {
689 	if     (cm->ndtype[nd] == MATP_nd) fprintf(fp, " %6d %6d", cm->map[cm->emap->lpos[nd]], cm->map[cm->emap->rpos[nd]]);
690 	else if(cm->ndtype[nd] == MATL_nd) fprintf(fp, " %6d %6s", cm->map[cm->emap->lpos[nd]], "-");
691 	else if(cm->ndtype[nd] == MATR_nd) fprintf(fp, " %6s %6d", "-", cm->map[cm->emap->rpos[nd]]);
692 	else 	                           fprintf(fp, " %6s %6s", "-", "-");
693       }
694       else { /* no map annotation */
695 	fprintf(fp, " %6s %6s", "-", "-");
696       }
697       /* consensus sequence (mandatory) */
698       if     (cm->ndtype[nd] == MATP_nd) fprintf(fp, " %c %c", cm->consensus[cm->emap->lpos[nd]], cm->consensus[cm->emap->rpos[nd]]);
699       else if(cm->ndtype[nd] == MATL_nd) fprintf(fp, " %c %c", cm->consensus[cm->emap->lpos[nd]], '-');
700       else if(cm->ndtype[nd] == MATR_nd) fprintf(fp, " %c %c", '-', cm->consensus[cm->emap->rpos[nd]]);
701       else 	                         fprintf(fp, " %c %c", '-', '-');
702       /* RF (optional) */
703       if(cm->flags & CMH_RF) {
704 	if     (cm->ndtype[nd] == MATP_nd) fprintf(fp, " %c %c", cm->rf[cm->emap->lpos[nd]], cm->rf[cm->emap->rpos[nd]]);
705 	else if(cm->ndtype[nd] == MATL_nd) fprintf(fp, " %c %c", cm->rf[cm->emap->lpos[nd]], '-');
706 	else if(cm->ndtype[nd] == MATR_nd) fprintf(fp, " %c %c", '-', cm->rf[cm->emap->rpos[nd]]);
707 	else 	                           fprintf(fp, " %c %c", '-', '-');
708       }
709       else { /* no RF annotation */
710 	fprintf(fp, " %c %c", '-', '-');
711       }
712       fputs("\n", fp);
713     }
714 
715     /* State line, w/ parents, children, dmin2, dmin1, dmax1, dmax2, transitions and emissions */
716     fprintf(fp, "    %2s %5d %5d %1d %5d %5d %5d %5d %5d %5d ",
717 	    Statetype(cm->sttype[v]), v,
718 	    cm->plast[v], cm->pnum[v],
719 	    cm->cfirst[v], cm->cnum[v],
720 	    cm->qdbinfo->dmin2[v], cm->qdbinfo->dmin1[v],
721 	    cm->qdbinfo->dmax1[v], cm->qdbinfo->dmax2[v]);
722 
723     /* Transitions */
724     if (cm->sttype[v] != B_st) {
725       for (x = 0; x < cm->cnum[v]; x++) {
726 	fprintf(fp, "%7s ", prob2ascii(cm->t[v][x], 1.));
727       }
728     }
729     else {
730       x = 0;
731     }
732     for (; x < 6; x++) {
733       fprintf(fp, "%7s ", "");
734     }
735 
736     /* Emissions */
737     if (cm->sttype[v] == MP_st) {
738       for (x = 0; x < cm->abc->K; x++) {
739 	for (y = 0; y < cm->abc->K; y++) {
740 	  fprintf(fp, "%6s ", prob2ascii(cm->e[v][x*cm->abc->K+y], cm->null[x]*cm->null[y]));
741 	}
742       }
743     }
744     else if (cm->sttype[v] == ML_st || cm->sttype[v] == MR_st || cm->sttype[v] == IL_st || cm->sttype[v] == IR_st) {
745       for (x = 0; x < cm->abc->K; x++) {
746 	fprintf(fp, "%6s ", prob2ascii(cm->e[v][x], cm->null[x]));
747       }
748     }
749     fputs("\n", fp);
750   }
751   fputs("//\n", fp);
752 
753   /* print additional p7 hmms if any */
754   if(cm->flags & CMH_FP7 && cm->fp7 != NULL) {
755     p7_hmmfile_WriteASCII(fp, -1, cm->fp7);
756   }
757   return eslOK;
758 }
759 
760 
761 /* Function:  cm_file_WriteBinary()
762  * Incept:    EPN, Fri Jun 17 11:01:17 2011
763  *            SRE, Wed Jan  3 13:50:26 2007 [Janelia] (p7_hmmfile_WriteBinary())
764  * Purpose:   Writes an CM to a file in INFERNAL binary format.
765  *
766  *            Legacy binary file formats will eventually be supported
767  *            by specifying the <format> code, but currently only one
768  *            valid format exists. Passing <-1> as format specifies
769  *            the default current standard format; pass a valid
770  *            code such as <CM_FILE_1a> to select a specific
771  *            binary format.
772  *
773  * Returns:   <eslOK> on success. File position of start of fp7 is
774  *            sent back in <*opt_fp7_offset> if it is non-NULL. If no
775  *            fp7 is written, (<*opt_fp7_offset> is 0) and caller will
776  *            know no fp7 is written because (cm->fp7 == NULL) || (!
777  *            (cm->flags & CMH_FP7)).  <eslFAIL> if any writes fail
778  *            (for instance, if disk fills up, which did happen during
779  *            testing!).
780  *
781  * Throws:    <eslEINVAL> if <format> isn't a valid 3.0 format code.
782  */
783 int
cm_file_WriteBinary(FILE * fp,int format,CM_t * cm,off_t * opt_fp7_offset)784 cm_file_WriteBinary(FILE *fp, int format, CM_t *cm, off_t *opt_fp7_offset)
785 {
786   int v, z;
787   off_t fp7_offset;
788 
789   if((cm->flags & CMH_LOCAL_BEGIN) || (cm->flags & CMH_LOCAL_END)) cm_Fail("cm_file_WriteASCII(): CM is in local mode");
790 
791   if (format == -1) format = CM_FILE_1a;
792 
793   /* ye olde magic number */
794   if      (format == CM_FILE_1a) { if (fwrite((char *) &(v1a_magic), sizeof(uint32_t), 1, fp) != 1) return eslFAIL; }
795   else ESL_EXCEPTION(eslEINVAL, "invalid CM file format code");
796 
797   /* info necessary for sizes of things
798    */
799   if (fwrite((char *) &(cm->flags),      sizeof(int),  1,   fp) != 1) return eslFAIL;
800   if (fwrite((char *) &(cm->M),          sizeof(int),  1,   fp) != 1) return eslFAIL;
801   if (fwrite((char *) &(cm->nodes),      sizeof(int),  1,   fp) != 1) return eslFAIL;
802   if (fwrite((char *) &(cm->clen),       sizeof(int),  1,   fp) != 1) return eslFAIL;
803   if (fwrite((char *) &(cm->abc->type),  sizeof(int),  1,   fp) != 1) return eslFAIL;
804 
805   /* main model section
806    */
807 
808   if (fwrite((char *) cm->sttype,         sizeof(char), cm->M,     fp) != cm->M)     return eslFAIL;
809   if (fwrite((char *) cm->ndidx,          sizeof(int),  cm->M,     fp) != cm->M)     return eslFAIL;
810   if (fwrite((char *) cm->stid,           sizeof(char), cm->M,     fp) != cm->M)     return eslFAIL;
811   if (fwrite((char *) cm->cfirst,         sizeof(int),  cm->M,     fp) != cm->M)     return eslFAIL;
812   if (fwrite((char *) cm->cnum,           sizeof(int),  cm->M,     fp) != cm->M)     return eslFAIL;
813   if (fwrite((char *) cm->plast,          sizeof(int),  cm->M,     fp) != cm->M)     return eslFAIL;
814   if (fwrite((char *) cm->pnum,           sizeof(int),  cm->M,     fp) != cm->M)     return eslFAIL;
815   if (fwrite((char *) cm->nodemap,        sizeof(int),  cm->nodes, fp) != cm->nodes) return eslFAIL;
816   if (fwrite((char *) cm->ndtype,         sizeof(char), cm->nodes, fp) != cm->nodes) return eslFAIL;
817   if (fwrite((char *) cm->qdbinfo->dmin1, sizeof(int),  cm->M,     fp) != cm->M)     return eslFAIL;
818   if (fwrite((char *) cm->qdbinfo->dmax1, sizeof(int),  cm->M,     fp) != cm->M)     return eslFAIL;
819   if (fwrite((char *) cm->qdbinfo->dmin2, sizeof(int),  cm->M,     fp) != cm->M)     return eslFAIL;
820   if (fwrite((char *) cm->qdbinfo->dmax2, sizeof(int),  cm->M,     fp) != cm->M)     return eslFAIL;
821 
822   for (v = 0; v < cm->M; v++) {
823     if (fwrite((char *) cm->t[v], sizeof(float), MAXCONNECT,            fp) != MAXCONNECT)              return eslFAIL;
824     if (fwrite((char *) cm->e[v], sizeof(float), cm->abc->K*cm->abc->K, fp) != (cm->abc->K*cm->abc->K)) return eslFAIL;
825   }
826 
827   /* annotation section
828    */
829   if (                           write_bin_string(fp, cm->name) != eslOK)                                       return eslFAIL;
830   if ((cm->flags & CMH_ACC)  && (write_bin_string(fp, cm->acc)  != eslOK))                                      return eslFAIL;
831   if ((cm->flags & CMH_DESC) && (write_bin_string(fp, cm->desc) != eslOK))                                      return eslFAIL;
832   if ((cm->flags & CMH_RF)   && (fwrite((char *) cm->rf,          sizeof(char), cm->clen+2, fp) != cm->clen+2)) return eslFAIL; /* +2: 1..clen and trailing \0 */
833   if ((cm->flags & CMH_CONS) && (fwrite((char *) cm->consensus,   sizeof(char), cm->clen+2, fp) != cm->clen+2)) return eslFAIL; /* consensus is mandatory */
834   if ((cm->flags & CMH_MAP)  && (fwrite((char *) cm->map,         sizeof(int),  cm->clen+1, fp) != cm->clen+1)) return eslFAIL; /* +2: 1..clen and trailing \0 */
835   if (fwrite((char *) &(cm->W), sizeof(int),      1,   fp) != 1) return eslFAIL;
836 
837   if ((write_bin_string(fp, cm->ctime))  != eslOK) return eslFAIL;
838   if ((write_bin_string(fp, cm->comlog)) != eslOK) return eslFAIL;
839 
840   if (fwrite((char *) &(cm->pbegin),         sizeof(float),    1,   fp) != 1) return eslFAIL;
841   if (fwrite((char *) &(cm->pend),           sizeof(float),    1,   fp) != 1) return eslFAIL;
842   if (fwrite((char *) &(cm->beta_W),         sizeof(double),   1,   fp) != 1) return eslFAIL;
843   if (fwrite((char *) &(cm->qdbinfo->beta1), sizeof(double),   1,   fp) != 1) return eslFAIL;
844   if (fwrite((char *) &(cm->qdbinfo->beta2), sizeof(double),   1,   fp) != 1) return eslFAIL;
845   if (fwrite((char *) &(cm->null2_omega),    sizeof(float),    1,   fp) != 1) return eslFAIL;
846   if (fwrite((char *) &(cm->null3_omega),    sizeof(float),    1,   fp) != 1) return eslFAIL;
847   if (fwrite((char *) &(cm->el_selfsc),      sizeof(float),    1,   fp) != 1) return eslFAIL;
848   if (fwrite((char *) &(cm->nseq),           sizeof(int),      1,   fp) != 1) return eslFAIL;
849   if (fwrite((char *) &(cm->eff_nseq),       sizeof(float),    1,   fp) != 1) return eslFAIL;
850   if (fwrite((char *) &(cm->checksum),       sizeof(uint32_t), 1,   fp) != 1) return eslFAIL;
851   if (fwrite((char *) cm->null,              sizeof(float), cm->abc->K, fp) != cm->abc->K) return eslFAIL;
852 
853   /* Rfam cutoffs
854    */
855   if ((cm->flags & CMH_GA) && (fwrite((char *) &(cm->ga), sizeof(float),    1,   fp) != 1)) return eslFAIL;
856   if ((cm->flags & CMH_TC) && (fwrite((char *) &(cm->tc), sizeof(float),    1,   fp) != 1)) return eslFAIL;
857   if ((cm->flags & CMH_NC) && (fwrite((char *) &(cm->nc), sizeof(float),    1,   fp) != 1)) return eslFAIL;
858 
859   /* E-value parameters
860    */
861   if (cm->flags & CMH_FP7) { /* should always be true */
862     if (fwrite((char *) &(cm->fp7_evparam[CM_p7_GFMU]),     sizeof(float), 1, fp) != 1) return eslFAIL;
863     if (fwrite((char *) &(cm->fp7_evparam[CM_p7_GFLAMBDA]), sizeof(float), 1, fp) != 1) return eslFAIL;
864   }
865   if (cm->flags & CMH_EXPTAIL_STATS) {
866     long dbsize_long;
867     for(z = 0; z < EXP_NMODES; z++) {
868       /* make sure our dbsize values can be cast to a long reliably,
869        * even on 32 bit systems (<= 2 Gb), they certainly should be,
870        * max value in cmcalibrate is 160Mb. (This is related to bug
871        * i31.)
872        */
873       if(cm->expA[z]->dbsize > (2000. * 1000000.)) ESL_EXCEPTION(eslEINVAL, "invalid dbsize (too big)");
874       dbsize_long = (long) cm->expA[z]->dbsize + 0.5;
875       if (fwrite((char *) &(cm->expA[z]->lambda),    sizeof(double), 1, fp) != 1) return eslFAIL;
876       if (fwrite((char *) &(cm->expA[z]->mu_extrap), sizeof(double), 1, fp) != 1) return eslFAIL;
877       if (fwrite((char *) &(cm->expA[z]->mu_orig),   sizeof(double), 1, fp) != 1) return eslFAIL;
878       if (fwrite((char *) &(dbsize_long),            sizeof(long),   1, fp) != 1) return eslFAIL;
879       if (fwrite((char *) &(cm->expA[z]->nrandhits), sizeof(int),    1, fp) != 1) return eslFAIL;
880       if (fwrite((char *) &(cm->expA[z]->tailp),     sizeof(double), 1, fp) != 1) return eslFAIL;
881     }
882   }
883 
884   /* finally, write the filter p7 HMM */
885   fp7_offset = 0;
886   /* fp7_offset remains 0 if we don't write a fp7, caller will know that no
887    * fp7 was written if (! (cm->flags & CMH_FP7)) || cm->fp7 == NULL
888    */
889   if(cm->flags & CMH_FP7 && cm->fp7 != NULL) {
890     if(opt_fp7_offset != NULL) {
891       /* figure out offset only if we are returning it, otherwise it's irrelevant and can cause a failure
892        * in some cases (e.g. cmconvert -b test.cm | cat > /dev/null), this was bug i41
893        */
894       if((fp7_offset = ftello(fp)) == -1) ESL_EXCEPTION(eslEINVAL, "failed to determine file position for p7 filter");
895     }
896     p7_hmmfile_WriteBinary(fp, -1, cm->fp7);
897   }
898 
899   if(opt_fp7_offset != NULL) *opt_fp7_offset = fp7_offset;
900   return eslOK;
901 }
902 /*----------------- end, save file output  ----------------------*/
903 
904 
905 /* Function: write_bin_string()
906  * Date:     SRE, Wed Oct 29 13:49:27 1997 [TWA 721 over Canada]
907  *
908  * Purpose:  Write a string in binary save format: an integer
909  *           for the string length (including \0), followed by
910  *           the string.
911  *
912  * Return:   <eslOK> on success;
913  *           <eslFAIL> if a write fails due to system error, such
914  *           as a filled disk (as happened in testing).
915  */
916 static int
write_bin_string(FILE * fp,char * s)917 write_bin_string(FILE *fp, char *s)
918 {
919   int len;
920   if (s != NULL)
921     {
922       len = strlen(s) + 1;
923       if (fwrite((char *) &len, sizeof(int),  1,   fp) != 1)   return eslFAIL;
924       if (fwrite((char *) s,    sizeof(char), len, fp) != len) return eslFAIL;
925     }
926   else
927     {
928       len = 0;
929       if (fwrite((char *) &len, sizeof(int), 1, fp) != 1)      return eslFAIL;
930     }
931   return eslOK;
932 }
933 
934 /*****************************************************************
935  * 3. API for reading CMs in various formats.
936  *****************************************************************/
937 
938 /* Function:  cm_file_Read()
939  * Incept:    EPN, Tue Jun 21 10:41:54 2011
940  *            SRE, Sat Jan  6 18:04:58 2007 [Casa de Gatos] (p7_hmmfile_Read())
941  *
942  * Purpose:   Read the next CM from open save file <cmfp>, and
943  *            optionally return this newly allocated CM in <opt_cm>.
944  *            (The optional return is so that an application is
945  *            only interested in whether the file contains a valid
946  *            CM or not -- for example, to verify that a file contains
947  *            only a single CM instead of a database of them.)
948  *
949  *            Read a p7 filter HMM (if one exists) for the CM if
950  *            <read_fp7> is TRUE, else, don't read one. This allows us
951  *            to read only the CM if we're using a pressed file in
952  *            cmscan and already have read the p7 filter HMM
953  *            previously. However be careful with this, because caller
954  *            will have problems if it tries to read the file
955  *            sequentially with <read_fp7> as FALSE because file
956  *            offset will not be at the beginning of the next CM upon
957  *            return, it will be at the beginning of the p7 filter for
958  *            the last CM read, if one exists.
959  *
960  *            Caller may or may not already know what alphabet the CM
961  *            is expected to be in.  A reference to the pointer to the
962  *            current alphabet is passed in <*ret_abc>. If the alphabet
963  *            is unknown, pass <*ret_abc = NULL>, and when the
964  *            new HMM is read, an appropriate new alphabet object is
965  *            allocated and passed back to the caller in <*ret_abc>.
966  *            If the alphabet is already known, <ret_abc> points to
967  *            that object ptr, and the new CM's alphabet type is
968  *            verified to agree with it. This mechanism allows an
969  *            application to let the first CM determine the alphabet
970  *            type for the application, while still keeping the
971  *            alphabet under the application's scope of control.
972  *
973  * Returns:   <eslOK> on success, and the newly allocated CM is
974  *            optionally returned via <opt_cm>. Additionally, if
975  *            <ret_abc> pointed to <NULL>, it now points to a newly
976  *            allocated alphabet.
977  *
978  *            Returns <eslEOF> if no CMs remain in the file; this may
979  *            indicate success or failure, depending on what the
980  *            caller is expecting.
981  *
982  *            Returns <eslEFORMAT> on any format problems, including
983  *            premature end of data or bad magic at the start of a
984  *            binary file. An informative error message is left in
985  *            <cmfp->errbuf>; the filename (fully qualified, if opened
986  *            in a directory specified by an <env> list) is in
987  *            <cmfp->fname>; and if <cmfp->efp> is non-<NULL>, the CM
988  *            file is in an ASCII text format, and the caller may also
989  *            obtain the line number at which the format error was
990  *            detected, in <cmfp->efp->linenumber>, and use it to
991  *            format informative output for a user.
992  *
993  *            Returns <eslEINCOMPAT> if the caller passed a known
994  *            alphabet (a non-<NULL> <*ret_abc>), but the alphabet
995  *            of the CM doesn't match this expectation.
996  *
997  *            Upon any return that is not <eslOK>, <*opt_cm> is
998  *            <NULL> and <*ret_abc> is left unchanged from what caller
999  *            passed it as.
1000  *
1001  * Throws:    <eslEMEM> upon an allocation error.
1002  *            <eslESYS> on failure of other system calls, such
1003  *            as file positioning functions (<fseeko()> or <ftello()>.
1004  */
1005 int
cm_file_Read(CM_FILE * cmfp,int read_fp7,ESL_ALPHABET ** ret_abc,CM_t ** opt_cm)1006 cm_file_Read(CM_FILE *cmfp, int read_fp7, ESL_ALPHABET **ret_abc,  CM_t **opt_cm)
1007 {
1008   /* A call to SSI to remember file position may eventually go here.  */
1009   return (*cmfp->parser)(cmfp, read_fp7, ret_abc, opt_cm);
1010 }
1011 
1012 /* Function:  cm_file_PositionByKey()
1013  * Synopsis:  Use SSI to reposition file to start of named CM.
1014  * Incept:    EPN, Tue Jun 21 10:46:33 2011
1015  *            SRE, Mon Jun 18 10:57:15 2007 [Janelia] (p7_hmmfile_PositionByKey())
1016  *
1017  * Purpose:   Reposition <cmfp> so the next CM we read will be the
1018  *            one named (or accessioned) <key>.
1019  *
1020  * Returns:   <eslOK> on success.
1021  *
1022  *            Returns <eslENOTFOUND> if <key> isn't found in the index for
1023  *            <cmfp>.
1024  *
1025  *            Returns <eslEFORMAT> is something goes wrong trying to
1026  *            read the index, indicating a file format problem in the
1027  *            SSI file.
1028  *
1029  *            In the event of either error, the state of <cmfp> is left
1030  *            unchanged.
1031  *
1032  * Throws:    <eslEMEM> on allocation failure, or <eslESYS> on system i/o
1033  *            call failure, or <eslEINVAL> if <cmfp> doesn't have an SSI
1034  *            index or is not a seekable stream.
1035  */
1036 int
cm_file_PositionByKey(CM_FILE * cmfp,const char * key)1037 cm_file_PositionByKey(CM_FILE *cmfp, const char *key)
1038 {
1039   uint16_t fh;
1040   off_t    offset;
1041   int      status;
1042 
1043   if (cmfp->ssi == NULL) ESL_EXCEPTION(eslEINVAL, "Need an open SSI index to call cm_file_PositionByKey()");
1044   if ((status = esl_ssi_FindName(cmfp->ssi, key, &fh, &offset, NULL, NULL)) != eslOK) return status;
1045   if (fseeko(cmfp->f, offset, SEEK_SET) != 0)    ESL_EXCEPTION(eslESYS, "fseek failed");
1046 
1047   cmfp->newly_opened = FALSE;	/* because we're poised on the magic number, and must read it */
1048   return eslOK;
1049 }
1050 
1051 
1052 /* Function:  cm_file_Position()
1053  * Synopsis:  Reposition file to a given offset.
1054  * Incept:    EPN, Tue Jun 21 10:47:48 2011
1055  *            MSF Wed Nov 4, 2009 [Janelia] (p7_hmmfile_Position())
1056  *
1057  * Purpose:   Reposition <cmfp> to position <offset>.
1058  *
1059  * Returns:   <eslOK> on success.
1060  *
1061  *            In the event an error, the state of <cmfp> is left
1062  *            unchanged.
1063  *
1064  * Throws:    <eslESYS> on system i/o call failure, or <eslEINVAL> if
1065  *            <cmfp> is not a seekable stream.
1066  */
1067 int
cm_file_Position(CM_FILE * cmfp,const off_t offset)1068 cm_file_Position(CM_FILE *cmfp, const off_t offset)
1069 {
1070   if (fseeko(cmfp->f, offset, SEEK_SET) != 0)    ESL_EXCEPTION(eslESYS, "fseek failed");
1071 
1072   cmfp->newly_opened = FALSE;	/* because we're poised on the magic number, and must read it */
1073   return eslOK;
1074 }
1075 
1076 /*------------------- end, input API ----------------------------*/
1077 
1078 
1079 /*****************************************************************
1080  * 4. API for reading/writing p7 HMMs/profiles to filter for CMs.
1081  *****************************************************************/
1082 
1083 /* Function:  cm_p7_hmmfile_Read()
1084  * Incept:    EPN, Fri Jul 22 09:21:53 2011
1085  *
1086  * Purpose:   Position an HMM save file <cmfp->hfp> to <offset> and
1087  *            read an HMM from it at that position. Return the HMM
1088  *            in <ret_hmm>.
1089  *
1090  *            Caller must already know the alphabet the HMM is
1091  *            expected to be in, passed in as <abc>.  If the alphabet
1092  *            for the HMM is of a different type than <abc>, we return
1093  *            eslEINCOMPAT.
1094  *
1095  * Returns:   <eslOK> on success, and the newly allocated HMM is
1096  *            in <ret_hmm>.
1097  *
1098  *            Returns <eslEOF> if no HMMs remain in the file; this
1099  *            is a failure, as we expect there to be one at <offset>.
1100  *
1101  *            Returns <eslEFORMAT> on any format problems, including
1102  *            premature end of data or bad magic at the start of a
1103  *            binary file. An informative error message is left in
1104  *            <cmfp->errbuf>; the filename (fully qualified, if opened
1105  *            in a directory specified by an <env> list) is in
1106  *            <cmfp->fname>; and if <cmfp->hfp->efp> is non-<NULL>,
1107  *            the HMM file is in an ASCII text format, and the caller
1108  *            may also obtain the line number at which the format
1109  *            error was detected, in <cmfp->hfp->efp->linenumber>, and
1110  *            use it to format informative output for a user.
1111  *
1112  *            Returns <eslEINCOMPAT> if the alphabet of the HMM doesn't
1113  *            match the expectation in <abc>.
1114  *
1115  *            Upon any return that is not <eslOK>, <*ret_hmm> is
1116  *            <NULL>.
1117  *
1118  * Throws:    <eslEMEM> upon an allocation error.
1119  *            <eslESYS> on failure of other system calls, such
1120  *            as file positioning functions (<fseeko()> or <ftello()>.
1121  */
1122 int
cm_p7_hmmfile_Read(CM_FILE * cmfp,ESL_ALPHABET * abc,off_t offset,P7_HMM ** ret_hmm)1123 cm_p7_hmmfile_Read(CM_FILE *cmfp, ESL_ALPHABET *abc, off_t offset, P7_HMM **ret_hmm)
1124 {
1125   int      status;
1126   P7_HMM  *hmm  = NULL;
1127   char    *tok1 = NULL;
1128   uint32_t magic;
1129 
1130 #ifdef HMMER_THREADS
1131   if(!cmfp->hfp->do_stdin && !cmfp->hfp->do_gzip) {
1132     /* lock the mutex to prevent other threads from reading the file at the same time */
1133     if (cmfp->hfp->syncRead) {
1134       if (pthread_mutex_lock (&cmfp->hfp->readMutex) != 0) ESL_EXCEPTION(eslESYS, "mutex lock failed");
1135     }
1136   }
1137 #endif
1138   if(!cmfp->hfp->do_stdin && !cmfp->hfp->do_gzip) {
1139     if (fseeko(cmfp->hfp->f, offset, SEEK_SET) != 0) ESL_XFAIL(eslEINCOMPAT, cmfp->errbuf, "Failed to set position for HMM file parser");
1140   }
1141 
1142   /* set the parser if it's unset (which it is only for first HMM read from cmfp->hfp) */
1143   if(cmfp->hfp->parser == NULL) {
1144     if(cmfp->is_binary) { /* CM file is binary, HMM file must be too */
1145       if (! fread((char *) &(magic), sizeof(uint32_t), 1, cmfp->hfp->f))  ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Failed to read magic number at start of p7 additional filters");
1146       if      (magic == v3a_magic) { cmfp->hfp->format = p7_HMMFILE_3a; cmfp->hfp->parser = read_bin30hmm; }
1147       else if (magic == v3b_magic) { cmfp->hfp->format = p7_HMMFILE_3b; cmfp->hfp->parser = read_bin30hmm; }
1148       else if (magic == v3c_magic) { cmfp->hfp->format = p7_HMMFILE_3c; cmfp->hfp->parser = read_bin30hmm; }
1149       else if (magic == v3d_magic) { cmfp->hfp->format = p7_HMMFILE_3d; cmfp->hfp->parser = read_bin30hmm; }
1150       else if (magic == v3e_magic) { cmfp->hfp->format = p7_HMMFILE_3e; cmfp->hfp->parser = read_bin30hmm; }
1151       else if (magic == v3f_magic) { cmfp->hfp->format = p7_HMMFILE_3f; cmfp->hfp->parser = read_bin30hmm; }
1152       else    ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Unknown magic number for p7 HMM filter");
1153     }
1154     else { /* CM file is ascii, HMM file must be too */
1155       if(cmfp->hfp->efp != NULL) ESL_XFAIL(eslEINVAL, cmfp->errbuf, "HMM ascii file parsers are out of sync");
1156       if ((cmfp->hfp->efp = esl_fileparser_Create(cmfp->hfp->f)) == NULL)   { status = eslEMEM; goto ERROR; }
1157       if ((status = esl_fileparser_SetCommentChar(cmfp->hfp->efp, '#'))        != eslOK)  goto ERROR;
1158       if ((status = esl_fileparser_GetToken(cmfp->hfp->efp, &tok1, NULL))      != eslOK)  goto ERROR;
1159 
1160       if      (strcmp("HMMER3/f", tok1) == 0) { cmfp->hfp->format = p7_HMMFILE_3f; cmfp->hfp->parser = read_asc30hmm; }
1161       else if (strcmp("HMMER3/e", tok1) == 0) { cmfp->hfp->format = p7_HMMFILE_3e; cmfp->hfp->parser = read_asc30hmm; }
1162       else if (strcmp("HMMER3/d", tok1) == 0) { cmfp->hfp->format = p7_HMMFILE_3d; cmfp->hfp->parser = read_asc30hmm; }
1163       else if (strcmp("HMMER3/c", tok1) == 0) { cmfp->hfp->format = p7_HMMFILE_3c; cmfp->hfp->parser = read_asc30hmm; }
1164       else if (strcmp("HMMER3/b", tok1) == 0) { cmfp->hfp->format = p7_HMMFILE_3b; cmfp->hfp->parser = read_asc30hmm; }
1165       else if (strcmp("HMMER3/a", tok1) == 0) { cmfp->hfp->format = p7_HMMFILE_3a; cmfp->hfp->parser = read_asc30hmm; }
1166       else    ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Unknown format for p7 HMM filter");
1167     }
1168   }
1169 
1170   /* read the HMM */
1171   status = p7_hmmfile_Read(cmfp->hfp, &abc, &hmm);
1172   if      (status == eslEOF)       ESL_XFAIL(status, cmfp->errbuf, "failed to read profile HMM piece, CM file may be truncated?");
1173   else if (status == eslEFORMAT)   ESL_XFAIL(status, cmfp->errbuf, "bad file format for HMM filter");
1174   else if (status == eslEINCOMPAT) ESL_XFAIL(status, cmfp->errbuf, "HMM filters are of different alphabets");
1175   else if (status != eslOK)        ESL_XFAIL(status, cmfp->errbuf, "Unexpected error in reading HMM filters");
1176 
1177 #ifdef HMMER_THREADS
1178   if(!cmfp->hfp->do_stdin && !cmfp->hfp->do_gzip) {
1179     if (cmfp->hfp->syncRead) {
1180       if (pthread_mutex_unlock (&cmfp->hfp->readMutex) != 0) ESL_EXCEPTION(eslESYS, "mutex unlock failed");
1181     }
1182   }
1183 #endif
1184 
1185   *ret_hmm = hmm;
1186   return status;
1187 
1188  ERROR:
1189 #ifdef HMMER_THREADS
1190   if(cmfp->hfp->f != NULL) {
1191     if (cmfp->hfp->syncRead) {
1192       if (pthread_mutex_unlock (&cmfp->hfp->readMutex) != 0) ESL_EXCEPTION(eslESYS, "mutex unlock failed");
1193     }
1194   }
1195 #endif
1196 
1197   if(hmm != NULL) p7_hmm_Destroy(hmm);
1198   *ret_hmm = NULL;
1199   return status;
1200 }
1201 
1202 /* Function:  cm_p7_oprofile_Write()
1203  * Synopsis:  Write an optimized p7 profile in two files.
1204  * Incept:    EPN, Fri Jul  8 06:57:50 2011
1205  *
1206  * Purpose:   Write the MSV filter part of <om> to open binary stream
1207  *            <ffp>, and the rest of the model to <pfp>. These two
1208  *            streams will typically be <.i1f> and <.i1p> files
1209  *            being created by cmpress.
1210  *
1211  *            Most of the work is done by p7_oprofile_Write(). This
1212  *            function is only necessary to write five pieces of data
1213  *            that are specific to CMs to the MSV file and one
1214  *            to the profile file:
1215  *
1216  *            1. <cm_offset>: the position in the corresponding <.i1m>
1217  *            file at which the CM corresponding to the optimized
1218  *            profile can be found.
1219  *
1220  *            2. <cm_clen>:  consensus length of the CM this p7 is a
1221  *            filter for.
1222  *
1223  *            3. <cm_W>:     window length of the CM this p7 is a
1224  *            filter for.
1225  *
1226  *            4. <cm_nbp>:   number of base pairs in CM (if 0, pipeline
1227  *            will be run in HMM only mode.
1228  *
1229  *            5. <gfmu>:     glocal forward mu for this <om>
1230  *
1231  *            6. <gflambda>: glocal forward lambda for this <om>
1232  *
1233  * Args:      ffp            - open binary stream for saving MSV filter part
1234  *            pfp            - open binary stream for saving rest of profile
1235  *            cm_offset      - disk offset for CM in <.i1m> file that corresponds
1236  *                             to this profile
1237  *            cm_clen        - consensus length of CM corresponding to this om (usually om->M)
1238  *            cm_W           - window length for the CM
1239  *            cm_nbp         - number of basepairs in the CM
1240  *            gfmu           - E value mu param for glocal forward for this om
1241  *            gflambda       - E value lambda param for glocal forward for this om
1242  *            om             - optimized profile to save
1243  *
1244  * Returns:   <eslOK> on success.
1245  *
1246  *            Returns <eslFAIL> on any write failure; for example,
1247  *            if disk is full.
1248  *
1249  * Throws:    (no abnormal error conditions)
1250  */
1251 int
cm_p7_oprofile_Write(FILE * ffp,FILE * pfp,off_t cm_offset,int cm_clen,int cm_W,int cm_nbp,float gfmu,float gflambda,P7_OPROFILE * om)1252 cm_p7_oprofile_Write(FILE *ffp, FILE *pfp, off_t cm_offset, int cm_clen, int cm_W, int cm_nbp, float gfmu, float gflambda, P7_OPROFILE *om)
1253 {
1254   /* <ffp> is the part of the oprofile that MSVFilter() needs */
1255   if (fwrite((char *) &(v1a_fmagic),     sizeof(uint32_t), 1,  ffp) != 1) return eslFAIL;
1256   if (fwrite((char *) &(cm_offset),      sizeof(off_t),    1,  ffp) != 1) return eslFAIL;
1257   if (fwrite((char *) &(cm_clen),        sizeof(int),      1,  ffp) != 1) return eslFAIL;
1258   if (fwrite((char *) &(cm_W),           sizeof(int),      1,  ffp) != 1) return eslFAIL;
1259   if (fwrite((char *) &(cm_nbp),         sizeof(int),      1,  ffp) != 1) return eslFAIL;
1260   if (fwrite((char *) &(gfmu),           sizeof(float),    1,  ffp) != 1) return eslFAIL;
1261   if (fwrite((char *) &(gflambda),       sizeof(float),    1,  ffp) != 1) return eslFAIL;
1262 
1263   /* <pfp> gets the rest of the oprofile, we don't need to write anything extra, so
1264    * p7_oprofile_Write handles it all */
1265 
1266   /* pass to p7_oprofile_Write to do the rest */
1267   return p7_oprofile_Write(ffp, pfp, om);
1268 }
1269 
1270 
1271 /* Function:  cm_p7_oprofile_Position()
1272  * Synopsis:  Reposition the hmm filter file part of a CM file to an offset.
1273  * Incept:    EPN, Fri Jul 22 10:49:41 2011
1274  *            MSF, Thu Oct 15, 2009 [Janelia] (p7_oprofile_Position())
1275  *
1276  * Purpose:   Reposition an open <cmfp->ffp> to offset <offset>.
1277  *            <offset> would usually be the first byte of a
1278  *            desired hmm record.
1279  *
1280  * Returns:   <eslOK>     on success;
1281  *            <eslEOF>    if no data can be read from this position.
1282  *
1283  * Throws:    <eslEINVAL>  if the <sqfp> is not positionable.
1284  *            <eslEFORMAT> if no msv profile opened.
1285  *            <eslESYS>    if the fseeko() call fails.
1286  */
1287 int
cm_p7_oprofile_Position(CM_FILE * cmfp,off_t offset)1288 cm_p7_oprofile_Position(CM_FILE *cmfp, off_t offset)
1289 {
1290   if (cmfp->ffp == NULL)  ESL_EXCEPTION(eslEFORMAT, cmfp->errbuf, "no MSV profile file; cmpress probably wasn't run");
1291   if (cmfp->do_stdin)     ESL_EXCEPTION(eslEINVAL, "can't Position() in standard input");
1292   if (cmfp->do_gzip)      ESL_EXCEPTION(eslEINVAL, "can't Position() in a gzipped file");
1293   if (offset < 0)         ESL_EXCEPTION(eslEINVAL, "bad offset");
1294 
1295   if (fseeko(cmfp->ffp, offset, SEEK_SET) != 0) ESL_EXCEPTION(eslESYS, "fseeko() failed");
1296 
1297   return eslOK;
1298 }
1299 
1300 
1301 /* Function:  cm_p7_oprofile_ReadMSV()
1302  * Synopsis:  Read MSV filter part of an optimized profile.
1303  * Incept:    EPN, Fri Jul  8 08:19:58 2011
1304  *
1305  * Purpose:   Read the MSV filter part of a p7 filter profile from the
1306  *            <.i1f> file associated with an open CM file <cmfp>.
1307  *            Allocate a new model, populate it with this minimal
1308  *            MSV filter information, and return a pointer to them
1309  *            in <*ret_om>. Return pointers to additional information
1310  *            on the model in <*ret_cm_offset>, <*ret_cm_clen> and
1311  *            <*ret_cm_W>.
1312  *
1313  *            Our alphabet may get set by the first HMM we read.  If
1314  *            <*byp_abc> is <NULL> at start, create a new alphabet and
1315  *            return a pointer to it in <*byp_abc>. If <*byp_abc> is
1316  *            non-<NULL>, it is assumed to be a pointer to an existing
1317  *            alphabet; we verify that the HMM's alphabet matches it
1318  *            and <*ret_abc> isn't changed.  This is the same
1319  *            convention used by <cm_file_Read()>.
1320  *
1321  *            The <.i1f> file was opened automatically, if it existed,
1322  *            when the CM file was opened with <cm_file_Open()>.
1323  *
1324  *            When no more HMMs remain in the file, return <eslEOF>.
1325  *
1326  *            Most of the work is done by p7_oprofile_ReadMSV(). This
1327  *            function is only necessary to read five pieces of data
1328  *            that are specific to CMs. See cm_p7_oprofile_Write()
1329  *            for more explanation.
1330  *
1331  *            If <read_scores> is TRUE: read all the MSV data
1332  *            (including scores) using p7_oprofile_ReadMSV().
1333  *            If <read_scores> is FALSE: read only the MSV info
1334  *            (no scores) using p7_oprofile_ReadInfoMSV().
1335  *
1336  *
1337  * Args:      cmfp    - open CM file, with associated .i1p file
1338  *            byp_abc - BYPASS: <*byp_abc == ESL_ALPHABET *> if known;
1339  *                              <*byp_abc == NULL> if desired;
1340  *                              <NULL> if unwanted.
1341  *            ret_cm_offset   - RETURN: offset of CM  each om corresponds to
1342  *            ret_cm_clen     - RETURN: clen of CM each om corresponds to
1343  *            ret_cm_W        - RETURN: W of CM each om corresponds to
1344  *            ret_cm_nbp      - RETURN: number of bps in CM each om corresponds to
1345  *            ret_gfmu        - RETURN: glocal fwd mu the om
1346  *            ret_gflambda    - RETURN: glocal forward lambda the om
1347  *            ret_om          - RETURN: the read <om> with MSV filter
1348  *                              data filled in.
1349  *
1350  * Returns:   <eslOK> on success. <*ret_om> is allocated here;
1351  *            caller free's with <p7_oprofile_Destroy()>.
1352  *            <*byp_abc> is allocated here if it was requested;
1353  *            caller free's with <esl_alphabet_Destroy()>.
1354  *
1355  *            Returns <eslEFORMAT> if <cmfp> has no <.i1f> file open,
1356  *            or on any parsing error.
1357  *
1358  *            Returns <eslEINCOMPAT> if the HMM we read is incompatible
1359  *            with the existing alphabet <*byp_abc> led us to expect.
1360  *
1361  *            On any returned error, <cmfp->errbuf> contains an
1362  *            informative error message.
1363  *
1364  * Throws:    <eslEMEM> on allocation error.
1365  */
1366 int
cm_p7_oprofile_ReadMSV(CM_FILE * cmfp,int read_scores,ESL_ALPHABET ** byp_abc,off_t * ret_cm_offset,int * ret_cm_clen,int * ret_cm_W,int * ret_cm_nbp,float * ret_gfmu,float * ret_gflambda,P7_OPROFILE ** ret_om)1367 cm_p7_oprofile_ReadMSV(CM_FILE *cmfp, int read_scores, ESL_ALPHABET **byp_abc, off_t *ret_cm_offset, int *ret_cm_clen, int *ret_cm_W, int *ret_cm_nbp, float *ret_gfmu, float *ret_gflambda, P7_OPROFILE **ret_om)
1368 {
1369   int status         = eslOK;      /* return status */
1370   uint32_t magic;                  /* magic number used to verify format */
1371   off_t cm_offset;                 /* offset of the corresponding CM  in cmfp->mfp */
1372   int   cm_clen;                   /* CM consensus length (will likely be om->M) */
1373   int   cm_W;                      /* CM window length (will likely differ from om->max_length) */
1374   int   cm_nbp;                    /* number of basepairs in CM (if 0 pipeline will be run in HMM only mode) */
1375   float gfmu;                      /* glocal fwd mu parameter for current hmm */
1376   float gflambda;                  /* glocal fwd lambda parameter for current hmm */
1377   P7_OPROFILE *om = NULL;          /* the om we've read */
1378 
1379   cmfp->errbuf[0] = '\0';
1380   if (cmfp->ffp == NULL)    ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "no MSV profile file; cmpress probably wasn't run");
1381 
1382   if (feof(cmfp->ffp))                                           { status = eslEOF; goto ERROR; } /* normal EOF: no more profiles */
1383   if (! fread( (char *) &magic, sizeof(uint32_t), 1, cmfp->ffp)) { status = eslEOF; goto ERROR; }
1384   if (magic != v1a_fmagic)  ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "bad magic; not a CM database?");
1385 
1386   if (! fread( (char *) &cm_offset,       sizeof(off_t),    1, cmfp->ffp)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM offset");
1387   if (! fread( (char *) &cm_clen,         sizeof(int),      1, cmfp->ffp)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM consensus length");
1388   if (! fread( (char *) &cm_W,            sizeof(int),      1, cmfp->ffp)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM window length (W)");
1389   if (! fread( (char *) &cm_nbp,          sizeof(int),      1, cmfp->ffp)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM number of bps");
1390   if (! fread( (char *) &gfmu,            sizeof(int),      1, cmfp->ffp)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read glocal fwd mu parameter");
1391   if (! fread( (char *) &gflambda,        sizeof(int),      1, cmfp->ffp)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read glocal fwd lambda parameter");
1392 
1393   /* move HMM file position (cmfp->hfp->ffp) to where we are in CM file (cmfp->ffp) */
1394   if (fseeko(cmfp->hfp->ffp, ftello(cmfp->ffp), SEEK_SET) != 0)               ESL_XFAIL(eslEINCOMPAT, cmfp->errbuf, "failed to set position for HMM file parser (MSV)");
1395   if (read_scores) {
1396     if ((status = p7_oprofile_ReadMSV(cmfp->hfp, byp_abc, &om))     != eslOK) ESL_XFAIL(status,       cmfp->errbuf, "failed to read MSV filter model");
1397   }
1398   else {
1399     if ((status = p7_oprofile_ReadInfoMSV(cmfp->hfp, byp_abc, &om)) != eslOK) ESL_XFAIL(status,       cmfp->errbuf, "failed to read MSV info");
1400   }
1401   /* move CM file position (cmfp->ffp) to where we are in HMM file (cmfp->hfp->ffp) */
1402   if (fseeko(cmfp->ffp, ftello(cmfp->hfp->ffp), SEEK_SET) != 0) ESL_XFAIL(eslEINCOMPAT, cmfp->errbuf, "Failed to set position for CM file parser (MSV)");
1403 
1404   if(ret_cm_offset != NULL) *ret_cm_offset  = cm_offset;
1405   if(ret_cm_clen   != NULL) *ret_cm_clen    = cm_clen;
1406   if(ret_cm_W      != NULL) *ret_cm_W       = cm_W;
1407   if(ret_cm_nbp    != NULL) *ret_cm_nbp     = cm_nbp;
1408   if(ret_gfmu      != NULL) *ret_gfmu       = gfmu;
1409   if(ret_gflambda  != NULL) *ret_gflambda   = gflambda;
1410   if(ret_om        != NULL) *ret_om         = om;
1411   return eslOK;
1412 
1413  ERROR:
1414   if (om != NULL) p7_oprofile_Destroy(om);
1415   if (ret_om != NULL) *ret_om = NULL;
1416   return status;
1417 }
1418 
1419 /* Function:  cm_p7_oprofile_ReadBlockMSV()
1420  * Synopsis:  Read the next block of MSV filter parts from a HMM file.
1421  * Incept:    EPN, Thu Jul 21 04:44:19 2011
1422  *
1423  * Purpose:   Reads a block of the MSV filter parts of optimized
1424  *            p7 profiles from open cm file <cmfp> into
1425  *            <hmmBlock>.
1426  *
1427  *            Most of the work is done by p7_oprofile_ReadMSV(). This
1428  *            function is only necessary to read five pieces of data
1429  *            that are specific to CMs. See cm_p7_oprofile_Write()
1430  *            for more explanation.
1431  *
1432  * Args:      cmfp    - open CM file, with associated .i1f file
1433  *            idx0    - the index of the next profile in the file (0 if first profile)
1434  *            byp_abc - BYPASS: <*byp_abc == ESL_ALPHABET *> if known;
1435  *                              <*byp_abc == NULL> if desired;
1436  *                              <NULL> if unwanted.
1437  *            hmmBlock- RETURN: the block of profiles.
1438  *
1439  * Returns:   <eslOK> on success; block in <hmmBlock>.
1440  *
1441  *            Returns <eslEFORMAT> if <cmfp> has no <.i1f> file open,
1442  *            or on any parsing error.
1443  *
1444  *            Returns <eslEINCOMPAT> if any HMM we read is incompatible
1445  *            with the existing alphabet <*byp_abc> led us to expect.
1446  *
1447  *            Returns <eslEOF> when there is no profiles left in the
1448  *            file (including first attempt to read an empty file).
1449  */
1450 int
cm_p7_oprofile_ReadBlockMSV(CM_FILE * cmfp,int64_t idx0,ESL_ALPHABET ** byp_abc,CM_P7_OM_BLOCK * hmmBlock)1451 cm_p7_oprofile_ReadBlockMSV(CM_FILE *cmfp, int64_t idx0, ESL_ALPHABET **byp_abc, CM_P7_OM_BLOCK *hmmBlock)
1452 {
1453   int status         = eslOK;      /* return status */
1454   int i;
1455 
1456   hmmBlock->count = 0;
1457   hmmBlock->idx0  = idx0;
1458   for (i = 0; i < hmmBlock->listSize; ++i)
1459     {
1460       status = cm_p7_oprofile_ReadMSV(cmfp, TRUE, byp_abc,
1461 				      &(hmmBlock->cm_offsetA[i]),
1462 				      &(hmmBlock->cm_clenA[i]),
1463 				      &(hmmBlock->cm_WA[i]),
1464 				      &(hmmBlock->cm_nbpA[i]),
1465 				      &(hmmBlock->gfmuA[i]),
1466 				      &(hmmBlock->gflambdaA[i]),
1467 				      &(hmmBlock->list[i]));
1468       if (status != eslOK) break;
1469       ++hmmBlock->count;
1470     }
1471   /* create the MSV data */
1472   for (i = 0; i < hmmBlock->count; ++i) {
1473     hmmBlock->msvdataA[i] = p7_hmm_ScoreDataCreate(hmmBlock->list[i], FALSE);
1474   }
1475   /* initiate the clan idx data, caller will need to fill this if nec */
1476   for (i = 0; i < hmmBlock->count; ++i) {
1477     hmmBlock->clan_idxA[i] = -1;
1478   }
1479 
1480   /* EOF will be returned only in the case were no profiles were read */
1481   if (status == eslEOF && i > 0) status = eslOK;
1482 
1483   return status;
1484 }
1485 
1486 /*----------------- end of API for p7 filters ----------------------*/
1487 
1488 
1489 /*****************************************************************
1490  * 5.  Private, specific profile CM file format parsers.
1491  *****************************************************************/
1492 
1493 /* Parsing save files from INFERNAL 1.x
1494  * All parsers follow the same API.
1495  *
1496  * Returns <eslOK> on success, and if <opt_cm> is non-NULL,
1497  * <*opt_cm> points at a newly allocated CM.
1498  *
1499  * Additionally, if <*ret_abc> was NULL, then a new alphabet is
1500  * allocated according to the alphabet type of this CM, and returned
1501  * thru <ret_abc>.  This allocation mechanism allows a main()
1502  * application that doesn't yet know its alphabet to determine the
1503  * alphabet when the first CM is read, while also allowing an
1504  * application to allocate its own alphabet and assure that the
1505  * input CMs are appropriate for that alphabet.
1506  *
1507  * Returns <eslEOF> when no CM remains in the file, indicating a
1508  * normal end-of-file.
1509  *
1510  * Two types of "normal error" may happen, which the caller must check
1511  * for. Returns <eslEFORMAT> on any save file format error, including
1512  * bad magic (i.e. this is not an INFERNAL file at all). Returns
1513  * <eslEINCOMPAT> if the expected alphabet (a non-<NULL> alphabet
1514  * specified by <*ret_abc>) does not match the alphabet type of the
1515  * HMM.
1516  *
1517  * When these normal errors occur, the caller can construct its error
1518  * message from:
1519  *    <cmfp->errbuf>:    contains an informative error message
1520  *    <cmfp->fname>:     name of the CM file (or '-' if STDIN)
1521  * and if <cmfp->efp> is non-<NULL>, the CM file is in ASCII text,
1522  * and the caller may also use:
1523  *    <cmfp->efp->linenumber>: line on which the parse error occurred.
1524  *
1525  * Throws:     <eslEMEM> on allocation error.
1526  *             <eslESYS> if a system i/o call fails.
1527  *             In cases of error (including both thrown error and normal error), <*ret_abc>
1528  *             is left in its original state as passed by the caller, and <*ret_hmm> is
1529  *             returned <NULL>.
1530  */
1531 static int
read_asc_1p1_cm(CM_FILE * cmfp,int read_fp7,ESL_ALPHABET ** ret_abc,CM_t ** opt_cm)1532 read_asc_1p1_cm(CM_FILE *cmfp, int read_fp7, ESL_ALPHABET **ret_abc, CM_t **opt_cm)
1533 {
1534   int           status;
1535   ESL_ALPHABET *abc  = NULL;
1536   CM_t         *cm   = NULL;
1537   P7_HMM       *hmm  = NULL;
1538   char         *tag  = NULL;
1539   char         *tok1 = NULL;
1540   char         *tok2 = NULL;
1541   char         *tok3 = NULL;
1542   char         *tok4 = NULL;
1543   char         *tok5 = NULL;
1544   char         *tok6 = NULL;
1545   int           alphatype;
1546   off_t         offset = 0;
1547   off_t         fp7_offset = 0;
1548   int           v, x, y, nd;            /* counters */
1549   int           read_fp7_stats = FALSE;
1550   uint32_t      cm_statstracker = 0; /* for making sure we have all CM E-value stats, if we have any */
1551   int           exp_mode;
1552   int           read_el_selfsc = FALSE; /* set to true when we read ELSELF line */
1553 
1554   /* temporary parameters, for storing values prior to their allocation in the CM */
1555   float *tmp_null          = NULL;
1556   float  tmp_fp7_gfmu;
1557   float  tmp_fp7_gflambda;
1558   double tmp_qdbbeta1;
1559   double tmp_qdbbeta2;
1560 
1561   /* temporary per-node annotation, will be converted to per-consensus position once the model
1562    * architecture is known, after the full model is read */
1563   char *tmp_rf_left    = NULL;
1564   char *tmp_rf_right   = NULL;
1565   char *tmp_cons_left  = NULL;
1566   char *tmp_cons_right = NULL;
1567   int  *tmp_map_left   = NULL;
1568   int  *tmp_map_right  = NULL;
1569 
1570   cmfp->errbuf[0] = '\0';
1571 
1572   if (cmfp->newly_opened)
1573     {
1574       offset            = 0;
1575       cmfp->newly_opened = FALSE;
1576     }
1577   else
1578     {
1579       /* Record where this CM starts on disk */
1580       if ((! cmfp->do_stdin) && (! cmfp->do_gzip) && (offset = ftello(cmfp->f)) < 0)   ESL_XEXCEPTION(eslESYS, "ftello() failed");
1581 
1582       /* First line of file: "INFERNAL1/a". Allocate shell for CM annotation information (we don't know M,nodes yet) */
1583       if ((status = esl_fileparser_NextLine(cmfp->efp))                   != eslOK)  goto ERROR;  /* EOF here is normal; could also be a thrown EMEM */
1584       if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tag, NULL)) != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "unexpected absence of tokens on data line");
1585 
1586       if      (cmfp->format == CM_FILE_1a) { if (strcmp(tag, "INFERNAL1/a") != 0)    ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Didn't find INFERNAL1/a tag: bad format or not an INFERNAL save file?"); }
1587       else                                                                           ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No such CM file format code: this shouldn't happen");
1588     }
1589 
1590   if ((cm = CreateCMShell()) == NULL)   ESL_XFAIL(eslEMEM,    cmfp->errbuf, "allocation failure, CM shell");
1591   cm->offset = offset;
1592 
1593   /* Header section */
1594   while ((status = esl_fileparser_NextLine(cmfp->efp)) == eslOK)
1595     {
1596       if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tag, NULL))     != eslOK)   ESL_XFAIL(status,    cmfp->errbuf, "Premature end of line");
1597 
1598       if (strcmp(tag, "NAME") == 0) {
1599 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))  != eslOK)   ESL_XFAIL(status,    cmfp->errbuf, "No name found on NAME line");
1600 	cm_SetName(cm, tok1);
1601       }
1602 
1603       else if (strcmp(tag, "ACC") == 0)  {
1604 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))  != eslOK)   ESL_XFAIL(status,    cmfp->errbuf, "No accession found on ACC line");
1605 	cm_SetAccession(cm, tok1);
1606       }
1607 
1608       else if (strcmp(tag, "DESC") == 0) {
1609 	if ((status = esl_fileparser_GetRemainingLine(cmfp->efp, &tok1))      != eslOK)   ESL_XFAIL(status,    cmfp->errbuf, "No description found on DESC line");
1610 	cm_SetDescription(cm, tok1);
1611       }
1612 
1613       else if (strcmp(tag, "STATES") == 0) {
1614 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))  != eslOK)   ESL_XFAIL(status,    cmfp->errbuf, "Number of model states not found on STATES line");
1615 	if ((cm->M = atoi(tok1))                                              == 0)  	  ESL_XFAIL(status,    cmfp->errbuf, "Invalid number of states %s on STATES line", tok1);
1616       }
1617 
1618       else if (strcmp(tag, "NODES") == 0) {
1619 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))  != eslOK)   ESL_XFAIL(status,    cmfp->errbuf, "Number of model nodes not found on NODES line");
1620 	if ((cm->nodes = atoi(tok1))                                          == 0)  	  ESL_XFAIL(status,    cmfp->errbuf, "Invalid number of nodes %s on NODES line", tok1);
1621       }
1622 
1623       else if (strcmp(tag, "CLEN") == 0) {
1624 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))  != eslOK)   ESL_XFAIL(status,    cmfp->errbuf, "No consensus length found on CLEN line");
1625 	if ((cm->clen = atoi(tok1))                                           == 0)   	  ESL_XFAIL(status,    cmfp->errbuf, "Invalid consensus length %s on CLEN line", tok1);
1626       }
1627 
1628       else if (strcmp(tag, "W") == 0) {
1629 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))  != eslOK)   ESL_XFAIL(status,    cmfp->errbuf, "No consensus length found on W line");
1630 	if ((cm->W = atoi(tok1))                                              == 0)   	  ESL_XFAIL(status,    cmfp->errbuf, "Invalid consensus length %s on W line", tok1);
1631       }
1632 
1633       else if (strcmp(tag, "ALPH") == 0) {
1634 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))  != eslOK)   ESL_XFAIL(status,    cmfp->errbuf, "No alphabet type found on ALPH");
1635 	if ((alphatype = esl_abc_EncodeType(tok1))                        == eslUNKNOWN)  ESL_XFAIL(status,    cmfp->errbuf, "Unrecognized alphabet type %s", tok1);
1636 	if (*ret_abc == NULL) {
1637 	  if ((abc = esl_alphabet_Create(alphatype))                        == NULL) 	  ESL_XFAIL(eslEMEM,   cmfp->errbuf, "Failed to create alphabet");
1638 	} else {
1639 	  if ((*ret_abc)->type != alphatype)	                                          ESL_XFAIL(eslEINCOMPAT,cmfp->errbuf,"Alphabet type mismatch: was %s, but current CM says %s", esl_abc_DecodeType( (*ret_abc)->type), tok1);
1640 	  abc = *ret_abc;
1641 	}
1642       }
1643 
1644       else if (strcmp(tag, "RF") == 0) {
1645 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,    cmfp->errbuf, "No yes/no found for RF line");
1646 	if      (strcasecmp(tok1, "yes") == 0) cm->flags |= CMH_RF;
1647 	else if (strcasecmp(tok1, "no")  != 0)                                            ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "RF header line must say yes/no, not %s", tok1);
1648       }
1649 
1650       else if (strcmp(tag, "CONS") == 0) {
1651 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "No yes/no found for CONS line");
1652 	if      (strcasecmp(tok1, "yes") == 0) cm->flags |= CMH_CONS;
1653 	else if (strcasecmp(tok1, "no")  != 0)                                           ESL_XFAIL(eslEFORMAT,  cmfp->errbuf, "CONS header line must say yes/no, not %s", tok1);
1654       }
1655 
1656       else if (strcmp(tag, "MAP") == 0) {
1657 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "No yes/no found for MAP line");
1658 	if      (strcasecmp(tok1, "yes") == 0) cm->flags |= CMH_MAP;
1659 	else if (strcasecmp(tok1, "no")  != 0)                                            ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "MAP header line must say yes/no, not %s", tok1);
1660       }
1661 
1662       else if (strcmp(tag, "DATE") == 0) {
1663 	if ((status = esl_fileparser_GetRemainingLine(cmfp->efp, &tok1))       != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "No date found on DATE line");
1664 	if (esl_strdup(tok1, -1, &(cm->ctime))                                 != eslOK)  ESL_XFAIL(eslEMEM,    cmfp->errbuf, "strdup() failed to set date");
1665       }
1666 
1667       else if (strcmp(tag, "COM") == 0) {
1668 	/* just skip the first token; it's something like [1], numbering the command lines */
1669 	if ((status = esl_fileparser_GetTokenOnLine  (cmfp->efp, &tok1, NULL)) != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "No command number on COM line");
1670 	if ((status = esl_fileparser_GetRemainingLine(cmfp->efp, &tok1))       != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "No command on COM line");
1671 	if (cm->comlog == NULL) {
1672 	  if (esl_strdup(tok1, -1, &(cm->comlog))                              != eslOK)  ESL_XFAIL(eslEMEM,    cmfp->errbuf, "esl_strdup() failed");
1673 	} else {
1674 	  if (esl_strcat(&(cm->comlog), -1, "\n", -1)                          != eslOK)  ESL_XFAIL(eslEMEM,    cmfp->errbuf, "esl_strcat() failed");
1675 	  if (esl_strcat(&(cm->comlog), -1, tok1,  -1)                         != eslOK)  ESL_XFAIL(eslEMEM,    cmfp->errbuf, "esl_strcat() failed");
1676 	}
1677       }
1678 
1679       else if (strcmp(tag, "PBEGIN") == 0) {
1680 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "Nothing follows PBEGIN tag");
1681 	if ((cm->pbegin = atof(tok1)) <= 0.0f)                                            ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid beta on PBEGIN line: should be a positive real number, not %s", tok1);
1682       }
1683 
1684       else if (strcmp(tag, "PEND") == 0) {
1685 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "Nothing follows PEND tag");
1686 	if ((cm->pend = atof(tok1)) <= 0.0f)                                              ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid beta on PEND line: should be a positive real number, not %s", tok1);
1687       }
1688 
1689       else if (strcmp(tag, "WBETA") == 0) {
1690 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "Nothing follows WBETA tag");
1691 	if ((cm->beta_W = atof(tok1)) <= 0.0f)                                            ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid beta on WBETA line: should be a positive real number, not %s", tok1);
1692       }
1693 
1694       else if (strcmp(tag, "QDBBETA1") == 0) {
1695 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "Nothing follows QDBBETA tag");
1696 	if ((tmp_qdbbeta1 = atof(tok1)) <= 0.0f)                                          ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid beta on QDBBETA line: should be a positive real number, not %s", tok1);
1697       }
1698 
1699       else if (strcmp(tag, "QDBBETA2") == 0) {
1700 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "Nothing follows QDBBETA tag");
1701 	if ((tmp_qdbbeta2 = atof(tok1)) <= 0.0f)                                          ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid beta on QDBBETA line: should be a positive real number, not %s", tok1);
1702       }
1703 
1704       else if (strcmp(tag, "N2OMEGA") == 0) {
1705 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "Nothing follows N2OMEGA tag");
1706 	if ((cm->null2_omega = atof(tok1)) <= 0.0f)                                       ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid omega on N2OMEGA line: should be a positive real number, not %s", tok1);
1707       }
1708 
1709       else if (strcmp(tag, "N3OMEGA") == 0) {
1710 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "Nothing follows N3OMEGA tag");
1711 	if ((cm->null3_omega = atof(tok1)) <= 0.0f)                                       ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid omega on N3OMEGA line: should be a positive real number, not %s", tok1);
1712       }
1713 
1714       else if (strcmp(tag, "ELSELF") == 0) {
1715 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "Nothing follows ELSELF tag");
1716 	cm->el_selfsc = atof(tok1);
1717 	read_el_selfsc = TRUE;
1718       }
1719 
1720       else if (strcmp(tag, "NSEQ") == 0) {
1721 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "Nothing follows NSEQ tag");
1722 	if ((cm->nseq = atoi(tok1)) == 0)                                                 ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid nseq on NSEQ line: should be integer, not %s", tok1);
1723       }
1724 
1725       else if (strcmp(tag, "EFFN") == 0) {
1726 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "Nothing follows EFFN tag");
1727 	if ((cm->eff_nseq = atof(tok1)) < (-1.*eslSMALLX1))                               ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid eff_nseq on EFFN line: should be a positive real number, not %s", tok1);
1728       }
1729 
1730       else if (strcmp(tag, "CKSUM") == 0) {
1731 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "Nothing follows CKSUM tag");
1732 	cm->checksum = atoll(tok1); /* if atoi(), then you may truncate uint32_t checksums > 2^31-1 */
1733 	cm->flags |= CMH_CHKSUM;
1734       }
1735 
1736       else if (strcmp(tag, "NULL") == 0) {
1737 	if(abc == NULL) ESL_XFAIL(status,     cmfp->errbuf, "Read NULL line before ALPH line, ALPH line must come first");
1738 	ESL_ALLOC(tmp_null, sizeof(float) * abc->K);
1739 	for (x = 0; x < abc->K; x++) {
1740 	  if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on NULL line");
1741 	  tmp_null[x] = ascii2prob(tok1, (1./(float) abc->K));
1742 	}
1743       }
1744 
1745       else if (strcmp(tag, "GA") == 0) {
1746 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on GA line");
1747 	cm->ga     = atof(tok1);
1748 	cm->flags |= CMH_GA;
1749       }
1750 
1751       else if (strcmp(tag, "TC") == 0) {
1752 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on TC line");
1753 	cm->tc     = atof(tok1);
1754 	cm->flags |= CMH_TC;
1755       }
1756 
1757       else if (strcmp(tag, "NC") == 0) {
1758 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on NC line");
1759 	cm->nc     = atof(tok1);
1760 	cm->flags |= CMH_NC;
1761       }
1762 
1763       else if (strcmp(tag, "EFP7GF") == 0) {
1764 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on EFP7GF line"); /* tau */
1765 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok2, NULL))   != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on EFP7GF line"); /* lambda   */
1766 	tmp_fp7_gfmu     = atof(tok1);
1767 	tmp_fp7_gflambda = atof(tok2);
1768 	read_fp7_stats = TRUE;
1769       }
1770 
1771       else if (strncmp(tag, "ECM", 3) == 0) { /* one of 4 possible CM E-value lines */
1772 	/* determine which one */
1773 	if      (strncmp(tag+3, "LC", 2) == 0) { exp_mode = EXP_CM_LC; cm_statstracker += 1; }
1774 	else if (strncmp(tag+3, "GC", 2) == 0) { exp_mode = EXP_CM_GC; cm_statstracker += 2; }
1775 	else if (strncmp(tag+3, "LI", 2) == 0) { exp_mode = EXP_CM_LI; cm_statstracker += 4; }
1776 	else if (strncmp(tag+3, "GI", 2) == 0) { exp_mode = EXP_CM_GI; cm_statstracker += 8; }
1777 	else                                   { ESL_XFAIL(status, cmfp->errbuf, "Invalid tag beginning with ECM"); }
1778 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on ECM.. line"); /* lambda    */
1779 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok2, NULL))   != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on ECM.. line"); /* mu_extrap */
1780 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok3, NULL))   != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on ECM.. line"); /* mu_orig   */
1781 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok4, NULL))   != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on ECM.. line"); /* dbsize    */
1782 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok5, NULL))   != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on ECM.. line"); /* nrandhits */
1783 	if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok6, NULL))   != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on ECM.. line"); /* tailp     */
1784 	if (cm->expA == NULL) {
1785 	  ESL_ALLOC(cm->expA, sizeof(ExpInfo_t *) * EXP_NMODES);
1786 	  for(x = 0; x < EXP_NMODES; x++) { cm->expA[x] = CreateExpInfo(); }
1787 	}
1788 	cm->expA[exp_mode]->lambda    = atof(tok1);
1789 	cm->expA[exp_mode]->mu_extrap = atof(tok2);
1790 	cm->expA[exp_mode]->mu_orig   = atof(tok3);
1791 	cm->expA[exp_mode]->dbsize    = atof(tok4); /* store as double, even though it was written as a long */
1792 	cm->expA[exp_mode]->nrandhits = atoi(tok5);
1793 	cm->expA[exp_mode]->tailp     = atof(tok6);
1794 	cm->expA[exp_mode]->is_valid  = TRUE;
1795       }
1796       else if (strcmp(tag, "CM") == 0) {
1797 	/* skip the remainder of this line */
1798 	if ((status = esl_fileparser_NextLine(cmfp->efp)) != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "Premature end of data before main model section");
1799 	break;
1800       }
1801     } /* end, loop over possible header tags */
1802   if (status != eslOK) goto ERROR;
1803 
1804   /* Done reading the header information.
1805    * Check that everything is ok and mandatory info is present before moving on.
1806    */
1807   if (cm->M     < 1)      ESL_XFAIL(status, cmfp->errbuf, "Failed to read STATES line in header section");
1808   if (cm->nodes < 1)      ESL_XFAIL(status, cmfp->errbuf, "Failed to read NODES line in header section");
1809   if (cm->clen  < 1)      ESL_XFAIL(status, cmfp->errbuf, "Failed to read CLEN line in header section");
1810   if (cm->W     < 1)      ESL_XFAIL(status, cmfp->errbuf, "Failed to read W line in header section");
1811   if (! read_el_selfsc)   ESL_XFAIL(status, cmfp->errbuf, "Failed to read ELSELF line in header section");
1812   if (! read_fp7_stats)   ESL_XFAIL(status, cmfp->errbuf, "Failed to read EFP7GF line in header section");
1813   if (cm->name == NULL)   ESL_XFAIL(status, cmfp->errbuf, "Failed to read NAME line in header section");
1814   if (abc      == NULL)   ESL_XFAIL(status, cmfp->errbuf, "Failed to read ALPH line in header section");
1815   if (tmp_null == NULL)   ESL_XFAIL(status, cmfp->errbuf, "Failed to read NULL line in header section");
1816 
1817   /* Check to make sure we parsed CM E-value stats correctly.
1818    */
1819   if (cm->expA != NULL) {
1820     if      (cm_statstracker == 15) cm->flags |= CMH_EXPTAIL_STATS;
1821     else if (cm_statstracker != 0)  ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Missing one or more ECM.. parameter lines");
1822   }
1823 
1824   /* Allocate body of CM now that # states (M) and # nodes (nnodes) are known */
1825   CreateCMBody(cm, cm->nodes, cm->M, cm->clen, abc);
1826 
1827   /* Copy values we stored in temp parameters awaiting CM allocation in CreateCMBody() */
1828   esl_vec_FCopy(tmp_null, abc->K, cm->null); /* cm->null allocated in CreateCMBody */
1829   cm->qdbinfo->beta1 = tmp_qdbbeta1;
1830   cm->qdbinfo->beta2 = tmp_qdbbeta2;
1831 
1832   /* Allocate and initialize temporary parameters for values we can't store
1833    * until after we've  read the full model and are able to construct an emit map */
1834   ESL_ALLOC(tmp_map_left,  sizeof(int) * cm->nodes);
1835   ESL_ALLOC(tmp_map_right, sizeof(int) * cm->nodes);
1836   esl_vec_ISet(tmp_map_left,  cm->nodes, -1);
1837   esl_vec_ISet(tmp_map_right, cm->nodes, -1);
1838 
1839   ESL_ALLOC(tmp_rf_left,  sizeof(char) * (cm->nodes+1));
1840   ESL_ALLOC(tmp_rf_right, sizeof(char) * (cm->nodes+1));
1841   tmp_rf_left[cm->nodes]  = '\0';
1842   tmp_rf_right[cm->nodes] = '\0';
1843 
1844   ESL_ALLOC(tmp_cons_left,  sizeof(char) * (cm->nodes+1));
1845   ESL_ALLOC(tmp_cons_right, sizeof(char) * (cm->nodes+1));
1846   tmp_cons_left[cm->nodes]  = '\0';
1847   tmp_cons_right[cm->nodes] = '\0';
1848 
1849   nd = -1;
1850   cm->clen = 0;
1851 
1852   for (v = 0; v < cm->M; v++)
1853     {
1854       if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))     != eslOK) ESL_XFAIL(status,     cmfp->errbuf, "Premature end of data before main model section");
1855 
1856       /* Ah, a node line. Process it and get the following line.
1857        */
1858       if (*tok1 == '[')
1859 	{
1860 	  if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on node line: expected %d, got %d", 10, 1);
1861 	  if ((x = NodeCode(tok1)) == -1)                                                ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid node type %s", tok1);
1862 	  if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on node line: expected %d, got %d", 10, 2);
1863 	  if (!is_integer(tok1))                                                         ESL_XFAIL(status,     cmfp->errbuf, "Invalid node index on node line: should be integer >= 0, not %s", tok1);
1864 	  nd = atoi(tok1);
1865 	  if (nd <  0)                                                                   ESL_XFAIL(status,     cmfp->errbuf, "Invalid node index on node line: should be integer >= 0, not %s", tok1);
1866 	  if (nd >= cm->nodes)                                                           ESL_XFAIL(status,     cmfp->errbuf, "Invalid node index on node line: should not exceed %d, read %s", cm->nodes, tok1);
1867 	  cm->ndtype[nd]  = x;
1868 	  if     (cm->ndtype[nd] == MATP_nd) cm->clen+=2;
1869 	  else if(cm->ndtype[nd] == MATL_nd) cm->clen++;
1870 	  else if(cm->ndtype[nd] == MATR_nd) cm->clen++;
1871 	  cm->nodemap[nd] = v;
1872 
1873 	  /* chew up ']' */
1874 	  if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on node line: expected %d, got %d", 10, 3);
1875 
1876 	  /* read annotation: MAP, consensus sequence and RF. Proper format depends on node type.
1877 	   */
1878 	  /* MAP (optional: CMH_MAP? yes, else no */
1879 	  if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on node line: expected %d, got %d", 10, 4);
1880 	  if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok2, NULL)) != eslOK) ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on node line: expected %d, got %d", 10, 5);
1881 	  if      ((cm->flags & CMH_MAP) && cm->ndtype[nd] == MATP_nd) {
1882 	    if (!is_integer(tok1))                                                       ESL_XFAIL(status,     cmfp->errbuf, "Invalid 1st MAP value on MATP node line: should be positive integer, not %s", tok1);
1883 	    if (!is_integer(tok2))                                                       ESL_XFAIL(status,     cmfp->errbuf, "Invalid 2nd MAP value on MATP node line: should be positive integer, not %s", tok2);
1884 	    tmp_map_left[nd]  = atoi(tok1);
1885 	    tmp_map_right[nd] = atoi(tok2);
1886 	  }
1887 	  else if((cm->flags & CMH_MAP) && cm->ndtype[nd] == MATL_nd) {
1888 	    if (!is_integer(tok1))                                                       ESL_XFAIL(status,     cmfp->errbuf, "Invalid 1st MAP value on MATL node line: should be positive integer, not %s", tok1);
1889 	    if (*tok2 != '-')                                                            ESL_XFAIL(status,     cmfp->errbuf, "Invalid 2nd MAP value on MATL node line: should be '-', not %s", tok2);
1890 	    tmp_map_left[nd]  = atoi(tok1);
1891 	    tmp_map_right[nd] = -1;
1892 	  }
1893 	  else if((cm->flags & CMH_MAP) && cm->ndtype[nd] == MATR_nd) {
1894 	    if (*tok1 != '-')                                                            ESL_XFAIL(status,     cmfp->errbuf, "Invalid 1st MAP value on MATR node line: should be '-', not %s", tok1);
1895 	    if (!is_integer(tok2))                                                       ESL_XFAIL(status,     cmfp->errbuf, "Invalid 2nd MAP value on MATR node line: should be positive integer, not %s", tok2);
1896 	    tmp_map_left[nd]  = -1;
1897 	    tmp_map_right[nd] = atoi(tok2);
1898 	  }
1899 	  else { /* either (! (cm->flags & CMH_MAP)) or ndtype is not MATP, MATL nor MATR */
1900 	    if (*tok1 != '-')                                                            ESL_XFAIL(status,     cmfp->errbuf, "Invalid 1st MAP value on node line: should be '-', not %s", tok1);
1901 	    if (*tok2 != '-')                                                            ESL_XFAIL(status,     cmfp->errbuf, "Invalid 2nd MAP value on node line: should be '-', not %s", tok2);
1902 	    tmp_map_left[nd]  = -1;
1903 	    tmp_map_right[nd] = -1;
1904 	  }
1905 
1906 	  /* consensus sequence (optional: CMH_CONS? yes, else no */
1907 	  if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on node line: expected %d, got %d", 10, 6);
1908 	  if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok2, NULL)) != eslOK) ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on node line: expected %d, got %d", 10, 7);
1909 	  if     ((cm->flags & CMH_CONS) && (cm->ndtype[nd] == MATP_nd)) {
1910 	    tmp_cons_left[nd]  = *tok1;
1911 	    tmp_cons_right[nd] = *tok2;
1912 	  }
1913 	  else if((cm->flags & CMH_CONS) && cm->ndtype[nd] == MATL_nd) {
1914 	    if (*tok2 != '-')                                                            ESL_XFAIL(status,     cmfp->errbuf, "Invalid 2nd consensus character on MATL node line: should be '-', not %s", tok2);
1915 	    tmp_cons_left[nd]  = *tok1;
1916 	    tmp_cons_right[nd] = *tok2;
1917 	  }
1918 	  else if((cm->flags & CMH_CONS) && cm->ndtype[nd] == MATR_nd) {
1919 	    if (*tok1 != '-')                                                            ESL_XFAIL(status,     cmfp->errbuf, "Invalid 1st consensus character on MATR node line: should be '-', not %s", tok1);
1920 	    tmp_cons_left[nd]  = *tok1;
1921 	    tmp_cons_right[nd] = *tok2;
1922 	  }
1923 	  else { /* either (! (cm->flags & CMH_CONS) or ndtype is not MATP, MATL nor MATR */
1924 	    if (*tok1 != '-')                                                            ESL_XFAIL(status,     cmfp->errbuf, "Invalid 1st consensus character on node line: should be '-', not %s", tok1);
1925 	    if (*tok2 != '-')                                                            ESL_XFAIL(status,     cmfp->errbuf, "Invalid 2nd consensus character on node line: should be '-', not %s", tok2);
1926 	    tmp_cons_left[nd]  = *tok1;
1927 	    tmp_cons_right[nd] = *tok2;
1928 	  }
1929 
1930 	  /* RF (optional: CMH_RF? yes, else no */
1931 	  if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on node line: expected %d, got %d", 10, 8);
1932 	  if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok2, NULL)) != eslOK) ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on node line: expected %d, got %d", 10, 9);
1933 	  if      ((cm->flags & CMH_RF) && cm->ndtype[nd] == MATP_nd) {
1934 	    tmp_rf_left[nd]  = *tok1;
1935 	    tmp_rf_right[nd] = *tok2;
1936 	  }
1937 	  else if((cm->flags & CMH_RF) && cm->ndtype[nd] == MATL_nd) {
1938 	    if (*tok2 != '-')                                                            ESL_XFAIL(status,     cmfp->errbuf, "Invalid 2nd RF character on MATL node line: should be '-', not %s", tok2);
1939 	    tmp_rf_left[nd]  = *tok1;
1940 	    tmp_rf_right[nd] = *tok2;
1941 	  }
1942 	  else if((cm->flags & CMH_RF) && cm->ndtype[nd] == MATR_nd) {
1943 	    if (*tok1 != '-')                                                            ESL_XFAIL(status,     cmfp->errbuf, "Invalid 1st RF character on MATR node line: should be '-', not %s", tok1);
1944 	    tmp_rf_left[nd]  = *tok1;
1945 	    tmp_rf_right[nd] = *tok2;
1946 	  }
1947 	  else { /* either (! (cm->flags & CMH_RF)) or ndtype is not MATP, MATL nor MATR */
1948 	    if (*tok1 != '-')                                                            ESL_XFAIL(status,     cmfp->errbuf, "Invalid 1st RF character on node line: should be '-', not %s", tok1);
1949 	    if (*tok2 != '-')                                                            ESL_XFAIL(status,     cmfp->errbuf, "Invalid 2nd RF character on node line: should be '-', not %s", tok2);
1950 	    tmp_rf_left[nd]  = *tok1;
1951 	    tmp_rf_right[nd] = *tok2;
1952 	  }
1953 	  if ((status = esl_fileparser_NextLine(cmfp->efp))                    != eslOK) ESL_XFAIL(status,     cmfp->errbuf, "Premature end of data in main model: no state %d line", v);
1954 	  if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK) ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on state line: expected at least %d, got %d", 8, 0);
1955 	} /* done with node line */
1956 
1957       /* Process state line.
1958        * <statecode> <v> <plast> <pnum> <cfirst> <cnum> <dmin2> <dmin1> <dmax1> <dmax2> <transition probs (variable number)> <emission probs (variable number)>
1959        */
1960 
1961       /* <statecode> */
1962       if((cm->sttype[v] = StateCode(tok1)) == -1)                                        ESL_XFAIL(status,     cmfp->errbuf, "Invalid state type %s\n", tok1);
1963 
1964       /* <v> */
1965       if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK)     ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on state %d line: expected at least %d, got %d", v, 8, 1);
1966       if (! is_integer(tok1))                                                            ESL_XFAIL(status,     cmfp->errbuf, "Invalid state index on state line: should be integer >= 0, not %s", tok1);
1967       if (atoi(tok1) != v)                                                               ESL_XFAIL(status,     cmfp->errbuf, "Invalid state index on state line: should be %d, not %s", v, tok1);
1968 
1969       /* <plast> */
1970       if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK)     ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on state %d line: expected at least %d, got %d", v, 8, 2);
1971       if (! is_integer(tok1))                                                            ESL_XFAIL(status,     cmfp->errbuf, "Invalid plast value on state line: should be integer, not %s", tok1);
1972       cm->plast[v] = atoi(tok1);
1973 
1974       /* <pnum> */
1975       if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK)     ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on state %d line: expected at least %d, got %d", v, 8, 3);
1976       if (! is_integer(tok1))                                                            ESL_XFAIL(status,     cmfp->errbuf, "Invalid pnum value on state line: should be integer, not %s", tok1);
1977       cm->pnum[v] = atoi(tok1);
1978 
1979       /* <cfirst> */
1980       if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK)     ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on state %d line: expected at least %d, got %d", v, 8, 4);
1981       if (! is_integer(tok1))                                                            ESL_XFAIL(status,     cmfp->errbuf, "Invalid cfirst value on state line: should be integer, not %s", tok1);
1982       cm->cfirst[v] = atoi(tok1);
1983 
1984       /* <cnum> */
1985       if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK)     ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on state %d line: expected at least %d, got %d", v, 8, 5);
1986       if (! is_integer(tok1))                                                            ESL_XFAIL(status,     cmfp->errbuf, "Invalid cnum value on state line: should be integer, not %s", tok1);
1987       cm->cnum[v] = atoi(tok1);
1988 
1989       /* <dmin2> */
1990       if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK)     ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on state %d line: expected at least %d, got %d", v, 8, 6);
1991       if (! is_integer(tok1))                                                            ESL_XFAIL(status,     cmfp->errbuf, "Invalid dmin2 value on state line: should be integer, not %s", tok1);
1992       cm->qdbinfo->dmin2[v] = atoi(tok1);
1993 
1994       /* <dmin1> */
1995       if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK)     ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on state %d line: expected at least %d, got %d", v, 8, 6);
1996       if (! is_integer(tok1))                                                            ESL_XFAIL(status,     cmfp->errbuf, "Invalid dmin1 value on state line: should be integer, not %s", tok1);
1997       cm->qdbinfo->dmin1[v] = atoi(tok1);
1998 
1999       /* <dmax1> */
2000       if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK)     ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on state %d line: expected at least %d, got %d", v, 8, 7);
2001       if (! is_integer(tok1))                                                            ESL_XFAIL(status,     cmfp->errbuf, "Invalid dmax1 value on state line: should be integer, not %s", tok1);
2002       cm->qdbinfo->dmax1[v] = atoi(tok1);
2003 
2004       /* <dmax2> */
2005       if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK)     ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on state %d line: expected at least %d, got %d", v, 8, 7);
2006       if (! is_integer(tok1))                                                            ESL_XFAIL(status,     cmfp->errbuf, "Invalid dmax2 value on state line: should be integer, not %s", tok1);
2007       cm->qdbinfo->dmax2[v] = atoi(tok1);
2008 
2009       /* Transition probabilities. */
2010       if (cm->sttype[v] != B_st) {
2011 	for (x = 0; x < cm->cnum[v]; x++) {
2012 	  if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK)     ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on state %d line: expected at least %d, got %d", v, 8+cm->cnum[v], 8+x);
2013 	  if ((! is_real(tok1) && (*tok1 != '*')))                                           ESL_XFAIL(status,     cmfp->errbuf, "Invalid transition score %d on state line: should be real number or '*', not %s", x+1, tok1);
2014 	  cm->t[v][x] = ascii2prob(tok1, 1.);
2015 	}
2016       }
2017       /* Emission probabilities. */
2018       if (cm->sttype[v] == ML_st || cm->sttype[v] == MR_st || cm->sttype[v] == IL_st || cm->sttype[v] == IR_st) {
2019 	for (x = 0; x < cm->abc->K; x++) {
2020 	  if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK)     ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on state %d line: expected %d, got %d", v, 8+cm->cnum[v]+cm->abc->K, 8+cm->cnum[v]+x);
2021 	  if ((! is_real(tok1) && (*tok1 != '*')))                                           ESL_XFAIL(status,     cmfp->errbuf, "Invalid emission score %d on state line: should be real number or '*', not %s", x+1, tok1);
2022 	  cm->e[v][x] = ascii2prob(tok1, cm->null[x]);
2023 	}
2024       }
2025       else if (cm->sttype[v] == MP_st) {
2026 	for (x = 0; x < cm->abc->K; x++) {
2027 	  for (y = 0; y < cm->abc->K; y++) {
2028 	    if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL)) != eslOK)   ESL_XFAIL(status,     cmfp->errbuf, "Too few fields on state %d line: expected %d, got %d", v, 8+cm->cnum[v]+(cm->abc->K*cm->abc->K), 8+cm->cnum[v]+(x*cm->abc->K)+y);
2029 	    if ((! is_real(tok1) && (*tok1 != '*')))                                         ESL_XFAIL(status,     cmfp->errbuf, "Invalid emission score %d on state line: should be real number or '*', not %s", (x*cm->abc->K)+y+1, tok1);
2030 	    cm->e[v][x*cm->abc->K+y] = ascii2prob(tok1, cm->null[x]*cm->null[y]);
2031 	  }
2032 	}
2033       }
2034       cm->ndidx[v] = nd;
2035       cm->stid[v]  = DeriveUniqueStateCode(cm->ndtype[nd], cm->sttype[v]);
2036       if ((status = esl_fileparser_NextLine(cmfp->efp))                            != eslOK) ESL_XFAIL(status,     cmfp->errbuf, "Premature end of data in main model: no state %d line", v+1);
2037     } /* end of loop over states */
2038   /* The closing // */
2039   if ((status = esl_fileparser_GetTokenOnLine(cmfp->efp, &tok1, NULL))        != eslOK)  ESL_XFAIL(status,     cmfp->errbuf, "Premature end of data: missing //?");
2040   if (strcmp(tok1, "//")                                                      != 0)      ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Expected closing //; found %s instead", tok1);
2041 
2042   /* Finally, read the filter HMM for this CM, unless we're explicitly told not to */
2043   if(read_fp7) {
2044     if(!cmfp->hfp->do_stdin && !cmfp->hfp->do_gzip) {
2045       if((fp7_offset = ftello(cmfp->f)) < 0) ESL_XFAIL(eslESYS, cmfp->errbuf, "ftello() failed");
2046     }
2047     else {
2048       fp7_offset = -1; /* this is irrelevant if stdin or gzip mode, cm_p7_hmmfile_Read will ignore it */
2049     }
2050     if((status = cm_p7_hmmfile_Read(cmfp, abc, fp7_offset, &hmm)) != eslOK) goto ERROR;
2051     if((status = cm_SetFilterHMM(cm, hmm, tmp_fp7_gfmu, tmp_fp7_gflambda)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Unable to set filter HMM for CM");
2052     if(!cmfp->hfp->do_stdin && !cmfp->hfp->do_gzip) {
2053       /* now position the CM file to the end of the HMM we just read */
2054       if (fseeko(cmfp->f, ftello(cmfp->hfp->f), SEEK_SET) != 0) ESL_XFAIL(eslESYS, cmfp->errbuf, "Failed to set position for CM file parser after reading filter HMMs");
2055     }
2056   }
2057 
2058   CMRenormalize(cm);
2059   cm->qdbinfo->setby = CM_QDBINFO_SETBY_CMFILE;
2060   cm->W_setby        = CM_W_SETBY_CMFILE;
2061 
2062   /* Create emit map now that we know the model architecture */
2063   cm->emap = CreateEmitMap(cm);
2064   if(cm->emap == NULL) ESL_XFAIL(eslEINVAL, cmfp->errbuf, "After reading complete model, failed to create an emit map");
2065 
2066   /* Use emit map to map the per-node annotation to per-consensus position */
2067   if (cm->flags & CMH_RF) {
2068     cm->rf[0] = ' ';
2069     for(nd = 0; nd < cm->nodes; nd++) {
2070       if(cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd) cm->rf[cm->emap->lpos[nd]] = tmp_rf_left[nd];
2071       if(cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATR_nd) cm->rf[cm->emap->rpos[nd]] = tmp_rf_right[nd];
2072     }
2073     cm->rf[cm->clen+1] = '\0';
2074   }
2075   if (cm->flags & CMH_CONS) {
2076     cm->consensus[0] = ' ';
2077     for(nd = 0; nd < cm->nodes; nd++) {
2078       if(cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd) cm->consensus[cm->emap->lpos[nd]] = tmp_cons_left[nd];
2079       if(cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATR_nd) cm->consensus[cm->emap->rpos[nd]] = tmp_cons_right[nd];
2080     }
2081     cm->consensus[cm->clen+1] = '\0';
2082   }
2083   if (cm->flags & CMH_MAP) {
2084     cm->map[0] = 0;
2085     for(nd = 0; nd < cm->nodes; nd++) {
2086       if(cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd) cm->map[cm->emap->lpos[nd]] = tmp_map_left[nd];
2087       if(cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATR_nd) cm->map[cm->emap->rpos[nd]] = tmp_map_right[nd];
2088     }
2089   }
2090 
2091   if(tmp_null  != NULL) free(tmp_null);
2092 
2093   /* these get allocated regardless of flag status, free them */
2094   free(tmp_rf_left);
2095   free(tmp_rf_right);
2096   free(tmp_cons_left);
2097   free(tmp_cons_right);
2098   free(tmp_map_left);
2099   free(tmp_map_right);
2100 
2101   if (*ret_abc == NULL) *ret_abc = abc;
2102   if ( opt_cm != NULL)  *opt_cm = cm; else FreeCM(cm);
2103   return eslOK;
2104 
2105  ERROR:
2106   if (*ret_abc == NULL && abc != NULL) esl_alphabet_Destroy(abc);
2107   if (cm       != NULL) FreeCM(cm);
2108   if (opt_cm   != NULL) *opt_cm = NULL;
2109   if      (status == eslEMEM || status == eslESYS) return status;
2110   else if (status == eslEOF)                       return status;
2111   else if (status == eslEINCOMPAT)                 return status;
2112   else                                             return eslEFORMAT;	/* anything else is a format error: includes premature EOF, EOL, EOD  */
2113 }
2114 
2115 
2116 static int
read_bin_1p1_cm(CM_FILE * cmfp,int read_fp7,ESL_ALPHABET ** ret_abc,CM_t ** opt_cm)2117 read_bin_1p1_cm(CM_FILE *cmfp, int read_fp7, ESL_ALPHABET **ret_abc, CM_t **opt_cm)
2118 {
2119   ESL_ALPHABET *abc = NULL;
2120   CM_t         *cm = NULL;
2121   P7_HMM       *hmm = NULL;
2122   uint32_t      magic;
2123   int           alphabet_type;
2124   int           v, x;
2125   off_t         offset = 0;
2126   off_t         fp7_offset = 0;
2127   int           status;
2128   float         tmp_fp7_gfmu;
2129   float         tmp_fp7_gflambda;
2130 
2131   cmfp->errbuf[0] = '\0';
2132   if (feof(cmfp->f))  { status = eslEOF;       goto ERROR; }
2133 
2134   if (cmfp->newly_opened)
2135     {
2136       offset = 0;
2137       cmfp->newly_opened = FALSE;
2138     }
2139   else
2140     {  /* Check magic. */
2141       if ((!cmfp->do_stdin) && (! cmfp->do_gzip)) {
2142 	if ((offset = ftello(cmfp->f)) < 0)                          ESL_XEXCEPTION(eslESYS, "ftello() failed");
2143       }
2144       if (! fread((char *) &magic, sizeof(uint32_t), 1, cmfp->f))    { status = eslEOF;       goto ERROR; }
2145 
2146       if (cmfp->format == CM_FILE_1a) { if (magic != v1a_magic)  ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "bad magic number at start of CM");  }
2147       else                                                       ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "no such CM file format code");
2148     }
2149 
2150   /* Allocate shell of the new CM.
2151    * Two-step allocation lets us read/set the flags first;
2152    * then the later CreateBody() call will allocate optional internal fields we need.
2153    */
2154   if ((cm = CreateCMShell()) == NULL)   ESL_XFAIL(eslEMEM,    cmfp->errbuf, "allocation failure, CM shell");
2155   cm->offset = offset;
2156 
2157   /* Get sizes of things */
2158   if (! fread((char *) &(cm->flags),     sizeof(int), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read flags");
2159 
2160   /* Important step that's easy to overlook: lower flags that may have
2161    * been up when CM was output to file, but that won't be true of the
2162    * CM we're about to read (since not all CM parameters go into the
2163    * file).
2164    */
2165   cm->flags &= ~CMH_BITS;
2166   cm->flags &= ~CMH_CP9;
2167   cm->flags &= ~CMH_CP9_TRUNC;
2168   cm->flags &= ~CMH_MLP7;
2169   cm->flags &= ~CM_IS_CONFIGURED;
2170 
2171   if (! fread((char *) &(cm->M),         sizeof(int), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read number of states");
2172   if (! fread((char *) &(cm->nodes),     sizeof(int), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read number of nodes");
2173   if (! fread((char *) &(cm->clen),      sizeof(int), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read consensus length");
2174   if (! fread((char *) &alphabet_type,   sizeof(int), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read alphabet_type");
2175 
2176   /* Set or verify alphabet. */
2177   if (*ret_abc == NULL)	{	/* still unknown: set it, pass control of it back to caller */
2178     if ((abc = esl_alphabet_Create(alphabet_type)) == NULL)     ESL_XFAIL(eslEMEM, cmfp->errbuf, "allocation failed, alphabet");
2179   } else {			/* already known: check it */
2180     abc = *ret_abc;
2181     if (abc->type != alphabet_type)                             ESL_XFAIL(eslEINCOMPAT, cmfp->errbuf, "Alphabet type mismatch: was %s, but current HMM says %s", esl_abc_DecodeType( abc->type), esl_abc_DecodeType(alphabet_type));
2182   }
2183 
2184   /* Finish the allocation of the CM
2185    */
2186   CreateCMBody(cm, cm->nodes, cm->M, cm->clen, abc);
2187 
2188   /* Core model probabilities. */
2189   if (! fread((char *) cm->sttype,         sizeof(char), cm->M,      cmfp->f))  ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read sttype array");
2190   if (! fread((char *) cm->ndidx,          sizeof(int),  cm->M,      cmfp->f))  ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read ndidx array");
2191   if (! fread((char *) cm->stid,           sizeof(char), cm->M,      cmfp->f))  ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read stid array");
2192   if (! fread((char *) cm->cfirst,         sizeof(int),  cm->M,      cmfp->f))  ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read cfirst array");
2193   if (! fread((char *) cm->cnum,           sizeof(int),  cm->M,      cmfp->f))  ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read cnum array");
2194   if (! fread((char *) cm->plast,          sizeof(int),  cm->M,      cmfp->f))  ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read plast array");
2195   if (! fread((char *) cm->pnum,           sizeof(int),  cm->M,      cmfp->f))  ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read pnum array");
2196   if (! fread((char *) cm->nodemap,        sizeof(int),  cm->nodes,  cmfp->f))  ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read nodemap array");
2197   if (! fread((char *) cm->ndtype,         sizeof(char), cm->nodes,  cmfp->f))  ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read ndtype array");
2198   if (! fread((char *) cm->qdbinfo->dmin1, sizeof(int),  cm->M,      cmfp->f))  ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read dmin1 array");
2199   if (! fread((char *) cm->qdbinfo->dmax1, sizeof(int),  cm->M,      cmfp->f))  ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read dmax1 array");
2200   if (! fread((char *) cm->qdbinfo->dmin2, sizeof(int),  cm->M,      cmfp->f))  ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read dmin2 array");
2201   if (! fread((char *) cm->qdbinfo->dmax2, sizeof(int),  cm->M,      cmfp->f))  ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read dmax2 array");
2202 
2203   cm->qdbinfo->setby = CM_QDBINFO_SETBY_CMFILE;
2204   cm->W_setby        = CM_W_SETBY_CMFILE;
2205 
2206   /* Create emit map now that we know the model architecture */
2207   cm->emap = CreateEmitMap(cm);
2208   if(cm->emap == NULL) ESL_XFAIL(eslEINVAL, cmfp->errbuf, "After reading complete model, failed to create an emit map");
2209 
2210   for (v = 0; v < cm->M; v++) {
2211     if (! fread((char *) cm->t[v], sizeof(float), MAXCONNECT,           cmfp->f))  ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read transitions for state %d", v);
2212     if (! fread((char *) cm->e[v], sizeof(float), cm->abc->K*cm->abc->K,cmfp->f))  ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read emissions for state %d", v);
2213   }
2214 
2215   /* Annotations. */
2216   if (read_bin_string(cmfp->f, &(cm->name)) != eslOK)                                                ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read name");
2217   if ((cm->flags & CMH_ACC)  && read_bin_string(cmfp->f, &(cm->acc))  != eslOK)                      ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read acc");
2218   if ((cm->flags & CMH_DESC) && read_bin_string(cmfp->f, &(cm->desc)) != eslOK)                      ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read desc");
2219   if ((cm->flags & CMH_RF)   && ! fread((char *) cm->rf,        sizeof(char), cm->clen+2, cmfp->f))  ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read rf");        /* +2: 1..M and trailing \0 */
2220   if ((cm->flags & CMH_CONS) && ! fread((char *) cm->consensus, sizeof(char), cm->clen+2, cmfp->f))  ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read consensus"); /* don't need to test for >=3e format, because the flag is sufficient (didn't exist pre-3e) */
2221   if ((cm->flags & CMH_MAP)  && ! fread((char *) cm->map, sizeof(int), cm->clen+1, cmfp->f))         ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read map");
2222   if (! fread((char *) &(cm->W),       sizeof(int),   1, cmfp->f))                                   ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read nseq");
2223 
2224   if (read_bin_string(cmfp->f, &(cm->ctime))  != eslOK)                                              ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read ctime");
2225   if (read_bin_string(cmfp->f, &(cm->comlog)) != eslOK)                                              ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read comlog");
2226 
2227   if (! fread((char *) &(cm->pbegin),         sizeof(float),    1,          cmfp->f))                ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read beta_W");
2228   if (! fread((char *) &(cm->pend),           sizeof(float),    1,          cmfp->f))                ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read beta_W");
2229   if (! fread((char *) &(cm->beta_W),         sizeof(double),   1,          cmfp->f))                ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read beta_W");
2230   if (! fread((char *) &(cm->qdbinfo->beta1), sizeof(double),   1,          cmfp->f))                ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read betaqdb1");
2231   if (! fread((char *) &(cm->qdbinfo->beta2), sizeof(double),   1,          cmfp->f))                ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read betaqdb2");
2232   if (! fread((char *) &(cm->null2_omega),    sizeof(float),    1,          cmfp->f))                ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read null2_omega");
2233   if (! fread((char *) &(cm->null3_omega),    sizeof(float),    1,          cmfp->f))                ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read null3_omega");
2234   if (! fread((char *) &(cm->el_selfsc),      sizeof(float),    1,          cmfp->f))                ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read el_selfsc");
2235   if (! fread((char *) &(cm->nseq),           sizeof(int),      1,          cmfp->f))                ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read nseq");
2236   if (! fread((char *) &(cm->eff_nseq),       sizeof(float),    1,          cmfp->f))                ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read eff_nseq");
2237   if (! fread((char *) &(cm->checksum),       sizeof(uint32_t), 1,          cmfp->f))                ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read checksum");
2238   if (! fread((char *) cm->null,              sizeof(float),    cm->abc->K, cmfp->f))                ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read null vector");
2239 
2240   /* Rfam cutoffs */
2241   if ((cm->flags & CMH_GA) && (! fread((char *) &(cm->ga), sizeof(float), 1, cmfp->f)))              ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read GA cutoff");
2242   if ((cm->flags & CMH_TC) && (! fread((char *) &(cm->tc), sizeof(float), 1, cmfp->f)))              ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read TC cutoff");
2243   if ((cm->flags & CMH_NC) && (! fread((char *) &(cm->nc), sizeof(float), 1, cmfp->f)))              ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read NC cutoff");
2244 
2245   /* E-value parameters */
2246   if (cm->flags & CMH_FP7) { /* should always be true */
2247     if (! fread((char *) &tmp_fp7_gfmu,     sizeof(float), 1, cmfp->f))         ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read additional P7 E-value stats");
2248     if (! fread((char *) &tmp_fp7_gflambda, sizeof(float), 1, cmfp->f))         ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read additional P7 E-value stats");
2249   }
2250   if (cm->flags & CMH_EXPTAIL_STATS) {
2251     long dbsize_long;
2252     ESL_ALLOC(cm->expA, sizeof(ExpInfo_t *) * EXP_NMODES);
2253     for(x = 0; x < EXP_NMODES; x++) {
2254       cm->expA[x] = CreateExpInfo();
2255       if (! fread((char *) &(cm->expA[x]->lambda),    sizeof(double), 1, cmfp->f))        ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM E-value stats");
2256       if (! fread((char *) &(cm->expA[x]->mu_extrap), sizeof(double), 1, cmfp->f))        ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM E-value stats");
2257       if (! fread((char *) &(cm->expA[x]->mu_orig),   sizeof(double), 1, cmfp->f))        ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM E-value stats");
2258       if (! fread((char *) &(dbsize_long),            sizeof(long),   1, cmfp->f))        ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM E-value stats");
2259       cm->expA[x]->dbsize = (double) dbsize_long;
2260       if (! fread((char *) &(cm->expA[x]->nrandhits), sizeof(int),    1, cmfp->f))        ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM E-value stats");
2261       if (! fread((char *) &(cm->expA[x]->tailp),     sizeof(double), 1, cmfp->f))        ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM E-value stats");
2262     }
2263   }
2264 
2265   /* Finally, read the filter HMM for this CM, unless we're explicitly asked not to. */
2266   if(read_fp7) {
2267     if(!cmfp->hfp->do_stdin && !cmfp->hfp->do_gzip) {
2268       if((fp7_offset = ftello(cmfp->f)) < 0) ESL_XFAIL(eslESYS, cmfp->errbuf, "ftello() failed");
2269     }
2270     else {
2271       fp7_offset = -1; /* this is irrelevant if stdin or gzip mode, cm_p7_hmmfile_Read will ignore it */
2272     }
2273     if((status = cm_p7_hmmfile_Read(cmfp, abc, fp7_offset, &hmm)) != eslOK) goto ERROR;
2274     if((status = cm_SetFilterHMM(cm, hmm, tmp_fp7_gfmu, tmp_fp7_gflambda)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Unable to set filter HMM for CM");
2275     if(!cmfp->hfp->do_stdin && !cmfp->hfp->do_gzip) {
2276       /* now position the CM file to the end of the HMM we just read */
2277       if (fseeko(cmfp->f, ftello(cmfp->hfp->f), SEEK_SET) != 0) ESL_XFAIL(eslESYS, cmfp->errbuf, "Failed to set position for CM file parser after reading filter HMMs");
2278     }
2279   }
2280 
2281   if (*ret_abc == NULL) *ret_abc = abc;	/* pass our new alphabet back to caller, if caller didn't know it already */
2282   if ( opt_cm != NULL)  *opt_cm  = cm;  else FreeCM(cm);
2283   return eslOK;
2284 
2285  ERROR:
2286   if (*ret_abc == NULL && abc != NULL) esl_alphabet_Destroy(abc); /* the test is for an alphabet created here, not passed */
2287   if (cm     != NULL) FreeCM(cm);
2288   if (opt_cm != NULL) *opt_cm = NULL;
2289   return status;
2290 }
2291 
2292 /* read_asc_1p0_cm(): inputting version 1.0-->1.0.2 CM file format.
2293  *
2294  * This function was stolen from Infernal v1.0.2's
2295  * cm_io.c:read_ascii_cm() (which is identical to the function of the
2296  * same time in v1.0 and v1.0.1.), and minmally changed to work in the
2297  * context of new CM_FILE data structure. This version also includes
2298  * better error message handling.
2299  *
2300  * Note that the value of <read_fp7> is irrelevant. It is only
2301  * in the prototype so that read_asc_1p0_cm is consistent with
2302  * other parser prototypes.
2303  */
2304 static int
read_asc_1p0_cm(CM_FILE * cmfp,int read_fp7,ESL_ALPHABET ** ret_abc,CM_t ** opt_cm)2305 read_asc_1p0_cm(CM_FILE *cmfp, int read_fp7, ESL_ALPHABET **ret_abc, CM_t **opt_cm)
2306 {
2307   int     status;
2308   CM_t   *cm;
2309   char   *buf;
2310   int     n;			/* length of buf */
2311   char   *s;
2312   int     M,N;			/* number of states, nodes in model */
2313   int     v,x,y,nd;		/* counters for states, events, nodes */
2314   char   *tok;
2315   int     toklen;
2316   int     exp_flags[EXP_NMODES]; /* keep track of which exp tails we've read */
2317   int     exp_mode;             /* index of exp tail info               */
2318   int     have_exps;            /* for checking we get 0 or all exp tails*/
2319   int     have_ga = FALSE;      /* we have GA cutoff, needed b/c we can't set cm->flags until after CreateCMBody() call */
2320   int     have_tc = FALSE;      /* we have TC cutoff, needed b/c we can't set cm->flags until after CreateCMBody() call */
2321   int     have_nc = FALSE;      /* we have NC cutoff, needed b/c we can't set cm->flags until after CreateCMBody() call */
2322   int     p;                    /* counter for partitions          */
2323   int     i;                    /* counter over exp_modes for exp tails */
2324   int     alphabet_type;        /* type of ESL_ALPHABET */
2325   ESL_ALPHABET *abc = NULL;
2326   int     read_nstates = FALSE; /* TRUE once we've read the number of states */
2327   int     read_nnodes  = FALSE; /* TRUE once we've read the number of nodes */
2328   int     read_atype   = FALSE; /* TRUE once we've read the alphabet type */
2329   int     read_clen = FALSE;
2330   int     evalues_are_invalid = FALSE;  /* TRUE if the PART line reports more than 1 partition */
2331   int     clen = 0;
2332   int     npartitions = 0;
2333   off_t   offset = 0;
2334 
2335   cm  = NULL;
2336   buf = NULL;
2337   n   = 0;
2338   for(i = 0; i < EXP_NMODES; i++)  exp_flags[i] = FALSE;
2339 
2340   cmfp->errbuf[0] = '\0';
2341 
2342   if (cmfp->newly_opened)
2343     {
2344       offset            = 0;
2345       cmfp->newly_opened = FALSE;
2346     }
2347   else
2348     {
2349       /* Record where this CM starts on disk */
2350       if ((! cmfp->do_stdin) && (! cmfp->do_gzip) && (offset = ftello(cmfp->f)) < 0)   ESL_XEXCEPTION(eslESYS, "ftello() failed");
2351 
2352       /* First line of file: "INFERNAL-1" */
2353       if (feof(cmfp->f) || esl_fgets(&buf, &n, cmfp->f) != eslOK) { /* end of file, free buf and return eslEOF */
2354 	if(buf != NULL) free(buf);
2355 	return eslEOF;
2356       }
2357       if      (cmfp->format == CM_FILE_1)  { if (strncmp(buf, "INFERNAL-1", 10) != 0)    ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Didn't find INFERNAL-1 tag: bad format or not an INFERNAL save file?"); }
2358       else                                                                              ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No such CM file format code: this shouldn't happen");
2359     }
2360 
2361 
2362   /* Parse the header information
2363    * These are all tag/value.
2364    * Ignore unknown tags (forward compatibility).
2365    */
2366   cm = CreateCMShell();
2367   M  = N = -1;
2368   while (esl_fgets(&buf, &n, cmfp->f) != eslEOF)
2369     {
2370       s   = buf;
2371       if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Premature end of data, prior to MODEL section");
2372       else if (strcmp(tok, "NAME") == 0)
2373 	{
2374 	  if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No name found on NAME line");
2375 	  if ((status = esl_strdup(tok, toklen, &(cm->name)))             != eslOK) ESL_XFAIL(status,     cmfp->errbuf, "Problem setting name for CM");
2376 	}
2377       else if (strcmp(tok, "ACC") == 0)
2378 	{
2379 	  if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No accession found on ACC line");
2380 	  if ((status = esl_strdup(tok, toklen, &(cm->acc)))              != eslOK) ESL_XFAIL(status,     cmfp->errbuf, "Problem setting accession for CM");
2381 	}
2382       else if (strcmp(tok, "DESC") == 0)
2383 	{
2384 	  if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No description found on DESC line");
2385 	  if ((status = esl_strdup(tok, toklen, &(cm->desc)))             != eslOK) ESL_XFAIL(status,     cmfp->errbuf, "Problem setting description for CM");
2386 	}
2387       else if (strcmp(tok, "GA") == 0)
2388 	{
2389 	  if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No GA threshold found on GA line");
2390 	  cm->ga = atof(tok);
2391 	  have_ga = TRUE;
2392 	}
2393       else if (strcmp(tok, "TC") == 0)
2394 	{
2395 	  if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No TC threshold found on TC line");
2396 	  cm->tc = atof(tok);
2397 	  have_tc = TRUE;
2398 	}
2399       else if (strcmp(tok, "NC") == 0)
2400 	{
2401 	  if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No NC threshold found on NC line");
2402 	  cm->nc = atof(tok);
2403 	  have_nc = TRUE;
2404 	}
2405       else if (strcmp(tok, "STATES") == 0)
2406 	{
2407 	  if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No number of STATES found on STATES line");
2408 	  M = atoi(tok);
2409 	  read_nstates = TRUE;
2410 	}
2411       else if (strcmp(tok, "NODES") == 0)
2412 	{
2413 	  if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No number of NODES found on NODES line");
2414 	  N = atoi(tok);
2415 	  read_nnodes = TRUE;
2416 	}
2417       else if (strcmp(tok, "ALPHABET") == 0)
2418 	{
2419 	  if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No alphabet type found on ALPHABET line");
2420 	  alphabet_type = atoi(tok);
2421 	  /* Set or verify alphabet. */
2422 	  if (*ret_abc == NULL)	{	/* still unknown: set it, pass control of it back to caller */
2423 	    if ((abc = esl_alphabet_Create(alphabet_type)) == NULL)       ESL_XFAIL(eslEMEM, cmfp->errbuf, "Failed to create alphabet");
2424 	  } else {			/* already known: check it */
2425 	    abc = *ret_abc;
2426 	    if ((*ret_abc)->type != alphabet_type)                        ESL_XFAIL(eslEINCOMPAT,cmfp->errbuf,"Alphabet type mismatch");
2427 	  }
2428 	  read_atype = TRUE;
2429 	}
2430       else if (strcmp(tok, "ELSELF") == 0)
2431 	{
2432 	  if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No ELSELF score found on ELSELF line");
2433 	  cm->el_selfsc = atof(tok);
2434 	}
2435       else if (strcmp(tok, "WBETA") == 0)
2436 	{
2437 	  if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No WBETA found on WBETA line");
2438 	  cm->beta_W = (double) atof(tok);
2439 	}
2440       else if (strcmp(tok, "NSEQ") == 0)
2441 	{
2442 	  if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No name found on NAME line");
2443 	  cm->nseq = atoi(tok);
2444 	}
2445       else if (strcmp(tok, "EFFNSEQ") == 0)
2446 	{
2447 	  if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No effective seq number found on EFFNSEQ line");
2448 	  cm->eff_nseq = atof(tok);
2449 	}
2450       else if (strcmp(tok, "CLEN") == 0)
2451 	{
2452 	  if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No consensus length found on CLEN line");
2453 	  clen = atoi(tok); /* we'll compare this to what we calculate at end of func */
2454 	  read_clen = TRUE;
2455 	  /* Now we have the clen and we should have the alphabet type and N and M, so we can build the
2456 	   * full model, and set the alphabet (which we need to do before alloc'ing/setting
2457 	   * the null model */
2458 	  if(! (read_nstates && read_nnodes && read_atype))
2459 	    {
2460 	      ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "ERROR, ALPHABET, STATES and NODES lines should precede CLEN line");
2461 	    }
2462 	  CreateCMBody(cm, N, M, clen, abc);
2463 	}
2464       /* comlog and ctime info. Careful, we want the full line, so a token becomes a full line
2465        * Also we stored this data differently in version 1.0, we have to throw out the CDATE
2466        * info, we don't store that anymore.
2467        */
2468       else if (strcmp(tok, "BCOM") == 0)
2469 	{
2470 	  while(isspace((int) (*s))) s++; /* chew up leading whitespace */
2471 	  if ((status = esl_strtok_adv(&s, "\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No build command found on BCOM line");
2472 	  if(cm->comlog == NULL) {
2473 	    if((status = esl_strdup(tok, toklen, &(cm->comlog))) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Problem setting build date");
2474 	  }
2475 	  else {
2476 	    if((status = esl_strcat(&(cm->comlog), -1,"\n",      1)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Problem setting build date");
2477 	    if((status = esl_strcat(&(cm->comlog), -1, tok, toklen)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Problem setting build date");
2478 	  }
2479 	}
2480       else if (strcmp(tok, "BDATE") == 0)
2481 	{
2482 	  while(isspace((int) (*s))) s++; /* chew up leading whitespace */
2483 	  if ((status = esl_strtok_adv(&s, "\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No date found on BDATE line");
2484 	  if (esl_strdup(tok, toklen, &(cm->ctime))                    != eslOK) ESL_XFAIL(eslEMEM,    cmfp->errbuf, "strdup() failed to set date");
2485 	}
2486       else if (strcmp(tok, "CCOM") == 0)
2487 	{
2488 	  while(isspace((int) (*s))) s++; /* chew up leading whitespace */
2489 	  if ((status = esl_strtok_adv(&s, "\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No calibrate command found on CCOM line");
2490 	  if(cm->comlog == NULL) {
2491 	    if((status = esl_strdup(tok, toklen, &(cm->comlog))) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Problem setting calibrate date");
2492 	  }
2493 	  else {
2494 	    if((status = esl_strcat(&(cm->comlog), -1,"\n",      1)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Problem setting calibrate date");
2495 	    if((status = esl_strcat(&(cm->comlog), -1, tok, toklen)) != eslOK) ESL_XFAIL(status, cmfp->errbuf, "Problem setting calibrate date");
2496 	  }
2497 	}
2498       else if (strcmp(tok, "CDATE") == 0)
2499 	{
2500 	  while(isspace((int) (*s))) s++; /* chew up leading whitespace */
2501 	  if ((status = esl_strtok_adv(&s, "\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No date found on CDATE line");
2502 	  /* we don't store this anymore */
2503 	}
2504       else if (strcmp(tok, "NULL") == 0)
2505 	{
2506 	  if(cm->abc == NULL) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "NULL line must be preceded by ALPHABET line");
2507 	  /* cm-> null already allocated in CreateCMBody() */
2508 	  for (x = 0; x < abc->K; x++)
2509 	    {
2510 	      if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Problem reading NULL line");
2511 	      cm->null[x] = ascii2prob(tok, (1./(float) abc->K));
2512 	    }
2513 	}
2514       /* exp tail distribution information */
2515       else if (strcmp(tok, "PART") == 0)
2516 	{
2517 	  if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No number of partitions on PART line");
2518 	  if (! is_integer(tok))                                                    ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "number of partitions is not an integer on PART line");
2519 	  npartitions = atoi(tok);
2520 	  if(npartitions != 1) {
2521 	    evalues_are_invalid = TRUE;
2522 	    /* we can't deal with more than 1 partition in the current codebase, if there are more, throw away any E-value parameters we read */
2523 	  }
2524 	  /* else 1 partition, we can handle this (nearly all infernal
2525 	   * 1.0--1.0.2 files should have 1 partitions, you could
2526 	   * only create a multi-partition file with the undocument
2527 	   * --exp-pfile option to cmcalibrate). */
2528 
2529 	  /* Ignore the rest of this line, it includes partition start/stop info that's no longer parsed */
2530 	  if ((status = esl_strtok_adv(&s, "\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid PART- line");
2531 	}
2532       /* exp tail info */
2533       else if ((strncmp(tok, "E-", 2) == 0) && (! evalues_are_invalid)) /* skip the E- lines if we've read more than one partition above */
2534 	{
2535 	  /* determine which exp tail we're reading */
2536 	  if      (strncmp(tok+2, "LC", 2) == 0) exp_mode = EXP_CM_LC;
2537 	  else if (strncmp(tok+2, "GC", 2) == 0) exp_mode = EXP_CM_GC;
2538 	  else if (strncmp(tok+2, "LI", 2) == 0) exp_mode = EXP_CM_LI;
2539 	  else if (strncmp(tok+2, "GI", 2) == 0) exp_mode = EXP_CM_GI;
2540 	  else if (strncmp(tok+2, "LV", 2) == 0) continue; /* cp9 HMM  local viterbi, irrelevant in current format */
2541 	  else if (strncmp(tok+2, "GV", 2) == 0) continue; /* cp9 HMM glocal viterbi, irrelevant in current format */
2542 	  else if (strncmp(tok+2, "LF", 2) == 0) continue; /* cp9 HMM  local forward, irrelevant in current format */
2543 	  else if (strncmp(tok+2, "GF", 2) == 0) continue; /* cp9 HMM glocal forward, irrelevant in current format */
2544 	  else ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid xx on E-xx line");
2545 
2546 	  /* create the expA array if this is the first E- line we've read */
2547 	  if (cm->expA == NULL) {
2548 	    ESL_ALLOC(cm->expA, sizeof(ExpInfo_t *) * EXP_NMODES);
2549 	    for(x = 0; x < EXP_NMODES; x++) { cm->expA[x] = CreateExpInfo(); }
2550 	  }
2551 
2552 	  /* now we know what exp tail we're reading, read it */
2553 	  /* chew up partition */
2554 	  if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No partition read on E-xx line");
2555 	  p = atoi(tok);
2556 	  if (p != 0)                                            ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid partition on E-xx line");
2557 
2558 	  if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No lambda read on E-xx line");
2559 	  if (! is_real(tok))                                    ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "lambda not real number on E-xx line");
2560 	  cm->expA[exp_mode]->lambda = atof(tok);
2561 
2562 	  if ((esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No mu_extrap read on E-xx line");
2563 	  if (! is_real(tok))                                    ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "mu_extrap not real number on E-xx line");
2564 	  cm->expA[exp_mode]->mu_extrap = atof(tok);
2565 
2566 	  if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No mu_orig read on E-xx line");
2567 	  if (! is_real(tok))                                    ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "mu_orign is not real number on E-xx line");
2568 	  cm->expA[exp_mode]->mu_orig = atof(tok);
2569 
2570 	  if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No dbsize read on E-xx line");
2571 	  if (! is_integer(tok))                                 ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "dbsize is not integer on E-xx line");
2572 	  cm->expA[exp_mode]->dbsize = atof(tok); /* read it as a double, even though it's written as a long */
2573 
2574 	  if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No nrandhits read on E-xx line");
2575 	  if (! is_integer(tok))                                 ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "nrandhits is not integer on E-xx line");
2576 	  cm->expA[exp_mode]->nrandhits = atoi(tok);
2577 
2578 	  if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No tailp read on E-xx line");
2579 	  if (! is_real(tok))                                    ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "tailp is not real number on E-xx line");
2580 	  cm->expA[exp_mode]->tailp = atof(tok);
2581 
2582 	  cm->expA[exp_mode]->cur_eff_dbsize = (double) (cm->expA[exp_mode]->nrandhits);
2583 	  /* Previous line is to set cur_eff_dbsize as if database was of size cm->stats->expAA[p]->dbsize, we
2584 	   * act as if the max hits we'll see is nrandhits, the number of hits we saw in cmcalibrate,
2585 	   * so this is the highest possible E-value we can get.
2586 	   * cur_eff_dbsize will be updated in cmsearch for whatever the target database size is. */
2587 	  cm->expA[exp_mode]->is_valid = TRUE; /* set valid flag */
2588 	  exp_flags[exp_mode] = TRUE;
2589 	}
2590       else if (strncmp(tok, "FT-", 3) == 0)
2591 	{
2592 	  /* filter thresholds statistics are deprecated, chew up the rest ot this line, and following two lines */
2593 	  if ((status = esl_strtok_adv(&s, "\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid FT- line");
2594 	  if (esl_fgets(&buf, &n, cmfp->f) == eslEOF) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Premature end to file, while reading FT- line 2");
2595 	  if (esl_fgets(&buf, &n, cmfp->f) == eslEOF) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Premature end to file, while reading FT- line 3");
2596 	}
2597       else if (strcmp(tok, "MODEL:") == 0)
2598 	break;
2599     }
2600 
2601   /* Done reading the header information.
2602    * Check that everything is ok and mandatory info is present before moving on.
2603    */
2604   if (feof(cmfp->f))      ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Premature end to CM file, file truncated?");
2605   if (! read_nstates)     ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "MODEL: line precedes STATES line");
2606   if (! read_nnodes)      ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "MODEL: line precedes NODES line");
2607   if (! read_clen)        ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "MODEL: line precedes CLEN line");
2608   if (! read_atype)       ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "MODEL: line precedes ALPHABET line");
2609   if (cm->name == NULL)   ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "MODEL: line precedes NAME line");
2610 
2611   /* if we have any exp tail stats, we (currently) require all of them */
2612   have_exps = exp_flags[0];
2613   for(exp_mode = 1; exp_mode < EXP_NMODES; exp_mode++)
2614     if(((have_exps && (!exp_flags[exp_mode]))) ||
2615        ((!have_exps) && (exp_flags[exp_mode])))
2616       ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Some but not all E-xx lines read, expected 8");
2617 
2618   /* Main model section.
2619    */
2620   CMZero(cm);
2621   if(have_exps)  cm->flags |= CMH_EXPTAIL_STATS;
2622   if(have_ga)    cm->flags |= CMH_GA;
2623   if(have_tc)    cm->flags |= CMH_TC;
2624   if(have_nc)    cm->flags |= CMH_NC;
2625   nd = -1;
2626   clen = 0;
2627   for (v = 0; v < cm->M; v++)
2628     {
2629       if ((status = esl_fgets(&buf, &n, cmfp->f)) != eslOK)                     ESL_XFAIL(status,     cmfp->errbuf, "Premature end of data before main model section");
2630       s = buf;
2631       if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(status,     cmfp->errbuf, "Premature end of data before main model section");
2632 
2633       /* Ah, a node line. Process it and get the following line.
2634        */
2635       if (*tok == '[')
2636 	{
2637 	  if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Too few fields on node line");
2638 	  if ((x = NodeCode(tok)) == -1)                                            ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid node type %s", tok);
2639 	  if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Too few fields on node line");
2640 	  if (!is_integer(tok))                                                     ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid node index on node line");
2641 	  nd = atoi(tok);
2642 	  cm->ndtype[nd]  = x;
2643 	  if(cm->ndtype[nd] == MATP_nd) clen+=2;
2644 	  else if(cm->ndtype[nd] == MATL_nd) clen++;
2645 	  else if(cm->ndtype[nd] == MATR_nd) clen++;
2646 	  cm->nodemap[nd] = v;
2647 
2648 	  if ((status = esl_fgets(&buf, &n, cmfp->f)) != eslOK)                     ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Too few fields on NODE line");
2649 	  s = buf;
2650 	  if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Too few fields on NODE line");
2651 	}
2652 
2653       /* Process state line.
2654        */
2655       cm->sttype[v] = StateCode(tok);
2656       if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2657       if (! is_integer(tok))                                                    ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2658       if (atoi(tok) != v)                                                       ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2659       if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2660       if (! is_integer(tok))                                                    ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2661       cm->plast[v] = atoi(tok);
2662       if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2663       if (! is_integer(tok))                                                    ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2664       cm->pnum[v] = atoi(tok);
2665       if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2666       if (! is_integer(tok))                                                    ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2667       cm->cfirst[v] = atoi(tok);
2668       if ((status= esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2669       if (! is_integer(tok))                                                   ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2670       cm->cnum[v] = atoi(tok);
2671 				/* Transition probabilities. */
2672       if (cm->sttype[v] != B_st)
2673 	{
2674 	  for (x = 0; x < cm->cnum[v]; x++)
2675 	    {
2676 	      if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2677 	      if (! is_real(tok) && *tok != '*')                                        ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2678 	      cm->t[v][x] = ascii2prob(tok, 1.);
2679 	    }
2680 	}
2681 				/* Emission probabilities. */
2682       if (cm->sttype[v] == ML_st || cm->sttype[v] == MR_st ||
2683 	  cm->sttype[v] == IL_st || cm->sttype[v] == IR_st)
2684 	{
2685 	  for (x = 0; x < cm->abc->K; x++)
2686 	    {
2687 	      if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2688 	      if (! is_real(tok) && *tok != '*')                                        ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2689 	      cm->e[v][x] = ascii2prob(tok, cm->null[x]);
2690 	    }
2691 	}
2692       else if (cm->sttype[v] == MP_st)
2693 	{
2694 	  for (x = 0; x < cm->abc->K; x++)
2695 	    for (y = 0; y < cm->abc->K; y++)
2696 	      {
2697 		if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2698 		if (! is_real(tok) && *tok != '*')                                        ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "Invalid state line for cm: %s state: %d", cm->name, v);
2699 		cm->e[v][x*cm->abc->K+y] = ascii2prob(tok, cm->null[x]*cm->null[y]);
2700 	      }
2701 	}
2702 
2703       cm->ndidx[v] = nd;
2704       cm->stid[v]  = DeriveUniqueStateCode(cm->ndtype[nd], cm->sttype[v]);
2705     } /* end of loop over states */
2706 
2707   /* Advance to record separator
2708    */
2709   while (esl_fgets(&buf, &n, cmfp->f) != eslEOF)
2710     if (strncmp(buf, "//", 2) == 0)
2711       break;
2712 
2713   /* EPN 10.29.06 Remove the sole source of CM ambiguities. Find and detach insert states
2714    *              that are 1 state before an END_E.  */
2715   cm_find_and_detach_dual_inserts(cm,
2716 				  FALSE, /* Don't check END_E-1 states have 0 counts, they may not if
2717 					  * an old version (0.7 or earlier) of cmbuild was used, or
2718 					  * cmbuild --nodetach  was used to build the CM  */
2719 				  TRUE); /* Detach the states by setting trans probs into them as 0.0   */
2720 
2721   /* check that the clen we calc'ed is the same as the CLEN line said */
2722   if (read_clen && clen != cm->clen)
2723     {
2724       ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "calculated consensus length %d does not equal read CLEN: %d.\n", clen, cm->clen);
2725     }
2726 
2727   /* Success.
2728    * Renormalize the CM, and return.
2729    */
2730   CMRenormalize(cm);
2731 
2732   if (buf != NULL) free(buf);
2733   if (*ret_abc == NULL) *ret_abc = abc;
2734   if ( opt_cm != NULL)  *opt_cm = cm; else FreeCM(cm);
2735   return eslOK;
2736 
2737  ERROR:
2738   if (buf != NULL) free(buf);
2739   if (*ret_abc == NULL && abc != NULL) esl_alphabet_Destroy(abc);
2740   if (cm      != NULL) FreeCM(cm);
2741   if (opt_cm != NULL) *opt_cm = NULL;
2742   if      (status == eslEMEM || status == eslESYS) return status;
2743   else if (status == eslEOF)                       return status;
2744   else if (status == eslEINCOMPAT)                 return status;
2745   else                                             return eslEFORMAT;	/* anything else is a format error: includes premature EOF, EOL, EOD  */
2746 }
2747 
2748 
2749 /* Below, read_asc30hmm(), etc. are EXACT copies from H3 p7_hmmfile.c
2750  * This isn't the right way to do it, but it's what we did.  Do NOT
2751  * edit these here. If you edit them in H3, you MUST bring those edits
2752  * here.  [SRE:24-Jan-17]
2753  */
2754 
2755 /* Parsing save files from HMMER 3.x
2756  * All parsers follow the same API.
2757  *
2758  * Returns <eslOK> on success, and if <opt_hmm> is non-NULL,
2759  * <*opt_hmm> points at a newly allocated HMM.
2760  *
2761  * Additionally, if <*ret_abc> was NULL, then a new alphabet is
2762  * allocated according to the alphabet type of this HMM, and returned
2763  * thru <ret_abc>.  This allocation mechanism allows a main()
2764  * application that doesn't yet know its alphabet to determine the
2765  * alphabet when the first HMM is read, while also allowing an
2766  * application to allocate its own alphabet and assure that the
2767  * input HMMs are appropriate for that alphabet.
2768  *
2769  * Returns <eslEOF> when no HMM remains in the file, indicating a
2770  * normal end-of-file.
2771  *
2772  * Two types of "normal error" may happen, which the caller must check
2773  * for. Returns <eslEFORMAT> on any save file format error, including
2774  * bad magic (i.e. this is not a HMMER file at all). Returns
2775  * <eslEINCOMPAT> if the expected alphabet (a non-<NULL> alphabet
2776  * specified by <*ret_abc>) does not match the alphabet type of the
2777  * HMM.
2778  *
2779  * When these normal errors occur, the caller can construct its error
2780  * message from:
2781  *    <hfp->errbuf>:    contains an informative error message
2782  *    <hfp->fname>:     name of the HMM file (or '-' if STDIN)
2783  * and if <hfp->efp> is non-<NULL>, the HMM file is in ASCII text,
2784  * and the caller may also use:
2785  *    <hfp->efp->linenumber>: line on which the parse error occurred.
2786  *
2787  * Throws:     <eslEMEM> on allocation error.
2788  *             <eslESYS> if a system i/o call fails.
2789  *             In cases of error (including both thrown error and normal error), <*ret_abc>
2790  *             is left in its original state as passed by the caller, and <*ret_hmm> is
2791  *             returned <NULL>.
2792  */
2793 static int
read_asc30hmm(P7_HMMFILE * hfp,ESL_ALPHABET ** ret_abc,P7_HMM ** opt_hmm)2794 read_asc30hmm(P7_HMMFILE *hfp, ESL_ALPHABET **ret_abc, P7_HMM **opt_hmm)
2795 {
2796   ESL_ALPHABET *abc  = NULL;
2797   P7_HMM       *hmm  = NULL;
2798   char         *tag  = NULL;
2799   char         *tok1 = NULL;
2800   char         *tok2 = NULL;
2801   char         *tok3 = NULL;
2802   char         *tok4 = NULL;
2803   int           alphatype;
2804   int           k,x;
2805   off_t         offset = 0;
2806   int           status;
2807   uint32_t      statstracker = 0;
2808 
2809   hfp->errbuf[0] = '\0';
2810 
2811   if (hfp->newly_opened)
2812     {
2813       offset            = 0;
2814       hfp->newly_opened = FALSE;
2815     }
2816   else
2817     {
2818       /* Record where this HMM starts on disk */
2819       if ((! hfp->do_stdin) && (! hfp->do_gzip) && (offset = ftello(hfp->f)) < 0)   ESL_XEXCEPTION(eslESYS, "ftello() failed");
2820 
2821       /* First line of file: "HMMER3/f". Allocate shell for HMM annotation information (we don't know K,M yet) */
2822       if ((status = esl_fileparser_NextLine(hfp->efp))                   != eslOK)  goto ERROR;  /* EOF here is normal; could also be a thrown EMEM */
2823       if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tag, NULL)) != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "unexpected absence of tokens on data line");
2824 
2825       if      (hfp->format == p7_HMMFILE_3f) { if (strcmp(tag, "HMMER3/f") != 0)     ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Didn't find HMMER3/f tag: bad format or not a HMMER save file?"); }
2826       else if (hfp->format == p7_HMMFILE_3e) { if (strcmp(tag, "HMMER3/e") != 0)     ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Didn't find HMMER3/e tag: bad format or not a HMMER save file?"); }
2827       else if (hfp->format == p7_HMMFILE_3d) { if (strcmp(tag, "HMMER3/d") != 0)     ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Didn't find HMMER3/d tag: bad format or not a HMMER save file?"); }
2828       else if (hfp->format == p7_HMMFILE_3c) { if (strcmp(tag, "HMMER3/c") != 0)     ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Didn't find HMMER3/c tag: bad format or not a HMMER save file?"); }
2829       else if (hfp->format == p7_HMMFILE_3b) { if (strcmp(tag, "HMMER3/b") != 0)     ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Didn't find HMMER3/b tag: bad format or not a HMMER save file?"); }
2830       else if (hfp->format == p7_HMMFILE_3a) { if (strcmp(tag, "HMMER3/a") != 0)     ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Didn't find HMMER3/a tag: bad format or not a HMMER save file?"); }
2831       else                                                                           ESL_XFAIL(eslEFORMAT, hfp->errbuf, "No such HMM file format code: this shouldn't happen");
2832     }
2833 
2834   if ((hmm = p7_hmm_CreateShell())                                   == NULL)   ESL_XFAIL(eslEMEM,    hfp->errbuf, "allocation failure, HMM shell");
2835   hmm->offset = offset;
2836 
2837   /* Header section */
2838   while ((status = esl_fileparser_NextLine(hfp->efp)) == eslOK)
2839     {
2840       if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tag, NULL))     != eslOK)   ESL_XFAIL(status,    hfp->errbuf, "Premature end of line");
2841 
2842       if (strcmp(tag, "NAME") == 0) {
2843 	if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))  != eslOK)   ESL_XFAIL(status,    hfp->errbuf, "No name found on NAME line");
2844 	p7_hmm_SetName(hmm, tok1);
2845       }
2846 
2847       else if (strcmp(tag, "ACC") == 0)  {
2848 	if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))  != eslOK)   ESL_XFAIL(status,    hfp->errbuf, "No accession found on ACC line");
2849 	p7_hmm_SetAccession(hmm, tok1);
2850       }
2851 
2852       else if (strcmp(tag, "DESC") == 0) {
2853 	if ((status = esl_fileparser_GetRemainingLine(hfp->efp, &tok1))      != eslOK)   ESL_XFAIL(status,    hfp->errbuf, "No description found on DESC line");
2854 	p7_hmm_SetDescription(hmm, tok1);
2855       }
2856 
2857       else if (strcmp(tag, "LENG") == 0) {
2858 	if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))  != eslOK)   ESL_XFAIL(status,    hfp->errbuf, "No model length found on LENG line");
2859 	if ((hmm->M = atoi(tok1))                                            == 0)     ESL_XFAIL(status,    hfp->errbuf, "Invalid model length %s on LENG line", tok1);
2860       }
2861 
2862       else if (hfp->format >= p7_HMMFILE_3c && strcmp(tag, "MAXL") == 0) {
2863 	if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))  != eslOK)   ESL_XFAIL(status,    hfp->errbuf, "No max length found on MAXL line");
2864 	if ((hmm->max_length = atoi(tok1))                                   == 0)     ESL_XFAIL(status,    hfp->errbuf, "Invalid max length %s on MAXL line", tok1);
2865       }
2866 
2867       else if (strcmp(tag, "ALPH") == 0) {
2868 	if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))  != eslOK)   ESL_XFAIL(status,    hfp->errbuf, "No alphabet type found on ALPH");
2869 	if ((alphatype = esl_abc_EncodeType(tok1))                        == eslUNKNOWN) ESL_XFAIL(status,    hfp->errbuf, "Unrecognized alphabet type %s", tok1);
2870 	if (*ret_abc == NULL) {
2871 	  if ((abc = esl_alphabet_Create(alphatype))                        == NULL)    ESL_XFAIL(eslEMEM,   hfp->errbuf, "Failed to create alphabet");
2872 	} else {
2873 	  if ((*ret_abc)->type != alphatype)                                           ESL_XFAIL(eslEINCOMPAT,hfp->errbuf,"Alphabet type mismatch: was %s, but current HMM says %s", esl_abc_DecodeType( (*ret_abc)->type), tok1);
2874 	  abc = *ret_abc;
2875 	}
2876       }
2877 
2878       else if (strcmp(tag, "RF") == 0) {
2879 	if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,    hfp->errbuf, "No yes/no found for RF line");
2880 	if      (strcasecmp(tok1, "yes") == 0)
2881 	  hmm->flags |= p7H_RF;
2882 	else if (strcasecmp(tok1, "no")  != 0)                                           ESL_XFAIL(eslEFORMAT, hfp->errbuf, "RF header line must say yes/no, not %s", tok1);
2883       }
2884 
2885       else if (strcmp(tag, "MM") == 0) {
2886 	if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,    hfp->errbuf, "No yes/no found for MM line");
2887 	if      (strcasecmp(tok1, "yes") == 0)
2888 	  hmm->flags |= p7H_MMASK;
2889 	else if (strcasecmp(tok1, "no")  != 0)                                           ESL_XFAIL(eslEFORMAT, hfp->errbuf, "MM header line must say yes/no, not %s", tok1);
2890       }
2891 
2892 
2893       else if (strcmp(tag, "CONS") == 0) {
2894 	if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "No yes/no found for CONS line");
2895 	if (strcasecmp(tok1, "yes") == 0)
2896 	  hmm->flags |= p7H_CONS;
2897 	else if (strcasecmp(tok1, "no")  != 0)                                           ESL_XFAIL(eslEFORMAT, hfp->errbuf, "CONS header line must say yes/no, not %s", tok1);
2898       }
2899 
2900       else if (strcmp(tag, "CS") == 0) {
2901 	if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "No yes/no found for CS line");
2902 	if (strcasecmp(tok1, "yes") == 0)
2903 	  hmm->flags |= p7H_CS;
2904 	else if (strcasecmp(tok1, "no")  != 0)                                           ESL_XFAIL(eslEFORMAT, hfp->errbuf, "CS header line must say yes/no, not %s", tok1);
2905       }
2906 
2907       else if (strcmp(tag, "MAP") == 0) {
2908 	if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "No yes/no found for MAP line");
2909 	if      (strcasecmp(tok1, "yes") == 0)
2910 	  hmm->flags |= p7H_MAP;
2911 	else if (strcasecmp(tok1, "no")  != 0)                                           ESL_XFAIL(eslEFORMAT, hfp->errbuf, "MAP header line must say yes/no, not %s", tok1);
2912       }
2913 
2914       else if (strcmp(tag, "DATE") == 0) {
2915 	if ((status = esl_fileparser_GetRemainingLine(hfp->efp, &tok1))       != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "No date found on DATE line");
2916 	if (esl_strdup(tok1, -1, &(hmm->ctime))                               != eslOK)  ESL_XFAIL(eslEMEM,    hfp->errbuf, "strdup() failed to set date");
2917       }
2918 
2919       else if (strcmp(tag, "COM") == 0) {
2920 	/* just skip the first token; it's something like [1], numbering the command lines */
2921 	if ((status = esl_fileparser_GetTokenOnLine  (hfp->efp, &tok1, NULL)) != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "No command number on COM line");
2922 	if ((status = esl_fileparser_GetRemainingLine(hfp->efp, &tok1))       != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "No command on COM line");
2923 	if (hmm->comlog == NULL) {
2924 	  if (esl_strdup(tok1, -1, &(hmm->comlog))                            != eslOK)  ESL_XFAIL(eslEMEM,    hfp->errbuf, "esl_strdup() failed");
2925 	} else {
2926 	  if (esl_strcat(&(hmm->comlog), -1, "\n", -1)                        != eslOK)  ESL_XFAIL(eslEMEM,    hfp->errbuf, "esl_strcat() failed");
2927 	  if (esl_strcat(&(hmm->comlog), -1, tok1,  -1)                       != eslOK)  ESL_XFAIL(eslEMEM,    hfp->errbuf, "esl_strcat() failed");
2928 	}
2929       }
2930 
2931       else if (strcmp(tag, "NSEQ") == 0) {
2932 	if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Nothing follows NSEQ tag");
2933 	if ((hmm->nseq = atoi(tok1)) == 0)                                               ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Invalid nseq on NSEQ line: should be integer, not %s", tok1);
2934       }
2935 
2936       else if (strcmp(tag, "EFFN") == 0) {
2937 	if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Nothing follows EFFN tag");
2938 	if ((hmm->eff_nseq = atof(tok1)) <= 0.0f)                                        ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Invalid eff_nseq on EFFN line: should be a real number, not %s", tok1);
2939       }
2940 
2941       else if (strcmp(tag, "CKSUM") == 0) {
2942 	if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Nothing follows CKSUM tag");
2943 	hmm->checksum = atoll(tok1); /* if atoi(), then you may truncate uint32_t checksums > 2^31-1 */
2944 	hmm->flags |= p7H_CHKSUM;
2945       }
2946 
2947       else if (strcmp(tag, "STATS") == 0) {
2948 	if (hfp->format >= p7_HMMFILE_3b)
2949 	  {
2950 	    if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Too few fields on STATS line"); /* LOCAL */
2951 	    if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok2, NULL))   != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Too few fields on STATS line"); /* MSV | VITERBI | FORWARD */
2952 	    if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok3, NULL))   != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Too few fields on STATS line"); /* mu | tau */
2953 	    if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok4, NULL))   != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Too few fields on STATS line"); /* lambda */
2954 	    if (strcasecmp(tok1, "LOCAL") == 0)
2955 	      {
2956 		if      (strcasecmp(tok2, "MSV")     == 0)  { hmm->evparam[p7_MMU]  = atof(tok3); hmm->evparam[p7_MLAMBDA] = atof(tok4); statstracker |= 0x1; }
2957 		else if (strcasecmp(tok2, "VITERBI") == 0)  { hmm->evparam[p7_VMU]  = atof(tok3); hmm->evparam[p7_VLAMBDA] = atof(tok4); statstracker |= 0x2; }
2958 		else if (strcasecmp(tok2, "FORWARD") == 0)  { hmm->evparam[p7_FTAU] = atof(tok3); hmm->evparam[p7_FLAMBDA] = atof(tok4); statstracker |= 0x4; }
2959 		else ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Failed to parse STATS, %s unrecognized as field 3", tok2);
2960 	      } else ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Failed to parse STATS, %s unrecognized as field 2", tok1);
2961 	  }
2962 	else if (hfp->format == p7_HMMFILE_3a) /* reverse compatibility with 30a */
2963 	  {
2964 	    if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Too few fields on STATS line"); /* LOCAL */
2965 	    if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok2, NULL))   != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Too few fields on STATS line"); /* VLAMBDA | VMU | FTAU */
2966 	    if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok3, NULL))   != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Too few fields on STATS line"); /* value */
2967 	    if (strcasecmp(tok1, "LOCAL") == 0)
2968 	      {
2969 		if      (strcasecmp(tok2, "VLAMBDA") == 0)  { hmm->evparam[p7_MLAMBDA] = hmm->evparam[p7_VLAMBDA] = hmm->evparam[p7_FLAMBDA] = atof(tok3);  statstracker |= 0x1; }
2970 		else if (strcasecmp(tok2, "VMU")     == 0)  {                            hmm->evparam[p7_MMU]     = hmm->evparam[p7_VMU]     = atof(tok3);  statstracker |= 0x2; }
2971 		else if (strcasecmp(tok2, "FTAU")    == 0)  {                                                       hmm->evparam[p7_FTAU]    = atof(tok3);  statstracker |= 0x4; }
2972 		else ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Failed to parse STATS, %s unrecognized as field 3", tok2);
2973 	      } else ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Failed to parse STATS, %s unrecognized as field 2", tok1);
2974 	  }
2975       }
2976 
2977       else if (strcmp(tag, "GA") == 0) {
2978         if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Too few fields on GA line");
2979         hmm->cutoff[p7_GA1] = atof(tok1);
2980         if ( (abc->type == eslDNA || abc->type == eslRNA) ) { //if DNA, there's no need for a 2nd value (domain GA)
2981           hmm->cutoff[p7_GA2] = hmm->cutoff[p7_GA1];
2982         } else {
2983           if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok2, NULL))   != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Too few fields on GA line");
2984           hmm->cutoff[p7_GA2] = atof(tok2);
2985         }
2986         hmm->flags         |= p7H_GA;
2987       }
2988 
2989       else if (strcmp(tag, "TC") == 0) {
2990         if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Too few fields on TC line");
2991         hmm->cutoff[p7_TC1] = atof(tok1);
2992         if ( (abc->type == eslDNA || abc->type == eslRNA) ) { //if DNA, there's no need for a 2nd value (domain GA)
2993           hmm->cutoff[p7_TC2] = hmm->cutoff[p7_TC1];
2994         } else {
2995           if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok2, NULL))   != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Too few fields on TC line");
2996           hmm->cutoff[p7_TC2] = atof(tok2);
2997         }
2998         hmm->flags         |= p7H_TC;
2999       }
3000 
3001       else if (strcmp(tag, "NC") == 0) {
3002         if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Too few fields on NC line");
3003         hmm->cutoff[p7_NC1] = atof(tok1);
3004         if ( (abc->type == eslDNA || abc->type == eslRNA) ) { //if DNA, there's no need for a 2nd value (domain GA)
3005           hmm->cutoff[p7_NC2] = hmm->cutoff[p7_NC1];
3006         } else {
3007           if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok2, NULL))   != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Too few fields on NC line");
3008           hmm->cutoff[p7_NC2] = atof(tok2);
3009         }
3010         hmm->flags         |= p7H_NC;
3011       }
3012 
3013       else if (strcmp(tag, "HMM") == 0)
3014   break;
3015     } /* end, loop over possible header tags */
3016 
3017   if (status != eslOK) goto ERROR;
3018 
3019   /* If we saw one STATS line, we need all 3. (True for both 3/a and 3/b formats) */
3020   if      (statstracker == 0x7) hmm->flags |= p7H_STATS;
3021   else if (statstracker != 0x0) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Missing one or more STATS parameter lines");
3022 
3023 
3024   /* Skip main model header lines; allocate body of HMM now that K,M are known */
3025   if ((status = esl_fileparser_NextLine(hfp->efp))                            != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Premature end of data before main model section");
3026   if ((status = esl_fileparser_NextLine(hfp->efp))                            != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Premature end of data before main model section");
3027   if ((status = p7_hmm_CreateBody(hmm, hmm->M, abc))                          != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Failed to allocate body of the new HMM");
3028   if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))         != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Premature end of data before main model section");
3029 
3030   /* Optional model composition (filter null model) may immediately follow headers */
3031   if (strcmp(tok1, "COMPO") == 0) {
3032     for (x = 0; x < abc->K; x++)  {
3033       if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))     != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Too few fields on COMPO line");
3034       hmm->compo[x] = (*tok1 == '*' ? 0.0 : expf(-1.0 * atof(tok1)));
3035     }
3036     hmm->flags |= p7H_COMPO;
3037     if ((status = esl_fileparser_NextLine(hfp->efp))                          != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Premature end of data after COMPO line");
3038     if ((esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))                != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Premature end of data after COMPO line");
3039   }
3040 
3041   /* First two lines are node 0: insert emissions, then transitions from node 0 (begin) */
3042 
3043   hmm->ins[0][0] = (*tok1 == '*' ? 0.0 : expf(-1.0 *atof(tok1)));
3044   for (x = 1; x < abc->K; x++) {
3045     if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))       != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Too few fields on insert line, node 0: expected %d, got %d\n", abc->K, x);
3046     hmm->ins[0][x] = (*tok1 == '*' ? 0.0 : expf(-1.0 *atof(tok1)));
3047   }
3048   if ((status = esl_fileparser_NextLine(hfp->efp))                            != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Premature end of data in main model: no node 0 transition line");
3049   for (x = 0; x < p7H_NTRANSITIONS; x++) {
3050     if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))       != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Too few fields on begin (0) transition line");
3051     hmm->t[0][x] = (*tok1 == '*' ? 0.0 : expf(-1.0 *atof(tok1)));
3052   }
3053 
3054   /* The main model section. */
3055   for (k = 1; k <= hmm->M; k++)
3056     {
3057 
3058       if ((status = esl_fileparser_NextLine(hfp->efp))                        != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Premature end of data in main model section, at node %d (expected %d)", k, hmm->M);
3059       if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))     != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Premature end of data in main model section, at node %d (expected %d)", k, hmm->M);
3060       if (atoi(tok1) != k)                                                               ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Expected match line to start with %d (of %d); saw %s", k, hmm->M, tok1);
3061 
3062       for (x = 0; x < abc->K; x++) {
3063 	if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Too few probability fields on match line, node %d: expected %d, got %d\n", k, abc->K, x);
3064 	hmm->mat[k][x] = (*tok1 == '*' ? 0.0 : expf(-1.0 *atof(tok1)));
3065       }
3066 
3067       if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))     != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Missing MAP field on match line for node %d: should at least be -", k);
3068       if (hmm->flags & p7H_MAP) hmm->map[k] = atoi(tok1);
3069 
3070       if (hfp->format >= p7_HMMFILE_3e) {
3071 	if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Missing CONS field on match line for node %d: should at least be -", k);
3072 	if (hmm->flags & p7H_CONS) hmm->consensus[k] = *tok1;
3073       }
3074       if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))     != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Missing RF field on match line for node %d: should at least be -",  k);
3075       if (hmm->flags & p7H_RF) hmm->rf[k]   = *tok1;
3076 
3077       if (hfp->format >= p7_HMMFILE_3f) {
3078         if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Missing MM field on match line for node %d: should at least be -", k);
3079         if (hmm->flags & p7H_MMASK) hmm->mm[k] = *tok1;
3080       }
3081 
3082 
3083       if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))     != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Missing CS field on match line for node %d: should at least be -",  k);
3084       if (hmm->flags & p7H_CS) hmm->cs[k]   = *tok1;
3085 
3086 
3087       if ((status = esl_fileparser_NextLine(hfp->efp))                        != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Premature end of data in main model: no insert emission line, node %d", k);
3088       for (x = 0; x < abc->K; x++) {
3089 	if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Too few probability fields on insert line, node %d: expected %d, got %d\n", k, abc->K, x);
3090 	hmm->ins[k][x] = (*tok1 == '*' ? 0.0 : expf(-1.0 *atof(tok1)));
3091       }
3092       if ((status = esl_fileparser_NextLine(hfp->efp))                        != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Premature end of data in main model: no transition line, node %d", k);
3093       for (x = 0; x < p7H_NTRANSITIONS; x++) {
3094 	if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))   != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Too few probability fields on transition line, node %d: expected %d, got %d\n", k, abc->K, x);
3095 	hmm->t[k][x] = (*tok1 == '*' ? 0.0 : expf(-1.0 *atof(tok1)));
3096       }
3097     }
3098 
3099   /* The closing // */
3100   if ((status = esl_fileparser_NextLine(hfp->efp))                            != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Premature end of data: missing //?");
3101   if ((status = esl_fileparser_GetTokenOnLine(hfp->efp, &tok1, NULL))         != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Premature end of data: missing //?");
3102   if (strcmp(tok1, "//")                                                      != 0)      ESL_XFAIL(eslEFORMAT, hfp->errbuf, "Expected closing //; found %s instead", tok1);
3103 
3104   /* legacy issues */
3105   if (hfp->format < p7_HMMFILE_3e && (status = p7_hmm_SetConsensus(hmm, NULL)) != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Failed to set consensus on legacy HMM format");
3106 
3107   /* Finish up. */
3108   if (hmm->flags & p7H_RF)   { hmm->rf[0]        = ' '; hmm->rf[hmm->M+1]        = '\0'; }
3109   if (hmm->flags & p7H_MMASK){ hmm->mm[0]        = ' '; hmm->mm[hmm->M+1]        = '\0'; }
3110   if (hmm->flags & p7H_CONS) { hmm->consensus[0] = ' '; hmm->consensus[hmm->M+1] = '\0'; }
3111   if (hmm->flags & p7H_CS)   { hmm->cs[0]        = ' '; hmm->cs[hmm->M+1]        = '\0'; }
3112   if (hmm->flags & p7H_MAP)  { hmm->map[0]       = 0; }
3113   if (hmm->name == NULL)    ESL_XFAIL(eslEFORMAT, hfp->errbuf, "No NAME found for HMM");
3114   if (hmm->M    <= 0)       ESL_XFAIL(eslEFORMAT, hfp->errbuf, "No LENG found for HMM (or LENG <= 0)");
3115   if (abc       == NULL)    ESL_XFAIL(eslEFORMAT, hfp->errbuf, "No ALPH found for HMM");
3116 
3117   if (*ret_abc == NULL) *ret_abc = abc;
3118   if ( opt_hmm != NULL) *opt_hmm = hmm; else p7_hmm_Destroy(hmm);
3119   return eslOK;
3120 
3121  ERROR:
3122   if (*ret_abc == NULL && abc != NULL) esl_alphabet_Destroy(abc);
3123   if (hmm     != NULL) p7_hmm_Destroy(hmm);
3124   if (opt_hmm != NULL) *opt_hmm = NULL;
3125   if      (status == eslEMEM || status == eslESYS) return status;
3126   else if (status == eslEOF)                       return status;
3127   else if (status == eslEINCOMPAT)                 return status;
3128   else                                             return eslEFORMAT;  /* anything else is a format error: includes premature EOF, EOL, EOD  */
3129 }
3130 
3131 
3132 static int
read_bin30hmm(P7_HMMFILE * hfp,ESL_ALPHABET ** ret_abc,P7_HMM ** opt_hmm)3133 read_bin30hmm(P7_HMMFILE *hfp, ESL_ALPHABET **ret_abc, P7_HMM **opt_hmm)
3134 {
3135   ESL_ALPHABET *abc = NULL;
3136   P7_HMM       *hmm = NULL;
3137   uint32_t      magic;
3138   int           alphabet_type;
3139   int           k;
3140   off_t         offset = 0;
3141   int           status;
3142 
3143   hfp->errbuf[0] = '\0';
3144   if (feof(hfp->f))                                             { status = eslEOF;       goto ERROR; }
3145 
3146   if (hfp->newly_opened)
3147     {
3148       offset = 0;
3149       hfp->newly_opened = FALSE;
3150     }
3151   else
3152     {  /* Check magic. */
3153       if ((!hfp->do_stdin) && (! hfp->do_gzip)) {
3154 	if ((offset = ftello(hfp->f)) < 0)                          ESL_XEXCEPTION(eslESYS, "ftello() failed");
3155       }
3156       if (! fread((char *) &magic, sizeof(uint32_t), 1, hfp->f))    { status = eslEOF;       goto ERROR; }
3157 
3158       if      (hfp->format == p7_HMMFILE_3f) { if (magic != v3f_magic)  ESL_XFAIL(eslEFORMAT, hfp->errbuf, "bad magic number at start of HMM");  }
3159       else if (hfp->format == p7_HMMFILE_3e) { if (magic != v3e_magic)  ESL_XFAIL(eslEFORMAT, hfp->errbuf, "bad magic number at start of HMM");  }
3160       else if (hfp->format == p7_HMMFILE_3d) { if (magic != v3d_magic)  ESL_XFAIL(eslEFORMAT, hfp->errbuf, "bad magic number at start of HMM");  }
3161       else if (hfp->format == p7_HMMFILE_3c) { if (magic != v3c_magic)  ESL_XFAIL(eslEFORMAT, hfp->errbuf, "bad magic number at start of HMM");  }
3162       else if (hfp->format == p7_HMMFILE_3b) { if (magic != v3b_magic)  ESL_XFAIL(eslEFORMAT, hfp->errbuf, "bad magic number at start of HMM");  }
3163       else if (hfp->format == p7_HMMFILE_3a) { if (magic != v3a_magic)  ESL_XFAIL(eslEFORMAT, hfp->errbuf, "bad magic number at start of HMM");  }
3164       else                                                              ESL_XFAIL(eslEFORMAT, hfp->errbuf, "no such HMM file format code");
3165     }
3166 
3167   /* Allocate shell of the new HMM.
3168    * Two-step allocation lets us read/set the flags first;
3169    * then the later CreateBody() call will allocate optional internal fields we need.
3170    */
3171   if ((hmm = p7_hmm_CreateShell()) == NULL)                     ESL_XFAIL(eslEMEM, hfp->errbuf, "allocation failed, HMM shell");
3172   hmm->offset = offset;
3173 
3174   /* Get sizes of things */
3175   /* xref J5/114 for a legacy use of <flags> for optional acc, desc annotation */
3176   if (! fread((char *) &(hmm->flags),  sizeof(int), 1, hfp->f)) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read flags");
3177   if (! fread((char *) &(hmm->M),      sizeof(int), 1, hfp->f)) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read model size M");
3178   if (! fread((char *) &alphabet_type, sizeof(int), 1, hfp->f)) ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read alphabet_type");
3179 
3180   /* Set or verify alphabet. */
3181   if (*ret_abc == NULL)	{	/* still unknown: set it, pass control of it back to caller */
3182     if ((abc = esl_alphabet_Create(alphabet_type)) == NULL)     ESL_XFAIL(eslEMEM, hfp->errbuf, "allocation failed, alphabet");
3183   } else {			/* already known: check it */
3184     abc = *ret_abc;
3185     if (abc->type != alphabet_type)                             ESL_XFAIL(eslEINCOMPAT, hfp->errbuf, "Alphabet type mismatch: was %s, but current HMM says %s", esl_abc_DecodeType( abc->type), esl_abc_DecodeType(alphabet_type));
3186   }
3187 
3188   /* Finish the allocation of the HMM
3189    */
3190   if ((status = p7_hmm_CreateBody(hmm, hmm->M, abc)) != eslOK)  ESL_XFAIL(eslEMEM, hfp->errbuf, "allocation failed, HMM body");
3191 
3192   /* Core model probabilities. */
3193   for (k = 1; k <= hmm->M; k++)
3194     if (! fread((char *) hmm->mat[k], sizeof(float), hmm->abc->K,      hfp->f))  ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read mat[%d]", k);
3195   for (k = 0; k <= hmm->M; k++)
3196     if (! fread((char *) hmm->ins[k], sizeof(float), hmm->abc->K,      hfp->f))  ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read ins[%d]", k);
3197   for (k = 0; k <= hmm->M; k++)
3198     if (! fread((char *) hmm->t[k],   sizeof(float), p7H_NTRANSITIONS, hfp->f))  ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read t[%d]", k);
3199 
3200   /* Annotations. */
3201   if (read_bin_string(hfp->f, &(hmm->name)) != eslOK)                                                ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read name");
3202   if ((hmm->flags & p7H_ACC)  && read_bin_string(hfp->f, &(hmm->acc))  != eslOK)                     ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read acc");
3203   if ((hmm->flags & p7H_DESC) && read_bin_string(hfp->f, &(hmm->desc)) != eslOK)                     ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read desc");
3204   if ((hmm->flags & p7H_RF)   && ! fread((char *) hmm->rf,        sizeof(char), hmm->M+2, hfp->f))   ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read rf");   /* +2: 1..M and trailing \0 */
3205   if ((hmm->flags & p7H_MMASK)&& ! fread((char *) hmm->mm,        sizeof(char), hmm->M+2, hfp->f))   ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read mm");   /* +2: 1..M and trailing \0 */
3206   if ((hmm->flags & p7H_CONS) && ! fread((char *) hmm->consensus, sizeof(char), hmm->M+2, hfp->f))   ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read consensus"); /* don't need to test for >=3e format, because the flag is sufficient (didn't exist pre-3e) */
3207   if ((hmm->flags & p7H_CS)   && ! fread((char *) hmm->cs,        sizeof(char), hmm->M+2, hfp->f))   ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read cs");
3208   if ((hmm->flags & p7H_CA)   && ! fread((char *) hmm->ca,        sizeof(char), hmm->M+2, hfp->f))   ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read ca");
3209   if (read_bin_string(hfp->f, &(hmm->comlog)) != eslOK)                                              ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read comlog");
3210   if (! fread((char *) &(hmm->nseq),       sizeof(int),   1, hfp->f))                                ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read nseq");
3211   if (! fread((char *) &(hmm->eff_nseq),   sizeof(float), 1, hfp->f))                                ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read eff_nseq");
3212   if (hfp->format >= p7_HMMFILE_3c) {
3213     if (! fread((char *) &(hmm->max_length), sizeof(int),   1, hfp->f))                         ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read max_length");
3214   }
3215   if (read_bin_string(hfp->f, &(hmm->ctime))  != eslOK)                                       ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read ctime");
3216   if ((hmm->flags & p7H_MAP)  && ! fread((char *) hmm->map, sizeof(int), hmm->M+1, hfp->f))   ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read map");
3217   if (! fread((char *) &(hmm->checksum), sizeof(uint32_t),1,hfp->f))                          ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read checksum");
3218 
3219   /* E-value parameters and Pfam cutoffs */
3220   if (hfp->format >= p7_HMMFILE_3b) {
3221     if (! fread((char *) hmm->evparam, sizeof(float), p7_NEVPARAM, hfp->f))                            ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read statistical params");
3222   } else if (hfp->format == p7_HMMFILE_3a) {
3223     /* a backward compatibility mode. 3/a files stored 3 floats: LAMBDA, MU, TAU. Read 3 #'s and carefully copy/rearrange them into new 6 format */
3224     if (! fread((char *) hmm->evparam, sizeof(float), 3,           hfp->f))                            ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read statistical params");
3225     hmm->evparam[p7_FLAMBDA] = hmm->evparam[0];
3226     hmm->evparam[p7_FTAU]    = hmm->evparam[2];
3227     hmm->evparam[p7_VLAMBDA] = hmm->evparam[0];
3228     hmm->evparam[p7_VMU]     = hmm->evparam[1];
3229     hmm->evparam[p7_MLAMBDA] = hmm->evparam[p7_VLAMBDA];
3230     hmm->evparam[p7_MMU]     = hmm->evparam[p7_VMU];
3231   }
3232   if (! fread((char *) hmm->cutoff,  sizeof(float), p7_NCUTOFFS, hfp->f))                            ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read Pfam score cutoffs");
3233   if ((hmm->flags & p7H_COMPO) && ! fread((char *) hmm->compo, sizeof(float), hmm->abc->K, hfp->f))  ESL_XFAIL(eslEFORMAT, hfp->errbuf, "failed to read model composition");
3234 
3235   /* other legacy issues */
3236   if (hfp->format < p7_HMMFILE_3e && (status = p7_hmm_SetConsensus(hmm, NULL)) != eslOK)  ESL_XFAIL(status,     hfp->errbuf, "Failed to set consensus on legacy HMM format");
3237 
3238   if (*ret_abc == NULL) *ret_abc = abc;	/* pass our new alphabet back to caller, if caller didn't know it already */
3239   if ( opt_hmm != NULL) *opt_hmm = hmm; else p7_hmm_Destroy(hmm);
3240   return eslOK;
3241 
3242  ERROR:
3243   if (*ret_abc == NULL && abc != NULL) esl_alphabet_Destroy(abc); /* the test is for an alphabet created here, not passed */
3244   if (hmm     != NULL) p7_hmm_Destroy(hmm);
3245   if (opt_hmm != NULL) *opt_hmm = NULL;
3246   return status;
3247 }
3248 /*--------------- end, private format parsers -------------------*/
3249 
3250 
3251 
3252 
3253 
3254 /*****************************************************************
3255  * 6. Other private functions involved in i/o
3256  *****************************************************************/
3257 
3258 /*****************************************************************
3259  * Some miscellaneous utility functions
3260  *****************************************************************/
3261 
3262 /* Function: read_bin_string()
3263  * Date:     SRE, Wed Oct 29 14:03:23 1997 [TWA 721]
3264  *
3265  * Purpose:  Read in a string from a binary file, where
3266  *           the first integer is the length (including '\0').
3267  *           If the length is 0, <*ret_s> is set to <NULL>.
3268  *
3269  *           This is a reasonable convention for storing/ reading
3270  *           strings in binary files. Note that because the length is
3271  *           inclusive of '\0', there's a difference between a NULL
3272  *           string and an empty string.
3273  *
3274  * Args:     fp       - FILE to read from
3275  *           ret_s    - string to read into
3276  *
3277  * Return:   <eslOK> on success. ret_s is malloc'ed here.
3278  *           <eslEOD> if a read fails - likely because no more
3279  *             data in file.
3280  *
3281  * Throws    <eslEMEM> on allocation error.
3282  */
3283 static int
read_bin_string(FILE * fp,char ** ret_s)3284 read_bin_string(FILE *fp, char **ret_s)
3285 {
3286   int   status;
3287   char *s = NULL;
3288   int   len;
3289 
3290   if (! fread((char *) &len, sizeof(int), 1, fp)) { status = eslEOD; goto ERROR; }
3291   if (len > 0) {
3292     ESL_ALLOC(s,  (sizeof(char) * len));
3293     if (! fread((char *) s, sizeof(char), len, fp)) { status = eslEOD; goto ERROR; }
3294   }
3295   *ret_s = s;
3296   return eslOK;
3297 
3298  ERROR:
3299   if (s != NULL) free(s);
3300   *ret_s = NULL;
3301   return status;
3302 }
3303 
3304 /* Function: prob2ascii()
3305  *
3306  * Purpose:  Format a probability for output to an ASCII save
3307  *           file. Returns a ptr to a static internal buffer.
3308  *
3309  */
3310 static char *
prob2ascii(float p,float null)3311 prob2ascii(float p, float null)
3312 {
3313   static char buffer[32];
3314 
3315   if (p == 0.0) return "*";
3316   sprintf(buffer, "%.3f", sreLOG2(p/null));
3317   return buffer;
3318 }
3319 
3320 
3321 /* Function: ascii2prob()
3322  *
3323  * Purpose:  Convert a saved string back to a probability.
3324  */
3325 static float
ascii2prob(char * s,float null)3326 ascii2prob(char *s, float null)
3327 {
3328   return (*s == '*') ? 0. : exp(atof(s)/1.44269504)*null;
3329 }
3330 
3331 /* EPN, Tue Aug  7 15:54:15 2007
3332  * is_integer() and is_real(), savagely ripped verbatim out
3333  * of Easel's esl_getopts.c, where they were private.
3334  */
3335 /* Function: is_integer()
3336  *
3337  * Returns TRUE if <s> points to something that atoi() will parse
3338  * completely and convert to an integer.
3339  */
3340 static int
is_integer(char * s)3341 is_integer(char *s)
3342 {
3343   int hex = 0;
3344 
3345   if (s == NULL) return 0;
3346   while (isspace((int) (*s))) s++;      /* skip whitespace */
3347   if (*s == '-' || *s == '+') s++;      /* skip leading sign */
3348 				        /* skip leading conversion signals */
3349   if ((strncmp(s, "0x", 2) == 0 && (int) strlen(s) > 2) ||
3350       (strncmp(s, "0X", 2) == 0 && (int) strlen(s) > 2))
3351     {
3352       s += 2;
3353       hex = 1;
3354     }
3355   else if (*s == '0' && (int) strlen(s) > 1)
3356     s++;
3357 				/* examine remainder for garbage chars */
3358   if (!hex)
3359     while (*s != '\0')
3360       {
3361 	if (!isdigit((int) (*s))) return 0;
3362 	s++;
3363       }
3364   else
3365     while (*s != '\0')
3366       {
3367 	if (!isxdigit((int) (*s))) return 0;
3368 	s++;
3369       }
3370   return 1;
3371 }
3372 
3373 
3374 /* is_real()
3375  *
3376  * Returns TRUE if <s> is a string representation
3377  * of a valid floating point number, convertable
3378  * by atof().
3379  */
3380 static int
is_real(char * s)3381 is_real(char *s)
3382 {
3383   int gotdecimal = 0;
3384   int gotexp     = 0;
3385   int gotreal    = 0;
3386 
3387   if (s == NULL) return 0;
3388 
3389   while (isspace((int) (*s))) s++; /* skip leading whitespace */
3390   if (*s == '-' || *s == '+') s++; /* skip leading sign */
3391 
3392   /* Examine remainder for garbage. Allowed one '.' and
3393    * one 'e' or 'E'; if both '.' and e/E occur, '.'
3394    * must be first.
3395    */
3396   while (*s != '\0')
3397     {
3398       if (isdigit((int) (*s))) 	gotreal++;
3399       else if (*s == '.')
3400 	{
3401 	  if (gotdecimal) return 0; /* can't have two */
3402 	  if (gotexp) return 0;     /* e/E preceded . */
3403 	  else gotdecimal++;
3404 	}
3405       else if (*s == 'e' || *s == 'E')
3406 	{
3407 	  if (gotexp) return 0;	/* can't have two */
3408 	  else gotexp++;
3409 	}
3410       else if (isspace((int) (*s)))
3411 	break;
3412       s++;
3413     }
3414 
3415   while (isspace((int) (*s))) s++;         /* skip trailing whitespace */
3416   if (*s == '\0' && gotreal) return 1;
3417   else return 0;
3418 }
3419 
3420 /*---------------- end, private utilities -----------------------*/
3421 
3422 /*****************************************************************
3423  * 7. Legacy v1.0 ascii file format output.
3424  *****************************************************************/
3425 
3426 /* Function:  cm_file_Write1p0Ascii()
3427  * Incept:    EPN, Tue Feb 28 14:35:47 2012
3428  *
3429  * Purpose:   Write a CM in version 1.0 format.
3430  *            cm_io.c:write_ascii_cm() from Infernal 1.0.2
3431  *            was used as the starting point for this function.
3432  *
3433  *            Calibration parameters: E-values and HMM filter
3434  *            thresholds are not written here. This is because 1.0
3435  *            expects either both E-values and HMM filter thresholds
3436  *            or neither and HMM filter thresholds don't exist
3437  *            in the new format.
3438  *
3439  * Returns: eslOK on success;
3440  */
3441 int
cm_file_Write1p0ASCII(FILE * fp,CM_t * cm)3442 cm_file_Write1p0ASCII(FILE *fp, CM_t *cm)
3443 {
3444   int status;
3445   int v,x,y,nd;
3446   char *comlog2print  = NULL;
3447 
3448   fprintf(fp, "INFERNAL-1 [converted from %s]\n", INFERNAL_VERSION);
3449 
3450   fprintf(fp,                          "NAME     %s\n", cm->name);
3451   if (cm->acc  != NULL)    fprintf(fp, "ACC      %s\n", cm->acc);
3452   if (cm->desc != NULL)    fprintf(fp, "DESC     %s\n", cm->desc);
3453   /* Rfam cutoffs */
3454   if (cm->flags & CMH_GA)  fprintf(fp, "GA       %.2f\n", cm->ga);
3455   if (cm->flags & CMH_TC)  fprintf(fp, "TC       %.2f\n", cm->tc);
3456   if (cm->flags & CMH_NC)  fprintf(fp, "NC       %.2f\n", cm->nc);
3457   fprintf(fp, "STATES   %d\n",   cm->M);
3458   fprintf(fp, "NODES    %d\n",   cm->nodes);
3459   fprintf(fp, "ALPHABET %d\n",   cm->abc->type);
3460   fprintf(fp, "ELSELF   %.8f\n", cm->el_selfsc);
3461   fprintf(fp, "WBETA    %g\n",   cm->beta_W);
3462   fprintf(fp, "NSEQ     %d\n",   cm->nseq);
3463   fprintf(fp, "EFFNSEQ  %.3f\n", cm->eff_nseq);
3464   fprintf(fp, "CLEN     %d\n",   cm->clen);
3465 
3466   /* Print out the BCOM line as the cm->comlog string up to the first
3467    * '\n' or the end of the string. Print cm->ctime as BDATE. We
3468    * don't print CCOM and CDATE. This should be okay because we won't
3469    * likely have E-value and HMM filter stats since we're probably
3470    * converting from a 1.1 or later file. But it is possible we're
3471    * converting from a 1.0 file to a 1.0 file in which case we output
3472    * E-value stats and filter threshold stats but no CCOM and CDATE
3473    * lines.  This won't affect the parsing of the file by 1.0 though.
3474    */
3475   if(cm->comlog != NULL) {
3476     ESL_ALLOC(comlog2print, sizeof(char) * (strlen(cm->comlog)+1));
3477     x = 0;
3478     while(x < strlen(cm->comlog) && cm->comlog[x] != '\n') { comlog2print[x] = cm->comlog[x]; x++; }
3479     comlog2print[x] = '\0';
3480     fprintf(fp, "BCOM     %s\n", comlog2print);
3481     free(comlog2print);
3482   }
3483   if(cm->ctime != NULL) fprintf(fp, "BDATE    %s\n", cm->ctime);
3484 
3485   fputs(      "NULL    ", fp);
3486   for (x = 0; x < cm->abc->K; x++)
3487     fprintf(fp, "%6s ", prob2ascii(cm->null[x], 1/(float)(cm->abc->K)));
3488   fputs("\n", fp);
3489 
3490   /* E-value statistics skipped in converted output
3491    * mainly because HMM filter thresholds no longer exist
3492    * in current version so we can't output them here.
3493    */
3494   /* main model section */
3495   fputs("MODEL:\n", fp);
3496   for (v = 0; v < cm->M; v++)
3497     {
3498       nd = cm->ndidx[v];
3499 
3500       /* Node line.
3501        */
3502       if (cm->nodemap[nd] == v)
3503 	fprintf(fp, "\t\t\t\t[ %-4s %4d ]\n", Nodetype(cm->ndtype[nd]), nd);
3504 
3505       /* State line, w/ parents, children, and transitions
3506        */
3507       fprintf(fp, "    %2s %5d %5d %1d %5d %5d ",
3508 	      Statetype(cm->sttype[v]), v,
3509 	      cm->plast[v], cm->pnum[v],
3510 	      cm->cfirst[v], cm->cnum[v]);
3511       if (cm->sttype[v] != B_st)
3512 	for (x = 0; x < cm->cnum[v]; x++)
3513 	  fprintf(fp, "%7s ", prob2ascii(cm->t[v][x], 1.));
3514       else x = 0;
3515       for (; x < 6; x++)
3516 	fprintf(fp, "%7s ", "");
3517 
3518       /* Emission line
3519        */
3520       if (cm->sttype[v] == MP_st)
3521 	{
3522 	  for (x = 0; x < cm->abc->K; x++)
3523 	    for (y = 0; y < cm->abc->K; y++)
3524 	      fprintf(fp, "%6s ", prob2ascii(cm->e[v][x*cm->abc->K+y], cm->null[x]*cm->null[y]));
3525 	}
3526       else if (cm->sttype[v] == ML_st || cm->sttype[v] == MR_st || cm->sttype[v] == IL_st || cm->sttype[v] == IR_st)
3527 	{
3528 	  for (x = 0; x < cm->abc->K; x++)
3529 	    fprintf(fp, "%6s ", prob2ascii(cm->e[v][x], cm->null[x]));
3530 	}
3531       fputs("\n", fp);
3532     }
3533   fputs("//\n", fp);
3534   return eslOK;
3535 
3536  ERROR:
3537   ESL_EXCEPTION(eslEMEM, "out of memory");
3538   return status;
3539 }
3540 
3541 /* multiline()
3542  *
3543  * Stolen from HMMER3, verbatim.
3544  *
3545  * Used to print the command log to ASCII save files.
3546  *
3547  * Given a record (like the comlog) that contains
3548  * multiple lines, print it as multiple lines with
3549  * a given prefix. e.g.:
3550  *
3551  * given:   "COM   ", "foo\nbar\nbaz"
3552  * print:   COM   1 foo
3553  *          COM   2 bar
3554  *          COM   3 baz
3555  *
3556  * If <s> is NULL, no-op. Otherwise <s> must be a <NUL>-terminated
3557  * string.  It does not matter if it ends in <\n> or not. <pfx>
3558  * must be a valid <NUL>-terminated string; it may be empty.
3559  *
3560  * Args:     fp:   FILE to print to
3561  *           pfx:  prefix for each line
3562  *           s:    line to break up and print; tolerates a NULL
3563  *
3564  * Returns: <eslOK> on success.
3565  *
3566  * Throws:  <eslEWRITE> on write error.
3567  */
3568 static int
multiline(FILE * fp,const char * pfx,char * s)3569 multiline(FILE *fp, const char *pfx, char *s)
3570 {
3571   char *sptr  = s;
3572   char *end   = NULL;
3573   int   n     = 0;
3574   int   nline = 1;
3575 
3576   do {
3577     end = strchr(sptr, '\n');
3578 
3579     if (end != NULL)                  /* if there's no \n left, end == NULL */
3580       {
3581   n = end - sptr;                       /* n chars exclusive of \n */
3582   if (fprintf(fp, "%s [%d] ", pfx, nline++) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "hmm write failed");
3583   if (fwrite(sptr, sizeof(char), n, fp)    != n) ESL_EXCEPTION_SYS(eslEWRITE, "hmm write failed");  /* using fwrite lets us write fixed # of chars   */
3584   if (fprintf(fp, "\n")                     < 0) ESL_EXCEPTION_SYS(eslEWRITE, "hmm write failed");  /* while writing \n w/ printf allows newline conversion */
3585   sptr += n + 1;                       /* +1 to get past \n */
3586       }
3587     else
3588       {
3589   if (fprintf(fp, "%s [%d] %s\n", pfx, nline++, sptr) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "hmm write failed"); /* last line */
3590       }
3591   } while (end != NULL  && *sptr != '\0');   /* *sptr == 0 if <s> terminates with a \n */
3592   return eslOK;
3593 }
3594 
3595 
3596 /*****************************************************************
3597  * 8. Benchmark driver.
3598  *****************************************************************/
3599 #ifdef CM_FILE_BENCHMARK
3600 /*
3601   gcc -pthread -std=gnu99 -g -Wall -static -o cm_file_benchmark -I. -L. -I../hmmer/src -L../hmmer/src -I../easel -L../easel -DCM_FILE_BENCHMARK cm_file.c -linfernal -lhmmer -leasel -lm
3602   icc -pthread                 -O3 -static -o cm_file_benchmark -I. -L. -I../hmmer/src -L../hmmer/src -I../easel -L../easel -DCM_FILE_BENCHMARK cm_file.c -linfernal -lhmmer -leasel -lm
3603   ./cm_file_benchmark Rfam.cm
3604  */
3605 #include "esl_config.h"
3606 #include "p7_config.h"
3607 #include "config.h"
3608 
3609 #include <stdlib.h>
3610 #include <stdio.h>
3611 
3612 #include "easel.h"
3613 #include "esl_getopts.h"
3614 #include "esl_stopwatch.h"
3615 
3616 #include "hmmer.h"
3617 
3618 #include "infernal.h"
3619 
3620 static ESL_OPTIONS options[] = {
3621   /* name           type      default  env  range toggles reqs incomp  help                                  docgroup*/
3622   { "-h",        eslARG_NONE,   FALSE, NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",  0 },
3623   { "-a",        eslARG_NONE,   FALSE, NULL, NULL,  NULL,  NULL, NULL, "include time of CM configuration", 0 },
3624   { "-v",        eslARG_NONE,   FALSE, NULL, NULL,  NULL,  NULL, NULL, "verbose: print model info as they're read", 0 },
3625   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3626 };
3627 static char usage[]  = "[-options] <CM file>";
3628 static char banner[] = "benchmark driver for CM input";
3629 
3630 int
main(int argc,char ** argv)3631 main(int argc, char **argv)
3632 {
3633   ESL_GETOPTS   *go       = cm_CreateDefaultApp(options, 1, argc, argv, banner, usage);
3634   ESL_STOPWATCH *w        = esl_stopwatch_Create();
3635   ESL_STOPWATCH *w2       = esl_stopwatch_Create();
3636   ESL_ALPHABET  *abc      = NULL;
3637   char          *cmfile   = esl_opt_GetArg(go, 1);
3638   CM_FILE       *cmfp     = NULL;
3639   CM_t          *cm       = NULL;
3640   int            nmodel   = 0;
3641   uint64_t       tot_clen = 0;
3642   int            status;
3643   int            be_verbose = esl_opt_GetBoolean(go, "-v");
3644   char           errbuf[eslERRBUFSIZE];
3645 
3646   esl_stopwatch_Start(w);
3647 
3648   status = cm_file_Open(cmfile, NULL, FALSE, &cmfp, errbuf);
3649   if      (status == eslENOTFOUND) cm_Fail("File existence/permissions problem in trying to open CM file %s.\n%s\n", cmfile, errbuf);
3650   else if (status == eslEFORMAT)   cm_Fail("File format problem in trying to open CM file %s.\n%s\n",                cmfile, errbuf);
3651   else if (status != eslOK)        cm_Fail("Unexpected error %d in opening CM file %s.\n%s\n",               status, cmfile, errbuf);
3652   if      (cmfp->do_gzip)          cm_Fail("Reading gzipped CM files is not supported");
3653   if      (cmfp->do_stdin)         cm_Fail("Reading CM files from stdin is not supported");
3654 
3655   if(be_verbose) esl_stopwatch_Start(w2);
3656   while ((status = cm_file_Read(cmfp, &abc, &cm)) == eslOK)
3657     {
3658       nmodel++;
3659       tot_clen += cm->clen;
3660 
3661       if (esl_opt_GetBoolean(go, "-a")) {
3662 	if((status = cm_Configure(cm, errbuf, -1)) != eslOK) cm_Fail(errbuf);
3663       }
3664 
3665       if(be_verbose) {
3666 	esl_stopwatch_Stop(w2);
3667 	printf("%-30s  ", cm->name);
3668 	esl_stopwatch_Display(stdout, w2, "CPU time: ");
3669 	esl_stopwatch_Start(w2);
3670       }
3671       FreeCM(cm);
3672     }
3673   if      (status == eslEFORMAT)   cm_Fail("bad file format in CM file %s\n%s",             cmfile, cmfp->errbuf);
3674   else if (status == eslEINCOMPAT) cm_Fail("CM file %s contains different alphabets\n%s",   cmfile, cmfp->errbuf);
3675   else if (status != eslEOF)       cm_Fail("Unexpected error in reading CMs from %s\n%s",   cmfile, cmfp->errbuf);
3676 
3677   esl_stopwatch_Stop(w);
3678   esl_stopwatch_Display(stdout, w, "# CPU time: ");
3679   printf("# number of models: %d\n", nmodel);
3680   printf("# total clen:       %" PRId64 "\n", tot_clen);
3681 
3682   cm_file_Close(cmfp);
3683   esl_alphabet_Destroy(abc);
3684   esl_stopwatch_Destroy(w);
3685   esl_stopwatch_Destroy(w2);
3686   esl_getopts_Destroy(go);
3687   return 0;
3688 }
3689 #endif /*cm_FILE_BENCHMARK*/
3690 /*---------------- end, benchmark driver ------------------------*/
3691 
3692 
3693 /*****************************************************************
3694  * 9. Example.
3695  *****************************************************************/
3696 /* On using the example to test error messages from cm_file_Open():
3697  *    Message
3698  *  --------------
3699  *  .gz file missing/not readable     \rm test.cm.gz; touch test.cm.gz; src/cmfile_example test.cm.gz
3700  *  gzip -dc doesn't exist            \cp testsuite/20aa.cm test.cm; gzip test.cm; sudo mv /usr/bin/gzip /usr/bin/gzip.old; src/cmfile_example test.cm.gz
3701  *  cm file not found                 \rm test.cm; src/cmfile_example test.cm
3702  *  bad SSI file format               \cp testsuite/20aa.cm test.cm; \rm test.cm.ssi; touch test.cm.ssi; src/cmfile_example test.cm
3703  *  64-bit SSI on 32-bit sys
3704  *  empty file                        \rm test.cm; touch test.cm
3705  *  unrecognized format (binary)      cat testsuite/20aa.cm > test.cm; src/cmpress test.cm; \rm test.cm; [edit test.cm.h3m, delete first byte]
3706  *  unrecognized format (ascii)       cat testsuite/20aa.cm | sed -e 's/^HMMER3\/b/HMMER3\/x/' > test.hmm
3707  *
3708  */
3709 
3710 #ifdef CMFILE_EXAMPLE
3711 /* gcc -g -Wall -DCMFILE_EXAMPLE -I. -I../easel -L. -L../easel -o cmfile_example cm_file.c -linfernal -lhmmer -leasel -lm
3712  */
3713 #include "esl_config.h"
3714 #include "p7_config.h"
3715 #include "config.h"
3716 
3717 #include "easel.h"
3718 #include "esl_alphabet.h"
3719 
3720 #include "hmmer.h"
3721 
3722 #include "infernal.h"
3723 
3724 int
main(int argc,char ** argv)3725 main(int argc, char **argv)
3726 {
3727   char         *cmfile = argv[1];
3728   CM_FILE      *cmfp   = NULL;
3729   CM_t         *cm     = NULL;
3730   ESL_ALPHABET *abc    = NULL;
3731   char          errbuf[eslERRBUFSIZE];
3732   int           status;
3733 
3734   /* An example of reading a single CM from a file, and checking that it is the only one. */
3735   status = cm_file_Open(cmfile, NULL, FALSE, &cmfp, errbuf);
3736   if      (status == eslENOTFOUND) cm_Fail("File existence/permissions problem in trying to open CM file %s.\n%s\n", cmfile, errbuf);
3737   else if (status == eslEFORMAT)   cm_Fail("File format problem in trying to open CM file %s.\n%s\n",                cmfile, errbuf);
3738   else if (status != eslOK)        cm_Fail("Unexpected error %d in opening CM file %s.\n%s\n",               status, cmfile, errbuf);
3739 
3740   status = cm_file_Read(cmfp, TRUE, &abc, &cm);
3741   if      (status == eslEFORMAT)   cm_Fail("Bad file format in CM file %s:\n%s\n",          cmfp->fname, cmfp->errbuf);
3742   else if (status == eslEINCOMPAT) cm_Fail("CM in %s is not in the expected %s alphabet\n", cmfp->fname, esl_abc_DecodeType(abc->type));
3743   else if (status == eslEOF)       cm_Fail("Empty CM file %s? No CM data found.\n",         cmfp->fname);
3744   else if (status != eslOK)        cm_Fail("Unexpected error in reading CMs from %s\n",     cmfp->fname);
3745 
3746   status = cm_file_Read(cmfp, TRUE, &abc, NULL);
3747   if (status != eslEOF)            cm_Fail("CM file %s does not contain just one CM\n", cmfp->fname);
3748 
3749   cm_file_Close(cmfp);
3750 
3751   cm_file_WriteASCII(stdout, -1, cm);
3752 
3753   esl_alphabet_Destroy(abc);
3754   FreeCM(cm);
3755   return 0;
3756 }
3757 #endif /*CMFILE_EXAMPLE*/
3758 /*----------------------- end, example --------------------------*/
3759 
3760