1 /************************************************************
2 * HMMER - Biological sequence analysis with profile-HMMs
3 * Copyright (C) 1992-1998 Washington University School of Medicine
4 *
5 * This source code is distributed under the terms of the
6 * GNU General Public License. See the files COPYING and
7 * GNULICENSE for details.
8 *
9 ************************************************************/
10
11 /* hmmio.c
12 *
13 * Input/output of HMMs.
14 *
15 * As of HMMER 2.0, HMMs are saved by default in a tabular ASCII format
16 * as log-odds or log probabilities scaled to an integer. A binary save
17 * file format is also available which is faster to access (a
18 * consideration which might be important for HMM library applications).
19 * HMMs can be concatenated into HMM libraries.
20 *
21 * A comment on loss of accuracy. Storing a number as a scaled log
22 * probability guarantees us an error of about 0.035% or
23 * less in the retrieved probability. We also are relatively invulnerable
24 * to the truncation errors which HMMER 1.8 was vulnerable to.
25 *
26 * Magic numbers (both for the ASCII and binary save formats) are used
27 * to label save files with a major version number. This simplifies the task of
28 * backwards compatibility as new versions of the program are created.
29 * Reverse but not forward compatibility is guaranteed. I.e. HMMER 2.0
30 * can read `1.7' save files, but not vice versa. Note that the major
31 * version number in the save files is NOT the version of the software
32 * that generated it; rather, the number of the last major version in which
33 * save format changed.
34 *
35 ******************************************************************
36 *
37 * The HMM input API:
38 *
39 * HMMFILE *hmmfp;
40 * char *hmmfile;
41 * struct plan7_s *hmm;
42 * char env[] = "HMMERDB"; (a la BLASTDB)
43 *
44 * hmmfp = HMMFileOpen(hmmfile, env) NULL on failure
45 * while (HMMFileRead(hmmfp, &hmm)) 0 if no more HMMs
46 * if (hmm == NULL) Die(); NULL on file parse failure
47 * whatever;
48 * FreeHMM(hmm);
49 * }
50 * HMMFileClose(hmmfp);
51 *
52 *****************************************************************
53 *
54 * The HMM output API:
55 *
56 * FILE *ofp;
57 * struct plan7_s *hmm;
58 *
59 * WriteAscHMM(ofp, hmm); to write/append an HMM to open file
60 * or WriteBinHMM(ofp, hmm); to write/append binary format HMM to open file
61 *
62 *****************************************************************
63 *
64 * V1.0: original implementation
65 * V1.1: regularizers removed from model structure
66 * V1.7: ref and cs annotation lines added from alignment, one
67 * char per match state 1..M
68 * V1.9: null model and name added to HMM structure. ASCII format changed
69 * to compact tabular one.
70 * V2.0: Plan7. Essentially complete rewrite.
71 */
72
73 #include <stdio.h>
74 #include <stdlib.h>
75 #include <string.h>
76 #include <ctype.h>
77 #include <time.h>
78 #include <unistd.h> /* to get SEEK_CUR definition on silly Suns */
79
80 #include "squid.h"
81 #include "config.h"
82 #include "structs.h"
83 #include "funcs.h"
84
85 #ifdef MEMDEBUG
86 #include "dbmalloc.h"
87 #endif
88
89 /* Magic numbers identifying binary formats.
90 * Do not change the old magics! Necessary for backwards compatibility.
91 */
92 static unsigned int v10magic = 0xe8ededb1; /* v1.0 binary: "hmm1" + 0x80808080 */
93 static unsigned int v10swap = 0xb1edede8; /* byteswapped v1.0 */
94 static unsigned int v11magic = 0xe8ededb2; /* v1.1 binary: "hmm2" + 0x80808080 */
95 static unsigned int v11swap = 0xb2edede8; /* byteswapped v1.1 */
96 static unsigned int v17magic = 0xe8ededb3; /* v1.7 binary: "hmm3" + 0x80808080 */
97 static unsigned int v17swap = 0xb3edede8; /* byteswapped v1.7 */
98 static unsigned int v19magic = 0xe8ededb4; /* V1.9 binary: "hmm4" + 0x80808080 */
99 static unsigned int v19swap = 0xb4edede8; /* V1.9 binary, byteswapped */
100 static unsigned int v20magic = 0xe8ededb5; /* V2.0 binary: "hmm5" + 0x80808080 */
101 static unsigned int v20swap = 0xb5edede8; /* V2.0 binary, byteswapped */
102
103 /* Old HMMER 1.x file formats.
104 */
105 #define HMMER1_0B 1 /* binary HMMER 1.0 */
106 #define HMMER1_0F 2 /* flat ascii HMMER 1.0 */
107 #define HMMER1_1B 3 /* binary HMMER 1.1 */
108 #define HMMER1_1F 4 /* flat ascii HMMER 1.1 */
109 #define HMMER1_7B 5 /* binary HMMER 1.7 */
110 #define HMMER1_7F 6 /* flat ascii HMMER 1.7 */
111 #define HMMER1_9B 7 /* HMMER 1.9 binary */
112 #define HMMER1_9F 8 /* HMMER 1.9 flat ascii */
113
114 static int read_asc20hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);
115 static int read_bin20hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);
116 static int read_asc19hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);
117 static int read_bin19hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);
118 static int read_asc17hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);
119 static int read_bin17hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);
120 static int read_asc11hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);
121 static int read_bin11hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);
122 static int read_asc10hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);
123 static int read_bin10hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);
124
125 static void byteswap(char *swap, int nbytes);
126 static char *prob2ascii(float p, float null);
127 static float ascii2prob(char *s, float null);
128 static void write_bin_string(FILE *fp, char *s);
129 static int read_bin_string(FILE *fp, int doswap, char **ret_s);
130 static void multiline(FILE *fp, char *pfx, char *s);
131
132 static struct plan9_s *read_plan9_binhmm(FILE *fp, int version, int swapped);
133 static struct plan9_s *read_plan9_aschmm(FILE *fp, int version);
134
135 /*****************************************************************
136 * HMM input API functions:
137 * HMMFileOpen()
138 * HMMFileRead()
139 * HMMFileClose()
140 * HMMFileRewind()
141 *****************************************************************/
142 /* Function: HMMFileOpen()
143 *
144 * Purpose: Open an HMM file for reading. The file may be either
145 * an index for a library of HMMs, or an HMM. If it's
146 * a library, sets is_library flag to TRUE in the HMMFILE
147 * structure.
148 *
149 * Args: hmmfile - name of file
150 * env - NULL, or environment variable for HMM database.
151 *
152 * Return: Valid HMMFILE *, or NULL on failure.
153 * hmmfp->f has been advanced beyond the first
154 * line (for text files) or the magic number (for binaries).
155 */
156 HMMFILE *
HMMFileOpen(char * hmmfile,char * env)157 HMMFileOpen(char *hmmfile, char *env)
158 {
159 return HMMFileOpenFseek(hmmfile,env,0);
160 }
161
162
163 /* Function: HMMFileOpenFseek()
164 *
165 * Purpose: Open an HMM file for reading at a byte offset
166 * The file may be either
167 * an index for a library of HMMs, or an HMM. If it's
168 * a library, sets is_library flag to TRUE in the HMMFILE
169 * structure.
170 *
171 * Adapted by Ewan
172 *
173 * Args: hmmfile - name of file
174 * env - NULL, or environment variable for HMM database.
175 *
176 * Return: Valid HMMFILE *, or NULL on failure.
177 * hmmfp->f has been advanced beyond the first
178 * line (for text files) or the magic number (for binaries).
179 */
180 HMMFILE *
HMMFileOpenFseek(char * hmmfile,char * env,int byte_pos)181 HMMFileOpenFseek(char *hmmfile, char *env,int byte_pos)
182 {
183 HMMFILE *hmmfp;
184 unsigned int magic;
185 char buf[512];
186
187 hmmfp = (HMMFILE *) MallocOrDie (sizeof(HMMFILE));
188 hmmfp->f = NULL;
189 hmmfp->parser = NULL;
190 hmmfp->is_binary = FALSE;
191 hmmfp->byteswap = FALSE;
192
193 /* Open the file. Look in current directory.
194 * If that doesn't work, check environment var for
195 * a second possible directory (usually the location
196 * of a system-wide HMM library)
197 */
198 if ((hmmfp->f = fopen(hmmfile, "r")) == NULL &&
199 (hmmfp->f = EnvFileOpen(hmmfile, env)) == NULL)
200 return NULL;
201
202 /* EWAN fseek to byte_pos to get the correct position in the file */
203 HMMFseek(hmmfp,byte_pos);
204
205 /* Check for binary or byteswapped binary format
206 * by peeking at first 4 bytes.
207 */
208 if (! fread((char *) &magic, sizeof(unsigned int), 1, hmmfp->f)) {
209 HMMFileClose(hmmfp);
210 return NULL;
211 }
212
213 /*EWAN ok... this rewind should be back to the byte position */
214 HMMFseek(hmmfp,byte_pos);
215
216 /* old code */
217 /* rewind(hmmfp->f); */
218
219 if (magic == v20magic) {
220 hmmfp->parser = read_bin20hmm;
221 hmmfp->is_binary = TRUE;
222 return hmmfp;
223 }
224 else if (magic == v20swap) {
225 hmmfp->parser = read_bin20hmm;
226 hmmfp->is_binary = TRUE;
227 hmmfp->byteswap = TRUE;
228 return hmmfp;
229 }
230 else if (magic == v19magic) {
231 hmmfp->parser = read_bin19hmm;
232 hmmfp->is_binary = TRUE;
233 return hmmfp;
234 }
235 else if (magic == v19swap) {
236 hmmfp->parser = read_bin19hmm;
237 hmmfp->is_binary = TRUE;
238 hmmfp->byteswap = TRUE;
239 return hmmfp;
240 }
241 else if (magic == v17magic) {
242 hmmfp->parser = read_bin17hmm;
243 hmmfp->is_binary = TRUE;
244 return hmmfp;
245 }
246 else if (magic == v17swap) {
247 hmmfp->parser = read_bin17hmm;
248 hmmfp->is_binary = TRUE;
249 hmmfp->byteswap = TRUE;
250 return hmmfp;
251 }
252 else if (magic == v11magic) {
253 hmmfp->parser = read_bin11hmm;
254 hmmfp->is_binary = TRUE;
255 return hmmfp;
256 }
257 else if (magic == v11swap) {
258 hmmfp->parser = read_bin11hmm;
259 hmmfp->is_binary = TRUE;
260 hmmfp->byteswap = TRUE;
261 return hmmfp;
262 }
263 else if (magic == v10magic) {
264 hmmfp->parser = read_bin10hmm;
265 hmmfp->is_binary = TRUE;
266 return hmmfp;
267 }
268 else if (magic == v10swap) {
269 hmmfp->parser = read_bin10hmm;
270 hmmfp->is_binary = TRUE;
271 hmmfp->byteswap = TRUE;
272 return hmmfp;
273 }
274 /* else we fall thru; it may be an ASCII file. */
275
276 /* If magic looks binary but we don't recognize it, choke and die.
277 */
278 if (magic & 0x80000000) {
279 Warn("\
280 %s appears to be a binary but format is not recognized\n\
281 It may be from a HMMER version more recent than yours,\n\
282 or may be a different kind of binary altogether.\n", hmmfile);
283 HMMFileClose(hmmfp);
284 return NULL;
285 }
286
287 /* Check for ASCII format by peeking at first word.
288 */
289 if (fgets(buf, 512, hmmfp->f) == NULL) {
290 HMMFileClose(hmmfp);
291 return NULL;
292 }
293
294 /*EWAN ok... this rewind should be back to the byte position */
295 HMMFseek(hmmfp,byte_pos);
296
297 if (strncmp("HMMER2.0", buf, 8) == 0) {
298 hmmfp->parser = read_asc20hmm;
299 return hmmfp;
300 } else if (strncmp("HMMER v1.9", buf, 10) == 0) {
301 hmmfp->parser = read_asc19hmm;
302 return hmmfp;
303 } else if (strncmp("# HMM v1.7", buf, 10) == 0) {
304 hmmfp->parser = read_asc17hmm;
305 return hmmfp;
306 } else if (strncmp("# HMM v1.1", buf, 10) == 0) {
307 hmmfp->parser = read_asc11hmm;
308 return hmmfp;
309 } else if (strncmp("# HMM v1.0", buf, 10) == 0) {
310 hmmfp->parser = read_asc10hmm;
311 return hmmfp;
312 }
313
314 /* If we haven't recognized it yet, it's bogus.
315 */
316 HMMFileClose(hmmfp);
317 return NULL;
318 }
319 int
HMMFileRead(HMMFILE * hmmfp,struct plan7_s ** ret_hmm)320 HMMFileRead(HMMFILE *hmmfp, struct plan7_s **ret_hmm)
321 {
322 return (*hmmfp->parser)(hmmfp, ret_hmm);
323 }
324 void
HMMFileClose(HMMFILE * hmmfp)325 HMMFileClose(HMMFILE *hmmfp)
326 {
327 if (hmmfp->f != NULL) fclose(hmmfp->f);
328 free(hmmfp);
329 }
330 void
HMMFileRewind(HMMFILE * hmmfp)331 HMMFileRewind(HMMFILE *hmmfp)
332 {
333 rewind(hmmfp->f);
334 }
335
336 long
HMMFtell(HMMFILE * hmmfp)337 HMMFtell(HMMFILE *hmmfp)
338 {
339 return ftell(hmmfp->f);
340 }
341
342 int
HMMFseek(HMMFILE * hmmfp,long pos)343 HMMFseek(HMMFILE * hmmfp,long pos)
344 {
345 return fseek(hmmfp->f,pos,SEEK_SET);
346 }
347
348
349 /*****************************************************************
350 * HMM output API:
351 * WriteAscHMM()
352 * WriteBinHMM()
353 *
354 *****************************************************************/
355
356 /* Function: WriteAscHMM()
357 *
358 * Purpose: Save an HMM in flat text ASCII format.
359 *
360 * Args: fp - open file for writing
361 * hmm - HMM to save
362 */
363 void
WriteAscHMM(FILE * fp,struct plan7_s * hmm)364 WriteAscHMM(FILE *fp, struct plan7_s *hmm)
365 {
366 int k; /* counter for nodes */
367 int x; /* counter for symbols */
368 int ts; /* counter for state transitions */
369
370 fprintf(fp, "HMMER2.0\n"); /* magic header */
371
372 /* write header information
373 */
374 fprintf(fp, "NAME %s\n", hmm->name);
375 fprintf(fp, "DESC %s\n",
376 (hmm->flags & PLAN7_DESC) ? hmm->desc : "");
377 fprintf(fp, "LENG %d\n", hmm->M);
378 fprintf(fp, "ALPH %s\n",
379 (Alphabet_type == hmmAMINO) ? "Amino":"Nucleic");
380 fprintf(fp, "RF %s\n",
381 (hmm->flags & PLAN7_RF) ? "yes" : "no");
382 fprintf(fp, "CS %s\n",
383 (hmm->flags & PLAN7_CS) ? "yes" : "no");
384 multiline(fp, "COM ", hmm->comlog);
385 fprintf(fp, "NSEQ %d\n", hmm->nseq);
386 fprintf(fp, "DATE %s\n", hmm->ctime);
387
388 /* Specials
389 */
390 fputs("XT ", fp);
391 for (k = 0; k < 4; k++)
392 for (x = 0; x < 2; x++)
393 fprintf(fp, "%6s ", prob2ascii(hmm->xt[k][x], 1.0));
394 fputs("\n", fp);
395
396 /* Save the null model first, so HMM readers can decode
397 * log odds scores on the fly. Save as log odds probabilities
398 * relative to 1/Alphabet_size (flat distribution)
399 */
400 fprintf(fp, "NULT ");
401 fprintf(fp, "%6s ", prob2ascii(hmm->p1, 1.0)); /* p1 */
402 fprintf(fp, "%6s\n", prob2ascii(1.0-hmm->p1, 1.0)); /* p2 */
403 fputs("NULE ", fp);
404 for (x = 0; x < Alphabet_size; x++)
405 fprintf(fp, "%6s ", prob2ascii(hmm->null[x], 1/(float)(Alphabet_size)));
406 fputs("\n", fp);
407
408 /* EVD statistics
409 */
410 if (hmm->flags & PLAN7_STATS)
411 fprintf(fp, "EVD %10f %10f\n", hmm->mu, hmm->lambda);
412
413 /* Print header
414 */
415 fprintf(fp, "HMM ");
416 for (x = 0; x < Alphabet_size; x++) fprintf(fp, " %c ", Alphabet[x]);
417 fprintf(fp, "\n");
418 fprintf(fp, " %6s %6s %6s %6s %6s %6s %6s %6s %6s\n",
419 "m->m", "m->i", "m->d", "i->m", "i->i", "d->m", "d->d", "b->m", "m->e");
420
421 /* Print HMM parameters (main section of the save file)
422 */
423 fprintf(fp, " %6s %6s ", prob2ascii(1-hmm->tbd1, 1.0), "*");
424 fprintf(fp, "%6s\n", prob2ascii(hmm->tbd1, 1.0));
425 for (k = 1; k <= hmm->M; k++)
426 {
427 /* Line 1: k and match emissions */
428 fprintf(fp, " %5d ", k);
429 for (x = 0; x < Alphabet_size; x++)
430 fprintf(fp, "%6s ", prob2ascii(hmm->mat[k][x], hmm->null[x]));
431 fputs("\n", fp);
432
433 /* Line 2: RF and insert emissions */
434 fprintf(fp, " %5c ", hmm->flags & PLAN7_RF ? hmm->rf[k] : '-');
435 for (x = 0; x < Alphabet_size; x++)
436 fprintf(fp, "%6s ", (k < hmm->M) ? prob2ascii(hmm->ins[k][x], hmm->null[x]) : "*");
437 fputs("\n", fp);
438 /* Line 3: CS and transition probs */
439 fprintf(fp, " %5c ", hmm->flags & PLAN7_CS ? hmm->cs[k] : '-');
440 for (ts = 0; ts < 7; ts++)
441 fprintf(fp, "%6s ", (k < hmm->M) ? prob2ascii(hmm->t[k][ts], 1.0) : "*");
442 fprintf(fp, "%6s ", prob2ascii(hmm->begin[k], 1.0));
443 fprintf(fp, "%6s ", prob2ascii(hmm->end[k], 1.0));
444
445 fputs("\n", fp);
446 }
447 fputs("//\n", fp);
448 }
449
450 /* Function: WriteBinHMM()
451 *
452 * Purpose: Write an HMM in binary format.
453 */
454 void
WriteBinHMM(FILE * fp,struct plan7_s * hmm)455 WriteBinHMM(FILE *fp, struct plan7_s *hmm)
456 {
457 int k;
458
459 /* ye olde magic number */
460 fwrite((char *) &(v20magic), sizeof(long), 1, fp);
461
462 /* header section
463 */
464 fwrite((char *) &(hmm->flags), sizeof(int), 1, fp);
465 write_bin_string(fp, hmm->name);
466 if (hmm->flags & PLAN7_DESC) write_bin_string(fp, hmm->desc);
467 fwrite((char *) &(hmm->M), sizeof(int), 1, fp);
468 fwrite((char *) &(Alphabet_type), sizeof(int), 1, fp);
469 if (hmm->flags & PLAN7_RF) fwrite((char *) hmm->rf, sizeof(char), hmm->M+1, fp);
470 if (hmm->flags & PLAN7_CS) fwrite((char *) hmm->cs, sizeof(char), hmm->M+1, fp);
471 write_bin_string(fp, hmm->comlog);
472 fwrite((char *) &(hmm->nseq), sizeof(int), 1, fp);
473 write_bin_string(fp, hmm->ctime);
474
475 /* Specials */
476 for (k = 0; k < 4; k++)
477 fwrite((char *) hmm->xt[k], sizeof(float), 2, fp);
478
479 /* Null model */
480 fwrite((char *)&(hmm->p1), sizeof(float), 1, fp);
481 fwrite((char *) hmm->null, sizeof(float), Alphabet_size, fp);
482
483 /* EVD stats */
484 if (hmm->flags & PLAN7_STATS) {
485 fwrite((char *) &(hmm->mu), sizeof(float), 1, fp);
486 fwrite((char *) &(hmm->lambda), sizeof(float), 1, fp);
487 }
488
489 /* entry/exit probabilities
490 */
491 fwrite((char *)&(hmm->tbd1),sizeof(float), 1, fp);
492 fwrite((char *) hmm->begin, sizeof(float), hmm->M+1, fp);
493 fwrite((char *) hmm->end, sizeof(float), hmm->M+1, fp);
494
495 /* main model
496 */
497 for (k = 1; k <= hmm->M; k++)
498 fwrite((char *) hmm->mat[k], sizeof(float), Alphabet_size, fp);
499 for (k = 1; k < hmm->M; k++)
500 fwrite((char *) hmm->ins[k], sizeof(float), Alphabet_size, fp);
501 for (k = 1; k < hmm->M; k++)
502 fwrite((char *) hmm->t[k], sizeof(float), 7, fp);
503 }
504
505
506 /*****************************************************************
507 *
508 * Internal: HMM file parsers for various releases of HMMER.
509 *
510 * read_{asc,bin}xxhmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm)
511 *
512 * Upon return, *ret_hmm is an allocated Plan7 HMM.
513 * Return 0 if no more HMMs in the file (normal).
514 * Return 1 and *ret_hmm = something if we got an HMM (normal)
515 * Return 1 if an error occurs (meaning "I tried to
516 * read something...") and *ret_hmm == NULL (meaning
517 * "...but it wasn't an HMM"). I know, this is a funny
518 * way to handle errors.
519 *
520 *****************************************************************/
521
522 static int
read_asc20hmm(HMMFILE * hmmfp,struct plan7_s ** ret_hmm)523 read_asc20hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm)
524 {
525 struct plan7_s *hmm;
526 char buffer[512];
527 char *s;
528 int M;
529 float p;
530 int k, x;
531
532 hmm = NULL;
533 if (feof(hmmfp->f) || fgets(buffer, 512, hmmfp->f) == NULL) return 0;
534 if (strncmp(buffer, "HMMER2.0", 8) != 0) goto FAILURE;
535
536 /* Get the header information: tag/value pairs in any order,
537 * ignore unknown tags, stop when "HMM" is reached (signaling
538 * start of main model)
539 */
540 hmm = AllocPlan7Shell();
541 M = -1;
542 while (fgets(buffer, 512, hmmfp->f) != NULL) {
543 if (strncmp(buffer, "NAME ", 5) == 0) Plan7SetName(hmm, buffer+6);
544 else if (strncmp(buffer, "DESC ", 5) == 0) Plan7SetDescription(hmm, buffer+6);
545 else if (strncmp(buffer, "LENG ", 5) == 0) M = atoi(buffer+6);
546 else if (strncmp(buffer, "NSEQ ", 5) == 0) hmm->nseq = atoi(buffer+6);
547 else if (strncmp(buffer, "ALPH ", 5) == 0)
548 { /* Alphabet type */
549 s2upper(buffer+6);
550 if (strncmp(buffer+6, "AMINO", 5) == 0) SetAlphabet(hmmAMINO);
551 else if (strncmp(buffer+6, "NUCLEIC", 7) == 0) SetAlphabet(hmmNUCLEIC);
552 else goto FAILURE;
553 }
554 else if (strncmp(buffer, "RF ", 5) == 0)
555 { /* Reference annotation present? */
556 if (sre_toupper(*(buffer+6)) == 'Y') hmm->flags |= PLAN7_RF;
557 }
558 else if (strncmp(buffer, "CS ", 5) == 0)
559 { /* Consensus annotation present? */
560 if (sre_toupper(*(buffer+6)) == 'Y') hmm->flags |= PLAN7_CS;
561 }
562 else if (strncmp(buffer, "COM ", 5) == 0)
563 { /* Command line log */
564 StringChop(buffer+6);
565 if (hmm->comlog == NULL)
566 hmm->comlog = Strdup(buffer+6);
567 else
568 {
569 hmm->comlog = ReallocOrDie(hmm->comlog, sizeof(char *) *
570 (strlen(hmm->comlog) + 1 + strlen(buffer+6)));
571 strcat(hmm->comlog, "\n");
572 strcat(hmm->comlog, buffer+6);
573 }
574 }
575 else if (strncmp(buffer, "DATE ", 5) == 0)
576 { /* Date file created */
577 StringChop(buffer+6);
578 hmm->ctime= Strdup(buffer+6);
579 }
580 else if (strncmp(buffer, "XT ", 5) == 0)
581 { /* Special transition section */
582 if ((s = strtok(buffer+6, " \t\n")) == NULL) goto FAILURE;
583 for (k = 0; k < 4; k++)
584 for (x = 0; x < 2; x++)
585 {
586 if (s == NULL) goto FAILURE;
587 hmm->xt[k][x] = ascii2prob(s, 1.0);
588 s = strtok(NULL, " \t\n");
589 }
590 }
591 else if (strncmp(buffer, "NULT ", 5) == 0)
592 { /* Null model transitions */
593 if ((s = strtok(buffer+6, " \t\n")) == NULL) goto FAILURE;
594 hmm->p1 = ascii2prob(s, 1.);
595 if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE;
596 hmm->p1 = hmm->p1 / (hmm->p1 + ascii2prob(s, 1.0));
597 }
598 else if (strncmp(buffer, "NULE ", 5) == 0)
599 { /* Null model emissions */
600 if (Alphabet_type == hmmNOTSETYET)
601 Die("ALPH must precede NULE in HMM save files");
602 s = strtok(buffer+6, " \t\n");
603 for (x = 0; x < Alphabet_size; x++) {
604 if (s == NULL) goto FAILURE;
605 hmm->null[x] = ascii2prob(s, 1./(float)Alphabet_size);
606 s = strtok(NULL, " \t\n");
607 }
608 }
609 else if (strncmp(buffer, "EVD ", 5) == 0)
610 { /* EVD parameters */
611 hmm->flags |= PLAN7_STATS;
612 if ((s = strtok(buffer+6, " \t\n")) == NULL) goto FAILURE;
613 hmm->mu = atof(s);
614 if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE;
615 hmm->lambda = atof(s);
616 }
617 else if (strncmp(buffer, "HMM ", 5) == 0) break;
618 }
619
620 /* partial check for mandatory fields */
621 if (feof(hmmfp->f)) goto FAILURE;
622 if (M < 1) goto FAILURE;
623 if (hmm->name == NULL) goto FAILURE;
624 if (Alphabet_type == hmmNOTSETYET) goto FAILURE;
625
626 /* Main model section. Read as integer log odds, convert
627 * to probabilities
628 */
629 AllocPlan7Body(hmm, M);
630 /* skip an annotation line */
631 if (fgets(buffer, 512, hmmfp->f) == NULL) goto FAILURE;
632 /* parse tbd1 line */
633 if (fgets(buffer, 512, hmmfp->f) == NULL) goto FAILURE;
634 if ((s = strtok(buffer, " \t\n")) == NULL) goto FAILURE;
635 p = ascii2prob(s, 1.0);
636 if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE;
637 if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE;
638 hmm->tbd1 = ascii2prob(s, 1.0);
639 hmm->tbd1 = hmm->tbd1 / (p + hmm->tbd1);
640
641 /* main model */
642 for (k = 1; k <= hmm->M; k++) {
643 /* Line 1: k and match emissions */
644 if (fgets(buffer, 512, hmmfp->f) == NULL) goto FAILURE;
645 if ((s = strtok(buffer, " \t\n")) == NULL) goto FAILURE;
646 if (atoi(s) != k) goto FAILURE;
647 for (x = 0; x < Alphabet_size; x++) {
648 if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE;
649 hmm->mat[k][x] = ascii2prob(s, hmm->null[x]);
650 }
651 /* Line 2: RF and insert emissions */
652 if (fgets(buffer, 512, hmmfp->f) == NULL) goto FAILURE;
653 if ((s = strtok(buffer, " \t\n")) == NULL) goto FAILURE;
654 if (hmm->flags & PLAN7_RF) hmm->rf[k] = *s;
655 if (k < hmm->M) {
656 for (x = 0; x < Alphabet_size; x++) {
657 if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE;
658 hmm->ins[k][x] = ascii2prob(s, hmm->null[x]);
659 }
660 }
661 /* Line 3: CS and transitions */
662 if (fgets(buffer, 512, hmmfp->f) == NULL) goto FAILURE;
663 if ((s = strtok(buffer, " \t\n")) == NULL) goto FAILURE;
664 if (hmm->flags & PLAN7_CS) hmm->cs[k] = *s;
665 for (x = 0; x < 7; x++) {
666 if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE;
667 if (k < hmm->M) hmm->t[k][x] = ascii2prob(s, 1.0);
668 }
669 if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE;
670 hmm->begin[k] = ascii2prob(s, 1.0);
671 if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE;
672 hmm->end[k] = ascii2prob(s, 1.0);
673
674 } /* end loop over main model */
675
676 /* Advance to record separator
677 */
678 while (fgets(buffer, 512, hmmfp->f) != NULL)
679 if (strncmp(buffer, "//", 2) == 0) break;
680
681 /* Set flags and return
682 */
683 hmm->flags |= PLAN7_HASPROB; /* probabilities are valid */
684 hmm->flags &= ~PLAN7_HASBITS; /* scores are not valid */
685 *ret_hmm = hmm;
686 return 1;
687
688 FAILURE:
689 if (hmm != NULL) FreePlan7(hmm);
690 *ret_hmm = NULL;
691 return 1;
692 }
693
694
695 static int
read_bin20hmm(HMMFILE * hmmfp,struct plan7_s ** ret_hmm)696 read_bin20hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm)
697 {
698 struct plan7_s *hmm;
699 int k,x;
700 int type;
701 unsigned long magic;
702
703 hmm = NULL;
704
705 /* Header section
706 */
707 if (feof(hmmfp->f)) return 0;
708 if (! fread((char *) &magic, sizeof(long), 1, hmmfp->f)) return 0;
709 if (hmmfp->byteswap) byteswap((char *)&magic, sizeof(long));
710 if (magic != v20magic) goto FAILURE;
711 /* allocate HMM shell for header info */
712 hmm = AllocPlan7Shell();
713 /* flags */
714 if (! fread((char *) &(hmm->flags), sizeof(int), 1, hmmfp->f)) goto FAILURE;
715 if (hmmfp->byteswap) byteswap((char *)&(hmm->flags), sizeof(int));
716 /* name */
717 if (! read_bin_string(hmmfp->f, hmmfp->byteswap, &(hmm->name))) goto FAILURE;
718 /* optional description */
719 if ((hmm->flags & PLAN7_DESC) &&
720 ! read_bin_string(hmmfp->f, hmmfp->byteswap, &(hmm->desc))) goto FAILURE;
721 /* length of model */
722 if (! fread((char *) &hmm->M, sizeof(int), 1, hmmfp->f)) goto FAILURE;
723 if (hmmfp->byteswap) byteswap((char *)&(hmm->M), sizeof(int));
724 /* alphabet type */
725 if (! fread((char *) &type, sizeof(int), 1, hmmfp->f)) goto FAILURE;
726 if (hmmfp->byteswap) byteswap((char *)&type, sizeof(int));
727 if (Alphabet_type == 0) SetAlphabet(type);
728
729 /* now allocate for rest of model */
730 AllocPlan7Body(hmm, hmm->M);
731
732 /* optional #=RF alignment annotation */
733 if ((hmm->flags & PLAN7_RF) &&
734 !fread((char *) hmm->rf, sizeof(char), hmm->M+1, hmmfp->f)) goto FAILURE;
735 hmm->rf[hmm->M+1] = '\0';
736 /* optional #=CS alignment annotation */
737 if ((hmm->flags & PLAN7_CS) &&
738 !fread((char *) hmm->cs, sizeof(char), hmm->M+1, hmmfp->f)) goto FAILURE;
739 hmm->cs[hmm->M+1] = '\0';
740 /* command line log */
741 if (!read_bin_string(hmmfp->f, hmmfp->byteswap, &(hmm->comlog))) goto FAILURE;
742 /* nseq */
743 if (!fread((char *) &(hmm->nseq),sizeof(int), 1, hmmfp->f)) goto FAILURE;
744 /* creation time */
745 if (!read_bin_string(hmmfp->f, hmmfp->byteswap, &(hmm->ctime))) goto FAILURE;
746
747 /* specials */
748 for (k = 0; k < 4; k++)
749 if (! fread((char *) hmm->xt[k], sizeof(float), 2, hmmfp->f)) goto FAILURE;
750
751 /* null model */
752 if (!fread((char *) &(hmm->p1),sizeof(float), 1, hmmfp->f)) goto FAILURE;
753 if (!fread((char *)hmm->null,sizeof(float),Alphabet_size,hmmfp->f))goto FAILURE;
754
755 /* EVD stats */
756 if (hmm->flags & PLAN7_STATS) {
757 if (! fread((char *) &(hmm->mu), sizeof(float), 1, hmmfp->f))goto FAILURE;
758 if (! fread((char *) &(hmm->lambda), sizeof(float), 1, hmmfp->f))goto FAILURE;
759
760 if (hmmfp->byteswap) {
761 byteswap((char *)&(hmm->mu), sizeof(float));
762 byteswap((char *)&(hmm->lambda), sizeof(float));
763 }
764 }
765
766 /* entry/exit probabilities
767 */
768 if (! fread((char *)&(hmm->tbd1), sizeof(float), 1, hmmfp->f)) goto FAILURE;
769 if (! fread((char *) hmm->begin, sizeof(float), hmm->M+1, hmmfp->f)) goto FAILURE;
770 if (! fread((char *) hmm->end, sizeof(float), hmm->M+1, hmmfp->f)) goto FAILURE;
771
772 /* main model */
773 for (k = 1; k <= hmm->M; k++)
774 if (! fread((char *) hmm->mat[k], sizeof(float), Alphabet_size, hmmfp->f)) goto FAILURE;
775 for (k = 1; k < hmm->M; k++)
776 if (! fread((char *) hmm->ins[k], sizeof(float), Alphabet_size, hmmfp->f)) goto FAILURE;
777 for (k = 1; k < hmm->M; k++)
778 if (! fread((char *) hmm->t[k], sizeof(float), 7, hmmfp->f)) goto FAILURE;
779
780 /* byteswapping
781 */
782 if (hmmfp->byteswap) {
783 for (x = 0; x < Alphabet_size; x++)
784 byteswap((char *) &(hmm->null[x]), sizeof(float));
785 byteswap((char *)&(hmm->p1), sizeof(float));
786 byteswap((char *)&(hmm->tbd1), sizeof(float));
787
788 for (k = 1; k <= hmm->M; k++)
789 {
790 for (x = 0; x < Alphabet_size; x++)
791 byteswap((char *)&(hmm->mat[k][x]), sizeof(float));
792 if (k < hmm->M)
793 for (x = 0; x < Alphabet_size; x++)
794 byteswap((char *)&(hmm->ins[k][x]), sizeof(float));
795 byteswap((char *)&(hmm->begin[k]), sizeof(float));
796 byteswap((char *)&(hmm->end[k]), sizeof(float));
797 for (x = 0; x < 7; x++)
798 byteswap((char *)&(hmm->t[k][x]), sizeof(float));
799 }
800 }
801
802
803 /* set flags and return
804 */
805 hmm->flags |= PLAN7_HASPROB; /* probabilities are valid */
806 hmm->flags &= ~PLAN7_HASBITS; /* scores are not yet valid */
807 *ret_hmm = hmm;
808 return 1;
809
810 FAILURE:
811 if (hmm != NULL) FreePlan7(hmm);
812 *ret_hmm = NULL;
813 return 1;
814 }
815
816
817
818
819
820 /* Function: read_asc19hmm()
821 * Date: Tue Apr 7 17:11:29 1998 [StL]
822 *
823 * Purpose: Read ASCII-format tabular (1.9 and later) save files.
824 *
825 * HMMER 1.9 was only used internally at WashU, as far as
826 * I know, so this code shouldn't be terribly important
827 * to anyone.
828 */
829 static int
read_asc19hmm(HMMFILE * hmmfp,struct plan7_s ** ret_hmm)830 read_asc19hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm)
831 {
832 struct plan7_s *hmm;
833 FILE *fp;
834 char buffer[512];
835 char *s;
836 int M; /* length of model */
837 int k; /* state number */
838 int x; /* symbol number */
839
840 hmm = NULL;
841 fp = hmmfp->f;
842 if (feof(fp) || fgets(buffer, 512, fp) == NULL) return 0;
843 if (strncmp(buffer, "HMMER v1.9", 10) != 0) goto FAILURE;
844
845 hmm = AllocPlan7Shell();
846 /* read M from first line */
847 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; M = atoi(s); /* model length */
848 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; /* ignore alphabet size */
849 if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE; Plan7SetName(hmm, s); /* name */
850 if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE; s2upper(s); /* alphabet type */
851 if (strcmp(s, "AMINO") == 0) SetAlphabet(hmmAMINO);
852 else if (strcmp(s, "NUCLEIC") == 0) SetAlphabet(hmmNUCLEIC);
853 else goto FAILURE;
854 /* read alphabet, make sure it's Plan7-compatible... */
855 if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE;
856 if (strncmp(s, Alphabet, Alphabet_size) != 0) goto FAILURE;
857
858 /* whether we have ref, cs info */
859 if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE;
860 if (strcmp(s, "yes") == 0) hmm->flags |= PLAN7_RF;
861 if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE;
862 if (strcmp(s, "yes") == 0) hmm->flags |= PLAN7_CS;
863
864 /* null model. 1.9 has emissions only. invent transitions. */
865 if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE;
866 if (strcmp(s, "null") != 0) goto FAILURE;
867 for (x = 0; x < Alphabet_size; x++) {
868 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
869 hmm->null[x] = ascii2prob(s, 1.0);
870 }
871 hmm->p1 = (Alphabet_type == hmmAMINO)? 350./351. : 1000./1001.;
872
873 /* Done with header; check some stuff before proceeding
874 */
875 if (feof(hmmfp->f)) goto FAILURE;
876 if (M < 1) goto FAILURE;
877 if (hmm->name == NULL) goto FAILURE;
878 if (Alphabet_type == hmmNOTSETYET) goto FAILURE;
879
880 /* Allocate the model. Set up the probabilities that Plan9
881 * doesn't set.
882 */
883 AllocPlan7Body(hmm, M);
884 ZeroPlan7(hmm);
885 Plan7LSConfig(hmm);
886
887 /* The zero row has: 4 or 20 unused scores for nonexistent M0 state
888 * then: B->M, tbd1, a B->I that Plan7 doesn't have;
889 * three unused D-> transitions; then three I0 transitions that Plan7 doesn't have;
890 * then two unused rf, cs annotations.
891 */
892 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; /* position index ignored */
893 for (x = 0; x < Alphabet_size; x++)
894 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; /* emissions ignored */
895 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
896 hmm->begin[1] = ascii2prob(s, 1.0);
897 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
898 hmm->tbd1 = ascii2prob(s, 1.0);
899 /* renormalize */
900 hmm->begin[1] = hmm->begin[1] / (hmm->begin[1] + hmm->tbd1);
901 hmm->tbd1 = hmm->tbd1 / (hmm->begin[1] + hmm->tbd1);
902 /* skip rest of line, seven integer fields, two char fields */
903 for (x = 0; x < 7; x++)
904 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
905 if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE;
906 if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE;
907
908 /* main model: table of emissions, transitions, annotation */
909 for (k = 1; k <= hmm->M; k++)
910 {
911 /* position index ignored */
912 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
913 /* match emissions */
914 for (x = 0; x < Alphabet_size; x++) {
915 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
916 hmm->mat[k][x] = ascii2prob(s, hmm->null[x]);
917 }
918 /* nine transitions; two are ignored */
919 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
920 if (k < hmm->M) hmm->t[k][TMM] = ascii2prob(s, 1.0);
921 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
922 if (k < hmm->M) hmm->t[k][TMD] = (k == hmm->M) ? 0.0 : ascii2prob(s, 1.0);
923 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
924 if (k < hmm->M) hmm->t[k][TMI] = ascii2prob(s, 1.0);
925
926 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
927 if (k < hmm->M) hmm->t[k][TDM] = ascii2prob(s, 1.0);
928 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
929 if (k < hmm->M) hmm->t[k][TDD] = (k == hmm->M) ? 0.0 : ascii2prob(s, 1.0);
930 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;/* TDI ignored. */
931
932 /* no insert state at k == M, be careful */
933 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
934 if (k < hmm->M) hmm->t[k][TIM] = ascii2prob(s, 1.0);
935 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; /* TID ignored. */
936 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
937 if (k < hmm->M) hmm->t[k][TII] = ascii2prob(s, 1.0);
938
939 /* annotations */
940 if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE;
941 if (hmm->flags & PLAN7_RF) hmm->rf[k] = *s;
942 if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE;
943 if (hmm->flags & PLAN7_CS) hmm->cs[k] = *s;
944 }
945 /* table of insert emissions;
946 * Plan7 has no insert state at 0 or M */
947 for (k = 0; k <= hmm->M; k++)
948 {
949 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; /* position index ignored */
950 for (x = 0; x < Alphabet_size; x++) {
951 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
952 if (k > 0 && k < hmm->M)
953 hmm->ins[k][x] = ascii2prob(s, hmm->null[x]);
954 }
955 }
956
957 /* Set flags and return
958 */
959 hmm->flags |= PLAN7_HASPROB; /* probabilities are valid */
960 hmm->flags &= ~PLAN7_HASBITS; /* scores are not valid */
961 Plan7Renormalize(hmm);
962 hmm->comlog = Strdup("[converted from an old Plan9 HMM]");
963 Plan7SetCtime(hmm);
964 *ret_hmm = hmm;
965 return 1;
966
967 FAILURE:
968 if (hmm != NULL) FreePlan7(hmm);
969 *ret_hmm = NULL;
970 return 1;
971 }
972
973 static int
read_bin19hmm(HMMFILE * hmmfp,struct plan7_s ** ret_hmm)974 read_bin19hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm)
975 {
976 unsigned int magic;
977 struct plan7_s *hmm; /* plan7 HMM */
978 struct plan9_s *p9hmm; /* old style 1.x HMM */
979
980 /* Read the magic number; if we don't see it, then we
981 * must be out of data in the file.
982 */
983 if (feof(hmmfp->f)) return 0;
984 if (! fread((char *) &magic, sizeof(unsigned int), 1, hmmfp->f)) return 0;
985
986 p9hmm = read_plan9_binhmm(hmmfp->f, HMMER1_9B, hmmfp->byteswap);
987 if (p9hmm == NULL) { *ret_hmm = NULL; return 1; }
988
989 Plan9toPlan7(p9hmm, &hmm);
990
991 hmm->comlog = Strdup("[converted from an old Plan9 HMM]");
992 Plan7SetCtime(hmm);
993
994 P9FreeHMM(p9hmm);
995 *ret_hmm = hmm;
996 return 1;
997 }
998 static int
read_asc17hmm(HMMFILE * hmmfp,struct plan7_s ** ret_hmm)999 read_asc17hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm)
1000 {
1001 struct plan7_s *hmm; /* plan7 HMM */
1002 struct plan9_s *p9hmm; /* old style 1.x HMM */
1003 char buffer[512];
1004
1005 /* Read the magic header; if we don't see it, then
1006 * we must be out of data in the file.
1007 */
1008 if (feof(hmmfp->f) || fgets(buffer, 512, hmmfp->f) == NULL) return 0;
1009
1010 p9hmm = read_plan9_aschmm(hmmfp->f, HMMER1_7F);
1011 if (p9hmm == NULL) { *ret_hmm = NULL; return 1; }
1012
1013 Plan9toPlan7(p9hmm, &hmm);
1014
1015 hmm->comlog = Strdup("[converted from an old Plan9 HMM]");
1016 Plan7SetCtime(hmm);
1017
1018 P9FreeHMM(p9hmm);
1019 *ret_hmm = hmm;
1020 return 1;
1021 }
1022
1023 static int
read_bin17hmm(HMMFILE * hmmfp,struct plan7_s ** ret_hmm)1024 read_bin17hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm)
1025 {
1026 unsigned int magic;
1027 struct plan7_s *hmm; /* plan7 HMM */
1028 struct plan9_s *p9hmm; /* old style 1.x HMM */
1029
1030 /* Read the magic number; if we don't see it, then we
1031 * must be out of data in the file.
1032 */
1033 if (feof(hmmfp->f)) return 0;
1034 if (! fread((char *) &magic, sizeof(unsigned int), 1, hmmfp->f)) return 0;
1035
1036 p9hmm = read_plan9_binhmm(hmmfp->f, HMMER1_7B, hmmfp->byteswap);
1037 if (p9hmm == NULL) { *ret_hmm = NULL; return 1; }
1038
1039 Plan9toPlan7(p9hmm, &hmm);
1040
1041 hmm->comlog = Strdup("[converted from an old Plan9 HMM]");
1042 Plan7SetCtime(hmm);
1043
1044 P9FreeHMM(p9hmm);
1045 *ret_hmm = hmm;
1046 return 1;
1047 }
1048
1049 static int
read_asc11hmm(HMMFILE * hmmfp,struct plan7_s ** ret_hmm)1050 read_asc11hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm)
1051 {
1052 Die("1.1 ASCII HMMs unsupported");
1053 return 1;
1054 }
1055 static int
read_bin11hmm(HMMFILE * hmmfp,struct plan7_s ** ret_hmm)1056 read_bin11hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm)
1057 {
1058 unsigned int magic;
1059 struct plan7_s *hmm; /* plan7 HMM */
1060 struct plan9_s *p9hmm; /* old style 1.x HMM */
1061
1062 /* Read the magic number; if we don't see it, then we
1063 * must be out of data in the file.
1064 */
1065 if (feof(hmmfp->f)) return 0;
1066 if (! fread((char *) &magic, sizeof(unsigned int), 1, hmmfp->f)) return 0;
1067
1068 p9hmm = read_plan9_binhmm(hmmfp->f, HMMER1_1B, hmmfp->byteswap);
1069 if (p9hmm == NULL) { *ret_hmm = NULL; return 1; }
1070
1071 Plan9toPlan7(p9hmm, &hmm);
1072
1073 hmm->comlog = Strdup("[converted from an old Plan9 HMM]");
1074 Plan7SetCtime(hmm);
1075
1076 P9FreeHMM(p9hmm);
1077 *ret_hmm = hmm;
1078 return 1;
1079 }
1080
1081 static int
read_asc10hmm(HMMFILE * hmmfp,struct plan7_s ** ret_hmm)1082 read_asc10hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm)
1083 {
1084 Die("1.0 ASCII HMMs unsupported");
1085 return 1;
1086 }
1087
1088 static int
read_bin10hmm(HMMFILE * hmmfp,struct plan7_s ** ret_hmm)1089 read_bin10hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm)
1090 {
1091 unsigned int magic;
1092 struct plan7_s *hmm; /* plan7 HMM */
1093 struct plan9_s *p9hmm; /* old style 1.x HMM */
1094
1095 /* Read the magic number; if we don't see it, then we
1096 * must be out of data in the file.
1097 */
1098 if (feof(hmmfp->f)) return 0;
1099 if (! fread((char *) &magic, sizeof(unsigned int), 1, hmmfp->f)) return 0;
1100
1101 p9hmm = read_plan9_binhmm(hmmfp->f, HMMER1_0B, hmmfp->byteswap);
1102 if (p9hmm == NULL) { *ret_hmm = NULL; return 1; }
1103
1104 Plan9toPlan7(p9hmm, &hmm);
1105
1106 hmm->comlog = Strdup("[converted from an old Plan9 HMM]");
1107 Plan7SetCtime(hmm);
1108
1109 P9FreeHMM(p9hmm);
1110 *ret_hmm = hmm;
1111 return 1;
1112 }
1113
1114 /*****************************************************************
1115 * Some miscellaneous utility functions
1116 *****************************************************************/
1117
1118 /* Function: prob2ascii()
1119 *
1120 * Purpose: Format a probability for output to an ASCII save
1121 * file. Returns a ptr to a static internal buffer.
1122 *
1123 */
1124 static char *
prob2ascii(float p,float null)1125 prob2ascii(float p, float null)
1126 {
1127 static char buffer[8];
1128
1129 if (p == 0.0) return "*";
1130 sprintf(buffer, "%6d", Prob2Score(p, null));
1131 return buffer;
1132 }
1133
1134
1135 /* Function: ascii2prob()
1136 *
1137 * Purpose: Convert a saved string back to a probability.
1138 */
1139 static float
ascii2prob(char * s,float null)1140 ascii2prob(char *s, float null)
1141 {
1142 return (*s == '*') ? 0. : Score2Prob(atoi(s), null);
1143 }
1144
1145 /* Function: byteswap()
1146 *
1147 * Purpose: Swap between big-endian and little-endian.
1148 * For example:
1149 * int foo = 0x12345678;
1150 * byteswap((char *) &foo, sizeof(int));
1151 * printf("%x\n", foo)
1152 * gives 78563412.
1153 *
1154 * I don't fully understand byte-swapping issues.
1155 * However, I have tested this on chars through floats,
1156 * on various machines:
1157 * SGI IRIX 4.0.5, SunOS 4.1.3, DEC Alpha OSF/1, Alliant
1158 *
1159 * Note: this is only a partial solution to the problem of
1160 * binary file portability. 32 bit integers are assumed by HMMER,
1161 * for instance. This should be true for all UNIX, VAX, and WinNT
1162 * platforms, I believe.
1163 *
1164 * Date: Sun Feb 12 10:26:22 1995
1165 */
1166 static void
byteswap(char * swap,int nbytes)1167 byteswap(char *swap, int nbytes)
1168 {
1169 int x;
1170 char byte;
1171
1172 for (x = 0; x < nbytes / 2; x++)
1173 {
1174 byte = swap[nbytes - x - 1];
1175 swap[nbytes - x - 1] = swap[x];
1176 swap[x] = byte;
1177 }
1178 }
1179
1180 /* Function: write_bin_string()
1181 * Date: SRE, Wed Oct 29 13:49:27 1997 [TWA 721 over Canada]
1182 *
1183 * Purpose: Write a string in binary save format: an integer
1184 * for the string length (including \0), followed by
1185 * the string.
1186 */
1187 static void
write_bin_string(FILE * fp,char * s)1188 write_bin_string(FILE *fp, char *s)
1189 {
1190 int len;
1191 if (s != NULL)
1192 {
1193 len = strlen(s) + 1;
1194 fwrite((char *) &len, sizeof(int), 1, fp);
1195 fwrite((char *) s, sizeof(char), len, fp);
1196 }
1197 else
1198 {
1199 len = 0;
1200 fwrite((char *) &len, sizeof(int), 1, fp);
1201 }
1202 }
1203
1204 /* Function: read_bin_string()
1205 * Date: SRE, Wed Oct 29 14:03:23 1997 [TWA 721]
1206 *
1207 * Purpose: Read in a string from a binary file, where
1208 * the first integer is the length (including '\0').
1209 *
1210 * Args: fp - FILE to read from
1211 * doswap - TRUE to byteswap
1212 * ret_s - string to read into
1213 *
1214 * Return: 0 on failure. ret_s is malloc'ed here.
1215 */
1216 static int
read_bin_string(FILE * fp,int doswap,char ** ret_s)1217 read_bin_string(FILE *fp, int doswap, char **ret_s)
1218 {
1219 char *s;
1220 int len;
1221
1222 if (! fread((char *) &len, sizeof(int), 1, fp)) return 0;
1223 if (doswap) byteswap((char *)&len, sizeof(int));
1224 s = MallocOrDie (sizeof(char) * (len));
1225 if (! fread((char *) s, sizeof(char), len, fp))
1226 {
1227 free(s);
1228 return 0;
1229 }
1230
1231 *ret_s = s;
1232 return 1;
1233 }
1234
1235 /* Function: multiline()
1236 * Date: Mon Jan 5 14:57:50 1998 [StL]
1237 *
1238 * Purpose: Given a record (like the comlog) that contains
1239 * multiple lines, print it as multiple lines with
1240 * a given prefix. e.g.:
1241 *
1242 * given: "COM ", "foo\nbar\nbaz"
1243 * print: COM foo
1244 * COM bar
1245 * COM baz
1246 *
1247 *
1248 * Used to print the command log to ASCII save files.
1249 *
1250 * Args: fp: FILE to print to
1251 * pfx: prefix for each line
1252 * s: line to break up and print; tolerates a NULL
1253 *
1254 * Return: (void)
1255 */
1256 static void
multiline(FILE * fp,char * pfx,char * s)1257 multiline(FILE *fp, char *pfx, char *s)
1258 {
1259 char *buf;
1260 char *sptr;
1261
1262 if (s == NULL) return;
1263 buf = Strdup(s);
1264 sptr = strtok(buf, "\n");
1265 while (sptr != NULL)
1266 {
1267 fprintf(fp, "%s%s\n", pfx, sptr);
1268 sptr = strtok(NULL, "\n");
1269 }
1270 free(buf);
1271 }
1272
1273
1274 /*****************************************************************
1275 * HMMER 1.x save file reading functions, modified from the
1276 * corpse of 1.9m.
1277 *****************************************************************/
1278
1279
1280 /* Function: read_plan9_binhmm()
1281 *
1282 * Read old (Plan9) binary HMM save files from HMMER 1.9 and earlier.
1283 * V1.0 saved regularizer and sympvec info, which V1.1 ignores.
1284 * V1.7 and later may include optional ref, cs annotation lines.
1285 * V1.9 added name, null model.
1286 *
1287 * Returns pointer to the HMM on success; NULL
1288 * on failure. Sets global alphabet information based on
1289 * whether it reads 4 or 20 as alphabet size (don't rely
1290 * on ancient HMMER macro definitions).
1291 */
1292 static struct plan9_s *
read_plan9_binhmm(FILE * fp,int version,int swapped)1293 read_plan9_binhmm(FILE *fp, int version, int swapped)
1294 {
1295 struct plan9_s *hmm;
1296 int M; /* length of model */
1297 int k; /* state number */
1298 int x; /* symbol or transition number */
1299 int len; /* length of variable length string */
1300 int asize; /* alphabet size */
1301 int atype; /* alphabet type (read but ignored) */
1302 char abet[20]; /* alphabet (read but ignored) */
1303
1304 /* read M and alphabet size */
1305 if (! fread((char *) &(M), sizeof(int), 1, fp)) return NULL;
1306 if (! fread((char *) &asize, sizeof(int), 1, fp)) return NULL;
1307 if (swapped) {
1308 byteswap((char *) &M, sizeof(int));
1309 byteswap((char *) &asize, sizeof(int));
1310 }
1311
1312 /* Set global alphabet information
1313 */
1314 if (Alphabet_type == hmmNOTSETYET)
1315 {
1316 if (asize == 4) SetAlphabet(hmmNUCLEIC);
1317 else if (asize == 20) SetAlphabet(hmmAMINO);
1318 else
1319 Die("A nonbiological alphabet size of %d; so I can't convert plan9 to plan7", asize);
1320 }
1321 else
1322 {
1323 if (asize != Alphabet_size)
1324 Die("Plan9 model's alphabet size (%d) doesn't match previous setting (%d)", asize, Alphabet_size);
1325 }
1326
1327 /* now, create space for hmm */
1328 if ((hmm = P9AllocHMM(M)) == NULL)
1329 Die("malloc failed for reading hmm in\n");
1330
1331 /* version 1.9+ files have a name */
1332 if (version == HMMER1_9B) {
1333 if (! fread((char *) &len, sizeof(int), 1, fp)) return NULL;
1334 if (swapped) byteswap((char *) &len, sizeof(int));
1335 hmm->name = (char *) ReallocOrDie (hmm->name, sizeof(char) * (len+1));
1336 if (! fread((char *) hmm->name, sizeof(char), len, fp)) return NULL;
1337 hmm->name[len] = '\0';
1338 }
1339
1340 /* read alphabet_type and alphabet, but ignore: we've already set them */
1341 if (! fread((char *) &atype, sizeof(int), 1, fp)) return NULL;
1342 if (! fread((char *) abet, sizeof(char), Alphabet_size, fp)) return NULL;
1343
1344 /* skip the random symbol frequencies in V1.0 */
1345 if (version == HMMER1_0B)
1346 fseek(fp, (long) (sizeof(float) * Alphabet_size), SEEK_CUR);
1347
1348 /* Get optional info in V1.7 and later
1349 */
1350 if (version == HMMER1_7B || version == HMMER1_9B)
1351 {
1352 if (! fread((char *) &(hmm->flags), sizeof(int), 1, fp)) return NULL;
1353 if (swapped) byteswap((char *) &hmm->flags, sizeof(int));
1354 if ((hmm->flags & HMM_REF) &&
1355 ! fread((char *) hmm->ref, sizeof(char), hmm->M+1, fp)) return NULL;
1356 hmm->ref[hmm->M+1] = '\0';
1357 if ((hmm->flags & HMM_CS) &&
1358 ! fread((char *) hmm->cs, sizeof(char), hmm->M+1, fp)) return NULL;
1359 hmm->cs[hmm->M+1] = '\0';
1360 }
1361
1362 /* Get the null model in V1.9 and later
1363 */
1364 if (version == HMMER1_9B)
1365 {
1366 if (! fread((char *) hmm->null, sizeof(float), Alphabet_size, fp)) return NULL;
1367 if (swapped)
1368 for (x = 0; x < Alphabet_size; x++)
1369 byteswap((char *) &(hmm->null[x]), sizeof(float));
1370 }
1371 else P9DefaultNullModel(hmm->null);
1372
1373 /* everything else is states */
1374 for (k = 0; k <= hmm->M; k++)
1375 {
1376 /* get match state info */
1377 if (! fread((char *) &(hmm->mat[k].t[MATCH]), sizeof(float), 1, fp)) return NULL;
1378 if (! fread((char *) &(hmm->mat[k].t[DELETE]), sizeof(float), 1, fp)) return NULL;
1379 if (! fread((char *) &(hmm->mat[k].t[INSERT]), sizeof(float), 1, fp)) return NULL;
1380 if (! fread((char *) hmm->mat[k].p, sizeof(float), Alphabet_size, fp)) return NULL
1381 ;
1382 if (swapped) {
1383 byteswap((char *) &(hmm->mat[k].t[MATCH]), sizeof(float));
1384 byteswap((char *) &(hmm->mat[k].t[DELETE]), sizeof(float));
1385 byteswap((char *) &(hmm->mat[k].t[INSERT]), sizeof(float));
1386 for (x = 0; x < Alphabet_size; x++)
1387 byteswap((char *) &(hmm->mat[k].p[x]), sizeof(float));
1388 }
1389
1390 /* skip the regularizer info in V1.0 */
1391 if (version == HMMER1_0B)
1392 fseek(fp, (long)(sizeof(float) * (3 + Alphabet_size)), SEEK_CUR);
1393
1394 /* get delete state info */
1395 if (! fread((char *) &(hmm->del[k].t[MATCH]), sizeof(float), 1, fp)) return NULL;
1396 if (! fread((char *) &(hmm->del[k].t[DELETE]), sizeof(float), 1, fp)) return NULL;
1397 if (! fread((char *) &(hmm->del[k].t[INSERT]), sizeof(float), 1, fp)) return NULL;
1398 if (swapped) {
1399 byteswap((char *) &(hmm->del[k].t[MATCH]), sizeof(float));
1400 byteswap((char *) &(hmm->del[k].t[DELETE]), sizeof(float));
1401 byteswap((char *) &(hmm->del[k].t[INSERT]), sizeof(float));
1402 }
1403
1404 /* skip the regularizer info in V1.0 */
1405 if (version == HMMER1_0B)
1406 fseek(fp, (long)(sizeof(float) * 3), SEEK_CUR);
1407
1408 /* get insert state info */
1409 if (! fread((char *) &(hmm->ins[k].t[MATCH]), sizeof(float), 1, fp)) return NULL;
1410 if (! fread((char *) &(hmm->ins[k].t[DELETE]), sizeof(float), 1, fp)) return NULL;
1411 if (! fread((char *) &(hmm->ins[k].t[INSERT]), sizeof(float), 1, fp)) return NULL;
1412 if (! fread((char *) hmm->ins[k].p, sizeof(float), Alphabet_size, fp)) return NULL
1413 ;
1414 if (swapped) {
1415 byteswap((char *) &(hmm->ins[k].t[MATCH]), sizeof(float));
1416 byteswap((char *) &(hmm->ins[k].t[DELETE]), sizeof(float));
1417 byteswap((char *) &(hmm->ins[k].t[INSERT]), sizeof(float));
1418 for (x = 0; x < Alphabet_size; x++)
1419 byteswap((char *) &(hmm->ins[k].p[x]), sizeof(float));
1420 }
1421
1422 /* skip the regularizer info in V1.0 */
1423 if (version == HMMER1_0B)
1424 fseek(fp, (long)(sizeof(float) * (3 + Alphabet_size)), SEEK_CUR);
1425 }
1426 P9Renormalize(hmm);
1427 return hmm;
1428 }
1429
1430
1431 /* Function: read_plan9_aschmm()
1432 *
1433 * Purpose: Read ASCII-format save files from 1.8.4 and earlier.
1434 * V1.0 contained sympvec and regularizers; these are ignored
1435 * in V1.1 and later
1436 * V1.7 and later contain ref and cs annotation.
1437 *
1438 * Args: fp - open save file, header has been read already
1439 * version - HMMER1_7F, for instance
1440 *
1441 * Returns ptr to the (allocated) new HMM on success,
1442 * or NULL on failure.
1443 */
1444 static struct plan9_s *
read_plan9_aschmm(FILE * fp,int version)1445 read_plan9_aschmm(FILE *fp, int version)
1446 {
1447 struct plan9_s *hmm;
1448 int M; /* length of model */
1449 char buffer[512];
1450 char *statetype;
1451 char *s;
1452 int k; /* state number */
1453 int i; /* symbol number */
1454 int asize;
1455 /* read M from first line */
1456 if (fgets(buffer, 512, fp) == NULL) return NULL;
1457 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1458 if (!isdigit(*s)) return NULL;
1459 M = atoi(s);
1460 /* read alphabet_length */
1461 if (fgets(buffer, 512, fp) == NULL) return NULL;
1462 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1463 if (!isdigit(*s)) return NULL;
1464 asize = atoi(s);
1465
1466 /* Set global alphabet information
1467 */
1468 if (Alphabet_type == hmmNOTSETYET)
1469 {
1470 if (asize == 4) SetAlphabet(hmmNUCLEIC);
1471 else if (asize == 20) SetAlphabet(hmmAMINO);
1472 else
1473 Die("A nonbiological alphabet size of %d; so I can't convert plan9 to plan7", asize);
1474 }
1475 else
1476 {
1477 if (asize != Alphabet_size)
1478 Die("Plan9 model's alphabet size (%d) doesn't match previous setting (%d)", asize, Alphabet_size);
1479 }
1480 /* now, create space for hmm */
1481 if ((hmm = P9AllocHMM(M)) == NULL)
1482 Die("malloc failed for reading hmm in\n");
1483
1484 /* read alphabet_type but ignore */
1485 if (fgets(buffer, 512, fp) == NULL) return NULL;
1486 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1487 if (!isdigit(*s)) return NULL;
1488 /* read alphabet but ignore */
1489 if (fgets(buffer, 512, fp) == NULL) return NULL;
1490 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1491
1492 /* skip the random symbol frequencies in V1.0 files. now unused */
1493 if (version == HMMER1_0F)
1494 for (i = 0; i < Alphabet_size; i++)
1495 if (fgets(buffer, 512, fp) == NULL) return NULL;
1496
1497 /* V1.7 has lines for whether we have valid ref, cs info
1498 */
1499 if (version == HMMER1_7F)
1500 {
1501 if (fgets(buffer, 512, fp) == NULL) return NULL;
1502 if (strncmp(buffer, "yes", 3) == 0) hmm->flags |= HMM_REF;
1503 if (fgets(buffer, 512, fp) == NULL) return NULL;
1504 if (strncmp(buffer, "yes", 3) == 0) hmm->flags |= HMM_CS;
1505 }
1506
1507 /* everything else is states */
1508 while (fgets(buffer, 512, fp) != NULL)
1509 {
1510 /* get state type and index info */
1511 if ((statetype = strtok(buffer, " \t\n")) == NULL) return NULL;
1512 if ((s = strtok((char *) NULL, " \t\n")) == NULL) return NULL;
1513 if (!isdigit(*s)) return NULL;
1514 k = atoi(s);
1515 if (k < 0 || k > hmm->M+1) return NULL;
1516
1517 if (strcmp(statetype, "###MATCH_STATE") == 0)
1518 {
1519 /* V1.7: get ref, cs info: */
1520 /* ###MATCH_STATE 16 (x) (H) */
1521 if (version == HMMER1_7F)
1522 {
1523 s = strtok(NULL, "\n");
1524 while (*s != '(' && *s != '\0') s++;
1525 if (*s != '(') return NULL;
1526 hmm->ref[k] = *(s+1);
1527 while (*s != '(' && *s != '\0') s++;
1528 if (*s != '(') return NULL;
1529 hmm->cs[k] = *(s+1);
1530 }
1531
1532 if (fgets(buffer, 512, fp) == NULL) return NULL;
1533 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1534 hmm->mat[k].t[MATCH] = (float) atof(s);
1535
1536 if (fgets(buffer, 512, fp) == NULL) return NULL;
1537 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1538 hmm->mat[k].t[DELETE] = (float) atof(s);
1539
1540 if (fgets(buffer, 512, fp) == NULL) return NULL;
1541 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1542 hmm->mat[k].t[INSERT] = (float) atof(s);
1543
1544 for (i = 0; i < Alphabet_size; i++)
1545 {
1546 if (fgets(buffer, 512, fp) == NULL) return NULL;
1547 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1548 hmm->mat[k].p[i] = (float) atof(s);
1549 }
1550
1551 /* Skip all regularizer info for V1.0 */
1552 if (version == HMMER1_0F)
1553 for (i = 0; i < Alphabet_size + 3; i++)
1554 if (fgets(buffer, 512, fp) == NULL) return NULL;
1555
1556 }
1557 else if (strcmp(statetype, "###INSERT_STATE") == 0)
1558 {
1559 if (fgets(buffer, 512, fp) == NULL) return NULL;
1560 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1561 hmm->ins[k].t[MATCH] = (float) atof(s);
1562
1563 if (fgets(buffer, 512, fp) == NULL) return NULL;
1564 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1565 hmm->ins[k].t[DELETE] = (float) atof(s);
1566
1567 if (fgets(buffer, 512, fp) == NULL) return NULL;
1568 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1569 hmm->ins[k].t[INSERT] = (float) atof(s);
1570
1571 for (i = 0; i < Alphabet_size; i++)
1572 {
1573 if (fgets(buffer, 512, fp) == NULL) return NULL;
1574 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1575 hmm->ins[k].p[i] = (float) atof(s);
1576 }
1577
1578 /* Skip all regularizer info in V1.0 files */
1579 if (version == HMMER1_0F)
1580 for (i = 0; i < Alphabet_size + 3; i++)
1581 if (fgets(buffer, 512, fp) == NULL) return NULL;
1582
1583 }
1584 else if (strcmp(statetype, "###DELETE_STATE") == 0)
1585 {
1586 if (fgets(buffer, 512, fp) == NULL) return NULL;
1587 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1588 hmm->del[k].t[MATCH] = (float) atof(s);
1589
1590 if (fgets(buffer, 512, fp) == NULL) return NULL;
1591 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1592 hmm->del[k].t[DELETE] = (float) atof(s);
1593
1594 if (fgets(buffer, 512, fp) == NULL) return NULL;
1595 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1596 hmm->del[k].t[INSERT] = (float) atof(s);
1597
1598 /* Skip all regularizer info in V1.0 files*/
1599 if (version == HMMER1_0F)
1600 for (i = 0; i < 3; i++)
1601 if (fgets(buffer, 512, fp) == NULL) return NULL;
1602 }
1603 else
1604 return NULL;
1605 }
1606
1607 P9DefaultNullModel(hmm->null);
1608 P9Renormalize(hmm);
1609 return hmm;
1610 }
1611