1 /* Multiple sequence alignment file i/o
2 *
3 * Table of contents:
4 * 1. Opening/closing an ESL_MSAFILE.
5 * 2. ESL_MSAFILE_FMTDATA: optional added constraints on formats.
6 * 3. Guessing file formats.
7 * 4. Guessing alphabets.
8 * 5. Random MSA flatfile access.
9 * 6. Reading an MSA from an ESL_MSAFILE.
10 * 7. Writing an MSA to a stream.
11 * 8. MSA functions that depend on MSAFILE
12 * 9. Utilities used by specific format parsers.
13 * 10. Unit tests.
14 * 11. Test driver.
15 * 12. Examples.
16 */
17 #include "esl_config.h"
18
19 #include <stdlib.h>
20 #include <stdio.h>
21 #include <string.h>
22
23 #include "easel.h"
24 #include "esl_alphabet.h"
25 #include "esl_buffer.h"
26 #include "esl_mem.h"
27 #include "esl_msa.h"
28 #include "esl_ssi.h"
29
30 #include "esl_msafile.h"
31
32 /*****************************************************************
33 *# 1. Opening/closing an ESL_MSAFILE
34 *****************************************************************/
35
36 static int msafile_Create (ESL_MSAFILE **ret_afp);
37 static int msafile_OpenBuffer(ESL_ALPHABET **byp_abc, ESL_BUFFER *bf, int format, ESL_MSAFILE_FMTDATA *fmtd, ESL_MSAFILE *afp);
38
39 /* Function: esl_msafile_Open()
40 * Synopsis: Open a multiple sequence alignment file for input.
41 *
42 * Purpose: Open a multiple sequence alignment file <msafile> for input.
43 * Return an open <ESL_MSAFILE> handle in <*ret_afp>.
44 *
45 * <msafile> is usually the name of a file. Alignments may
46 * also be read from standard input, or from
47 * gzip-compressed files. If <msafile> is ``-'', alignment
48 * input is taken from the standard input stream. If
49 * <msafile> ends in ``.gz'', alignment input is read
50 * through a pipe from <gzip -dc>.
51 *
52 * <byp_abc>, <env>, <format>, and <fmtd> support a variety
53 * of optional/advanced operations, as described
54 * below. Minimally, a caller can set <byp_abc> to <NULL>,
55 * <format> to <eslMSAFILE_UNKNOWN>, and <fmtd> to <NULL>,
56 * and <msafile> will be opened in text mode; in the
57 * current working directory; and its format will be
58 * autodetected.
59 *
60 * The <byp_abc> argument controls whether data are to be
61 * read in text or digital mode. In digital mode, alignment
62 * data are immediately digitized into an Easel internal
63 * alphabet (which among other things, allows various
64 * things to operate on sequence data more efficiently) and
65 * because an expected alphabet is known, parsers are able
66 * to detect invalid characters. The caller may either
67 * provide an alphabet (thus asserting what it's expected
68 * to be), or have <esl_msafile_Open()> look at the file
69 * and guess what alphabet it appears to be (DNA or amino
70 * acid code, usually). In text mode, alignment data are
71 * read verbatim. It might be advantageous for an
72 * application to read in text mode -- for example, if a
73 * variant alignment format is using characters in some
74 * special way, and you need to deal with them specially.
75 * All this goes through the setting of the passed-by-reference
76 * alphabet pointer <byp_abc>. If caller passes NULL for
77 * the <byp_abc> argument, input is in text mode. If caller
78 * provides a valid non-NULL <byp_abc> pointer but
79 * <*byp_abc> is NULL (that is, caller has declared
80 * <ESL_ALPHABET *abc = NULL> and passed <&abc> as an
81 * argument), then we attempt to guess the digital alphabet
82 * using <esl_msafile_GuessAlphabet()>, based on the first
83 * alignment in the input. In this case, the new alphabet
84 * is allocated here and returned to the caller. If caller
85 * provides a digital alphabet (that is, <ESL_ALPHABET *abc
86 * = esl_alphabet_Create...()> and passed <&abc>), that's
87 * the alphabet we use.
88 *
89 * The <env> argument controls where we search for the
90 * <msafile>. If <env> is <NULL>, only the current working
91 * directory is checked. Optionally, caller can provide in
92 * <env> the name of an environment variable ("PFAMDB",
93 * perhaps), in which the routine can find a
94 * colon-delimited list of directories. Then, if <msafile>
95 * is not found in the current working directory, we look
96 * for it in these directories, in the order they're
97 * listed.
98 *
99 * The <format> argument allows the caller to either allow
100 * <esl_msafile_Open()> to autodetect the file format of
101 * <msafile>, or to assert that it knows the file is in a
102 * particular format. If <format> is <eslMSAFILE_UNKNOWN>,
103 * format autodetection is performed. Other valid codes include:
104 * | <eslMSAFILE_STOCKHOLM> | Stockholm format |
105 * | <eslMSAFILE_AFA> | Aligned FASTA format |
106 * | <eslMSAFILE_CLUSTAL> | Clustal format (strict) |
107 * | <eslMSAFILE_CLUSTALLIKE> | Clustal-like (MUSCLE, PROBCONS...) |
108 * | <eslMSAFILE_PHYLIP> | PHYLIP interleaved format |
109 * | <eslMSAFILE_PHYLIPS> | PHYLIP sequential format |
110 * | <eslMSAFILE_A2M> | UCSC SAM A2M (dotless or dotful) |
111 * | <eslMSAFILE_PSIBLAST> | NCBI PSI-BLAST |
112 * | <eslMSAFILE_SELEX> | a general alignment block format |
113 *
114 * The <fmtd> argument is an optional pointer to a
115 * <ESL_MSAFILE_FMTDATA> structure that the caller may
116 * initialize and provide, in order to assert any
117 * additional unusual constraints on the input format --
118 * for example, to dictate that a PHYLIP format file must
119 * have some nonstandard name field width. Generally,
120 * though, <fmtd> will be <NULL>, and such things would
121 * either be autodetected as part of the autodetected
122 * format, or the strict version of the parser will be
123 * used. (That is, if you open a PHYLIP file with
124 * <format=eslMSAFILE_UNKNOWN> and a <NULL fmtd>, the
125 * format autodetector is used, and it will automagically
126 * detect nonstandard PHYLIP namewidths != 10. But if you
127 * open a PHYLIP file with <format=eslMSAFILE_PHYLIP> and a
128 * <NULL fmtd>, then you get the strict interleaved PHYLIP
129 * parser, which requires a name width of exactly 10.)
130 *
131 * Args: byp_abc - digital alphabet to use, or NULL for text mode
132 * if <*byp_abc> is NULL, guess the digital alphabet,
133 * create it, and return it in <*byp_abc>.
134 * If <*byp_abc> is a digital alphabet, use it.
135 * msafile - name of alignment input to open;
136 * if "-", read standard input;
137 * if "*.gz", read through a <gzip -dc> pipe.
138 * env - <NULL>, or the name of an environment variable
139 * containing colon-delimited list of directories
140 * in which to search for <msafile> (e.g. "PFAMDB").
141 * format - format code, such as <eslMSAFILE_STOCKHOLM>;
142 * or <eslMSAFILE_UNKNOWN> to autodetect format.
143 * fmtd - <NULL>, or a pointer to an initialized
144 * <ESL_MSAFILE_FMTDATA> structure, containing
145 * any additional unusual constraints to apply
146 * to the input format.
147 * *ret_afp - RETURN: open MSA input stream.
148 *
149 * Returns: <eslOK> on success, and <*ret_afp> is the newly opened msa file.
150 *
151 * <eslENOTFOUND> if <msafile> doesn't exist or can't be
152 * opened for reading; or (in the case of a <.gz> file) if
153 * a <gzip> executable doesn't exist in user's <PATH> or
154 * can't be executed. <afp->errmsg> is something like
155 * "couldn't open %s for reading", with <%s> being the
156 * name of the msafile.
157 *
158 * <eslENOFORMAT> if we tried to autodetect the file format
159 * (caller provided <format=eslMSAFILE_UNKNOWN>), and
160 * failed. <afp->errmsg> is an informative user-directed
161 * message, minimally something like "couldn't determine alignment
162 * input format", possibly more detailed.
163 *
164 * <eslENOALPHABET> if we tried to autodetect the alphabet
165 * (caller provided <&abc>, <abc=NULL> to request digital
166 * mode w/ alphabet autodetection) but the alphabet could
167 * not be reliably guessed.
168 *
169 * <eslFAIL> in the case of a <.gz> file and the <gzip -dc>
170 * command fails on it.
171 *
172 * On any of these normal errors, <*ret_afp> is returned in
173 * an error state, containing a user-directed error message
174 * in <afp->errmsg> and (if relevant) the full path to
175 * <msafile> that we attempted to open in
176 * <afp->bf->filename>. See <esl_msafile_OpenFailure()> for
177 * a function that gives a standard way of reporting these
178 * diagnostics to <stderr>.
179 *
180 * Throws: <eslEMEM> on allocation failure.
181 * <eslESYS> on a system call failure, such as <fread()>.
182 * <eslEINVAL> if we tried to use <stdin> but the <stdin> stream was
183 * invalid (in an error state, <NULL>, or at <EOF>).
184 * On thrown exceptions, <*ret_afp> is <NULL>.
185 */
186 int
esl_msafile_Open(ESL_ALPHABET ** byp_abc,const char * msafile,const char * env,int format,ESL_MSAFILE_FMTDATA * fmtd,ESL_MSAFILE ** ret_afp)187 esl_msafile_Open(ESL_ALPHABET **byp_abc, const char *msafile, const char *env, int format, ESL_MSAFILE_FMTDATA *fmtd, ESL_MSAFILE **ret_afp)
188 {
189 ESL_MSAFILE *afp = NULL;
190 int status;
191
192 if ( (status = msafile_Create(&afp)) != eslOK) goto ERROR;
193
194 if ((status = esl_buffer_Open(msafile, env, &(afp->bf))) != eslOK)
195 ESL_XFAIL(status, afp->errmsg, "%s", afp->bf->errmsg); /* ENOTFOUND; FAIL are normal here */
196
197 if ( (status = msafile_OpenBuffer(byp_abc, afp->bf, format, fmtd, afp)) != eslOK) goto ERROR;
198
199 *ret_afp = afp;
200 return eslOK;
201
202 ERROR: /* on normal errors, afp is returned in an error state */
203 if (status == eslENOTFOUND || status == eslFAIL || status == eslENOFORMAT || status == eslENODATA || status == eslENOALPHABET)
204 { afp->abc = NULL; *ret_afp = afp;}
205 else
206 { if (afp) esl_msafile_Close(afp); *ret_afp = NULL; }
207 return status;
208 }
209
210
211 /* Function: esl_msafile_OpenMem()
212 * Synopsis: Open a string or buffer for parsing as an MSA.
213 *
214 * Purpose: Essentially the same as <esl_msafile_Open()>, except
215 * we ``open'' the string or buffer <p>, <n> as the
216 * input source.
217 *
218 * If <p> is a NUL-terminated string, providing its length
219 * <n> is optional; <n> may be passed as -1. If <p> is an
220 * unterminated buffer, providing the length <n> is
221 * mandatory.
222 */
223 int
esl_msafile_OpenMem(ESL_ALPHABET ** byp_abc,const char * p,esl_pos_t n,int format,ESL_MSAFILE_FMTDATA * fmtd,ESL_MSAFILE ** ret_afp)224 esl_msafile_OpenMem(ESL_ALPHABET **byp_abc, const char *p, esl_pos_t n, int format, ESL_MSAFILE_FMTDATA *fmtd, ESL_MSAFILE **ret_afp)
225 {
226 ESL_MSAFILE *afp = NULL;
227 int status;
228
229 if ( (status = msafile_Create(&afp)) != eslOK) goto ERROR;
230 if ( (status = esl_buffer_OpenMem(p, n, &(afp->bf))) != eslOK) goto ERROR;
231
232 if ( (status = msafile_OpenBuffer(byp_abc, afp->bf, format, fmtd, afp)) != eslOK) goto ERROR;
233 *ret_afp = afp;
234 return eslOK;
235
236 ERROR:
237 if (status == eslENOTFOUND || status == eslFAIL || status == eslEFORMAT || status == eslENODATA || status == eslENOALPHABET)
238 { afp->abc = NULL; *ret_afp = afp;}
239 else
240 { if (afp) esl_msafile_Close(afp); *ret_afp = NULL; }
241 return status;
242 }
243
244
245 /* Function: esl_msafile_OpenBuffer()
246 * Synopsis: Open an input buffer for parsing as an MSA.
247 *
248 * Purpose: Essentially the same as <esl_msafile_Open()>, except
249 * we ``open'' an <ESL_BUFFER> <bf> that's already been
250 * opened by the caller for some input source.
251 */
252 int
esl_msafile_OpenBuffer(ESL_ALPHABET ** byp_abc,ESL_BUFFER * bf,int format,ESL_MSAFILE_FMTDATA * fmtd,ESL_MSAFILE ** ret_afp)253 esl_msafile_OpenBuffer(ESL_ALPHABET **byp_abc, ESL_BUFFER *bf, int format, ESL_MSAFILE_FMTDATA *fmtd, ESL_MSAFILE **ret_afp)
254 {
255 ESL_MSAFILE *afp = NULL;
256 int status;
257
258 if ( (status = msafile_Create(&afp)) != eslOK) goto ERROR;
259
260 afp->bf = bf;
261 if ((status = msafile_OpenBuffer(byp_abc, afp->bf, format, fmtd, afp)) != eslOK) goto ERROR;
262 *ret_afp = afp;
263 return eslOK;
264
265 ERROR:
266 if (status == eslENOTFOUND || status == eslFAIL || status == eslEFORMAT || status == eslENODATA || status == eslENOALPHABET)
267 { afp->abc = NULL; *ret_afp = afp;}
268 else
269 { if (afp) esl_msafile_Close(afp); *ret_afp = NULL; }
270 return status;
271 }
272
273 /* Function: esl_msafile_OpenFailure()
274 * Synopsis: Report diagnostics of normal error in opening MSA file, and exit.
275 *
276 * Purpose: Report user-directed diagnostics of a normal error in opening
277 * an MSA input. Print a message to <stderr>, then exit.
278 */
279 void
esl_msafile_OpenFailure(ESL_MSAFILE * afp,int status)280 esl_msafile_OpenFailure(ESL_MSAFILE *afp, int status)
281 {
282 int show_source = FALSE;
283 int show_fmt = FALSE;
284
285 fprintf(stderr, "Alignment input open failed.\n");
286
287 if (status == eslENOTFOUND) { fprintf(stderr, " %s\n", afp->errmsg); }
288 else if (status == eslFAIL) { fprintf(stderr, " %s\n", afp->errmsg); }
289 else if (status == eslENOFORMAT) { fprintf(stderr, " %s\n", afp->errmsg); show_source = TRUE; }
290 else if (status == eslENOALPHABET) { fprintf(stderr, " %s\n", afp->errmsg); show_source = TRUE; show_fmt = TRUE; }
291 else if (status == eslEMEM) { fprintf(stderr, " Memory allocation failure\n"); }
292 else if (status == eslESYS) { fprintf(stderr, " System call failed, possibly fread()\n"); }
293 else { fprintf(stderr, " Unexpected error code %d\n", status); }
294
295 if (show_source) {
296 switch (afp->bf->mode_is) {
297 case eslBUFFER_STREAM: fprintf(stderr, " while reading from an input stream (not a file)\n"); break;
298 case eslBUFFER_CMDPIPE: fprintf(stderr, " while reading through a pipe (not a file)\n"); break;
299 case eslBUFFER_FILE:
300 case eslBUFFER_ALLFILE:
301 case eslBUFFER_MMAP: fprintf(stderr, " while reading file %s\n", afp->bf->filename); break;
302 case eslBUFFER_STRING: fprintf(stderr, " while reading from a provided string (not a file)\n"); break;
303 default: break;
304 }
305 }
306
307 if (show_fmt) {
308 fprintf(stderr, " while parsing for %s format\n", esl_msafile_DecodeFormat(afp->format));
309 }
310
311 esl_msafile_Close(afp);
312 exit(status);
313 }
314
315 /* Function: esl_msafile_SetDigital()
316 * Synopsis: Convert an open text-mode ESL_MSAFILE to digital mode.
317 *
318 * Purpose: Convert the open <afp> from text mode to digital mode,
319 * using alphabet <abc>.
320 *
321 * Note: This function is only here for legacy support: it's
322 * called by esl_sqio, which still uses an outdated
323 * open / guess alphabet / set digital pattern. When
324 * sqio is upgraded next, this function should be removed.
325 */
326 int
esl_msafile_SetDigital(ESL_MSAFILE * afp,const ESL_ALPHABET * abc)327 esl_msafile_SetDigital(ESL_MSAFILE *afp, const ESL_ALPHABET *abc)
328 {
329 int status;
330
331 afp->abc = abc;
332
333 switch (afp->format) {
334 case eslMSAFILE_A2M: status = esl_msafile_a2m_SetInmap( afp); break;
335 case eslMSAFILE_AFA: status = esl_msafile_afa_SetInmap( afp); break;
336 case eslMSAFILE_CLUSTAL: status = esl_msafile_clustal_SetInmap( afp); break;
337 case eslMSAFILE_CLUSTALLIKE: status = esl_msafile_clustal_SetInmap( afp); break;
338 case eslMSAFILE_PFAM: status = esl_msafile_stockholm_SetInmap(afp); break;
339 case eslMSAFILE_PHYLIP: status = esl_msafile_phylip_SetInmap( afp); break;
340 case eslMSAFILE_PHYLIPS: status = esl_msafile_phylip_SetInmap( afp); break;
341 case eslMSAFILE_PSIBLAST: status = esl_msafile_psiblast_SetInmap( afp); break;
342 case eslMSAFILE_SELEX: status = esl_msafile_selex_SetInmap( afp); break;
343 case eslMSAFILE_STOCKHOLM: status = esl_msafile_stockholm_SetInmap(afp); break;
344 default: ESL_EXCEPTION(eslEINCONCEIVABLE, "no such alignment file format");
345 }
346 return status;
347 }
348
349 /* Function: esl_msafile_Close()
350 * Synopsis: Close an open <ESL_MSAFILE>.
351 */
352 void
esl_msafile_Close(ESL_MSAFILE * afp)353 esl_msafile_Close(ESL_MSAFILE *afp)
354 {
355 if (afp) {
356 if (afp->bf) esl_buffer_Close(afp->bf);
357 if (afp->ssi) esl_ssi_Close(afp->ssi);
358 free(afp);
359 }
360 }
361
362 static int
msafile_Create(ESL_MSAFILE ** ret_afp)363 msafile_Create(ESL_MSAFILE **ret_afp)
364 {
365 ESL_MSAFILE *afp = NULL;
366 int status;
367
368 ESL_ALLOC(afp, sizeof(ESL_MSAFILE));
369 afp->bf = NULL;
370 afp->line = NULL;
371 afp->n = 0;
372 afp->linenumber = 0;
373 afp->lineoffset = 0;
374 afp->format = eslMSAFILE_UNKNOWN;
375 afp->abc = NULL;
376 afp->ssi = NULL;
377 afp->errmsg[0] = '\0';
378
379 esl_msafile_fmtdata_Init(&(afp->fmtd));
380
381 *ret_afp = afp;
382 return eslOK;
383
384 ERROR:
385 *ret_afp = NULL;
386 return status;
387 }
388
389
390
391 /* All input sources funnel through here.
392 * Here, <afp> is already allocated and initialized, and the input
393 * <bf> is opened successfully.
394 */
395 static int
msafile_OpenBuffer(ESL_ALPHABET ** byp_abc,ESL_BUFFER * bf,int format,ESL_MSAFILE_FMTDATA * fmtd,ESL_MSAFILE * afp)396 msafile_OpenBuffer(ESL_ALPHABET **byp_abc, ESL_BUFFER *bf, int format, ESL_MSAFILE_FMTDATA *fmtd, ESL_MSAFILE *afp)
397 {
398 ESL_ALPHABET *abc = NULL;
399 int alphatype = eslUNKNOWN;
400 int status;
401
402 /* if caller provided <fmtd>, copy it into afp->fmtd */
403 if (fmtd) esl_msafile_fmtdata_Copy(fmtd, &(afp->fmtd));
404
405 /* Determine the format */
406 if (format == eslMSAFILE_UNKNOWN &&
407 (status = esl_msafile_GuessFileFormat(afp->bf, &format, &(afp->fmtd), afp->errmsg)) != eslOK)
408 goto ERROR;
409 afp->format = format;
410
411 /* Determine the alphabet; set <abc>. (<abc> == NULL means text mode.) */
412 /* Note that GuessAlphabet() functions aren't allowed to use the inmap, because it isn't set yet */
413
414 if (byp_abc && *byp_abc) /* Digital mode, and caller provided the alphabet */
415 {
416 abc = *byp_abc;
417 alphatype = abc->type;
418 }
419 else if (byp_abc) /* Digital mode, and caller wants us to guess and create an alphabet */
420 {
421 status = esl_msafile_GuessAlphabet(afp, &alphatype);
422 if (status == eslENOALPHABET) ESL_XFAIL(eslENOALPHABET, afp->errmsg, "couldn't guess alphabet (maybe try --dna/--rna/--amino if available)");
423 else if (status != eslOK) goto ERROR;
424 if ( (abc = esl_alphabet_Create(alphatype)) == NULL) { status = eslEMEM; goto ERROR; }
425 }
426 afp->abc = abc; /* with afp->abc set, the inmap config functions know whether to do digital/text */
427
428 /* Configure the format-specific, digital or text mode character
429 * input map in afp->inmap.
430 * All of these must:
431 *
432 * set inmap[0] to an appropriate 'unknown' character, to replace
433 * invalid input with.
434 * set ' ' to eslDSQ_IGNORE (if we're supposed to accept and skip
435 * it), or map it to a gap, or set it as eslDSQ_ILLEGAL.
436 * in digital mode, copy the abc->inmap
437 * in text mode, decide if we should accept most any
438 * non-whitespace character (isgraph()), or if the format is
439 * inherently restrictive and we should go with isalpha() +
440 * some other valid characters "_-.~*" instead.
441 */
442 switch (afp->format) {
443 case eslMSAFILE_A2M: if ((status = esl_msafile_a2m_SetInmap( afp)) != eslOK) goto ERROR; break;
444 case eslMSAFILE_AFA: if ((status = esl_msafile_afa_SetInmap( afp)) != eslOK) goto ERROR; break;
445 case eslMSAFILE_CLUSTAL: if ((status = esl_msafile_clustal_SetInmap( afp)) != eslOK) goto ERROR; break;
446 case eslMSAFILE_CLUSTALLIKE: if ((status = esl_msafile_clustal_SetInmap( afp)) != eslOK) goto ERROR; break;
447 case eslMSAFILE_PFAM: if ((status = esl_msafile_stockholm_SetInmap(afp)) != eslOK) goto ERROR; break;
448 case eslMSAFILE_PHYLIP: if ((status = esl_msafile_phylip_SetInmap( afp)) != eslOK) goto ERROR; break;
449 case eslMSAFILE_PHYLIPS: if ((status = esl_msafile_phylip_SetInmap( afp)) != eslOK) goto ERROR; break;
450 case eslMSAFILE_PSIBLAST: if ((status = esl_msafile_psiblast_SetInmap( afp)) != eslOK) goto ERROR; break;
451 case eslMSAFILE_SELEX: if ((status = esl_msafile_selex_SetInmap( afp)) != eslOK) goto ERROR; break;
452 case eslMSAFILE_STOCKHOLM: if ((status = esl_msafile_stockholm_SetInmap(afp)) != eslOK) goto ERROR; break;
453 default: ESL_XEXCEPTION(eslENOFORMAT, "no such alignment file format");
454 }
455
456 if (esl_byp_IsReturned(byp_abc)) *byp_abc = abc;
457 return eslOK;
458
459 ERROR: /* on normal errors, afp is returned in an error state */
460 if (abc && ! esl_byp_IsProvided(byp_abc)) { esl_alphabet_Destroy(abc); }
461 if (esl_byp_IsReturned(byp_abc)) *byp_abc = NULL;
462 afp->abc = NULL;
463 return status;
464 }
465 /*------------- end, open/close an ESL_MSAFILE -----------------*/
466
467
468
469 /*****************************************************************
470 *# 2. ESL_MSAFILE_FMTDATA: optional extra constraints on formats.
471 *****************************************************************/
472
473 /* Function: esl_msafile_fmtdata_Init()
474 * Synopsis: Initialize a <ESL_MSAFILE_FMTDATA> structure.
475 */
476 int
esl_msafile_fmtdata_Init(ESL_MSAFILE_FMTDATA * fmtd)477 esl_msafile_fmtdata_Init(ESL_MSAFILE_FMTDATA *fmtd)
478 {
479 fmtd->namewidth = 0;
480 fmtd->rpl = 0;
481 return eslOK;
482 }
483
484 /* Function: esl_msafile_fmtdata_Copy()
485 * Synopsis: Copy one <ESL_MSAFILE_FMTDATA> structure to another.
486 */
487 int
esl_msafile_fmtdata_Copy(ESL_MSAFILE_FMTDATA * src,ESL_MSAFILE_FMTDATA * dst)488 esl_msafile_fmtdata_Copy(ESL_MSAFILE_FMTDATA *src, ESL_MSAFILE_FMTDATA *dst)
489 {
490 dst->namewidth = src->namewidth;
491 dst->rpl = src->rpl;
492 return eslOK;
493 }
494
495 /*--------------- ESL_MSAFILE_FMTDATA --------------------------*/
496
497
498 /*****************************************************************
499 *# 3. Guessing file format.
500 *****************************************************************/
501
502 static int msafile_check_selex (ESL_BUFFER *bf);
503
504 /* Function: esl_msafile_GuessFileFormat()
505 * Synopsis: Guess the MSA file format of an open buffer.
506 *
507 * Purpose: Peek into an open buffer, and try to determine what
508 * alignment file format (if any) its input is in. If a
509 * format can be determined, return <eslOK> and set
510 * <*ret_fmtcode> to the format code. If not, return
511 * <eslENOFORMAT> and set <*ret_fmtcode> to
512 * <eslMSAFILE_UNKNOWN>. In either case, the buffer <bf> is
513 * restored to its original position upon return.
514 *
515 * Some formats may have variants that require special
516 * handling. Caller may pass a pointer <*opt_fmtd> to a
517 * <ESL_MSAFILE_FMTDATA> structure to capture this
518 * information from format autodetection, or <NULL>. If the
519 * structure is provided, it is reinitialized, and any
520 * fields that can be determined by the appropriate
521 * format-guessing function are filled in. If <*opt_fmtd>
522 * is <NULL>, the range of variation that can be captured
523 * by some formats may be limited. Currently this only
524 * affects PHYLIP format, where <opt_fmtd->namewidth>
525 * allows files with nonstandard name field widths to be
526 * autodetected and parsed.
527 *
528 * Args: bf - the open buffer to read input from
529 * ret_fmtcode - RETURN: format code that's determined
530 * opt_fmtd - optRETURN: ptr to an <ESL_MSAFILE_FMTDATA> structure to
531 * be filled in with additional format-specific data, or <NULL>
532 * errbuf - optRETURN: space for an informative error message if
533 * format autodetection fails; caller allocates for <eslERRBUFSIZE> bytes.
534 *
535 * Returns: <eslOK> on success, and <*ret_fmtcode> contains the format code.
536 *
537 * <eslENOFORMAT> if format can't be guessed; now <*ret_fmtcode> contains
538 * <eslMSAFILE_UNKNOWN>, and optional <errbuf> contains an explanation.
539 *
540 * Throws: (no abnormal error conditions)
541 */
542 int
esl_msafile_GuessFileFormat(ESL_BUFFER * bf,int * ret_fmtcode,ESL_MSAFILE_FMTDATA * opt_fmtd,char * errbuf)543 esl_msafile_GuessFileFormat(ESL_BUFFER *bf, int *ret_fmtcode, ESL_MSAFILE_FMTDATA *opt_fmtd, char *errbuf)
544 {
545 esl_pos_t initial_offset;
546 char *p;
547 esl_pos_t n;
548 int fmt_bysuffix = eslMSAFILE_UNKNOWN;
549 int fmt_byfirstline = eslMSAFILE_UNKNOWN;
550 int status;
551
552 /* Initialize the optional data, if provided (move this initialization to a function someday) */
553 if (opt_fmtd) esl_msafile_fmtdata_Init(opt_fmtd);
554 if (errbuf) errbuf[0] = '\0';
555
556 /* As we start, save parser status:
557 * remember the offset where we started (usually 0, but not necessarily)
558 * set an anchor to be sure that this offset stays in the buffer's memory
559 */
560 initial_offset = esl_buffer_GetOffset(bf);
561 esl_buffer_SetAnchor(bf, initial_offset);
562
563 /* We may use a filename suffix as a clue, especially when
564 * distinguishing formats that may not be distinguishable.
565 * (if there's a filename, and if it has a suffix, anyway.)
566 */
567 if (bf->filename)
568 {
569 esl_file_Extension(bf->filename, 0, &p, &n);
570 if (esl_memstrcmp(p, n, ".gz")) esl_file_Extension(bf->filename, 3, &p, &n);
571 if (p)
572 {
573 if (esl_memstrcmp(p, n, ".sto")) fmt_bysuffix = eslMSAFILE_STOCKHOLM;
574 else if (esl_memstrcmp(p, n, ".sth")) fmt_bysuffix = eslMSAFILE_STOCKHOLM;
575 else if (esl_memstrcmp(p, n, ".stk")) fmt_bysuffix = eslMSAFILE_STOCKHOLM;
576 else if (esl_memstrcmp(p, n, ".afa")) fmt_bysuffix = eslMSAFILE_AFA;
577 else if (esl_memstrcmp(p, n, ".afasta")) fmt_bysuffix = eslMSAFILE_AFA;
578 else if (esl_memstrcmp(p, n, ".pfam")) fmt_bysuffix = eslMSAFILE_PFAM;
579 else if (esl_memstrcmp(p, n, ".a2m")) fmt_bysuffix = eslMSAFILE_A2M;
580 else if (esl_memstrcmp(p, n, ".slx")) fmt_bysuffix = eslMSAFILE_SELEX;
581 else if (esl_memstrcmp(p, n, ".selex")) fmt_bysuffix = eslMSAFILE_SELEX;
582 else if (esl_memstrcmp(p, n, ".pb")) fmt_bysuffix = eslMSAFILE_PSIBLAST;
583 else if (esl_memstrcmp(p, n, ".ph")) fmt_bysuffix = eslMSAFILE_PHYLIP;
584 else if (esl_memstrcmp(p, n, ".phy")) fmt_bysuffix = eslMSAFILE_PHYLIP;
585 else if (esl_memstrcmp(p, n, ".phyi")) fmt_bysuffix = eslMSAFILE_PHYLIP;
586 else if (esl_memstrcmp(p, n, ".phys")) fmt_bysuffix = eslMSAFILE_PHYLIPS;
587 }
588 }
589
590 /* We peek at the first non-blank line of the file.
591 * Multiple sequence alignment files are often identifiable by a token on this line.
592 */
593 /* Skip blank lines, get first non-blank one */
594 do {
595 status = esl_buffer_GetLine(bf, &p, &n);
596 } while (status == eslOK && esl_memspn(p, n, " \t") == n);
597 if (status == eslEOF) ESL_XFAIL(eslENOFORMAT, errbuf, "can't guess alignment input format: empty file/no data");
598
599 if (esl_memstrpfx(p, n, "# STOCKHOLM")) fmt_byfirstline = eslMSAFILE_STOCKHOLM;
600 else if (esl_memstrpfx(p, n, ">")) fmt_byfirstline = eslMSAFILE_AFA;
601 else if (esl_memstrpfx(p, n, "CLUSTAL")) fmt_byfirstline = eslMSAFILE_CLUSTAL;
602 else if (esl_memstrcontains(p, n, "multiple sequence alignment")) fmt_byfirstline = eslMSAFILE_CLUSTALLIKE;
603 else {
604 char *tok;
605 esl_pos_t toklen;
606 /* look for <nseq> <alen>, characteristic of PHYLIP files */
607 if (esl_memtok(&p, &n, " \t", &tok, &toklen) == eslOK && esl_memspn(tok, toklen, "0123456789") == toklen &&
608 esl_memtok(&p, &n, " \t", &tok, &toklen) == eslOK && esl_memspn(tok, toklen, "0123456789") == toklen)
609 fmt_byfirstline = eslMSAFILE_PHYLIP; /* interleaved for now; we'll look more closely soon */
610 }
611
612 /* Restore parser status, rewind to start */
613 esl_buffer_SetOffset (bf, initial_offset);
614 esl_buffer_RaiseAnchor(bf, initial_offset);
615
616 /* Rules to determine formats.
617 * If we have to call a routine that looks deeper into the buffer
618 * than the first line, that routine must restore buffer status.
619 */
620 if (fmt_byfirstline == eslMSAFILE_STOCKHOLM)
621 {
622 if (fmt_bysuffix == eslMSAFILE_STOCKHOLM) *ret_fmtcode = eslMSAFILE_STOCKHOLM;
623 else if (fmt_bysuffix == eslMSAFILE_PFAM) *ret_fmtcode = eslMSAFILE_PFAM;
624 else *ret_fmtcode = eslMSAFILE_STOCKHOLM;
625 }
626 else if (fmt_byfirstline == eslMSAFILE_CLUSTAL)
627 {
628 *ret_fmtcode = eslMSAFILE_CLUSTAL;
629 }
630 else if (fmt_byfirstline == eslMSAFILE_CLUSTALLIKE)
631 {
632 *ret_fmtcode = eslMSAFILE_CLUSTALLIKE;
633 }
634 else if (fmt_byfirstline == eslMSAFILE_AFA)
635 {
636 if (fmt_bysuffix == eslMSAFILE_A2M) *ret_fmtcode = eslMSAFILE_A2M; // A2M requires affirmative identification by caller,
637 else *ret_fmtcode = eslMSAFILE_AFA; // because of ambiguity w/ AFA.
638 }
639 else if (fmt_byfirstline == eslMSAFILE_PHYLIP)
640 {
641 if (fmt_bysuffix == eslMSAFILE_PHYLIP) *ret_fmtcode = eslMSAFILE_PHYLIP;
642 else if (fmt_bysuffix == eslMSAFILE_PHYLIPS) *ret_fmtcode = eslMSAFILE_PHYLIPS;
643 else
644 {
645 int namewidth;
646 status = esl_msafile_phylip_CheckFileFormat(bf, ret_fmtcode, &namewidth);
647 if (status == eslEAMBIGUOUS) ESL_XFAIL(eslENOFORMAT, errbuf, "can't guess format: it's consistent w/ both phylip, phylips.");
648 else if (status == eslFAIL) ESL_XFAIL(eslENOFORMAT, errbuf, "format unrecognized, though it looks phylip-like");
649
650 if (opt_fmtd) opt_fmtd->namewidth = namewidth;
651 else if (namewidth != 10) ESL_XFAIL(eslENOFORMAT, errbuf, "can't parse nonstandard PHYLIP name width (expected 10)"); /* if we can't store the nonstandard width, we can't allow the caller to think it can parse this */
652 }
653 }
654 else // if we haven't guessed so far, try selex.
655 { /* selex parser can handle psiblast too */
656 if (fmt_bysuffix == eslMSAFILE_SELEX) *ret_fmtcode = eslMSAFILE_SELEX;
657 else if (msafile_check_selex(bf) == eslOK) {
658 if (fmt_bysuffix == eslMSAFILE_PSIBLAST) *ret_fmtcode = eslMSAFILE_PSIBLAST;
659 else *ret_fmtcode = eslMSAFILE_SELEX;
660 }
661 else ESL_XFAIL(eslENOFORMAT, errbuf, "couldn't guess alignment input format - doesn't even look like selex");
662 }
663
664 return eslOK;
665
666 ERROR:
667 *ret_fmtcode = eslMSAFILE_UNKNOWN;
668 return status;
669 }
670
671
672 /* Function: esl_msafile_IsMultiRecord()
673 * Synopsis: Test if a format supports multiple MSAs per file.
674 *
675 * Purpose: Return <TRUE> if MSA file format <fmt> supports
676 * more than one MSA record per file. Return <FALSE>
677 * otherwise (including the cases of <fmt> being
678 * invalid or <eslMSAFILE_UNKNOWN>).
679 */
680 int
esl_msafile_IsMultiRecord(int fmt)681 esl_msafile_IsMultiRecord(int fmt)
682 {
683 switch (fmt) {
684 case eslMSAFILE_UNKNOWN: return FALSE;
685 case eslMSAFILE_STOCKHOLM: return TRUE;
686 case eslMSAFILE_PFAM: return TRUE;
687 case eslMSAFILE_A2M: return FALSE;
688 case eslMSAFILE_PSIBLAST: return FALSE;
689 case eslMSAFILE_SELEX: return FALSE;
690 case eslMSAFILE_AFA: return FALSE;
691 case eslMSAFILE_CLUSTAL: return FALSE;
692 case eslMSAFILE_CLUSTALLIKE: return FALSE;
693 case eslMSAFILE_PHYLIP: return TRUE; /* because seqboot. undocumented in phylip, phylip format can come out multi-msa */
694 case eslMSAFILE_PHYLIPS: return TRUE; /* ditto */
695 default: return FALSE;
696 }
697 return FALSE; /* keep compilers happy */
698 }
699
700
701 /* Function: esl_msafile_EncodeFormat()
702 * Synopsis: Convert text string to an MSA file format code.
703 *
704 * Purpose: Given a text string, match it case-insensitively
705 * against a list of possible formats, and return the
706 * appropriate MSA file format code. For example,
707 * <esl_msafile_EncodeFormat("Stockholm")> returns
708 * <eslMSAFILE_STOCKHOLM>.
709 *
710 * If the format is unrecognized, return
711 * <eslMSAFILE_UNKNOWN>.
712 *
713 * Note: Keep in sync with <esl_sqio_EncodeFormat()>,
714 * which decodes all possible sequence file formats,
715 * both unaligned and aligned.
716 */
717 int
esl_msafile_EncodeFormat(char * fmtstring)718 esl_msafile_EncodeFormat(char *fmtstring)
719 {
720 if (strcasecmp(fmtstring, "stockholm") == 0) return eslMSAFILE_STOCKHOLM;
721 if (strcasecmp(fmtstring, "pfam") == 0) return eslMSAFILE_PFAM;
722 if (strcasecmp(fmtstring, "a2m") == 0) return eslMSAFILE_A2M;
723 if (strcasecmp(fmtstring, "psiblast") == 0) return eslMSAFILE_PSIBLAST;
724 if (strcasecmp(fmtstring, "selex") == 0) return eslMSAFILE_SELEX;
725 if (strcasecmp(fmtstring, "afa") == 0) return eslMSAFILE_AFA;
726 if (strcasecmp(fmtstring, "clustal") == 0) return eslMSAFILE_CLUSTAL;
727 if (strcasecmp(fmtstring, "clustallike") == 0) return eslMSAFILE_CLUSTALLIKE;
728 if (strcasecmp(fmtstring, "phylip") == 0) return eslMSAFILE_PHYLIP;
729 if (strcasecmp(fmtstring, "phylips") == 0) return eslMSAFILE_PHYLIPS;
730 return eslMSAFILE_UNKNOWN;
731 }
732
733
734 /* Function: esl_msafile_DecodeFormat()
735 * Synopsis: Convert internal file format code to text string.
736 *
737 * Purpose: Given an internal file format code <fmt>
738 * (<eslMSAFILE_STOCKHOLM>, for example), returns
739 * a string suitable for printing ("Stockholm",
740 * for example).
741 *
742 * Returns: a pointer to a static description string.
743 *
744 * Throws: If code isn't valid, throws an <eslEINCONCEIVABLE> exception
745 * internally, and returns <NULL>.
746 *
747 * Note: Keep in sync with <esl_sqio_DecodeFormat()>.
748 */
749 char *
esl_msafile_DecodeFormat(int fmt)750 esl_msafile_DecodeFormat(int fmt)
751 {
752 switch (fmt) {
753 case eslMSAFILE_UNKNOWN: return "unknown";
754 case eslMSAFILE_STOCKHOLM: return "Stockholm";
755 case eslMSAFILE_PFAM: return "Pfam";
756 case eslMSAFILE_A2M: return "UCSC A2M";
757 case eslMSAFILE_PSIBLAST: return "PSI-BLAST";
758 case eslMSAFILE_SELEX: return "SELEX";
759 case eslMSAFILE_AFA: return "aligned FASTA";
760 case eslMSAFILE_CLUSTAL: return "Clustal";
761 case eslMSAFILE_CLUSTALLIKE: return "Clustal-like";
762 case eslMSAFILE_PHYLIP: return "PHYLIP (interleaved)";
763 case eslMSAFILE_PHYLIPS: return "PHYLIP (sequential)";
764 default: break;
765 }
766 esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "no such msa format code %d\n", fmt);
767 return NULL;
768 }
769
770
771
772 /* msafile_check_selex()
773 * Checks whether an input source appears to be in SELEX format.
774 *
775 * Check whether the input source <bf> appears to be a
776 * SELEX-format alignment file, starting from the current point,
777 * to the end of the input. Return <eslOK> if so, <eslFAIL>
778 * if not.
779 *
780 * The checker is more rigorous than the parser. To be autodetected,
781 * the SELEX file can't use whitespace for gaps. The parser, though,
782 * allows whitespace. A caller would have to specific SELEX format
783 * explicitly in that case.
784 */
785 static int
msafile_check_selex(ESL_BUFFER * bf)786 msafile_check_selex(ESL_BUFFER *bf)
787 {
788 esl_pos_t start_offset = -1;
789 int block_nseq = 0; /* Number of seqs in each block is checked */
790 int nseq = 0;
791 esl_pos_t block_nres = 0; /* Number of residues in each line is checked */
792 char *firstname = NULL; /* First seq name of every block is checked */
793 esl_pos_t namelen = 0;
794 int blockidx = 0;
795 int in_block = FALSE;
796 char *p, *tok;
797 esl_pos_t n, toklen;
798 int status;
799
800 /* Anchor at the start of the input, so we can rewind */
801 start_offset = esl_buffer_GetOffset(bf);
802 if ( (status = esl_buffer_SetAnchor(bf, start_offset)) != eslOK) goto ERROR;
803
804 while ( (status = esl_buffer_GetLine(bf, &p, &n)) == eslOK)
805 {
806 /* Some automatic giveaways of SELEX format */
807 if (esl_memstrpfx(p, n, "#=RF")) { status = eslOK; goto DONE; }
808 if (esl_memstrpfx(p, n, "#=CS")) { status = eslOK; goto DONE; }
809 if (esl_memstrpfx(p, n, "#=SS")) { status = eslOK; goto DONE; }
810 if (esl_memstrpfx(p, n, "#=SA")) { status = eslOK; goto DONE; }
811
812 /* skip comments */
813 if (esl_memstrpfx(p, n, "#")) continue;
814
815 /* blank lines: end block, reset block counters */
816 if (esl_memspn(p, n, " \t") == n)
817 {
818 if (block_nseq && block_nseq != nseq) { status = eslFAIL; goto DONE;} /* each block has same # of seqs */
819 if (in_block) blockidx++;
820 if (blockidx >= 3) { status = eslOK; goto DONE; } /* stop after three blocks; we're pretty sure by now */
821 in_block = FALSE;
822 block_nres = 0;
823 block_nseq = nseq;
824 nseq = 0;
825 continue;
826 }
827
828 /* else we're a "sequence" line. test for two and only two non-whitespace
829 * fields; test that second field has same length; test that each block
830 * starts with the same seq name..
831 */
832 in_block = TRUE;
833 if ( (status = esl_memtok(&p, &n, " \t", &tok, &toklen)) != eslOK) goto ERROR; /* there's at least one token - we already checked for blank lines */
834 if (nseq == 0) /* check first seq name in each block */
835 {
836 if (blockidx == 0) { firstname = tok; namelen = toklen; } /* First block: set the name we check against. */
837 else if (toklen != namelen || memcmp(tok, firstname, toklen) != 0) { status = eslFAIL; goto DONE; } /* Subsequent blocks */
838 }
839 if (esl_memtok(&p, &n, " \t", &tok, &toklen) != eslOK) { status = eslFAIL; goto DONE; }
840 if (block_nres && toklen != block_nres) { status = eslFAIL; goto DONE; }
841 block_nres = toklen;
842 if (esl_memtok(&p, &n, " \t", &tok, &toklen) == eslOK) { status = eslFAIL; goto DONE; }
843 nseq++;
844 }
845 if (status != eslEOF) goto ERROR; /* EOF is expected and good; anything else is bad */
846
847 if (in_block) blockidx++;
848 status = (blockidx ? eslOK : eslFAIL); /* watch out for the case of no input */
849 /* deliberate readthru */
850 DONE:
851 if (start_offset != -1) {
852 if (esl_buffer_SetOffset(bf, start_offset) != eslOK) goto ERROR;
853 if (esl_buffer_RaiseAnchor(bf, start_offset) != eslOK) goto ERROR;
854 }
855 return status;
856
857 ERROR:
858 if (start_offset != -1) {
859 esl_buffer_SetOffset(bf, start_offset);
860 esl_buffer_RaiseAnchor(bf, start_offset);
861 }
862 return status;
863 }
864 /*---------------- end, file format utilities -------------------*/
865
866
867 /*****************************************************************
868 *# 4. Guessing alphabet
869 *****************************************************************/
870
871 /* Function: esl_msafile_GuessAlphabet()
872 * Synopsis: Guess what kind of sequences the MSA file contains.
873 *
874 * Purpose: Guess the alphabet of the sequences in the open
875 * <ESL_MSAFILE> <afp> -- <eslDNA>, <eslRNA>, or <eslAMINO> --
876 * based on the residue composition of the input.
877 *
878 * Returns: Returns <eslOK> on success, and <*ret_type> is set
879 * to <eslDNA>, <eslRNA>, or <eslAMINO>.
880 *
881 * Returns <eslENOALPHABET> and sets <*ret_type> to
882 * <eslUNKNOWN> if the alphabet cannot be reliably
883 * guessed.
884 *
885 * Either way, the <afp>'s buffer is restored to its original
886 * state, no matter how much data we had to read while trying
887 * to guess.
888 *
889 * Throws: <eslEMEM> - an allocation error.
890 * <eslESYS> - a system call such as <fread()> failed.
891 * <eslEINCONCEIVABLE> - "impossible" corruption, internal bug
892 */
893 int
esl_msafile_GuessAlphabet(ESL_MSAFILE * afp,int * ret_type)894 esl_msafile_GuessAlphabet(ESL_MSAFILE *afp, int *ret_type)
895 {
896 int status = eslENOALPHABET;
897 switch (afp->format) {
898 case eslMSAFILE_STOCKHOLM: status = esl_msafile_stockholm_GuessAlphabet(afp, ret_type); break;
899 case eslMSAFILE_PFAM: status = esl_msafile_stockholm_GuessAlphabet(afp, ret_type); break;
900 case eslMSAFILE_A2M: status = esl_msafile_a2m_GuessAlphabet (afp, ret_type); break;
901 case eslMSAFILE_PSIBLAST: status = esl_msafile_psiblast_GuessAlphabet (afp, ret_type); break;
902 case eslMSAFILE_SELEX: status = esl_msafile_selex_GuessAlphabet (afp, ret_type); break;
903 case eslMSAFILE_AFA: status = esl_msafile_afa_GuessAlphabet (afp, ret_type); break;
904 case eslMSAFILE_CLUSTAL: status = esl_msafile_clustal_GuessAlphabet (afp, ret_type); break;
905 case eslMSAFILE_CLUSTALLIKE: status = esl_msafile_clustal_GuessAlphabet (afp, ret_type); break;
906 case eslMSAFILE_PHYLIP: status = esl_msafile_phylip_GuessAlphabet (afp, ret_type); break;
907 case eslMSAFILE_PHYLIPS: status = esl_msafile_phylip_GuessAlphabet (afp, ret_type); break;
908 }
909 return status;
910 }
911 /*----------- end, utilities for alphabets ----------------------*/
912
913
914 /*****************************************************************
915 *# 5. Random msa flatfile database access (with SSI)
916 *****************************************************************/
917
918 /* Function: esl_msafile_PositionByKey()
919 * Synopsis: Use SSI to reposition file to start of named MSA.
920 *
921 * Purpose: Reposition <afp> so that the next MSA we read
922 * will be the one named (or accessioned) <key>.
923 *
924 * Returns: <eslOK> on success, and the file <afp> is repositioned
925 * such that the next <esl_msafile_Read()> call will read the
926 * alignment named <key>.
927 *
928 * Returns <eslENOTFOUND> if <key> isn't found in the index
929 * for <afp>.
930 *
931 * Returns <eslEFORMAT> if something goes wrong trying to
932 * read the index, indicating some sort of file format
933 * problem in the SSI file.
934 *
935 * Throws: <eslENODATA> if there's no open SSI index;
936 * <eslEINVAL> if the <offset> is invalid, either requiring rewind
937 * of a nonrewindable stream, or off the end of the data;
938 * <eslESYS> if a system call such as <fread()> fails;
939 * <eslEMEM> on allocation failure.
940 * In all these cases, the state of the <afp> is uncertain
941 * and may be corrupt; the application should not continue
942 * to use it.
943 */
944 int
esl_msafile_PositionByKey(ESL_MSAFILE * afp,const char * key)945 esl_msafile_PositionByKey(ESL_MSAFILE *afp, const char *key)
946 {
947 uint16_t fh;
948 off_t offset;
949 int status;
950
951 if (afp->ssi == NULL) ESL_EXCEPTION(eslENODATA, "Need an open SSI index to call esl_msafile_PositionByKey()");
952 if ((status = esl_ssi_FindName(afp->ssi, key, &fh, &offset, NULL, NULL)) != eslOK) return status; /* eslENOTFOUND|eslEFORMAT [eslEMEM] */
953 if ((status = esl_buffer_SetOffset(afp->bf, offset)) != eslOK) return status; /* [eslEINVAL|eslESYS|eslEMEM] */
954
955 /* The linenumber gets messed up after a file positioning. Best we can do
956 * is to turn it off (set it to -1). FIX THIS next time SSI format is
957 * changed: add linenumber to a primary key record.
958 */
959 afp->linenumber = -1;
960 return eslOK;
961 }
962
963 /*--------------------- end SSI functions -----------------------*/
964
965
966 /*****************************************************************
967 *# 6. Reading MSAs from input
968 *****************************************************************/
969
970 /* Function: esl_msafile_Read()
971 * Synopsis: Read next MSA from input.
972 *
973 * Purpose: Reads the next MSA from open MSA input <afp>, and return it in
974 * <*ret_msa>.
975 *
976 * Args: afp - open alignment input stream
977 8 *ret_msa - RETURN: alignment
978 *
979 * Returns: <eslOK> on success.
980 *
981 * <eslEFORMAT> on a parse error, and <afp->errmsg> is set
982 * to a user-directed error message; <*ret_msa> is <NULL>.
983 *
984 * If no alignment is found at all, returns <eslEOF>,
985 * and <afp->errmsg> is blank; <*ret_msa> is <NULL>.
986 *
987 * On normal error, <afp> and the return status code may be
988 * passed to <esl_msafile_ReadFailure()> to print diagnostics
989 * to <stderr> (including input source information and line
990 * number) and exit.
991 *
992 * Throws: <eslEMEM> - an allocation failed.
993 * <eslESYS> - a system call such as fread() failed
994 * <eslEINCONCEIVABLE> - "impossible" corruption
995 */
996 int
esl_msafile_Read(ESL_MSAFILE * afp,ESL_MSA ** ret_msa)997 esl_msafile_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa)
998 {
999 ESL_MSA *msa = NULL;
1000 int status = eslOK;
1001 esl_pos_t offset = esl_buffer_GetOffset(afp->bf);
1002
1003 switch (afp->format) {
1004 case eslMSAFILE_A2M: if ((status = esl_msafile_a2m_Read (afp, &msa)) != eslOK) goto ERROR; break;
1005 case eslMSAFILE_AFA: if ((status = esl_msafile_afa_Read (afp, &msa)) != eslOK) goto ERROR; break;
1006 case eslMSAFILE_CLUSTAL: if ((status = esl_msafile_clustal_Read (afp, &msa)) != eslOK) goto ERROR; break;
1007 case eslMSAFILE_CLUSTALLIKE: if ((status = esl_msafile_clustal_Read (afp, &msa)) != eslOK) goto ERROR; break;
1008 case eslMSAFILE_PFAM: if ((status = esl_msafile_stockholm_Read(afp, &msa)) != eslOK) goto ERROR; break;
1009 case eslMSAFILE_PHYLIP: if ((status = esl_msafile_phylip_Read (afp, &msa)) != eslOK) goto ERROR; break;
1010 case eslMSAFILE_PHYLIPS: if ((status = esl_msafile_phylip_Read (afp, &msa)) != eslOK) goto ERROR; break;
1011 case eslMSAFILE_PSIBLAST: if ((status = esl_msafile_psiblast_Read (afp, &msa)) != eslOK) goto ERROR; break;
1012 case eslMSAFILE_SELEX: if ((status = esl_msafile_selex_Read (afp, &msa)) != eslOK) goto ERROR; break;
1013 case eslMSAFILE_STOCKHOLM: if ((status = esl_msafile_stockholm_Read(afp, &msa)) != eslOK) goto ERROR; break;
1014 default: ESL_EXCEPTION(eslEINCONCEIVABLE, "no such msa file format");
1015 }
1016
1017 msa->offset = offset;
1018 *ret_msa = msa;
1019 return eslOK;
1020
1021 ERROR:
1022 if (msa) esl_msa_Destroy(msa);
1023 *ret_msa = NULL;
1024 return status;
1025 }
1026
1027 /* Function: esl_msafile_ReadFailure()
1028 * Synopsis: Report diagnostics of a normal error in parsing MSA file, and exit.
1029 *
1030 * Purpose: Report user-directed diagnostics of a normal error from
1031 * parsing an MSA file. Output the error message to
1032 * <stderr>, along with information about what we were
1033 * parsing (filename, if it was a file) and where we were
1034 * in the input (linenumber, if we know it). This
1035 * information is all available in <afp>. Then close <afp>
1036 * and exit with the <status> provided by the caller.
1037 *
1038 * Args: afp - open ESL_MSAFILE, containing information about
1039 * the error and the input source.
1040 * status - exit status; generally eslEFORMAT.
1041 *
1042 * Returns: no return. Exits here with <status>.
1043 */
1044 void
esl_msafile_ReadFailure(ESL_MSAFILE * afp,int status)1045 esl_msafile_ReadFailure(ESL_MSAFILE *afp, int status)
1046 {
1047 switch (status) {
1048 case eslEFORMAT: fprintf(stderr, "Alignment input parse error:\n %s\n", afp->errmsg); break;
1049 case eslEOF: fprintf(stderr, "Alignment input appears empty?\n"); break;
1050 default: fprintf(stderr, "Alignment input read error; unexpected code %d\n", status); break;
1051 }
1052
1053 switch (afp->bf->mode_is) {
1054 case eslBUFFER_STREAM: fprintf(stderr, " while reading %s from an input stream (not a file)\n", esl_msafile_DecodeFormat(afp->format)); break;
1055 case eslBUFFER_CMDPIPE: fprintf(stderr, " while reading %s through a pipe (not a file)\n", esl_msafile_DecodeFormat(afp->format)); break;
1056 case eslBUFFER_FILE:
1057 case eslBUFFER_ALLFILE:
1058 case eslBUFFER_MMAP: fprintf(stderr, " while reading %s file %s\n", esl_msafile_DecodeFormat(afp->format), afp->bf->filename); break;
1059 case eslBUFFER_STRING: fprintf(stderr, " while reading %s from a provided string (not a file)\n", esl_msafile_DecodeFormat(afp->format)); break;
1060 default: break;
1061 }
1062
1063 if (afp->linenumber > 0) fprintf(stderr, " at or near line %" PRIu64 "\n", afp->linenumber);
1064 else fprintf(stderr, " at or near byte %" PRIu64 "\n", esl_buffer_GetOffset(afp->bf));
1065
1066 esl_msafile_Close(afp);
1067 exit(status);
1068 }
1069 /*------------ end, reading MSA from ESL_MSAFILE ---------------*/
1070
1071
1072
1073
1074 /*****************************************************************
1075 *# 7. Writing an MSA to a stream.
1076 *****************************************************************/
1077
1078 /* Function: esl_msafile_Write()
1079 * Synopsis: Write an MSA to a stream.
1080 *
1081 * Purpose: Writes alignment <msa> to open stream <fp> in format <fmt>.
1082 *
1083 * In general, the <msa> is unchanged, but there are some
1084 * exceptions. For example, writing an alignment in A2M format
1085 * will alter alignment data (marking missing data
1086 * symbols on heuristically defined sequence fragments) and
1087 * create an <\#=RF> annotation line, if an <msa->rf>
1088 * annotation line isn't already present in the <msa>.
1089 *
1090 * Args: fp - open stream (such as <stdout>)
1091 * msa - alignment to write
1092 * fmt - format code (such as <eslMSAFILE_STOCKHOLM>)
1093 *
1094 * Returns: <eslOK> on success.
1095 *
1096 * Throws: <eslEMEM> on allocation error.
1097 * <eslEWRITE> on any system write error, such as a filled disk.
1098 */
1099 int
esl_msafile_Write(FILE * fp,ESL_MSA * msa,int fmt)1100 esl_msafile_Write(FILE *fp, ESL_MSA *msa, int fmt)
1101 {
1102 int status;
1103
1104 switch (fmt) {
1105 case eslMSAFILE_STOCKHOLM: status = esl_msafile_stockholm_Write(fp, msa, eslMSAFILE_STOCKHOLM); break;
1106 case eslMSAFILE_PFAM: status = esl_msafile_stockholm_Write(fp, msa, eslMSAFILE_PFAM); break;
1107 case eslMSAFILE_A2M: status = esl_msafile_a2m_Write (fp, msa); break;
1108 case eslMSAFILE_PSIBLAST: status = esl_msafile_psiblast_Write (fp, msa); break;
1109 case eslMSAFILE_SELEX: status = esl_msafile_selex_Write (fp, msa); break;
1110 case eslMSAFILE_AFA: status = esl_msafile_afa_Write (fp, msa); break;
1111 case eslMSAFILE_CLUSTAL: status = esl_msafile_clustal_Write (fp, msa, eslMSAFILE_CLUSTAL); break;
1112 case eslMSAFILE_CLUSTALLIKE: status = esl_msafile_clustal_Write (fp, msa, eslMSAFILE_CLUSTALLIKE); break;
1113 case eslMSAFILE_PHYLIP: status = esl_msafile_phylip_Write (fp, msa, eslMSAFILE_PHYLIP, NULL); break;
1114 case eslMSAFILE_PHYLIPS: status = esl_msafile_phylip_Write (fp, msa, eslMSAFILE_PHYLIPS, NULL); break;
1115 default: ESL_EXCEPTION(eslEINCONCEIVABLE, "no such msa file format");
1116 }
1117 return status;
1118 }
1119
1120 /*-------------- end, writing MSA to stream ---------------------*/
1121
1122
1123 /*****************************************************************
1124 *# 8. MSA functions that depend on MSAFILE
1125 *****************************************************************/
1126
1127 /* Function: esl_msa_CreateFromString()
1128 * Synopsis: Creates a small <ESL_MSA> from a test case string.
1129 *
1130 * Purpose: A convenience for making small test cases in the test
1131 * suites: given the contents of a complete multiple
1132 * sequence alignment file as a single string <s> in
1133 * alignment format <fmt>, convert it to an <ESL_MSA>.
1134 *
1135 * For example,
1136 * ```
1137 * esl_msa_CreateFromString("# STOCKHOLM 1.0\n"
1138 * "seq1 AAAAA\n"
1139 * "seq2 AAAAA\n"
1140 * "//\n", eslMSAFILE_STOCKHOLM);
1141 * ```
1142 * creates an ungapped alignment of two AAAAA sequences.
1143 * (Using string concatenation in C99 makes for a cleaner
1144 * looking multi-line string.)
1145 *
1146 * The <msa> is in text mode. If you want it in digital
1147 * mode, use <esl_msa_Digitize()> on it.
1148 *
1149 * Returns: a pointer to the new <ESL_MSA> on success.
1150 *
1151 * Throws: <NULL> if it fails to obtain, open, or read the temporary file
1152 * that it puts the string <s> in.
1153 */
1154 ESL_MSA *
esl_msa_CreateFromString(const char * s,int fmt)1155 esl_msa_CreateFromString(const char *s, int fmt)
1156 {
1157 ESL_MSAFILE *mfp = NULL;
1158 ESL_MSA *msa = NULL;
1159
1160 if (esl_msafile_OpenMem(NULL, s, -1, fmt, NULL, &mfp) != eslOK) goto ERROR;
1161 if (esl_msafile_Read(mfp, &msa) != eslOK) goto ERROR;
1162 esl_msafile_Close(mfp);
1163 return msa;
1164
1165 ERROR:
1166 if (mfp) esl_msafile_Close(mfp);
1167 if (msa) esl_msa_Destroy(msa);
1168 return NULL;
1169 }
1170
1171 /*****************************************************************
1172 *# 9. Utilities used by specific format parsers.
1173 *****************************************************************/
1174
1175 /* Function: esl_msafile_GetLine()
1176 * Synopsis: Read next line of input alignment file.
1177 *
1178 * Purpose: Read next line of input <afp>, into its internal
1179 * data fields: <afp->line> points to the start of the
1180 * line in <afp->bf>, <afp->n> is its length in
1181 * bytes, <afp->lineoffset> is the offset in the input
1182 * to the start of the line, and <afp->linenumber> is
1183 * the linenumber from <1..N> for <N> total lines in the
1184 * input.
1185 *
1186 * Optionally, caller can request <*opt_p>, <*opt_n>,
1187 * which are set to <afp->line>,<afp->n>. This gives the
1188 * caller a modifiable line pointer that it can step
1189 * through, while <afp->line> is preserved for possible
1190 * diagnostics if anything goes awry.
1191 *
1192 * Args: <afp> : an open alignment file input
1193 * <*opt_p> : optRETURN: modifiable copy of <afp->line> pointer
1194 * <*opt_n> : optRETURN: modifiable copy of <afp->n>
1195 *
1196 * Returns: <eslOK> on success.
1197 *
1198 * <eslEOF> at EOF. Now <afp->line> is <NULL>, <afp->n>
1199 * is <0>, and <afp->lineoffset> is <0>. <afp->linenumber>
1200 * is the total number of lines in the input.
1201 *
1202 * Throws: <eslEMEM> if an allocation fails.
1203 * <eslESYS> if a system call fails, such as fread().
1204 * <eslEINCONCEIVABLE> on internal code errors.
1205 */
1206 int
esl_msafile_GetLine(ESL_MSAFILE * afp,char ** opt_p,esl_pos_t * opt_n)1207 esl_msafile_GetLine(ESL_MSAFILE *afp, char **opt_p, esl_pos_t *opt_n)
1208 {
1209 int status;
1210
1211 afp->lineoffset = esl_buffer_GetOffset(afp->bf);
1212 if ((status = esl_buffer_GetLine(afp->bf, &(afp->line), &(afp->n))) != eslOK) goto ERROR;
1213 if (afp->linenumber != -1) afp->linenumber++;
1214
1215 if (opt_p) *opt_p = afp->line;
1216 if (opt_n) *opt_n = afp->n;
1217 return eslOK;
1218
1219 ERROR:
1220 afp->line = NULL;
1221 afp->n = 0;
1222 afp->lineoffset = -1;
1223 /* leave linenumber alone. on EOF, it is the number of lines in the file, and that might be useful. */
1224 if (opt_p) *opt_p = NULL;
1225 if (opt_n) *opt_n = 0;
1226 return status;
1227 }
1228
1229
1230 /* Function: esl_msafile_PutLine()
1231 * Synopsis: Put the line we just read back in the input stream
1232 *
1233 * Purpose: Put the line we just read back in the input stream
1234 * and unset <afp->line> and its associated information
1235 * internally. The next <esl_msafile_GetLine()> call
1236 * will read exactly the same line again.
1237 *
1238 * This gets used in parsing files that contain multiple
1239 * MSAs. If the way we determine that an MSA record has
1240 * ended is by reading the first line of the next MSA
1241 * record, then we may want to stuff it back in the input
1242 * buffer, so it gets parsed properly as part of the next
1243 * record. In Pfam/Stockholm parsing we don't have to
1244 * do this, because the first line is just a format code,
1245 * with no record-specific data. But in PHYLIP multiple MSA
1246 * format, for example, the first line is nseq,alen.
1247 *
1248 * Args: afp - the open input stream
1249 *
1250 * Returns: <eslOK> on succes
1251 *
1252 * Throws: <eslEMEM>, <eslESYS>, <eslEINCONCEIVABLE> if the
1253 * <esl_buffer_Set()> call fails.
1254 */
1255 int
esl_msafile_PutLine(ESL_MSAFILE * afp)1256 esl_msafile_PutLine(ESL_MSAFILE *afp)
1257 {
1258 int status;
1259 if ((status = esl_buffer_Set(afp->bf, afp->line, 0)) != eslOK) return status;
1260 afp->line = NULL;
1261 afp->n = 0;
1262 if (afp->linenumber != -1) afp->linenumber--;
1263 afp->lineoffset = -1;
1264 return eslOK;
1265 }
1266
1267 /*--------------- end, parser utilities -------------------------*/
1268
1269
1270
1271 /*****************************************************************
1272 * 10. Unit tests
1273 *****************************************************************/
1274 #ifdef eslMSAFILE_TESTDRIVE
1275
1276
1277 static void
utest_format2format(int fmt1,int fmt2)1278 utest_format2format(int fmt1, int fmt2)
1279 {
1280 char msg[] = "esl_msafile: format2format unit test failed";
1281 char tmpfile1[32] = "esltmpXXXXXX";
1282 char tmpfile2[32] = "esltmpXXXXXX";
1283 char tmpfile3[32] = "esltmpXXXXXX";
1284 FILE *ofp = NULL;
1285 ESL_MSA *msa1, *msa2, *msa3, *msa4;
1286 ESL_MSAFILE *afp;
1287 int alphatype = eslAMINO;
1288 ESL_ALPHABET *abc = esl_alphabet_Create(alphatype);
1289 ESL_ALPHABET *abc2 = NULL;
1290
1291 char *testmsa = "\
1292 # STOCKHOLM 1.0\n\
1293 seq1 ACDEFGHIKLMNPQRSTVWYacdefghiklmnpq------ACDEFGHIKLMNPQRSTVWYacde......mnpqrstvwyACDEFGHI______RSTVWYacdefghiklmnpqrstvwy\n\
1294 seq2 ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwyACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwyACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy\n\
1295 //\n";
1296
1297 /* Create the test msa, msa1, digital mode, no autodetections */
1298 if ( esl_msafile_OpenMem(&abc, testmsa, strlen(testmsa), eslMSAFILE_STOCKHOLM, NULL, &afp) != eslOK) esl_fatal(msg);
1299 if ( esl_msafile_Read(afp, &msa1) != eslOK) esl_fatal(msg);
1300 esl_msafile_Close(afp);
1301
1302 /* Write it to tmpfile1 in fmt1. (This exercises writing of digital MSAs, in all <fmt1> formats) */
1303 if ( esl_tmpfile_named(tmpfile1, &ofp) != eslOK) esl_fatal(msg);
1304 if ( esl_msafile_Write(ofp, msa1, fmt1) != eslOK) esl_fatal(msg);
1305 fclose(ofp);
1306
1307 /* Read it back from <fmt1> in TEXT mode (verbatim), with format autodetection (except A2M) */
1308 if (fmt1 != eslMSAFILE_A2M)
1309 {
1310 if ( esl_msafile_Open(NULL, tmpfile1, NULL, eslMSAFILE_UNKNOWN, NULL, &afp) != eslOK) esl_fatal(msg);
1311 if (fmt1 == eslMSAFILE_PFAM && afp->format == eslMSAFILE_STOCKHOLM) afp->format = eslMSAFILE_PFAM;
1312 if (fmt1 == eslMSAFILE_PSIBLAST && afp->format == eslMSAFILE_SELEX) afp->format = eslMSAFILE_PSIBLAST;
1313 if ( esl_msafile_Read(afp, &msa2) != eslOK) esl_fatal(msg);
1314 esl_msafile_Close(afp);
1315 }
1316 else // without autodetection:
1317 {
1318 if ( esl_msafile_Open(NULL, tmpfile1, NULL, fmt1, NULL, &afp) != eslOK) esl_fatal(msg);
1319 if ( esl_msafile_Read(afp, &msa2) != eslOK) esl_fatal(msg);
1320 esl_msafile_Close(afp);
1321 }
1322
1323 /* Write it to tmpfile2 in fmt2. (This exercises writing of text-mode MSAs, in all <fmt2> formats) */
1324 if ( esl_tmpfile_named(tmpfile2, &ofp) != eslOK) esl_fatal(msg);
1325 if ( esl_msafile_Write(ofp, msa2, fmt2) != eslOK) esl_fatal(msg);
1326 fclose(ofp);
1327
1328 /* Read it back in TEXT mode, with format autodetection (except A2M) */
1329 if (fmt2 != eslMSAFILE_A2M)
1330 {
1331 if ( esl_msafile_Open(NULL, tmpfile2, NULL, eslMSAFILE_UNKNOWN, NULL, &afp) != eslOK) esl_fatal(msg);
1332 if (fmt2 == eslMSAFILE_PFAM && afp->format == eslMSAFILE_STOCKHOLM) afp->format = eslMSAFILE_PFAM;
1333 if (fmt2 == eslMSAFILE_PSIBLAST && afp->format == eslMSAFILE_SELEX) afp->format = eslMSAFILE_PSIBLAST;
1334 if ( esl_msafile_Read(afp, &msa3) != eslOK) esl_fatal(msg);
1335 esl_msafile_Close(afp);
1336 }
1337 else // without autodetection:
1338 {
1339 if ( esl_msafile_Open(NULL, tmpfile2, NULL, fmt2, NULL, &afp) != eslOK) esl_fatal(msg);
1340 if ( esl_msafile_Read(afp, &msa3) != eslOK) esl_fatal(msg);
1341 esl_msafile_Close(afp);
1342 }
1343
1344 /* Write it to tmpfile4 in fmt2. (This exercises writing of digital-mode MSAs, in all <fmt2> formats */
1345 if ( esl_tmpfile_named(tmpfile3, &ofp) != eslOK) esl_fatal(msg);
1346 if ( esl_msafile_Write(ofp, msa3, fmt2) != eslOK) esl_fatal(msg);
1347 fclose(ofp);
1348
1349 /* Read it back in DIGITAL mode, with alphabet autodetection but not format */
1350 if ( esl_msafile_Open(&abc2, tmpfile3, NULL, fmt2, NULL, &afp) != eslOK) esl_fatal(msg);
1351 if ( esl_msafile_Read(afp, &msa4) != eslOK) esl_fatal(msg);
1352 esl_msafile_Close(afp);
1353
1354 /* Now:
1355 * msa1 = digital mode test alignment, created from Stockholm string
1356 * msa2 = TEXT mode, read from <fmt1> tmpfile1
1357 * msa3 = TEXT mode, read from <fmt2> tmpfile2
1358 * msa4 = digital mode, read from <fmt2> tmpfile3
1359 * So we expect:
1360 * msa2==msa3
1361 * msa1==msa4
1362 */
1363
1364 /* some normalization before comparing alignments. */
1365 esl_msa_SymConvert(msa2, "abcdefghijklmnopqrstuvwxyz.", "ABCDEFGHIJKLMNOPQRSTUVWXYZ-");
1366 esl_msa_SymConvert(msa3, "abcdefghijklmnopqrstuvwxyz.", "ABCDEFGHIJKLMNOPQRSTUVWXYZ-");
1367
1368 if (msa2->rf) { free (msa2->rf); msa2->rf = NULL; }
1369 if (msa3->rf) { free (msa3->rf); msa3->rf = NULL; }
1370 if (msa4->rf) { free (msa4->rf); msa4->rf = NULL; }
1371
1372 if (esl_msa_Compare(msa2, msa3) != eslOK) esl_fatal(msg);
1373 if (esl_msa_Compare(msa1, msa4) != eslOK) esl_fatal(msg);
1374
1375 remove(tmpfile1);
1376 remove(tmpfile2);
1377 remove(tmpfile3);
1378 esl_msa_Destroy(msa1);
1379 esl_msa_Destroy(msa2);
1380 esl_msa_Destroy(msa3);
1381 esl_msa_Destroy(msa4);
1382 esl_alphabet_Destroy(abc);
1383 esl_alphabet_Destroy(abc2);
1384 }
1385 #endif /*eslMSAFILE_TESTDRIVE*/
1386 /*----------------- end, unit tests -----------------------------*/
1387
1388
1389 /*****************************************************************
1390 * 11. Test driver
1391 *****************************************************************/
1392 #ifdef eslMSAFILE_TESTDRIVE
1393
1394 /* compile: gcc -g -Wall -I. -L. -o esl_msafile_utest -DeslMSAFILE_TESTDRIVE esl_msafile.c -leasel -lm
1395 * (gcov): gcc -g -Wall -fprofile-arcs -ftest-coverage -I. -L. -o esl_msafile_utest -DeslMSAFILE_TESTDRIVE esl_msafile.c -leasel -lm
1396 * run: ./esl_msafile_utest
1397 */
1398 #include "esl_config.h"
1399
1400 #include <stdio.h>
1401
1402 #include "easel.h"
1403 #include "esl_getopts.h"
1404 #include "esl_msafile.h"
1405
1406 static ESL_OPTIONS options[] = {
1407 /* name type default env range togs reqs incomp help docgrp */
1408 {"-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0},
1409 { 0,0,0,0,0,0,0,0,0,0},
1410 };
1411 static char usage[] = "[-options]";
1412 static char banner[] = "test driver for MSA input/output format module";
1413
1414 int
main(int argc,char ** argv)1415 main(int argc, char **argv)
1416 {
1417 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
1418 int fmt1, fmt2;
1419
1420 for (fmt1 = eslMSAFILE_STOCKHOLM; fmt1 <= eslMSAFILE_PHYLIPS; fmt1++)
1421 for (fmt2 = eslMSAFILE_STOCKHOLM; fmt2 <= eslMSAFILE_PHYLIPS; fmt2++)
1422 utest_format2format(fmt1, fmt2);
1423
1424 esl_getopts_Destroy(go);
1425 exit(0);
1426 }
1427
1428 #endif /*eslMSAFILE_TESTDRIVE*/
1429 /*----------------- end, test driver ----------------------------*/
1430
1431
1432
1433 /*****************************************************************
1434 * 12. Examples.
1435 *****************************************************************/
1436
1437 #ifdef eslMSAFILE_EXAMPLE
1438 /*::cexcerpt::msafile_example::begin::*/
1439 #include <stdio.h>
1440 #include "easel.h"
1441 #include "esl_alphabet.h"
1442 #include "esl_getopts.h"
1443 #include "esl_msa.h"
1444 #include "esl_msafile.h"
1445
1446 static ESL_OPTIONS options[] = {
1447 /* name type default env range toggles reqs incomp help docgroup*/
1448 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
1449 { "-i", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show info, instead of converting format", 0 },
1450 { "--informat", eslARG_STRING, NULL, NULL, NULL, NULL, NULL, NULL, "specify the input MSA file is in format <s>", 0 },
1451 { "--outformat", eslARG_STRING, "Clustal", NULL, NULL, NULL, NULL, NULL, "write the output MSA in format <s>", 0 },
1452 { "--dna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use DNA alphabet", 0 },
1453 { "--rna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use RNA alphabet", 0 },
1454 { "--amino", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use protein alphabet", 0 },
1455 { "--text", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use text mode: no digital alphabet", 0 },
1456 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1457 };
1458 static char usage[] = "[-options] <msafile>";
1459 static char banner[] = "example of multiple alignment input and output using the msafile module(s)";
1460
1461 int
main(int argc,char ** argv)1462 main(int argc, char **argv)
1463 {
1464 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
1465 char *msafile = esl_opt_GetArg(go, 1);
1466 int infmt = eslMSAFILE_UNKNOWN;
1467 int outfmt = eslMSAFILE_UNKNOWN;
1468 ESL_ALPHABET *abc = NULL;
1469 ESL_MSAFILE *afp = NULL;
1470 ESL_MSA *msa = NULL;
1471 int textmode = esl_opt_GetBoolean(go, "--text");
1472 int showinfo = esl_opt_GetBoolean(go, "-i");
1473 int nali = 0;
1474 int status;
1475
1476 /* If you know the alphabet you want, create it - you'll pass it to esl_msafile_Open() */
1477 if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA);
1478 else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA);
1479 else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
1480
1481 /* If you know the MSA file format, set it (<infmt>, here). */
1482 if (esl_opt_IsOn(go, "--informat") &&
1483 (infmt = esl_msafile_EncodeFormat(esl_opt_GetString(go, "--informat"))) == eslMSAFILE_UNKNOWN)
1484 esl_fatal("%s is not a valid MSA file format for --informat", esl_opt_GetString(go, "--informat"));
1485
1486 /* Open in text or digital mode.
1487 * To let the Open() function autoguess the format, you pass <infmt=eslMSAFILE_UNKNOWN>.
1488 * To let it autoguess the alphabet, you set <abc=NULL> and pass <&abc>.
1489 * To open in text mode instead of digital, you pass <NULL> for the alphabet argument.
1490 * esl_msafile_OpenFailure() is a convenience, printing various diagnostics of any
1491 * open failure to <stderr>. You can of course handle your own diagnostics instead.
1492 */
1493 if (textmode) status = esl_msafile_Open(NULL, msafile, NULL, infmt, NULL, &afp);
1494 else status = esl_msafile_Open(&abc, msafile, NULL, infmt, NULL, &afp);
1495 if (status != eslOK) esl_msafile_OpenFailure(afp, status);
1496
1497 if (showinfo) {
1498 printf("# Format: %s\n", esl_msafile_DecodeFormat(afp->format));
1499 printf("# Alphabet: %s\n", (afp->abc ? esl_abc_DecodeType(afp->abc->type) : "text mode"));
1500 }
1501
1502 /* Choose the output file format */
1503 if ( (outfmt = esl_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat"))) == eslMSAFILE_UNKNOWN)
1504 esl_fatal("%s is not a valid MSA file format for --outformat", esl_opt_GetString(go, "--outformat"));
1505
1506 while ((status = esl_msafile_Read(afp, &msa)) == eslOK)
1507 {
1508 /* if digital MSA: msa->ax[idx=0..nseq-1][acol=1..alen] is the alignment data;
1509 * if text MSA: msa->aseq[idx=0..nseq-1][acol=0..alen-1] */
1510 nali++;
1511
1512 if (showinfo) printf("# alignment %5d: %15s: %6d seqs, %5d columns\n\n", nali, msa->name, (int) msa->nseq, (int) msa->alen);
1513 else esl_msafile_Write(stdout, msa, outfmt);
1514
1515 esl_msa_Destroy(msa);
1516 }
1517 if (nali == 0 || status != eslEOF) esl_msafile_ReadFailure(afp, status); /* a convenience, like esl_msafile_OpenFailure() */
1518
1519 esl_alphabet_Destroy(abc);
1520 esl_msafile_Close(afp);
1521 esl_getopts_Destroy(go);
1522 exit(0);
1523 }
1524 /*::cexcerpt::msafile_example::end::*/
1525 #endif /*eslMSAFILE_EXAMPLE*/
1526 /*------------------------ end of examples -----------------------*/
1527
1528