1 /* Multiple sequence alignment file i/o
2  *
3  * Table of contents:
4  *    1. Opening/closing an ESL_MSAFILE.
5  *    2. ESL_MSAFILE_FMTDATA: optional added constraints on formats.
6  *    3. Guessing file formats.
7  *    4. Guessing alphabets.
8  *    5. Random MSA flatfile access.
9  *    6. Reading an MSA from an ESL_MSAFILE.
10  *    7. Writing an MSA to a stream.
11  *    8. MSA functions that depend on MSAFILE
12  *    9. Utilities used by specific format parsers.
13  *   10. Unit tests.
14  *   11. Test driver.
15  *   12. Examples.
16  */
17 #include "esl_config.h"
18 
19 #include <stdlib.h>
20 #include <stdio.h>
21 #include <string.h>
22 
23 #include "easel.h"
24 #include "esl_alphabet.h"
25 #include "esl_buffer.h"
26 #include "esl_mem.h"
27 #include "esl_msa.h"
28 #include "esl_ssi.h"
29 
30 #include "esl_msafile.h"
31 
32 /*****************************************************************
33  *# 1. Opening/closing an ESL_MSAFILE
34  *****************************************************************/
35 
36 static int msafile_Create    (ESL_MSAFILE **ret_afp);
37 static int msafile_OpenBuffer(ESL_ALPHABET **byp_abc, ESL_BUFFER *bf, int format, ESL_MSAFILE_FMTDATA *fmtd, ESL_MSAFILE *afp);
38 
39 /* Function:  esl_msafile_Open()
40  * Synopsis:  Open a multiple sequence alignment file for input.
41  *
42  * Purpose:   Open a multiple sequence alignment file <msafile> for input.
43  *            Return an open <ESL_MSAFILE> handle in <*ret_afp>.
44  *
45  *            <msafile> is usually the name of a file. Alignments may
46  *            also be read from standard input, or from
47  *            gzip-compressed files.  If <msafile> is ``-'', alignment
48  *            input is taken from the standard input stream. If
49  *            <msafile> ends in ``.gz'', alignment input is read
50  *            through a pipe from <gzip -dc>.
51  *
52  *            <byp_abc>, <env>, <format>, and <fmtd> support a variety
53  *            of optional/advanced operations, as described
54  *            below. Minimally, a caller can set <byp_abc> to <NULL>,
55  *            <format> to <eslMSAFILE_UNKNOWN>, and <fmtd> to <NULL>,
56  *            and <msafile> will be opened in text mode; in the
57  *            current working directory; and its format will be
58  *            autodetected.
59  *
60  *            The <byp_abc> argument controls whether data are to be
61  *            read in text or digital mode. In digital mode, alignment
62  *            data are immediately digitized into an Easel internal
63  *            alphabet (which among other things, allows various
64  *            things to operate on sequence data more efficiently) and
65  *            because an expected alphabet is known, parsers are able
66  *            to detect invalid characters. The caller may either
67  *            provide an alphabet (thus asserting what it's expected
68  *            to be), or have <esl_msafile_Open()> look at the file
69  *            and guess what alphabet it appears to be (DNA or amino
70  *            acid code, usually).  In text mode, alignment data are
71  *            read verbatim. It might be advantageous for an
72  *            application to read in text mode -- for example, if a
73  *            variant alignment format is using characters in some
74  *            special way, and you need to deal with them specially.
75  *            All this goes through the setting of the passed-by-reference
76  *            alphabet pointer <byp_abc>. If caller passes NULL for
77  *            the <byp_abc> argument, input is in text mode. If caller
78  *            provides a valid non-NULL <byp_abc> pointer but
79  *            <*byp_abc> is NULL (that is, caller has declared
80  *            <ESL_ALPHABET *abc = NULL> and passed <&abc> as an
81  *            argument), then we attempt to guess the digital alphabet
82  *            using <esl_msafile_GuessAlphabet()>, based on the first
83  *            alignment in the input. In this case, the new alphabet
84  *            is allocated here and returned to the caller. If caller
85  *            provides a digital alphabet (that is, <ESL_ALPHABET *abc
86  *            = esl_alphabet_Create...()> and passed <&abc>), that's
87  *            the alphabet we use.
88  *
89  *            The <env> argument controls where we search for the
90  *            <msafile>.  If <env> is <NULL>, only the current working
91  *            directory is checked.  Optionally, caller can provide in
92  *            <env> the name of an environment variable ("PFAMDB",
93  *            perhaps), in which the routine can find a
94  *            colon-delimited list of directories.  Then, if <msafile>
95  *            is not found in the current working directory, we look
96  *            for it in these directories, in the order they're
97  *            listed.
98  *
99  *            The <format> argument allows the caller to either allow
100  *            <esl_msafile_Open()> to autodetect the file format of
101  *            <msafile>, or to assert that it knows the file is in a
102  *            particular format. If <format> is <eslMSAFILE_UNKNOWN>,
103  *            format autodetection is performed. Other valid codes include:
104  *             | <eslMSAFILE_STOCKHOLM>   | Stockholm format                    |
105  *             | <eslMSAFILE_AFA>         | Aligned FASTA format                |
106  *             | <eslMSAFILE_CLUSTAL>     | Clustal format (strict)             |
107  *             | <eslMSAFILE_CLUSTALLIKE> | Clustal-like  (MUSCLE, PROBCONS...) |
108  *             | <eslMSAFILE_PHYLIP>      | PHYLIP interleaved format           |
109  *             | <eslMSAFILE_PHYLIPS>     | PHYLIP sequential format            |
110  *             | <eslMSAFILE_A2M>         | UCSC SAM A2M (dotless or dotful)    |
111  *             | <eslMSAFILE_PSIBLAST>    | NCBI PSI-BLAST                      |
112  *             | <eslMSAFILE_SELEX>       | a general alignment block format    |
113  *
114  *            The <fmtd> argument is an optional pointer to a
115  *            <ESL_MSAFILE_FMTDATA> structure that the caller may
116  *            initialize and provide, in order to assert any
117  *            additional unusual constraints on the input format --
118  *            for example, to dictate that a PHYLIP format file must
119  *            have some nonstandard name field width. Generally,
120  *            though, <fmtd> will be <NULL>, and such things would
121  *            either be autodetected as part of the autodetected
122  *            format, or the strict version of the parser will be
123  *            used.  (That is, if you open a PHYLIP file with
124  *            <format=eslMSAFILE_UNKNOWN> and a <NULL fmtd>, the
125  *            format autodetector is used, and it will automagically
126  *            detect nonstandard PHYLIP namewidths != 10. But if you
127  *            open a PHYLIP file with <format=eslMSAFILE_PHYLIP> and a
128  *            <NULL fmtd>, then you get the strict interleaved PHYLIP
129  *            parser, which requires a name width of exactly 10.)
130  *
131  * Args:      byp_abc   - digital alphabet to use, or NULL for text mode
132  *                        if <*byp_abc> is NULL, guess the digital alphabet,
133  *                        create it, and return it in <*byp_abc>.
134  *                        If <*byp_abc> is a digital alphabet, use it.
135  *            msafile   - name of alignment input to open;
136  *                        if "-", read standard input;
137  *                        if "*.gz", read through a <gzip -dc> pipe.
138  *            env       - <NULL>, or the name of an environment variable
139  *                        containing colon-delimited list of directories
140  *                        in which to search for <msafile> (e.g. "PFAMDB").
141  *            format    - format code, such as <eslMSAFILE_STOCKHOLM>;
142  *                        or <eslMSAFILE_UNKNOWN> to autodetect format.
143  *            fmtd      - <NULL>, or a pointer to an initialized
144  *                        <ESL_MSAFILE_FMTDATA> structure, containing
145  *                        any additional unusual constraints to apply
146  *                        to the input format.
147  *            *ret_afp  - RETURN: open MSA input stream.
148  *
149  * Returns:   <eslOK> on success, and <*ret_afp> is the newly opened msa file.
150  *
151  *            <eslENOTFOUND> if <msafile> doesn't exist or can't be
152  *            opened for reading; or (in the case of a <.gz> file) if
153  *            a <gzip> executable doesn't exist in user's <PATH> or
154  *            can't be executed. <afp->errmsg> is something like
155  *            "couldn't open %s for reading", with <%s> being the
156  *            name of the msafile.
157  *
158  *            <eslENOFORMAT> if we tried to autodetect the file format
159  *            (caller provided <format=eslMSAFILE_UNKNOWN>), and
160  *            failed. <afp->errmsg> is an informative user-directed
161  *            message, minimally something like "couldn't determine alignment
162  *            input format", possibly more detailed.
163  *
164  *            <eslENOALPHABET> if we tried to autodetect the alphabet
165  *            (caller provided <&abc>, <abc=NULL> to request digital
166  *            mode w/ alphabet autodetection) but the alphabet could
167  *            not be reliably guessed.
168  *
169  *            <eslFAIL> in the case of a <.gz> file and the <gzip -dc>
170  *            command fails on it.
171  *
172  *            On any of these normal errors, <*ret_afp> is returned in
173  *            an error state, containing a user-directed error message
174  *            in <afp->errmsg> and (if relevant) the full path to
175  *            <msafile> that we attempted to open in
176  *            <afp->bf->filename>. See <esl_msafile_OpenFailure()> for
177  *            a function that gives a standard way of reporting these
178  *            diagnostics to <stderr>.
179  *
180  * Throws:    <eslEMEM> on allocation failure.
181  *            <eslESYS> on a system call failure, such as <fread()>.
182  *            <eslEINVAL> if we tried to use <stdin> but the <stdin> stream was
183  *            invalid (in an error state, <NULL>, or at <EOF>).
184  *            On thrown exceptions, <*ret_afp> is <NULL>.
185  */
186 int
esl_msafile_Open(ESL_ALPHABET ** byp_abc,const char * msafile,const char * env,int format,ESL_MSAFILE_FMTDATA * fmtd,ESL_MSAFILE ** ret_afp)187 esl_msafile_Open(ESL_ALPHABET **byp_abc, const char *msafile, const char *env, int format, ESL_MSAFILE_FMTDATA *fmtd, ESL_MSAFILE **ret_afp)
188 {
189   ESL_MSAFILE *afp = NULL;
190   int          status;
191 
192   if ( (status = msafile_Create(&afp)) != eslOK) goto ERROR;
193 
194   if ((status = esl_buffer_Open(msafile, env, &(afp->bf))) != eslOK)
195     ESL_XFAIL(status, afp->errmsg, "%s", afp->bf->errmsg); /* ENOTFOUND; FAIL are normal here */
196 
197   if ( (status = msafile_OpenBuffer(byp_abc, afp->bf, format, fmtd, afp)) != eslOK) goto ERROR;
198 
199   *ret_afp = afp;
200   return eslOK;
201 
202  ERROR:  /* on normal errors, afp is returned in an error state */
203   if (status == eslENOTFOUND || status == eslFAIL || status == eslENOFORMAT || status == eslENODATA || status == eslENOALPHABET)
204     { afp->abc = NULL; *ret_afp = afp;}
205   else
206     { if (afp) esl_msafile_Close(afp);  *ret_afp = NULL; }
207   return status;
208 }
209 
210 
211 /* Function:  esl_msafile_OpenMem()
212  * Synopsis:  Open a string or buffer for parsing as an MSA.
213  *
214  * Purpose:   Essentially the same as <esl_msafile_Open()>, except
215  *            we ``open'' the string or buffer <p>, <n> as the
216  *            input source.
217  *
218  *            If <p> is a NUL-terminated string, providing its length
219  *            <n> is optional; <n> may be passed as -1. If <p> is an
220  *            unterminated buffer, providing the length <n> is
221  *            mandatory.
222  */
223 int
esl_msafile_OpenMem(ESL_ALPHABET ** byp_abc,const char * p,esl_pos_t n,int format,ESL_MSAFILE_FMTDATA * fmtd,ESL_MSAFILE ** ret_afp)224 esl_msafile_OpenMem(ESL_ALPHABET **byp_abc, const char *p, esl_pos_t n, int format, ESL_MSAFILE_FMTDATA *fmtd, ESL_MSAFILE **ret_afp)
225 {
226   ESL_MSAFILE *afp = NULL;
227   int status;
228 
229   if ( (status = msafile_Create(&afp))                 != eslOK) goto ERROR;
230   if ( (status = esl_buffer_OpenMem(p, n, &(afp->bf))) != eslOK) goto ERROR;
231 
232   if ( (status = msafile_OpenBuffer(byp_abc, afp->bf, format, fmtd, afp)) != eslOK) goto ERROR;
233   *ret_afp = afp;
234   return eslOK;
235 
236  ERROR:
237   if (status == eslENOTFOUND || status == eslFAIL || status == eslEFORMAT || status == eslENODATA || status == eslENOALPHABET)
238     { afp->abc = NULL; *ret_afp = afp;}
239   else
240     { if (afp) esl_msafile_Close(afp);  *ret_afp = NULL; }
241   return status;
242 }
243 
244 
245 /* Function:  esl_msafile_OpenBuffer()
246  * Synopsis:  Open an input buffer for parsing as an MSA.
247  *
248  * Purpose:   Essentially the same as <esl_msafile_Open()>, except
249  *            we ``open'' an <ESL_BUFFER> <bf> that's already been
250  *            opened by the caller for some input source.
251  */
252 int
esl_msafile_OpenBuffer(ESL_ALPHABET ** byp_abc,ESL_BUFFER * bf,int format,ESL_MSAFILE_FMTDATA * fmtd,ESL_MSAFILE ** ret_afp)253 esl_msafile_OpenBuffer(ESL_ALPHABET **byp_abc, ESL_BUFFER *bf, int format, ESL_MSAFILE_FMTDATA *fmtd, ESL_MSAFILE **ret_afp)
254 {
255   ESL_MSAFILE *afp = NULL;
256   int status;
257 
258   if ( (status = msafile_Create(&afp)) != eslOK) goto ERROR;
259 
260   afp->bf = bf;
261   if ((status = msafile_OpenBuffer(byp_abc, afp->bf, format, fmtd, afp)) != eslOK) goto ERROR;
262   *ret_afp = afp;
263   return eslOK;
264 
265  ERROR:
266   if (status == eslENOTFOUND || status == eslFAIL || status == eslEFORMAT || status == eslENODATA || status == eslENOALPHABET)
267     { afp->abc = NULL; *ret_afp = afp;}
268   else
269     { if (afp) esl_msafile_Close(afp);  *ret_afp = NULL; }
270   return status;
271 }
272 
273 /* Function:  esl_msafile_OpenFailure()
274  * Synopsis:  Report diagnostics of normal error in opening MSA file, and exit.
275  *
276  * Purpose:   Report user-directed diagnostics of a normal error in opening
277  *            an MSA input. Print a message to <stderr>, then exit.
278  */
279 void
esl_msafile_OpenFailure(ESL_MSAFILE * afp,int status)280 esl_msafile_OpenFailure(ESL_MSAFILE *afp, int status)
281 {
282   int show_source = FALSE;
283   int show_fmt    = FALSE;
284 
285   fprintf(stderr, "Alignment input open failed.\n");
286 
287   if      (status == eslENOTFOUND)   { fprintf(stderr, "   %s\n", afp->errmsg);                                      }
288   else if (status == eslFAIL)	     { fprintf(stderr, "   %s\n", afp->errmsg);                                      }
289   else if (status == eslENOFORMAT)   { fprintf(stderr, "   %s\n", afp->errmsg); show_source = TRUE;                  }
290   else if (status == eslENOALPHABET) { fprintf(stderr, "   %s\n", afp->errmsg); show_source = TRUE; show_fmt = TRUE; }
291   else if (status == eslEMEM)        { fprintf(stderr, "   Memory allocation failure\n");                            }
292   else if (status == eslESYS)        { fprintf(stderr, "   System call failed, possibly fread()\n");                 }
293   else                               { fprintf(stderr, "   Unexpected error code %d\n", status);                     }
294 
295   if (show_source) {
296     switch (afp->bf->mode_is) {
297     case eslBUFFER_STREAM:   fprintf(stderr, "   while reading from an input stream (not a file)\n");   break;
298     case eslBUFFER_CMDPIPE:  fprintf(stderr, "   while reading through a pipe (not a file)\n");         break;
299     case eslBUFFER_FILE:
300     case eslBUFFER_ALLFILE:
301     case eslBUFFER_MMAP:     fprintf(stderr, "   while reading file %s\n", afp->bf->filename);          break;
302     case eslBUFFER_STRING:   fprintf(stderr, "   while reading from a provided string (not a file)\n"); break;
303     default:                 break;
304     }
305   }
306 
307   if (show_fmt) {
308     fprintf(stderr, "   while parsing for %s format\n", esl_msafile_DecodeFormat(afp->format));
309   }
310 
311   esl_msafile_Close(afp);
312   exit(status);
313 }
314 
315 /* Function:  esl_msafile_SetDigital()
316  * Synopsis:  Convert an open text-mode ESL_MSAFILE to digital mode.
317  *
318  * Purpose:   Convert the open <afp> from text mode to digital mode,
319  *            using alphabet <abc>.
320  *
321  * Note:      This function is only here for legacy support: it's
322  *            called by esl_sqio, which still uses an outdated
323  *            open / guess alphabet / set digital pattern. When
324  *            sqio is upgraded next, this function should be removed.
325  */
326 int
esl_msafile_SetDigital(ESL_MSAFILE * afp,const ESL_ALPHABET * abc)327 esl_msafile_SetDigital(ESL_MSAFILE *afp, const ESL_ALPHABET *abc)
328 {
329   int status;
330 
331   afp->abc = abc;
332 
333   switch (afp->format) {
334   case eslMSAFILE_A2M:          status = esl_msafile_a2m_SetInmap(      afp); break;
335   case eslMSAFILE_AFA:          status = esl_msafile_afa_SetInmap(      afp); break;
336   case eslMSAFILE_CLUSTAL:      status = esl_msafile_clustal_SetInmap(  afp); break;
337   case eslMSAFILE_CLUSTALLIKE:  status = esl_msafile_clustal_SetInmap(  afp); break;
338   case eslMSAFILE_PFAM:         status = esl_msafile_stockholm_SetInmap(afp); break;
339   case eslMSAFILE_PHYLIP:       status = esl_msafile_phylip_SetInmap(   afp); break;
340   case eslMSAFILE_PHYLIPS:      status = esl_msafile_phylip_SetInmap(   afp); break;
341   case eslMSAFILE_PSIBLAST:     status = esl_msafile_psiblast_SetInmap( afp); break;
342   case eslMSAFILE_SELEX:        status = esl_msafile_selex_SetInmap(    afp); break;
343   case eslMSAFILE_STOCKHOLM:    status = esl_msafile_stockholm_SetInmap(afp); break;
344   default:                      ESL_EXCEPTION(eslEINCONCEIVABLE, "no such alignment file format");
345   }
346   return status;
347 }
348 
349 /* Function:  esl_msafile_Close()
350  * Synopsis:  Close an open <ESL_MSAFILE>.
351  */
352 void
esl_msafile_Close(ESL_MSAFILE * afp)353 esl_msafile_Close(ESL_MSAFILE *afp)
354 {
355   if (afp) {
356     if (afp->bf)        esl_buffer_Close(afp->bf);
357     if (afp->ssi)       esl_ssi_Close(afp->ssi);
358     free(afp);
359   }
360 }
361 
362 static int
msafile_Create(ESL_MSAFILE ** ret_afp)363 msafile_Create(ESL_MSAFILE **ret_afp)
364 {
365   ESL_MSAFILE  *afp = NULL;
366   int           status;
367 
368   ESL_ALLOC(afp, sizeof(ESL_MSAFILE));
369   afp->bf         = NULL;
370   afp->line       = NULL;
371   afp->n          = 0;
372   afp->linenumber = 0;
373   afp->lineoffset = 0;
374   afp->format     = eslMSAFILE_UNKNOWN;
375   afp->abc        = NULL;
376   afp->ssi        = NULL;
377   afp->errmsg[0]  = '\0';
378 
379   esl_msafile_fmtdata_Init(&(afp->fmtd));
380 
381   *ret_afp = afp;
382   return eslOK;
383 
384  ERROR:
385   *ret_afp = NULL;
386   return status;
387 }
388 
389 
390 
391 /* All input sources funnel through here.
392  * Here, <afp> is already allocated and initialized, and the input
393  * <bf> is opened successfully.
394  */
395 static int
msafile_OpenBuffer(ESL_ALPHABET ** byp_abc,ESL_BUFFER * bf,int format,ESL_MSAFILE_FMTDATA * fmtd,ESL_MSAFILE * afp)396 msafile_OpenBuffer(ESL_ALPHABET **byp_abc, ESL_BUFFER *bf, int format, ESL_MSAFILE_FMTDATA *fmtd,  ESL_MSAFILE *afp)
397 {
398   ESL_ALPHABET        *abc       = NULL;
399   int                  alphatype = eslUNKNOWN;
400   int                  status;
401 
402   /* if caller provided <fmtd>, copy it into afp->fmtd */
403   if (fmtd) esl_msafile_fmtdata_Copy(fmtd, &(afp->fmtd));
404 
405   /* Determine the format */
406   if (format == eslMSAFILE_UNKNOWN &&
407       (status = esl_msafile_GuessFileFormat(afp->bf, &format, &(afp->fmtd), afp->errmsg)) != eslOK)
408     goto ERROR;
409   afp->format = format;
410 
411   /* Determine the alphabet; set <abc>. (<abc> == NULL means text mode.)  */
412   /* Note that GuessAlphabet() functions aren't allowed to use the inmap, because it isn't set yet */
413 
414   if (byp_abc && *byp_abc)	/* Digital mode, and caller provided the alphabet */
415     {
416       abc       = *byp_abc;
417       alphatype = abc->type;
418     }
419   else if (byp_abc)		/* Digital mode, and caller wants us to guess and create an alphabet */
420     {
421       status = esl_msafile_GuessAlphabet(afp, &alphatype);
422       if      (status == eslENOALPHABET) ESL_XFAIL(eslENOALPHABET, afp->errmsg, "couldn't guess alphabet (maybe try --dna/--rna/--amino if available)");
423       else if (status != eslOK)          goto ERROR;
424       if ( (abc = esl_alphabet_Create(alphatype))                == NULL) { status = eslEMEM; goto ERROR; }
425     }
426   afp->abc = abc;	/* with afp->abc set, the inmap config functions know whether to do digital/text    */
427 
428   /* Configure the format-specific, digital or text mode character
429    * input map in afp->inmap.
430    * All of these must:
431    *
432    *    set inmap[0] to an appropriate 'unknown' character, to replace
433    *       invalid input with.
434    *    set ' ' to eslDSQ_IGNORE (if we're supposed to accept and skip
435    *       it), or map it to a gap, or set it as eslDSQ_ILLEGAL.
436    *    in digital mode, copy the abc->inmap
437    *    in text mode, decide if we should accept most any
438    *        non-whitespace character (isgraph()), or if the format is
439    *        inherently restrictive and we should go with isalpha() +
440    *        some other valid characters "_-.~*" instead.
441    */
442   switch (afp->format) {
443   case eslMSAFILE_A2M:          if ((status = esl_msafile_a2m_SetInmap(      afp)) != eslOK) goto ERROR; break;
444   case eslMSAFILE_AFA:          if ((status = esl_msafile_afa_SetInmap(      afp)) != eslOK) goto ERROR; break;
445   case eslMSAFILE_CLUSTAL:      if ((status = esl_msafile_clustal_SetInmap(  afp)) != eslOK) goto ERROR; break;
446   case eslMSAFILE_CLUSTALLIKE:  if ((status = esl_msafile_clustal_SetInmap(  afp)) != eslOK) goto ERROR; break;
447   case eslMSAFILE_PFAM:         if ((status = esl_msafile_stockholm_SetInmap(afp)) != eslOK) goto ERROR; break;
448   case eslMSAFILE_PHYLIP:       if ((status = esl_msafile_phylip_SetInmap(   afp)) != eslOK) goto ERROR; break;
449   case eslMSAFILE_PHYLIPS:      if ((status = esl_msafile_phylip_SetInmap(   afp)) != eslOK) goto ERROR; break;
450   case eslMSAFILE_PSIBLAST:     if ((status = esl_msafile_psiblast_SetInmap( afp)) != eslOK) goto ERROR; break;
451   case eslMSAFILE_SELEX:        if ((status = esl_msafile_selex_SetInmap(    afp)) != eslOK) goto ERROR; break;
452   case eslMSAFILE_STOCKHOLM:    if ((status = esl_msafile_stockholm_SetInmap(afp)) != eslOK) goto ERROR; break;
453   default: ESL_XEXCEPTION(eslENOFORMAT, "no such alignment file format");
454   }
455 
456   if (esl_byp_IsReturned(byp_abc)) *byp_abc = abc;
457   return eslOK;
458 
459  ERROR:  /* on normal errors, afp is returned in an error state */
460   if (abc && ! esl_byp_IsProvided(byp_abc)) { esl_alphabet_Destroy(abc); }
461   if (esl_byp_IsReturned(byp_abc)) *byp_abc = NULL;
462   afp->abc = NULL;
463   return status;
464 }
465 /*------------- end, open/close an ESL_MSAFILE -----------------*/
466 
467 
468 
469 /*****************************************************************
470  *# 2. ESL_MSAFILE_FMTDATA: optional extra constraints on formats.
471  *****************************************************************/
472 
473 /* Function:  esl_msafile_fmtdata_Init()
474  * Synopsis:  Initialize a <ESL_MSAFILE_FMTDATA> structure.
475  */
476 int
esl_msafile_fmtdata_Init(ESL_MSAFILE_FMTDATA * fmtd)477 esl_msafile_fmtdata_Init(ESL_MSAFILE_FMTDATA *fmtd)
478 {
479   fmtd->namewidth = 0;
480   fmtd->rpl       = 0;
481   return eslOK;
482 }
483 
484 /* Function:  esl_msafile_fmtdata_Copy()
485  * Synopsis:  Copy one <ESL_MSAFILE_FMTDATA> structure to another.
486  */
487 int
esl_msafile_fmtdata_Copy(ESL_MSAFILE_FMTDATA * src,ESL_MSAFILE_FMTDATA * dst)488 esl_msafile_fmtdata_Copy(ESL_MSAFILE_FMTDATA *src, ESL_MSAFILE_FMTDATA *dst)
489 {
490   dst->namewidth = src->namewidth;
491   dst->rpl       = src->rpl;
492   return eslOK;
493 }
494 
495 /*--------------- ESL_MSAFILE_FMTDATA --------------------------*/
496 
497 
498 /*****************************************************************
499  *# 3. Guessing file format.
500  *****************************************************************/
501 
502 static int msafile_check_selex  (ESL_BUFFER *bf);
503 
504 /* Function:  esl_msafile_GuessFileFormat()
505  * Synopsis:  Guess the MSA file format of an open buffer.
506  *
507  * Purpose:   Peek into an open buffer, and try to determine what
508  *            alignment file format (if any) its input is in. If a
509  *            format can be determined, return <eslOK> and set
510  *            <*ret_fmtcode> to the format code.  If not, return
511  *            <eslENOFORMAT> and set <*ret_fmtcode> to
512  *            <eslMSAFILE_UNKNOWN>.  In either case, the buffer <bf> is
513  *            restored to its original position upon return.
514  *
515  *            Some formats may have variants that require special
516  *            handling. Caller may pass a pointer <*opt_fmtd> to a
517  *            <ESL_MSAFILE_FMTDATA> structure to capture this
518  *            information from format autodetection, or <NULL>. If the
519  *            structure is provided, it is reinitialized, and any
520  *            fields that can be determined by the appropriate
521  *            format-guessing function are filled in. If <*opt_fmtd>
522  *            is <NULL>, the range of variation that can be captured
523  *            by some formats may be limited. Currently this only
524  *            affects PHYLIP format, where <opt_fmtd->namewidth>
525  *            allows files with nonstandard name field widths to be
526  *            autodetected and parsed.
527  *
528  * Args:      bf          - the open buffer to read input from
529  *            ret_fmtcode - RETURN:    format code that's determined
530  *            opt_fmtd    - optRETURN: ptr to an <ESL_MSAFILE_FMTDATA> structure to
531  *                          be filled in with additional format-specific data, or <NULL>
532  *            errbuf      - optRETURN: space for an informative error message if
533  *                          format autodetection fails; caller allocates for <eslERRBUFSIZE> bytes.
534  *
535  * Returns:   <eslOK> on success, and <*ret_fmtcode> contains the format code.
536  *
537  *            <eslENOFORMAT> if format can't be guessed; now <*ret_fmtcode> contains
538  *            <eslMSAFILE_UNKNOWN>, and optional <errbuf> contains an explanation.
539  *
540  * Throws:    (no abnormal error conditions)
541  */
542 int
esl_msafile_GuessFileFormat(ESL_BUFFER * bf,int * ret_fmtcode,ESL_MSAFILE_FMTDATA * opt_fmtd,char * errbuf)543 esl_msafile_GuessFileFormat(ESL_BUFFER *bf, int *ret_fmtcode, ESL_MSAFILE_FMTDATA *opt_fmtd, char *errbuf)
544 {
545   esl_pos_t  initial_offset;
546   char      *p;
547   esl_pos_t  n;
548   int        fmt_bysuffix    = eslMSAFILE_UNKNOWN;
549   int        fmt_byfirstline = eslMSAFILE_UNKNOWN;
550   int        status;
551 
552   /* Initialize the optional data, if provided (move this initialization to a function someday) */
553   if (opt_fmtd) esl_msafile_fmtdata_Init(opt_fmtd);
554   if (errbuf)   errbuf[0] = '\0';
555 
556   /* As we start, save parser status:
557    *   remember the offset where we started (usually 0, but not necessarily)
558    *   set an anchor to be sure that this offset stays in the buffer's memory
559    */
560   initial_offset = esl_buffer_GetOffset(bf);
561   esl_buffer_SetAnchor(bf, initial_offset);
562 
563   /* We may use a filename suffix as a clue, especially when
564    * distinguishing formats that may not be distinguishable.
565    * (if there's a filename, and if it has a suffix, anyway.)
566    */
567   if (bf->filename)
568     {
569       esl_file_Extension(bf->filename, 0, &p, &n);
570       if (esl_memstrcmp(p, n, ".gz")) esl_file_Extension(bf->filename, 3, &p, &n);
571       if (p)
572 	{
573 	  if      (esl_memstrcmp(p, n, ".sto"))    fmt_bysuffix = eslMSAFILE_STOCKHOLM;
574 	  else if (esl_memstrcmp(p, n, ".sth"))    fmt_bysuffix = eslMSAFILE_STOCKHOLM;
575 	  else if (esl_memstrcmp(p, n, ".stk"))    fmt_bysuffix = eslMSAFILE_STOCKHOLM;
576 	  else if (esl_memstrcmp(p, n, ".afa"))    fmt_bysuffix = eslMSAFILE_AFA;
577 	  else if (esl_memstrcmp(p, n, ".afasta")) fmt_bysuffix = eslMSAFILE_AFA;
578 	  else if (esl_memstrcmp(p, n, ".pfam"))   fmt_bysuffix = eslMSAFILE_PFAM;
579 	  else if (esl_memstrcmp(p, n, ".a2m"))    fmt_bysuffix = eslMSAFILE_A2M;
580 	  else if (esl_memstrcmp(p, n, ".slx"))    fmt_bysuffix = eslMSAFILE_SELEX;
581 	  else if (esl_memstrcmp(p, n, ".selex"))  fmt_bysuffix = eslMSAFILE_SELEX;
582 	  else if (esl_memstrcmp(p, n, ".pb"))     fmt_bysuffix = eslMSAFILE_PSIBLAST;
583 	  else if (esl_memstrcmp(p, n, ".ph"))     fmt_bysuffix = eslMSAFILE_PHYLIP;
584 	  else if (esl_memstrcmp(p, n, ".phy"))    fmt_bysuffix = eslMSAFILE_PHYLIP;
585 	  else if (esl_memstrcmp(p, n, ".phyi"))   fmt_bysuffix = eslMSAFILE_PHYLIP;
586 	  else if (esl_memstrcmp(p, n, ".phys"))   fmt_bysuffix = eslMSAFILE_PHYLIPS;
587 	}
588     }
589 
590   /* We peek at the first non-blank line of the file.
591    * Multiple sequence alignment files are often identifiable by a token on this line.
592    */
593   /* Skip blank lines, get first non-blank one */
594   do   {
595     status = esl_buffer_GetLine(bf, &p, &n);
596   } while (status == eslOK && esl_memspn(p, n, " \t") == n);
597   if (status == eslEOF) ESL_XFAIL(eslENOFORMAT, errbuf, "can't guess alignment input format: empty file/no data");
598 
599   if      (esl_memstrpfx(p, n, "# STOCKHOLM"))                      fmt_byfirstline = eslMSAFILE_STOCKHOLM;
600   else if (esl_memstrpfx(p, n, ">"))                                fmt_byfirstline = eslMSAFILE_AFA;
601   else if (esl_memstrpfx(p, n, "CLUSTAL"))                          fmt_byfirstline = eslMSAFILE_CLUSTAL;
602   else if (esl_memstrcontains(p, n, "multiple sequence alignment")) fmt_byfirstline = eslMSAFILE_CLUSTALLIKE;
603   else {
604     char     *tok;
605     esl_pos_t toklen;
606     /* look for <nseq> <alen>, characteristic of PHYLIP files */
607     if (esl_memtok(&p, &n, " \t", &tok, &toklen) == eslOK  && esl_memspn(tok, toklen, "0123456789") == toklen &&
608 	esl_memtok(&p, &n, " \t", &tok, &toklen) == eslOK  && esl_memspn(tok, toklen, "0123456789") == toklen)
609       fmt_byfirstline = eslMSAFILE_PHYLIP; /* interleaved for now; we'll look more closely soon */
610   }
611 
612   /* Restore parser status, rewind to start */
613   esl_buffer_SetOffset  (bf, initial_offset);
614   esl_buffer_RaiseAnchor(bf, initial_offset);
615 
616   /* Rules to determine formats.
617    * If we have to call a routine that looks deeper into the buffer
618    * than the first line, that routine must restore buffer status.
619    */
620   if      (fmt_byfirstline == eslMSAFILE_STOCKHOLM)
621     {
622       if      (fmt_bysuffix == eslMSAFILE_STOCKHOLM) *ret_fmtcode = eslMSAFILE_STOCKHOLM;
623       else if (fmt_bysuffix == eslMSAFILE_PFAM)      *ret_fmtcode = eslMSAFILE_PFAM;
624       else    *ret_fmtcode = eslMSAFILE_STOCKHOLM;
625     }
626   else if (fmt_byfirstline == eslMSAFILE_CLUSTAL)
627     {
628       *ret_fmtcode = eslMSAFILE_CLUSTAL;
629     }
630   else if (fmt_byfirstline == eslMSAFILE_CLUSTALLIKE)
631     {
632       *ret_fmtcode = eslMSAFILE_CLUSTALLIKE;
633     }
634   else if (fmt_byfirstline == eslMSAFILE_AFA)
635     {
636       if      (fmt_bysuffix == eslMSAFILE_A2M) *ret_fmtcode = eslMSAFILE_A2M;  // A2M requires affirmative identification by caller,
637       else                                     *ret_fmtcode = eslMSAFILE_AFA;  //  because of ambiguity w/ AFA.
638     }
639   else if (fmt_byfirstline == eslMSAFILE_PHYLIP)
640     {
641       if      (fmt_bysuffix == eslMSAFILE_PHYLIP)  *ret_fmtcode = eslMSAFILE_PHYLIP;
642       else if (fmt_bysuffix == eslMSAFILE_PHYLIPS) *ret_fmtcode = eslMSAFILE_PHYLIPS;
643       else
644 	{
645 	  int namewidth;
646 	  status = esl_msafile_phylip_CheckFileFormat(bf, ret_fmtcode, &namewidth);
647 	  if      (status == eslEAMBIGUOUS) ESL_XFAIL(eslENOFORMAT, errbuf, "can't guess format: it's consistent w/ both phylip, phylips.");
648 	  else if (status == eslFAIL)       ESL_XFAIL(eslENOFORMAT, errbuf, "format unrecognized, though it looks phylip-like");
649 
650 	  if      (opt_fmtd)                opt_fmtd->namewidth = namewidth;
651 	  else if (namewidth != 10)         ESL_XFAIL(eslENOFORMAT, errbuf, "can't parse nonstandard PHYLIP name width (expected 10)"); /* if we can't store the nonstandard width, we can't allow the caller to think it can parse this */
652 	}
653     }
654   else // if we haven't guessed so far, try selex.
655     {				/* selex parser can handle psiblast too */
656       if      (fmt_bysuffix == eslMSAFILE_SELEX) *ret_fmtcode = eslMSAFILE_SELEX;
657       else if (msafile_check_selex(bf) == eslOK) {
658 	if (fmt_bysuffix == eslMSAFILE_PSIBLAST) *ret_fmtcode = eslMSAFILE_PSIBLAST;
659 	else                                     *ret_fmtcode = eslMSAFILE_SELEX;
660       }
661       else    ESL_XFAIL(eslENOFORMAT, errbuf, "couldn't guess alignment input format - doesn't even look like selex");
662     }
663 
664   return eslOK;
665 
666  ERROR:
667   *ret_fmtcode = eslMSAFILE_UNKNOWN;
668   return status;
669 }
670 
671 
672 /* Function:  esl_msafile_IsMultiRecord()
673  * Synopsis:  Test if a format supports multiple MSAs per file.
674  *
675  * Purpose:   Return <TRUE> if MSA file format <fmt> supports
676  *            more than one MSA record per file. Return <FALSE>
677  *            otherwise (including the cases of <fmt> being
678  *            invalid or <eslMSAFILE_UNKNOWN>).
679  */
680 int
esl_msafile_IsMultiRecord(int fmt)681 esl_msafile_IsMultiRecord(int fmt)
682 {
683   switch (fmt) {
684   case eslMSAFILE_UNKNOWN:     return FALSE;
685   case eslMSAFILE_STOCKHOLM:   return TRUE;
686   case eslMSAFILE_PFAM:        return TRUE;
687   case eslMSAFILE_A2M:         return FALSE;
688   case eslMSAFILE_PSIBLAST:    return FALSE;
689   case eslMSAFILE_SELEX:       return FALSE;
690   case eslMSAFILE_AFA:         return FALSE;
691   case eslMSAFILE_CLUSTAL:     return FALSE;
692   case eslMSAFILE_CLUSTALLIKE: return FALSE;
693   case eslMSAFILE_PHYLIP:      return TRUE; /* because seqboot. undocumented in phylip,  phylip format can come out multi-msa */
694   case eslMSAFILE_PHYLIPS:     return TRUE; /* ditto */
695   default:                     return FALSE;
696   }
697   return FALSE;			/* keep compilers happy */
698 }
699 
700 
701 /* Function:  esl_msafile_EncodeFormat()
702  * Synopsis:  Convert text string to an MSA file format code.
703  *
704  * Purpose:   Given a text string, match it case-insensitively
705  *            against a list of possible formats, and return the
706  *            appropriate MSA file format code. For example,
707  *            <esl_msafile_EncodeFormat("Stockholm")> returns
708  *            <eslMSAFILE_STOCKHOLM>.
709  *
710  *            If the format is unrecognized, return
711  *            <eslMSAFILE_UNKNOWN>.
712  *
713  * Note:      Keep in sync with <esl_sqio_EncodeFormat()>,
714  *            which decodes all possible sequence file formats,
715  *            both unaligned and aligned.
716  */
717 int
esl_msafile_EncodeFormat(char * fmtstring)718 esl_msafile_EncodeFormat(char *fmtstring)
719 {
720   if (strcasecmp(fmtstring, "stockholm")   == 0) return eslMSAFILE_STOCKHOLM;
721   if (strcasecmp(fmtstring, "pfam")        == 0) return eslMSAFILE_PFAM;
722   if (strcasecmp(fmtstring, "a2m")         == 0) return eslMSAFILE_A2M;
723   if (strcasecmp(fmtstring, "psiblast")    == 0) return eslMSAFILE_PSIBLAST;
724   if (strcasecmp(fmtstring, "selex")       == 0) return eslMSAFILE_SELEX;
725   if (strcasecmp(fmtstring, "afa")         == 0) return eslMSAFILE_AFA;
726   if (strcasecmp(fmtstring, "clustal")     == 0) return eslMSAFILE_CLUSTAL;
727   if (strcasecmp(fmtstring, "clustallike") == 0) return eslMSAFILE_CLUSTALLIKE;
728   if (strcasecmp(fmtstring, "phylip")      == 0) return eslMSAFILE_PHYLIP;
729   if (strcasecmp(fmtstring, "phylips")     == 0) return eslMSAFILE_PHYLIPS;
730   return eslMSAFILE_UNKNOWN;
731 }
732 
733 
734 /* Function:  esl_msafile_DecodeFormat()
735  * Synopsis:  Convert internal file format code to text string.
736  *
737  * Purpose:   Given an internal file format code <fmt>
738  *            (<eslMSAFILE_STOCKHOLM>, for example), returns
739  *            a string suitable for printing ("Stockholm",
740  *            for example).
741  *
742  * Returns:   a pointer to a static description string.
743  *
744  * Throws:    If code isn't valid, throws an <eslEINCONCEIVABLE> exception
745  *            internally, and returns <NULL>.
746  *
747  * Note:      Keep in sync with <esl_sqio_DecodeFormat()>.
748  */
749 char *
esl_msafile_DecodeFormat(int fmt)750 esl_msafile_DecodeFormat(int fmt)
751 {
752   switch (fmt) {
753   case eslMSAFILE_UNKNOWN:     return "unknown";
754   case eslMSAFILE_STOCKHOLM:   return "Stockholm";
755   case eslMSAFILE_PFAM:        return "Pfam";
756   case eslMSAFILE_A2M:         return "UCSC A2M";
757   case eslMSAFILE_PSIBLAST:    return "PSI-BLAST";
758   case eslMSAFILE_SELEX:       return "SELEX";
759   case eslMSAFILE_AFA:         return "aligned FASTA";
760   case eslMSAFILE_CLUSTAL:     return "Clustal";
761   case eslMSAFILE_CLUSTALLIKE: return "Clustal-like";
762   case eslMSAFILE_PHYLIP:      return "PHYLIP (interleaved)";
763   case eslMSAFILE_PHYLIPS:     return "PHYLIP (sequential)";
764   default:                     break;
765   }
766   esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "no such msa format code %d\n", fmt);
767   return NULL;
768 }
769 
770 
771 
772 /* msafile_check_selex()
773  * Checks whether an input source appears to be in SELEX format.
774  *
775  * Check whether the input source <bf> appears to be a
776  * SELEX-format alignment file, starting from the current point,
777  * to the end of the input. Return <eslOK> if so, <eslFAIL>
778  * if not.
779  *
780  * The checker is more rigorous than the parser. To be autodetected,
781  * the SELEX file can't use whitespace for gaps. The parser, though,
782  * allows whitespace. A caller would have to specific SELEX format
783  * explicitly in that case.
784  */
785 static int
msafile_check_selex(ESL_BUFFER * bf)786 msafile_check_selex(ESL_BUFFER *bf)
787 {
788   esl_pos_t start_offset = -1;
789   int       block_nseq   = 0;	 /* Number of seqs in each block is checked     */
790   int       nseq         = 0;
791   esl_pos_t block_nres   = 0;	 /* Number of residues in each line is checked  */
792   char     *firstname    = NULL; /* First seq name of every block is checked    */
793   esl_pos_t namelen      = 0;
794   int       blockidx     = 0;
795   int       in_block     = FALSE;
796   char     *p, *tok;
797   esl_pos_t n,  toklen;
798   int       status;
799 
800   /* Anchor at the start of the input, so we can rewind */
801   start_offset = esl_buffer_GetOffset(bf);
802   if ( (status = esl_buffer_SetAnchor(bf, start_offset)) != eslOK) goto ERROR;
803 
804   while ( (status = esl_buffer_GetLine(bf, &p, &n)) == eslOK)
805     {
806       /* Some automatic giveaways of SELEX format */
807       if (esl_memstrpfx(p, n, "#=RF")) { status = eslOK; goto DONE; }
808       if (esl_memstrpfx(p, n, "#=CS")) { status = eslOK; goto DONE; }
809       if (esl_memstrpfx(p, n, "#=SS")) { status = eslOK; goto DONE; }
810       if (esl_memstrpfx(p, n, "#=SA")) { status = eslOK; goto DONE; }
811 
812       /* skip comments */
813       if (esl_memstrpfx(p, n, "#"))    continue;
814 
815       /* blank lines: end block, reset block counters */
816       if (esl_memspn(p, n, " \t") == n)
817 	{
818 	  if (block_nseq && block_nseq != nseq) { status = eslFAIL; goto DONE;} /* each block has same # of seqs */
819 	  if (in_block) blockidx++;
820 	  if (blockidx >= 3) { status = eslOK; goto DONE; } /* stop after three blocks; we're pretty sure by now */
821 	  in_block   = FALSE;
822 	  block_nres = 0;
823 	  block_nseq = nseq;
824 	  nseq       = 0;
825 	  continue;
826 	}
827 
828       /* else we're a "sequence" line. test for two and only two non-whitespace
829        * fields; test that second field has same length; test that each block
830        * starts with the same seq name..
831        */
832       in_block = TRUE;
833       if ( (status = esl_memtok(&p, &n, " \t", &tok, &toklen)) != eslOK) goto ERROR; /* there's at least one token - we already checked for blank lines */
834       if (nseq == 0)	/* check first seq name in each block */
835 	{
836 	  if (blockidx == 0) { firstname = tok; namelen = toklen; } /* First block: set the name we check against. */
837 	  else if (toklen != namelen || memcmp(tok, firstname, toklen) != 0) { status = eslFAIL; goto DONE; } /* Subsequent blocks */
838 	}
839       if (esl_memtok(&p, &n, " \t", &tok, &toklen) != eslOK) { status = eslFAIL; goto DONE; }
840       if (block_nres && toklen != block_nres)                { status = eslFAIL; goto DONE; }
841       block_nres = toklen;
842       if (esl_memtok(&p, &n, " \t", &tok, &toklen) == eslOK) { status = eslFAIL; goto DONE; }
843       nseq++;
844     }
845   if (status != eslEOF) goto ERROR;  /* EOF is expected and good; anything else is bad */
846 
847   if (in_block) blockidx++;
848   status = (blockidx ? eslOK : eslFAIL); /* watch out for the case of no input */
849   /* deliberate readthru */
850  DONE:
851   if (start_offset != -1) {
852     if (esl_buffer_SetOffset(bf, start_offset)   != eslOK) goto ERROR;
853     if (esl_buffer_RaiseAnchor(bf, start_offset) != eslOK) goto ERROR;
854   }
855   return status;
856 
857  ERROR:
858   if (start_offset != -1) {
859     esl_buffer_SetOffset(bf, start_offset);
860     esl_buffer_RaiseAnchor(bf, start_offset);
861   }
862   return status;
863 }
864 /*---------------- end, file format utilities -------------------*/
865 
866 
867 /*****************************************************************
868  *# 4. Guessing alphabet
869  *****************************************************************/
870 
871 /* Function:  esl_msafile_GuessAlphabet()
872  * Synopsis:  Guess what kind of sequences the MSA file contains.
873  *
874  * Purpose:   Guess the alphabet of the sequences in the open
875  *            <ESL_MSAFILE> <afp> -- <eslDNA>, <eslRNA>, or <eslAMINO> --
876  *            based on the residue composition of the input.
877  *
878  * Returns:   Returns <eslOK> on success, and <*ret_type> is set
879  *            to <eslDNA>, <eslRNA>, or <eslAMINO>.
880  *
881  *            Returns <eslENOALPHABET> and sets <*ret_type> to
882  *            <eslUNKNOWN> if the alphabet cannot be reliably
883  *            guessed.
884  *
885  *            Either way, the <afp>'s buffer is restored to its original
886  *            state, no matter how much data we had to read while trying
887  *            to guess.
888  *
889  * Throws:    <eslEMEM> - an allocation error.
890  *            <eslESYS> - a system call such as <fread()> failed.
891  *            <eslEINCONCEIVABLE> - "impossible" corruption, internal bug
892  */
893 int
esl_msafile_GuessAlphabet(ESL_MSAFILE * afp,int * ret_type)894 esl_msafile_GuessAlphabet(ESL_MSAFILE *afp, int *ret_type)
895 {
896   int status = eslENOALPHABET;
897   switch (afp->format) {
898   case eslMSAFILE_STOCKHOLM:   status = esl_msafile_stockholm_GuessAlphabet(afp, ret_type); break;
899   case eslMSAFILE_PFAM:        status = esl_msafile_stockholm_GuessAlphabet(afp, ret_type); break;
900   case eslMSAFILE_A2M:         status = esl_msafile_a2m_GuessAlphabet      (afp, ret_type); break;
901   case eslMSAFILE_PSIBLAST:    status = esl_msafile_psiblast_GuessAlphabet (afp, ret_type); break;
902   case eslMSAFILE_SELEX:       status = esl_msafile_selex_GuessAlphabet    (afp, ret_type); break;
903   case eslMSAFILE_AFA:         status = esl_msafile_afa_GuessAlphabet      (afp, ret_type); break;
904   case eslMSAFILE_CLUSTAL:     status = esl_msafile_clustal_GuessAlphabet  (afp, ret_type); break;
905   case eslMSAFILE_CLUSTALLIKE: status = esl_msafile_clustal_GuessAlphabet  (afp, ret_type); break;
906   case eslMSAFILE_PHYLIP:      status = esl_msafile_phylip_GuessAlphabet   (afp, ret_type); break;
907   case eslMSAFILE_PHYLIPS:     status = esl_msafile_phylip_GuessAlphabet   (afp, ret_type); break;
908   }
909   return status;
910 }
911 /*----------- end, utilities for alphabets ----------------------*/
912 
913 
914 /*****************************************************************
915  *# 5. Random msa flatfile database access (with SSI)
916  *****************************************************************/
917 
918 /* Function:  esl_msafile_PositionByKey()
919  * Synopsis:  Use SSI to reposition file to start of named MSA.
920  *
921  * Purpose:   Reposition <afp> so that the next MSA we read
922  *            will be the one named (or accessioned) <key>.
923  *
924  * Returns:   <eslOK> on success, and the file <afp> is repositioned
925  *            such that the next <esl_msafile_Read()> call will read the
926  *            alignment named <key>.
927  *
928  *            Returns <eslENOTFOUND> if <key> isn't found in the index
929  *            for <afp>.
930  *
931  *            Returns <eslEFORMAT> if something goes wrong trying to
932  *            read the index, indicating some sort of file format
933  *            problem in the SSI file.
934  *
935  * Throws:    <eslENODATA> if there's no open SSI index;
936  *            <eslEINVAL> if the <offset> is invalid, either requiring rewind
937  *              of a nonrewindable stream, or off the end of the data;
938  *            <eslESYS> if a system call such as <fread()> fails;
939  *            <eslEMEM> on allocation failure.
940  *            In all these cases, the state of the <afp> is uncertain
941  *            and may be corrupt; the application should not continue
942  *            to use it.
943  */
944 int
esl_msafile_PositionByKey(ESL_MSAFILE * afp,const char * key)945 esl_msafile_PositionByKey(ESL_MSAFILE *afp, const char *key)
946 {
947   uint16_t fh;
948   off_t    offset;
949   int      status;
950 
951   if (afp->ssi == NULL) ESL_EXCEPTION(eslENODATA, "Need an open SSI index to call esl_msafile_PositionByKey()");
952   if ((status = esl_ssi_FindName(afp->ssi, key, &fh, &offset, NULL, NULL)) != eslOK) return status; /* eslENOTFOUND|eslEFORMAT [eslEMEM] */
953   if ((status = esl_buffer_SetOffset(afp->bf, offset))                     != eslOK) return status; /* [eslEINVAL|eslESYS|eslEMEM] */
954 
955   /* The linenumber gets messed up after a file positioning. Best we can do
956    * is to turn it off (set it to -1). FIX THIS next time SSI format is
957    * changed: add linenumber to a primary key record.
958    */
959   afp->linenumber = -1;
960   return eslOK;
961 }
962 
963 /*--------------------- end SSI functions -----------------------*/
964 
965 
966 /*****************************************************************
967  *# 6. Reading MSAs from input
968  *****************************************************************/
969 
970 /* Function:  esl_msafile_Read()
971  * Synopsis:  Read next MSA from input.
972  *
973  * Purpose:   Reads the next MSA from open MSA input <afp>, and return it in
974  *            <*ret_msa>.
975  *
976  * Args:      afp      - open alignment input stream
977  8            *ret_msa - RETURN: alignment
978  *
979  * Returns:   <eslOK> on success.
980  *
981  *            <eslEFORMAT> on a parse error, and <afp->errmsg> is set
982  *            to a user-directed error message; <*ret_msa> is <NULL>.
983  *
984  *            If no alignment is found at all, returns <eslEOF>,
985  *            and <afp->errmsg> is blank; <*ret_msa> is <NULL>.
986  *
987  *            On normal error, <afp> and the return status code may be
988  *            passed to <esl_msafile_ReadFailure()> to print diagnostics
989  *            to <stderr> (including input source information and line
990  *            number) and exit.
991  *
992  * Throws:    <eslEMEM> - an allocation failed.
993  *            <eslESYS> - a system call such as fread() failed
994  *            <eslEINCONCEIVABLE> - "impossible" corruption
995  */
996 int
esl_msafile_Read(ESL_MSAFILE * afp,ESL_MSA ** ret_msa)997 esl_msafile_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa)
998 {
999   ESL_MSA  *msa    = NULL;
1000   int       status = eslOK;
1001   esl_pos_t offset = esl_buffer_GetOffset(afp->bf);
1002 
1003   switch (afp->format) {
1004   case eslMSAFILE_A2M:          if ((status = esl_msafile_a2m_Read      (afp, &msa)) != eslOK) goto ERROR; break;
1005   case eslMSAFILE_AFA:          if ((status = esl_msafile_afa_Read      (afp, &msa)) != eslOK) goto ERROR; break;
1006   case eslMSAFILE_CLUSTAL:      if ((status = esl_msafile_clustal_Read  (afp, &msa)) != eslOK) goto ERROR; break;
1007   case eslMSAFILE_CLUSTALLIKE:  if ((status = esl_msafile_clustal_Read  (afp, &msa)) != eslOK) goto ERROR; break;
1008   case eslMSAFILE_PFAM:         if ((status = esl_msafile_stockholm_Read(afp, &msa)) != eslOK) goto ERROR; break;
1009   case eslMSAFILE_PHYLIP:       if ((status = esl_msafile_phylip_Read   (afp, &msa)) != eslOK) goto ERROR; break;
1010   case eslMSAFILE_PHYLIPS:      if ((status = esl_msafile_phylip_Read   (afp, &msa)) != eslOK) goto ERROR; break;
1011   case eslMSAFILE_PSIBLAST:     if ((status = esl_msafile_psiblast_Read (afp, &msa)) != eslOK) goto ERROR; break;
1012   case eslMSAFILE_SELEX:        if ((status = esl_msafile_selex_Read    (afp, &msa)) != eslOK) goto ERROR; break;
1013   case eslMSAFILE_STOCKHOLM:    if ((status = esl_msafile_stockholm_Read(afp, &msa)) != eslOK) goto ERROR; break;
1014   default:                      ESL_EXCEPTION(eslEINCONCEIVABLE, "no such msa file format");
1015   }
1016 
1017   msa->offset = offset;
1018   *ret_msa    = msa;
1019   return eslOK;
1020 
1021  ERROR:
1022   if (msa) esl_msa_Destroy(msa);
1023   *ret_msa = NULL;
1024   return status;
1025 }
1026 
1027 /* Function:  esl_msafile_ReadFailure()
1028  * Synopsis:  Report diagnostics of a normal error in parsing MSA file, and exit.
1029  *
1030  * Purpose:   Report user-directed diagnostics of a normal error from
1031  *            parsing an MSA file.  Output the error message to
1032  *            <stderr>, along with information about what we were
1033  *            parsing (filename, if it was a file) and where we were
1034  *            in the input (linenumber, if we know it). This
1035  *            information is all available in <afp>. Then close <afp>
1036  *            and exit with the <status> provided by the caller.
1037  *
1038  * Args:      afp    - open ESL_MSAFILE, containing information about
1039  *                     the error and the input source.
1040  *            status - exit status; generally eslEFORMAT.
1041  *
1042  * Returns:   no return. Exits here with <status>.
1043  */
1044 void
esl_msafile_ReadFailure(ESL_MSAFILE * afp,int status)1045 esl_msafile_ReadFailure(ESL_MSAFILE *afp, int status)
1046 {
1047   switch (status) {
1048   case eslEFORMAT:  fprintf(stderr, "Alignment input parse error:\n   %s\n", afp->errmsg);       break;
1049   case eslEOF:      fprintf(stderr, "Alignment input appears empty?\n");                         break;
1050   default:          fprintf(stderr, "Alignment input read error; unexpected code %d\n", status); break;
1051   }
1052 
1053   switch (afp->bf->mode_is) {
1054   case eslBUFFER_STREAM:   fprintf(stderr, "   while reading %s from an input stream (not a file)\n", esl_msafile_DecodeFormat(afp->format));   break;
1055   case eslBUFFER_CMDPIPE:  fprintf(stderr, "   while reading %s through a pipe (not a file)\n",       esl_msafile_DecodeFormat(afp->format));   break;
1056   case eslBUFFER_FILE:
1057   case eslBUFFER_ALLFILE:
1058   case eslBUFFER_MMAP:     fprintf(stderr, "   while reading %s file %s\n", esl_msafile_DecodeFormat(afp->format), afp->bf->filename);          break;
1059   case eslBUFFER_STRING:   fprintf(stderr, "   while reading %s from a provided string (not a file)\n", esl_msafile_DecodeFormat(afp->format)); break;
1060   default:                 break;
1061   }
1062 
1063   if (afp->linenumber > 0) fprintf(stderr, "   at or near line %" PRIu64 "\n", afp->linenumber);
1064   else                     fprintf(stderr, "   at or near byte %" PRIu64 "\n", esl_buffer_GetOffset(afp->bf));
1065 
1066   esl_msafile_Close(afp);
1067   exit(status);
1068 }
1069 /*------------ end, reading MSA from ESL_MSAFILE ---------------*/
1070 
1071 
1072 
1073 
1074 /*****************************************************************
1075  *# 7. Writing an MSA to a stream.
1076  *****************************************************************/
1077 
1078 /* Function:  esl_msafile_Write()
1079  * Synopsis:  Write an MSA to a stream.
1080  *
1081  * Purpose:   Writes alignment <msa> to open stream <fp> in format <fmt>.
1082  *
1083  *            In general, the <msa> is unchanged, but there are some
1084  *            exceptions. For example, writing an alignment in A2M format
1085  *            will alter alignment data (marking missing data
1086  *            symbols on heuristically defined sequence fragments) and
1087  *            create an <\#=RF> annotation line, if an <msa->rf>
1088  *            annotation line isn't already present in the <msa>.
1089  *
1090  * Args:      fp   - open stream (such as <stdout>)
1091  *            msa  - alignment to write
1092  *            fmt  - format code (such as <eslMSAFILE_STOCKHOLM>)
1093  *
1094  * Returns:   <eslOK> on success.
1095  *
1096  * Throws:    <eslEMEM> on allocation error.
1097  *            <eslEWRITE> on any system write error, such as a filled disk.
1098  */
1099 int
esl_msafile_Write(FILE * fp,ESL_MSA * msa,int fmt)1100 esl_msafile_Write(FILE *fp, ESL_MSA *msa, int fmt)
1101 {
1102   int status;
1103 
1104   switch (fmt) {
1105   case eslMSAFILE_STOCKHOLM:   status = esl_msafile_stockholm_Write(fp, msa, eslMSAFILE_STOCKHOLM);     break;
1106   case eslMSAFILE_PFAM:        status = esl_msafile_stockholm_Write(fp, msa, eslMSAFILE_PFAM);          break;
1107   case eslMSAFILE_A2M:         status = esl_msafile_a2m_Write      (fp, msa);                           break;
1108   case eslMSAFILE_PSIBLAST:    status = esl_msafile_psiblast_Write (fp, msa);                           break;
1109   case eslMSAFILE_SELEX:       status = esl_msafile_selex_Write    (fp, msa);                           break;
1110   case eslMSAFILE_AFA:         status = esl_msafile_afa_Write      (fp, msa);                           break;
1111   case eslMSAFILE_CLUSTAL:     status = esl_msafile_clustal_Write  (fp, msa, eslMSAFILE_CLUSTAL);       break;
1112   case eslMSAFILE_CLUSTALLIKE: status = esl_msafile_clustal_Write  (fp, msa, eslMSAFILE_CLUSTALLIKE);   break;
1113   case eslMSAFILE_PHYLIP:      status = esl_msafile_phylip_Write   (fp, msa, eslMSAFILE_PHYLIP,  NULL); break;
1114   case eslMSAFILE_PHYLIPS:     status = esl_msafile_phylip_Write   (fp, msa, eslMSAFILE_PHYLIPS, NULL); break;
1115   default:                     ESL_EXCEPTION(eslEINCONCEIVABLE, "no such msa file format");
1116   }
1117   return status;
1118 }
1119 
1120 /*-------------- end, writing MSA to stream ---------------------*/
1121 
1122 
1123 /*****************************************************************
1124  *# 8. MSA functions that depend on MSAFILE
1125  *****************************************************************/
1126 
1127 /* Function:  esl_msa_CreateFromString()
1128  * Synopsis:  Creates a small <ESL_MSA> from a test case string.
1129  *
1130  * Purpose:   A convenience for making small test cases in the test
1131  *            suites: given the contents of a complete multiple
1132  *            sequence alignment file as a single string <s> in
1133  *            alignment format <fmt>, convert it to an <ESL_MSA>.
1134  *
1135  *            For example,
1136  *            ```
1137  *              esl_msa_CreateFromString("# STOCKHOLM 1.0\n"
1138  *                                       "seq1 AAAAA\n"
1139  *                                       "seq2 AAAAA\n"
1140  *                                       "//\n", eslMSAFILE_STOCKHOLM);
1141  *            ```
1142  *            creates an ungapped alignment of two AAAAA sequences.
1143  *            (Using string concatenation in C99 makes for a cleaner
1144  *            looking multi-line string.)
1145  *
1146  *            The <msa> is in text mode. If you want it in digital
1147  *            mode, use <esl_msa_Digitize()> on it.
1148  *
1149  * Returns:   a pointer to the new <ESL_MSA> on success.
1150  *
1151  * Throws:    <NULL> if it fails to obtain, open, or read the temporary file
1152  *            that it puts the string <s> in.
1153  */
1154 ESL_MSA *
esl_msa_CreateFromString(const char * s,int fmt)1155 esl_msa_CreateFromString(const char *s, int fmt)
1156 {
1157   ESL_MSAFILE  *mfp  = NULL;
1158   ESL_MSA      *msa  = NULL;
1159 
1160   if (esl_msafile_OpenMem(NULL, s, -1, fmt, NULL, &mfp) != eslOK) goto ERROR;
1161   if (esl_msafile_Read(mfp, &msa)                       != eslOK) goto ERROR;
1162   esl_msafile_Close(mfp);
1163   return msa;
1164 
1165  ERROR:
1166   if (mfp) esl_msafile_Close(mfp);
1167   if (msa) esl_msa_Destroy(msa);
1168   return NULL;
1169 }
1170 
1171 /*****************************************************************
1172  *# 9. Utilities used by specific format parsers.
1173  *****************************************************************/
1174 
1175 /* Function:  esl_msafile_GetLine()
1176  * Synopsis:  Read next line of input alignment file.
1177  *
1178  * Purpose:   Read next line of input <afp>, into its internal
1179  *            data fields: <afp->line> points to the start of the
1180  *            line in <afp->bf>, <afp->n> is its length in
1181  *            bytes, <afp->lineoffset> is the offset in the input
1182  *            to the start of the line, and <afp->linenumber> is
1183  *            the linenumber from <1..N> for <N> total lines in the
1184  *            input.
1185  *
1186  *            Optionally, caller can request <*opt_p>, <*opt_n>,
1187  *            which are set to <afp->line>,<afp->n>. This gives the
1188  *            caller a modifiable line pointer that it can step
1189  *            through, while <afp->line> is preserved for possible
1190  *            diagnostics if anything goes awry.
1191  *
1192  * Args:      <afp>    : an open alignment file input
1193  *            <*opt_p> : optRETURN: modifiable copy of <afp->line> pointer
1194  *            <*opt_n> : optRETURN: modifiable copy of <afp->n>
1195  *
1196  * Returns:   <eslOK> on success.
1197  *
1198  *            <eslEOF> at EOF. Now <afp->line> is <NULL>, <afp->n>
1199  *            is <0>, and <afp->lineoffset> is <0>. <afp->linenumber>
1200  *            is the total number of lines in the input.
1201  *
1202  * Throws:    <eslEMEM> if an allocation fails.
1203  *            <eslESYS> if a system call fails, such as fread().
1204  *            <eslEINCONCEIVABLE> on internal code errors.
1205  */
1206 int
esl_msafile_GetLine(ESL_MSAFILE * afp,char ** opt_p,esl_pos_t * opt_n)1207 esl_msafile_GetLine(ESL_MSAFILE *afp, char **opt_p, esl_pos_t *opt_n)
1208 {
1209   int status;
1210 
1211   afp->lineoffset = esl_buffer_GetOffset(afp->bf);
1212   if ((status = esl_buffer_GetLine(afp->bf, &(afp->line), &(afp->n))) != eslOK) goto ERROR;
1213   if (afp->linenumber != -1) afp->linenumber++;
1214 
1215   if (opt_p) *opt_p = afp->line;
1216   if (opt_n) *opt_n = afp->n;
1217   return eslOK;
1218 
1219  ERROR:
1220   afp->line       = NULL;
1221   afp->n          = 0;
1222   afp->lineoffset = -1;
1223   /* leave linenumber alone. on EOF, it is the number of lines in the file, and that might be useful. */
1224   if (opt_p) *opt_p = NULL;
1225   if (opt_n) *opt_n = 0;
1226   return status;
1227 }
1228 
1229 
1230 /* Function:  esl_msafile_PutLine()
1231  * Synopsis:  Put the line we just read back in the input stream
1232  *
1233  * Purpose:   Put the line we just read back in the input stream
1234  *            and unset <afp->line> and its associated information
1235  *            internally. The next <esl_msafile_GetLine()> call
1236  *            will read exactly the same line again.
1237  *
1238  *            This gets used in parsing files that contain multiple
1239  *            MSAs. If the way we determine that an MSA record has
1240  *            ended is by reading the first line of the next MSA
1241  *            record, then we may want to stuff it back in the input
1242  *            buffer, so it gets parsed properly as part of the next
1243  *            record.  In Pfam/Stockholm parsing we don't have to
1244  *            do this, because the first line is just a format code,
1245  *            with no record-specific data. But in PHYLIP multiple MSA
1246  *            format, for example, the first line is nseq,alen.
1247  *
1248  * Args:      afp  - the open input stream
1249  *
1250  * Returns:   <eslOK> on succes
1251  *
1252  * Throws:    <eslEMEM>, <eslESYS>, <eslEINCONCEIVABLE> if the
1253  *            <esl_buffer_Set()> call fails.
1254  */
1255 int
esl_msafile_PutLine(ESL_MSAFILE * afp)1256 esl_msafile_PutLine(ESL_MSAFILE *afp)
1257 {
1258   int status;
1259   if ((status = esl_buffer_Set(afp->bf, afp->line, 0)) != eslOK) return status;
1260   afp->line       = NULL;
1261   afp->n          = 0;
1262   if (afp->linenumber != -1) afp->linenumber--;
1263   afp->lineoffset = -1;
1264   return eslOK;
1265 }
1266 
1267 /*--------------- end, parser utilities -------------------------*/
1268 
1269 
1270 
1271 /*****************************************************************
1272  * 10. Unit tests
1273  *****************************************************************/
1274 #ifdef eslMSAFILE_TESTDRIVE
1275 
1276 
1277 static void
utest_format2format(int fmt1,int fmt2)1278 utest_format2format(int fmt1, int fmt2)
1279 {
1280   char          msg[]        = "esl_msafile: format2format unit test failed";
1281   char          tmpfile1[32] = "esltmpXXXXXX";
1282   char          tmpfile2[32] = "esltmpXXXXXX";
1283   char          tmpfile3[32] = "esltmpXXXXXX";
1284   FILE         *ofp = NULL;
1285   ESL_MSA      *msa1, *msa2, *msa3, *msa4;
1286   ESL_MSAFILE  *afp;
1287   int           alphatype    = eslAMINO;
1288   ESL_ALPHABET *abc          = esl_alphabet_Create(alphatype);
1289   ESL_ALPHABET *abc2         = NULL;
1290 
1291   char *testmsa = "\
1292 # STOCKHOLM 1.0\n\
1293 seq1    ACDEFGHIKLMNPQRSTVWYacdefghiklmnpq------ACDEFGHIKLMNPQRSTVWYacde......mnpqrstvwyACDEFGHI______RSTVWYacdefghiklmnpqrstvwy\n\
1294 seq2    ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwyACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwyACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy\n\
1295 //\n";
1296 
1297   /* Create the test msa, msa1, digital mode, no autodetections */
1298   if ( esl_msafile_OpenMem(&abc, testmsa, strlen(testmsa), eslMSAFILE_STOCKHOLM, NULL, &afp) != eslOK) esl_fatal(msg);
1299   if ( esl_msafile_Read(afp, &msa1) != eslOK) esl_fatal(msg);
1300   esl_msafile_Close(afp);
1301 
1302   /* Write it to tmpfile1 in fmt1. (This exercises writing of digital MSAs, in all <fmt1> formats) */
1303   if ( esl_tmpfile_named(tmpfile1, &ofp)  != eslOK) esl_fatal(msg);
1304   if ( esl_msafile_Write(ofp, msa1, fmt1) != eslOK) esl_fatal(msg);
1305   fclose(ofp);
1306 
1307   /* Read it back from <fmt1> in TEXT mode (verbatim), with format autodetection (except A2M) */
1308   if (fmt1 != eslMSAFILE_A2M)
1309     {
1310       if ( esl_msafile_Open(NULL, tmpfile1, NULL, eslMSAFILE_UNKNOWN, NULL, &afp) != eslOK) esl_fatal(msg);
1311       if (fmt1 == eslMSAFILE_PFAM     && afp->format == eslMSAFILE_STOCKHOLM) afp->format = eslMSAFILE_PFAM;
1312       if (fmt1 == eslMSAFILE_PSIBLAST && afp->format == eslMSAFILE_SELEX)     afp->format = eslMSAFILE_PSIBLAST;
1313       if ( esl_msafile_Read(afp, &msa2) != eslOK) esl_fatal(msg);
1314       esl_msafile_Close(afp);
1315     }
1316   else // without autodetection:
1317     {
1318       if ( esl_msafile_Open(NULL, tmpfile1, NULL, fmt1, NULL, &afp) != eslOK) esl_fatal(msg);
1319       if ( esl_msafile_Read(afp, &msa2) != eslOK) esl_fatal(msg);
1320       esl_msafile_Close(afp);
1321     }
1322 
1323   /* Write it to tmpfile2 in fmt2. (This exercises writing of text-mode MSAs, in all <fmt2> formats) */
1324   if ( esl_tmpfile_named(tmpfile2, &ofp)  != eslOK) esl_fatal(msg);
1325   if ( esl_msafile_Write(ofp, msa2, fmt2) != eslOK) esl_fatal(msg);
1326   fclose(ofp);
1327 
1328   /* Read it back in TEXT mode, with format autodetection  (except A2M) */
1329   if (fmt2 != eslMSAFILE_A2M)
1330     {
1331       if ( esl_msafile_Open(NULL, tmpfile2, NULL, eslMSAFILE_UNKNOWN, NULL, &afp) != eslOK) esl_fatal(msg);
1332       if (fmt2 == eslMSAFILE_PFAM     && afp->format == eslMSAFILE_STOCKHOLM) afp->format = eslMSAFILE_PFAM;
1333       if (fmt2 == eslMSAFILE_PSIBLAST && afp->format == eslMSAFILE_SELEX)     afp->format = eslMSAFILE_PSIBLAST;
1334       if ( esl_msafile_Read(afp, &msa3) != eslOK) esl_fatal(msg);
1335       esl_msafile_Close(afp);
1336     }
1337   else // without autodetection:
1338     {
1339       if ( esl_msafile_Open(NULL, tmpfile2, NULL, fmt2, NULL, &afp) != eslOK) esl_fatal(msg);
1340       if ( esl_msafile_Read(afp, &msa3) != eslOK) esl_fatal(msg);
1341       esl_msafile_Close(afp);
1342     }
1343 
1344   /* Write it to tmpfile4 in fmt2. (This exercises writing of digital-mode MSAs, in all <fmt2> formats */
1345   if ( esl_tmpfile_named(tmpfile3, &ofp)   != eslOK) esl_fatal(msg);
1346   if ( esl_msafile_Write(ofp, msa3, fmt2) != eslOK) esl_fatal(msg);
1347   fclose(ofp);
1348 
1349   /* Read it back in DIGITAL mode, with alphabet autodetection but not format */
1350   if ( esl_msafile_Open(&abc2, tmpfile3, NULL, fmt2, NULL, &afp) != eslOK) esl_fatal(msg);
1351   if ( esl_msafile_Read(afp, &msa4) != eslOK) esl_fatal(msg);
1352   esl_msafile_Close(afp);
1353 
1354   /* Now:
1355    *   msa1 = digital mode test alignment, created from Stockholm string
1356    *   msa2 = TEXT mode, read from <fmt1> tmpfile1
1357    *   msa3 = TEXT mode, read from <fmt2> tmpfile2
1358    *   msa4 = digital mode, read from <fmt2> tmpfile3
1359    * So we expect:
1360    *   msa2==msa3
1361    *   msa1==msa4
1362    */
1363 
1364   /* some normalization before comparing alignments. */
1365   esl_msa_SymConvert(msa2, "abcdefghijklmnopqrstuvwxyz.", "ABCDEFGHIJKLMNOPQRSTUVWXYZ-");
1366   esl_msa_SymConvert(msa3, "abcdefghijklmnopqrstuvwxyz.", "ABCDEFGHIJKLMNOPQRSTUVWXYZ-");
1367 
1368   if (msa2->rf) { free (msa2->rf); msa2->rf = NULL; }
1369   if (msa3->rf) { free (msa3->rf); msa3->rf = NULL; }
1370   if (msa4->rf) { free (msa4->rf); msa4->rf = NULL; }
1371 
1372   if (esl_msa_Compare(msa2, msa3) != eslOK) esl_fatal(msg);
1373   if (esl_msa_Compare(msa1, msa4) != eslOK) esl_fatal(msg);
1374 
1375   remove(tmpfile1);
1376   remove(tmpfile2);
1377   remove(tmpfile3);
1378   esl_msa_Destroy(msa1);
1379   esl_msa_Destroy(msa2);
1380   esl_msa_Destroy(msa3);
1381   esl_msa_Destroy(msa4);
1382   esl_alphabet_Destroy(abc);
1383   esl_alphabet_Destroy(abc2);
1384 }
1385 #endif /*eslMSAFILE_TESTDRIVE*/
1386 /*----------------- end, unit tests -----------------------------*/
1387 
1388 
1389 /*****************************************************************
1390  * 11. Test driver
1391  *****************************************************************/
1392 #ifdef eslMSAFILE_TESTDRIVE
1393 
1394 /* compile: gcc -g -Wall -I. -L. -o esl_msafile_utest -DeslMSAFILE_TESTDRIVE esl_msafile.c -leasel -lm
1395  *  (gcov): gcc -g -Wall -fprofile-arcs -ftest-coverage -I. -L. -o esl_msafile_utest -DeslMSAFILE_TESTDRIVE esl_msafile.c -leasel -lm
1396  * run:     ./esl_msafile_utest
1397  */
1398 #include "esl_config.h"
1399 
1400 #include <stdio.h>
1401 
1402 #include "easel.h"
1403 #include "esl_getopts.h"
1404 #include "esl_msafile.h"
1405 
1406 static ESL_OPTIONS options[] = {
1407    /* name  type         default  env   range togs  reqs  incomp  help                docgrp */
1408   {"-h",  eslARG_NONE,    FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage",                            0},
1409   { 0,0,0,0,0,0,0,0,0,0},
1410 };
1411 static char usage[]  = "[-options]";
1412 static char banner[] = "test driver for MSA input/output format module";
1413 
1414 int
main(int argc,char ** argv)1415 main(int argc, char **argv)
1416 {
1417   ESL_GETOPTS    *go          = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
1418   int fmt1, fmt2;
1419 
1420   for (fmt1 = eslMSAFILE_STOCKHOLM; fmt1 <= eslMSAFILE_PHYLIPS; fmt1++)
1421     for (fmt2 = eslMSAFILE_STOCKHOLM; fmt2 <= eslMSAFILE_PHYLIPS; fmt2++)
1422       utest_format2format(fmt1, fmt2);
1423 
1424   esl_getopts_Destroy(go);
1425   exit(0);
1426 }
1427 
1428 #endif /*eslMSAFILE_TESTDRIVE*/
1429 /*----------------- end, test driver ----------------------------*/
1430 
1431 
1432 
1433 /*****************************************************************
1434  * 12. Examples.
1435  *****************************************************************/
1436 
1437 #ifdef eslMSAFILE_EXAMPLE
1438 /*::cexcerpt::msafile_example::begin::*/
1439 #include <stdio.h>
1440 #include "easel.h"
1441 #include "esl_alphabet.h"
1442 #include "esl_getopts.h"
1443 #include "esl_msa.h"
1444 #include "esl_msafile.h"
1445 
1446 static ESL_OPTIONS options[] = {
1447   /* name             type          default  env  range toggles reqs incomp  help                                       docgroup*/
1448   { "-h",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",        0 },
1449   { "-i",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "show info, instead of converting format",     0 },
1450   { "--informat",  eslARG_STRING,      NULL,  NULL, NULL,  NULL,  NULL, NULL, "specify the input MSA file is in format <s>", 0 },
1451   { "--outformat", eslARG_STRING, "Clustal",  NULL, NULL,  NULL,  NULL, NULL, "write the output MSA in format <s>",          0 },
1452   { "--dna",       eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use DNA alphabet",                            0 },
1453   { "--rna",       eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use RNA alphabet",                            0 },
1454   { "--amino",     eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use protein alphabet",                        0 },
1455   { "--text",      eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use text mode: no digital alphabet",          0 },
1456   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1457 };
1458 static char usage[]  = "[-options] <msafile>";
1459 static char banner[] = "example of multiple alignment input and output using the msafile module(s)";
1460 
1461 int
main(int argc,char ** argv)1462 main(int argc, char **argv)
1463 {
1464   ESL_GETOPTS  *go        = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
1465   char         *msafile   = esl_opt_GetArg(go, 1);
1466   int           infmt     = eslMSAFILE_UNKNOWN;
1467   int           outfmt    = eslMSAFILE_UNKNOWN;
1468   ESL_ALPHABET *abc       = NULL;
1469   ESL_MSAFILE  *afp       = NULL;
1470   ESL_MSA      *msa       = NULL;
1471   int           textmode  = esl_opt_GetBoolean(go, "--text");
1472   int           showinfo  = esl_opt_GetBoolean(go, "-i");
1473   int           nali      = 0;
1474   int           status;
1475 
1476   /* If you know the alphabet you want, create it - you'll pass it to esl_msafile_Open() */
1477   if      (esl_opt_GetBoolean(go, "--rna"))   abc = esl_alphabet_Create(eslRNA);
1478   else if (esl_opt_GetBoolean(go, "--dna"))   abc = esl_alphabet_Create(eslDNA);
1479   else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
1480 
1481   /* If you know the MSA file format, set it (<infmt>, here). */
1482   if (esl_opt_IsOn(go, "--informat") &&
1483       (infmt = esl_msafile_EncodeFormat(esl_opt_GetString(go, "--informat"))) == eslMSAFILE_UNKNOWN)
1484     esl_fatal("%s is not a valid MSA file format for --informat", esl_opt_GetString(go, "--informat"));
1485 
1486   /* Open in text or digital mode.
1487    *   To let the Open() function autoguess the format, you pass <infmt=eslMSAFILE_UNKNOWN>.
1488    *   To let it autoguess the alphabet, you set <abc=NULL> and pass <&abc>.
1489    *   To open in text mode instead of digital, you pass <NULL> for the alphabet argument.
1490    * esl_msafile_OpenFailure() is a convenience, printing various diagnostics of any
1491    * open failure to <stderr>. You can of course handle your own diagnostics instead.
1492    */
1493   if (textmode) status = esl_msafile_Open(NULL, msafile, NULL, infmt, NULL, &afp);
1494   else          status = esl_msafile_Open(&abc, msafile, NULL, infmt, NULL, &afp);
1495   if (status != eslOK)   esl_msafile_OpenFailure(afp, status);
1496 
1497   if (showinfo) {
1498     printf("# Format:    %s\n", esl_msafile_DecodeFormat(afp->format));
1499     printf("# Alphabet:  %s\n", (afp->abc ? esl_abc_DecodeType(afp->abc->type) : "text mode"));
1500   }
1501 
1502   /* Choose the output file format */
1503   if ( (outfmt = esl_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat"))) == eslMSAFILE_UNKNOWN)
1504     esl_fatal("%s is not a valid MSA file format for --outformat", esl_opt_GetString(go, "--outformat"));
1505 
1506   while ((status = esl_msafile_Read(afp, &msa)) == eslOK)
1507     {
1508       /* if digital MSA: msa->ax[idx=0..nseq-1][acol=1..alen] is the alignment data;
1509        * if text MSA:  msa->aseq[idx=0..nseq-1][acol=0..alen-1] */
1510       nali++;
1511 
1512       if (showinfo) printf("# alignment %5d: %15s: %6d seqs, %5d columns\n\n", nali, msa->name, (int) msa->nseq, (int) msa->alen);
1513       else   	    esl_msafile_Write(stdout, msa, outfmt);
1514 
1515       esl_msa_Destroy(msa);
1516     }
1517   if (nali == 0 || status != eslEOF) esl_msafile_ReadFailure(afp, status); /* a convenience, like esl_msafile_OpenFailure() */
1518 
1519   esl_alphabet_Destroy(abc);
1520   esl_msafile_Close(afp);
1521   esl_getopts_Destroy(go);
1522   exit(0);
1523 }
1524 /*::cexcerpt::msafile_example::end::*/
1525 #endif /*eslMSAFILE_EXAMPLE*/
1526 /*------------------------ end of examples -----------------------*/
1527 
1528