1 /* i/o of multiple sequence alignment files in Clustal-like formats
2  *
3  * Contents:
4  *   1. API for reading/writing Clustal and Clustal-like formats
5  *   2. Internal routines for Clustal formats.
6  *   3. Unit tests.
7  *   4. Test driver.
8  *   5. Example.
9  *
10  * This module is responsible for i/o of both eslMSAFILE_CLUSTAL and
11  * eslMSAFILE_CLUSTALLIKE alignment formats.
12  */
13 #include "esl_config.h"
14 
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <ctype.h>
19 
20 #include "easel.h"
21 #include "esl_alphabet.h"
22 #include "esl_mem.h"
23 #include "esl_msa.h"
24 #include "esl_msafile.h"
25 
26 #include "esl_msafile_clustal.h"
27 
28 static int make_text_consensus_line(const ESL_MSA *msa, char **ret_consline);
29 static int make_digital_consensus_line(const ESL_MSA *msa, char **ret_consline);
30 
31 /*****************************************************************
32  *# 1. API for reading/writing Clustal and Clustal-like formats
33  *****************************************************************/
34 
35 /* Function:  esl_msafile_clustal_SetInmap()
36  * Synopsis:  Configure input map for CLUSTAL, CLUSTALLIKE formats.
37  *
38  * Purpose:   Set the <afp->inmap> for Clustal-like formats.
39  *
40  *            Text mode accepts any <isgraph()> character.
41  *            Digital mode enforces the usual Easel alphabets.
42  */
43 int
esl_msafile_clustal_SetInmap(ESL_MSAFILE * afp)44 esl_msafile_clustal_SetInmap(ESL_MSAFILE *afp)
45 {
46   int sym;
47 
48   if (afp->abc)
49     {
50       for (sym = 0; sym < 128; sym++)
51 	afp->inmap[sym] = afp->abc->inmap[sym];
52       afp->inmap[0] = esl_abc_XGetUnknown(afp->abc);
53     }
54 
55   if (! afp->abc)
56     {
57       for (sym = 1; sym < 128; sym++)
58 	afp->inmap[sym] = (isgraph(sym) ? sym : eslDSQ_ILLEGAL);
59       afp->inmap[0] = '?';
60     }
61   return eslOK;
62 }
63 
64 
65 /* Function:  esl_msafile_clustal_GuessAlphabet()
66  * Synopsis:  Guess the alphabet of an open Clustal MSA input.
67  *
68  * Purpose:   Guess the alpbabet of the sequences in open
69  *            Clustal format MSA file <afp>.
70  *
71  *            On a normal return, <*ret_type> is set to <eslDNA>,
72  *            <eslRNA>, or <eslAMINO>, and <afp> is reset to its
73  *            original position.
74  *
75  * Args:      afp      - open Clustal format MSA file
76  *            ret_type - RETURN: <eslDNA>, <eslRNA>, or <eslAMINO>
77  *
78  * Returns:   <eslOK> on success.
79  *            <eslENOALPHABET> if alphabet type can't be determined.
80  *            In either case, <afp> is rewound to the position it
81  *            started at.
82  *
83  * Throws:    <eslEMEM> on allocation error.
84  *            <eslESYS> on failures of fread() or other system calls
85  */
86 int
esl_msafile_clustal_GuessAlphabet(ESL_MSAFILE * afp,int * ret_type)87 esl_msafile_clustal_GuessAlphabet(ESL_MSAFILE *afp, int *ret_type)
88 {
89   int       alphatype     = eslUNKNOWN;
90   esl_pos_t anchor        = -1;
91   int       threshold[3]  = { 500, 5000, 50000 }; /* we check after 500, 5000, 50000 residues; else we go to EOF */
92   int       nsteps        = 3;
93   int       step          = 0;
94   int       nres          = 0;
95   int       x;
96   int64_t   ct[26];
97   char     *p, *tok;
98   esl_pos_t n,  toklen, pos;
99   int       status;
100 
101   for (x = 0; x < 26; x++) ct[x] = 0;
102 
103   anchor = esl_buffer_GetOffset(afp->bf);
104   if ((status = esl_buffer_SetAnchor(afp->bf, anchor)) != eslOK) { status = eslEINCONCEIVABLE; goto ERROR; } /* [eslINVAL] can't happen here */
105 
106   /* Ignore the first nonblank line, which says "CLUSTAL W (1.83) multiple sequence alignment" or some such */
107   while ( (status = esl_buffer_GetLine(afp->bf, &p, &n)) == eslOK  && esl_memspn(p, n, " \t") == n) ;
108   if      (status == eslEOF) ESL_XFAIL(eslENOALPHABET, afp->errmsg, "can't determine alphabet: no alignment data found");
109   else if (status != eslOK)  goto ERROR;
110 
111   while ( (status = esl_buffer_GetLine(afp->bf, &p, &n)) == eslOK)
112     {
113       if ((status = esl_memtok(&p, &n, " \t", &tok, &toklen)) != eslOK) continue; /* ignore blank lines */
114       /* p now points to the rest of the sequence line, after a name */
115 
116       /* count characters into ct[] array */
117       for (pos = 0; pos < n; pos++)
118 	if (isalpha(p[pos])) {
119 	  x = toupper(p[pos]) - 'A';
120 	  ct[x]++;
121 	  nres++;
122 	}
123       /* note that GuessAlphabet() is robust against the optional coord lines
124        * and the annotation lines -- it only counts ascii characters.
125        */
126 
127       /* try to stop early, checking after 500, 5000, and 50000 residues: */
128       if (step < nsteps && nres > threshold[step]) {
129 	if ((status = esl_abc_GuessAlphabet(ct, &alphatype)) == eslOK) goto DONE; /* (eslENOALPHABET) */
130 	step++;
131       }
132     }
133   if (status != eslEOF) goto ERROR; /* [eslEMEM,eslESYS,eslEINCONCEIVABLE] */
134   status = esl_abc_GuessAlphabet(ct, &alphatype); /* (eslENOALPHABET) */
135 
136  DONE:
137   esl_buffer_SetOffset(afp->bf, anchor);   /* Rewind to where we were. */
138   esl_buffer_RaiseAnchor(afp->bf, anchor);
139   *ret_type = alphatype;
140   return status;
141 
142  ERROR:
143   if (anchor != -1) {
144     esl_buffer_SetOffset(afp->bf, anchor);
145     esl_buffer_RaiseAnchor(afp->bf, anchor);
146   }
147   *ret_type = eslUNKNOWN;
148   return status;
149 }
150 
151 
152 /* Function:  esl_msafile_clustal_Read()
153  * Synopsis:  Read in a CLUSTAL or CLUSTALLIKE alignment.
154  *
155  * Purpose:   Read an MSA from an open <ESL_MSAFILE> <afp>, parsing
156  *            for Clustal or Clustal-like format, starting from the
157  *            current point. (<afp->format> is expected to be
158  *            <eslMSAFILE_CLUSTAL> or <eslMSAFILE_CLUSTALLIKE>.) Create a
159  *            new multiple alignment, and return a ptr to that
160  *            alignment in <*ret_msa>.  Caller is responsible for
161  *            free'ing this <ESL_MSA>.
162  *
163  * Args:      afp     - open <ESL_MSAFILE>
164  *            ret_msa - RETURN: newly parsed <ESL_MSA>
165  *
166  * Returns:   <eslOK> on success.
167  *
168  *            <eslEOF> if no (more) alignment data are found in
169  *            <afp>, and <afp> is returned at EOF.
170  *
171  *            <eslEFORMAT> on a parse error. <*ret_msa> is set to
172  *            <NULL>. <afp> contains information sufficient for
173  *            constructing useful diagnostic output:
174  *            | <afp->errmsg>       | user-directed error message     |
175  *            | <afp->linenumber>   | line # where error was detected |
176  *            | <afp->line>         | offending line (not NUL-term)   |
177  *            | <afp->n>            | length of offending line        |
178  *            | <afp->bf->filename> | name of the file                |
179  *            and <afp> is poised at the start of the following line,
180  *            so (in principle) the caller could try to resume
181  *            parsing.
182  *
183  * Throws:    <eslEMEM> - an allocation failed.
184  *            <eslESYS> - a system call such as fread() failed
185  *            <eslEINCONCEIVABLE> - "impossible" corruption
186  */
187 int
esl_msafile_clustal_Read(ESL_MSAFILE * afp,ESL_MSA ** ret_msa)188 esl_msafile_clustal_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa)
189 {
190   ESL_MSA  *msa     = NULL;
191   char     *p       = NULL;
192   esl_pos_t n       = 0;
193   char     *tok     = NULL;
194   esl_pos_t ntok    = 0;
195   int       nblocks = 0;
196   int       idx     = 0;
197   int       nseq    = 0;
198   int64_t   alen    = 0;
199   int64_t   cur_alen;
200   esl_pos_t pos;
201   esl_pos_t name_start, name_len;
202   esl_pos_t seq_start, seq_len;
203   esl_pos_t block_seq_start, block_seq_len;
204   int       status;
205 
206   ESL_DASSERT1( (afp->format == eslMSAFILE_CLUSTAL || afp->format == eslMSAFILE_CLUSTALLIKE) );
207 
208   afp->errmsg[0] = '\0';
209 
210   if (afp->abc   &&  (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
211   if (! afp->abc &&  (msa = esl_msa_Create(                 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
212 
213   /* skip leading blank lines in file */
214   while ( (status = esl_msafile_GetLine(afp, &p, &n)) == eslOK  && esl_memspn(afp->line, afp->n, " \t") == afp->n) ;
215   if      (status != eslOK)  goto ERROR; /* includes normal EOF */
216 
217   /* That first line says something like: "CLUSTAL W (1.83) multiple sequence alignment" */
218   if (esl_memtok(&p, &n, " \t", &tok, &ntok) != eslOK)                             ESL_XFAIL(eslEFORMAT, afp->errmsg, "missing CLUSTAL header");
219   if (afp->format == eslMSAFILE_CLUSTAL && ! esl_memstrpfx(tok, ntok, "CLUSTAL"))  ESL_XFAIL(eslEFORMAT, afp->errmsg, "missing CLUSTAL header");
220   if (! esl_memstrcontains(p, n, "multiple sequence alignment"))                   ESL_XFAIL(eslEFORMAT, afp->errmsg, "missing CLUSTAL header");
221 
222   /* skip blank lines again */
223   do {
224     status = esl_msafile_GetLine(afp, &p, &n);
225     if      (status == eslEOF) ESL_XFAIL(eslEFORMAT, afp->errmsg, "no alignment data following header");
226     else if (status != eslOK) goto ERROR;
227   } while (esl_memspn(afp->line, afp->n, " \t") == afp->n); /* idiom for "blank line" */
228 
229   /* Read the file a line at a time. */
230   do { 		/* afp->line, afp->n is now the first line of a block... */
231     idx = 0;
232     do {
233       for (pos = 0;     pos < n; pos++) if (! isspace(p[pos])) break;
234       name_start = pos;
235       for (pos = pos+1; pos < n; pos++) if (  isspace(p[pos])) break;
236       name_len   = pos - name_start;
237       for (pos = pos+1; pos < n; pos++) if (! isspace(p[pos])) break;
238       seq_start  = pos;
239       if (pos >= n) ESL_XFAIL(eslEFORMAT, afp->errmsg, "invalid alignment line");
240       for (pos = pos+1; pos < n; pos++) if (  isspace(p[pos])) break;
241       seq_len    = pos - seq_start; /* expect one block; ignore trailing stuff, inc. optional coords */
242 
243       if (idx == 0) {
244 	block_seq_start = seq_start;
245 	block_seq_len   = seq_len;
246       } else {
247 	if (seq_start != block_seq_start) ESL_XFAIL(eslEFORMAT, afp->errmsg, "sequence start is misaligned");
248 	if (seq_len   != block_seq_len)   ESL_XFAIL(eslEFORMAT, afp->errmsg, "sequence end is misaligned");
249       }
250 
251       /* Store the sequence name. */
252       if (nblocks == 0)	{
253 	/* make sure we have room for another sequence */
254 	if (idx >= msa->sqalloc &&  (status = esl_msa_Expand(msa))           != eslOK) goto ERROR;
255 	if ( (status = esl_msa_SetSeqName(msa, idx, p+name_start, name_len)) != eslOK) goto ERROR;
256 	nseq++;
257       } else {
258 	if (! esl_memstrcmp(p+name_start, name_len, msa->sqname[idx]))
259 	  ESL_XFAIL(eslEFORMAT, afp->errmsg, "expected sequence %s on this line, but saw %.*s", msa->sqname[idx], (int) name_len, p+name_start);
260       }
261 
262       /* Append the sequence. */
263       cur_alen = alen;
264       if (msa->abc)    { status = esl_abc_dsqcat(afp->inmap, &(msa->ax[idx]),   &(cur_alen), p+seq_start, seq_len); }
265       if (! msa->abc)  { status = esl_strmapcat (afp->inmap, &(msa->aseq[idx]), &(cur_alen), p+seq_start, seq_len); }
266       if      (status == eslEINVAL)    ESL_XFAIL(eslEFORMAT, afp->errmsg, "one or more invalid sequence characters");
267       else if (status != eslOK)        goto ERROR;
268       if (cur_alen - alen != seq_len) ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected number of seq characters");
269 
270       /* get next line. if it's a consensus line, we're done with the block */
271       status = esl_msafile_GetLine(afp, &p, &n);
272       if      (status == eslEOF) ESL_XFAIL(eslEFORMAT, afp->errmsg, "alignment block did not end with consensus line");
273       else if (status != eslOK)  goto ERROR;
274 
275       idx++;
276     } while (esl_memspn(afp->line, afp->n, " .:*") < afp->n); /* end loop over a block */
277 
278     if (idx != nseq) ESL_XFAIL(eslEFORMAT, afp->errmsg, "last block didn't contain same # of seqs as earlier blocks");
279 
280     /* skip blank lines until we find start of next block, or EOF */
281     do {
282       status = esl_msafile_GetLine(afp, &p, &n);
283       if      (status == eslEOF) break;
284       else if (status != eslOK)  goto ERROR;
285     } while (esl_memspn(p, n, " \t") == n);
286 
287     alen += block_seq_len;
288     nblocks++;
289   } while (status == eslOK);	/* normal end has status == EOF after last block. */
290 
291   msa->nseq = nseq;
292   msa->alen = alen;
293   if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR;
294   *ret_msa = msa;
295   return eslOK;
296 
297  ERROR:
298   if (msa) esl_msa_Destroy(msa);
299   *ret_msa = NULL;
300   return status;
301 }
302 
303 
304 /* Function:  esl_msafile_clustal_Write()
305  * Synopsis:  Write a CLUSTAL format alignment file to a stream.
306  *
307  * Purpose:   Write alignment <msa> to output stream <fp>, in
308  *            format <fmt>. If <fmt> is <eslMSAFILE_CLUSTAL>,
309  *            write strict CLUSTAL 2.1 format. If <fmt>
310  *            is <eslMSAFILE_CLUSTALLIKE>, put "EASEL (VERSION)"
311  *            in the header.
312  *
313  *            The alignment is written in blocks of 60 aligned
314  *            residues at a time.
315  *
316  *            Constructing the CLUSTAL consensus line properly
317  *            requires knowing the alphabet. If the <msa> is in text
318  *            mode, we don't know the alphabet, so then we use a
319  *            simplified consensus line, with '*' marking completely
320  *            conserved columns, ' ' on everything else. If the <msa>
321  *            is in digital mode and of type <eslAMINO>, then we also
322  *            use Clustal's "strong" and "weak" residue group
323  *            annotations, ':' and '.'.  Strong groups are STA, NEQK,
324  *            NHQK, NDEQ, QHRK, MILV, MILF, HY, and FYW. Weak groups
325  *            are CSA, ATV, SAG, STNK, STPA, SGND, SNDEQK, NDEQHK,
326  *            NEQHRK, FVLIM, and HFY.
327  *
328  * Args:      fp  - open output stream, writable
329  *            msa - alignment to write
330  *            fmt - eslMSAFILE_CLUSTAL or eslMSAFILE_CLUSTALLIKE
331  *
332  * Returns:   <eslOK> on success.
333  *
334  * Throws:    <eslEMEM> on allocation error.
335  *            <eslEWRITE> on any system write error, such as filled disk.
336  */
337 int
esl_msafile_clustal_Write(FILE * fp,const ESL_MSA * msa,int fmt)338 esl_msafile_clustal_Write(FILE *fp, const ESL_MSA *msa, int fmt)
339 {
340   int       cpl        = 60;
341   int       maxnamelen = 0;
342   int       namelen;
343   char     *consline   = NULL;
344   char     *buf        = NULL;
345   int64_t   apos;
346   int       i;
347   int       status;
348 
349   ESL_ALLOC(buf, sizeof(char) * (cpl+1));
350   buf[cpl] = '\0';
351   for (i = 0; i < msa->nseq; i++)
352     {
353       namelen = strlen(msa->sqname[i]);
354       maxnamelen = ESL_MAX(namelen, maxnamelen);
355     }
356 
357   /* Make a CLUSTAL-like consensus line */
358   if (  msa->abc && (status = make_digital_consensus_line(msa, &consline)) != eslOK) goto ERROR;
359   if (! msa->abc && (status = make_text_consensus_line   (msa, &consline)) != eslOK) goto ERROR;
360 
361   /* The magic header */
362   if      (fmt == eslMSAFILE_CLUSTAL)     { if (fprintf(fp, "CLUSTAL 2.1 multiple sequence alignment\n")               < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "clustal msa write failed");  }
363   else if (fmt == eslMSAFILE_CLUSTALLIKE) { if (fprintf(fp, "EASEL (%s) multiple sequence alignment\n", EASEL_VERSION) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "clustal msa write failed");  }
364 
365   /* The alignment */
366   for (apos = 0; apos < msa->alen; apos += cpl)
367     {
368       if (fprintf(fp, "\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "clustal msa write failed");
369       for (i = 0; i < msa->nseq; i++)
370 	{
371 	  if (msa->abc)   esl_abc_TextizeN(msa->abc, msa->ax[i]+apos+1, cpl, buf);
372 	  if (! msa->abc) strncpy(buf, msa->aseq[i]+apos, cpl);
373 	  if (fprintf(fp, "%-*s %s\n", maxnamelen, msa->sqname[i], buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "clustal msa write failed");
374 	}
375       strncpy(buf, consline+apos, cpl);
376       if (fprintf(fp, "%-*s %s\n", maxnamelen, "", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "clustal msa write failed");
377     }
378 
379   free(buf);
380   free(consline);
381   return eslOK;
382 
383  ERROR:
384   if (buf)      free(buf);
385   if (consline) free(consline);
386   return status;
387 }
388 /*---------------- end, Clustal API -----------------------------*/
389 
390 
391 
392 /*****************************************************************
393  * 2. Internal routines for Clustal formats
394  *****************************************************************/
395 
396 /* Clustal consensus lines.
397  *    '*' :  100% conserved positions
398  *    ':' :  all residues in column belong to a "strong group"
399  *    '.' :  all residues in column belong to a "weak group"
400  *    ' ' :  otherwise
401  *
402  * Gap characters count, and ambiguity codes are interpreted verbatim,
403  * so even a single gap or ambiguity code makes the column a ' '.
404  *
405  * From examining the source code for ClustalW (as it writes its
406  * "self explanatory format", ahem!):
407  *   strong groups = STA, NEQK, NHQK, NDEQ, QHRK, MILV, MILF,
408  *                   HY,  FYW
409  *   weak groups =   CSA, ATV,  SAG,  STNK, STPA, SGND, SNDEQK,
410  *	             NDEQHK, NEQHRK, FVLIM, HFY
411  *
412  * These groups only apply to protein data, and therefore only to
413  * digital alignments using an <eslAMINO> alphabet.
414 
415  * Calculating the consensus line can be compute-intensive, for large
416  * alignments. A naive implementation (for each column, collect
417  * residue counts, compare to each conservation group) was judged too
418  * slow: 16.2s to write the Pkinase full alignment, compared to 1.5s
419  * to write Stockholm format [SRE:J8/22]. Here we use a slightly less
420  * naive implementation, which collects a bit vector (one bit per
421  * residue) for each column, and traverses the alignment in stride
422  * (sequences, then columns). Writing Clustal format Pkinase now takes
423  * 2.3s, and most of the difference w.r.t. Stockholm is now assignable
424  * to the smaller width (thus greater number of blocks) written for
425  * Clustal (60 cpl vs 200) rather than to consensus construction.
426  *
427  * An oversophisticated approach could use a finite
428  * automaton to store all groups in one machine, then to use the FA to
429  * process each residue seen in a column; for most columns, we would
430  * quickly reach a rejection state (most columns don't belong to
431  * a conservation group, especially in large alignments). For a sketch
432  * of how to construct and use such an automaton, xref SRE:J8/22.
433  * I decided this was probably overkill, and didn't implement it.
434  */
435 
436 
437 /* make_text_consensus_line()
438  *
439  * Given a text mode <msa>, allocate and create a CLUSTAL-style
440  * consensus line; return it in <*ret_consline>. Caller is responsible
441  * for free'ing this string.
442  *
443  * In text mode, we don't know the alphabet; in particular, we can't
444  * know if the data are amino acids, so we don't know if it's
445  * appropriate to use the amino acid group codes. So we don't;
446  * in text mode, only '*' and ' ' appear in consensus lines.
447  *
448  * The consensus line is numbered 0..alen-1, and is NUL-terminated.
449  *
450  * Returns <eslOK> on success.
451  * No normal failure codes.
452  * Throws <eslEMEM> on allocation error.
453  */
454 static int
make_text_consensus_line(const ESL_MSA * msa,char ** ret_consline)455 make_text_consensus_line(const ESL_MSA *msa, char **ret_consline)
456 {
457   char     *consline = NULL;
458   uint32_t *v        = NULL;
459   uint32_t  tmpv, maxv;
460   int       n;
461   int       idx, apos, x;
462   int       status;
463 
464   ESL_ALLOC(consline, sizeof(char)     * (msa->alen+1));
465   ESL_ALLOC(v,        sizeof(uint32_t) * (msa->alen));
466   for (apos = 0; apos < msa->alen; apos++)
467     v[apos] = 0;
468 
469   for (idx = 0; idx < msa->nseq; idx++)
470     for (apos = 0; apos < msa->alen; apos++)
471       {
472 	x = toupper(msa->aseq[idx][apos]) - 'A';
473 	if (x >= 0 && x < 26) v[apos] |= (1 <<  x);
474 	else                  v[apos] |= (1 << 26);
475       }
476   maxv = (1 << 26) - 1;
477 
478   for (apos = 0; apos < msa->alen; apos++)
479     {
480       for (n = 0, tmpv = v[apos]; tmpv; n++) tmpv &= tmpv-1; /* Kernighan magic: count # of bits set in tmpv */
481       consline[apos] = ((n == 1 && v[apos] < maxv) ? '*' : ' ');
482     }
483   consline[msa->alen] = '\0';
484 
485   *ret_consline = consline;
486   free(v);
487   return eslOK;
488 
489  ERROR:
490   if (v)        free(v);
491   if (consline) free(consline);
492   *ret_consline = NULL;
493   return status;
494 }
495 
496 
497 /* make_digital_consensus_line()
498  *
499  * Exactly the same as make_text_consensus_line(), except for
500  * digital mode <msa>.
501  */
502 static int
matches_group_digital(ESL_ALPHABET * abc,uint32_t v,char * group)503 matches_group_digital(ESL_ALPHABET *abc, uint32_t v, char *group)
504 {
505   uint32_t gv  = 0;
506   ESL_DSQ  sym;
507   char    *c;
508 
509   for (c = group; *c; c++) {
510     sym = esl_abc_DigitizeSymbol(abc, *c);
511     gv |= (1 << sym);
512   }
513   return ( ((v & gv) == v) ? TRUE : FALSE);
514 }
515 
516 static int
make_digital_consensus_line(const ESL_MSA * msa,char ** ret_consline)517 make_digital_consensus_line(const ESL_MSA *msa, char **ret_consline)
518 {
519   char     *consline = NULL;
520   uint32_t *v        = NULL;
521   uint32_t  tmpv, maxv;
522   int       n;
523   int       idx, apos;
524   int       status;
525 
526   /* if this ever becomes a problem, easy enough to make v a uint64_t to get up to Kp<=64 */
527   if (msa->abc->Kp > 32) ESL_EXCEPTION(eslEINVAL, "Clustal format writer cannot handle digital alphabets of Kp>32 residues");
528 
529   ESL_ALLOC(v,        sizeof(uint32_t) * (msa->alen+1));
530   ESL_ALLOC(consline, sizeof(char)     * (msa->alen+1));
531   for (apos = 0; apos <= msa->alen; apos++)
532     v[apos] = 0;
533 
534   for (idx = 0; idx < msa->nseq; idx++)
535     for (apos = 1; apos <= msa->alen; apos++)
536       v[apos] |= (1 << msa->ax[idx][apos]);
537 
538   maxv = (1 << msa->abc->K) - 1; /* maxv: has all canonical residue bits set */
539 
540   for (apos = 1; apos <= msa->alen; apos++)
541     {
542       consline[apos-1] = ' ';
543 
544       for (n = 0, tmpv = v[apos]; tmpv; n++) tmpv &= tmpv-1; /* Kernighan magic: count # of bits set in tmpv */
545 
546       if      (n == 0 || n > 6)  continue;               /* n==0 shouldn't happen; n > 6 means too many different residues seen */
547       else if (v[apos] > maxv)   continue;	         /* gap or ambiguity chars seen; column must be left unannotated */
548       else if (n == 1)           consline[apos-1] = '*'; /* complete conservation of a canonical residue */
549       else if (msa->abc->type == eslAMINO)
550 	{
551 	  if      (matches_group_digital(msa->abc, v[apos], "STA"))  consline[apos-1] = ':';
552 	  else if (matches_group_digital(msa->abc, v[apos], "NEQK")) consline[apos-1] = ':';
553 	  else if (matches_group_digital(msa->abc, v[apos], "NHQK")) consline[apos-1] = ':';
554 	  else if (matches_group_digital(msa->abc, v[apos], "NDEQ")) consline[apos-1] = ':';
555 	  else if (matches_group_digital(msa->abc, v[apos], "QHRK")) consline[apos-1] = ':';
556 	  else if (matches_group_digital(msa->abc, v[apos], "MILV")) consline[apos-1] = ':';
557 	  else if (matches_group_digital(msa->abc, v[apos], "MILF")) consline[apos-1] = ':';
558 	  else if (matches_group_digital(msa->abc, v[apos], "HY"))   consline[apos-1] = ':';
559 	  else if (matches_group_digital(msa->abc, v[apos], "FYW"))  consline[apos-1] = ':';
560 
561 	  else if (matches_group_digital(msa->abc, v[apos], "CSA"))    consline[apos-1] = '.';
562 	  else if (matches_group_digital(msa->abc, v[apos], "ATV"))    consline[apos-1] = '.';
563 	  else if (matches_group_digital(msa->abc, v[apos], "SAG"))    consline[apos-1] = '.';
564 	  else if (matches_group_digital(msa->abc, v[apos], "STNK"))   consline[apos-1] = '.';
565 	  else if (matches_group_digital(msa->abc, v[apos], "STPA"))   consline[apos-1] = '.';
566 	  else if (matches_group_digital(msa->abc, v[apos], "SGND"))   consline[apos-1] = '.';
567 	  else if (matches_group_digital(msa->abc, v[apos], "SNDEQK")) consline[apos-1] = '.';
568 	  else if (matches_group_digital(msa->abc, v[apos], "NDEQHK")) consline[apos-1] = '.';
569 	  else if (matches_group_digital(msa->abc, v[apos], "NEQHRK")) consline[apos-1] = '.';
570 	  else if (matches_group_digital(msa->abc, v[apos], "FVLIM"))  consline[apos-1] = '.';
571 	  else if (matches_group_digital(msa->abc, v[apos], "HFY"))    consline[apos-1] = '.';
572 	}
573     }
574   consline[apos-1] = '\0';
575 
576   *ret_consline = consline;
577   free(v);
578   return eslOK;
579 
580  ERROR:
581   if (v)        free(v);
582   if (consline) free(consline);
583   *ret_consline = NULL;
584   return eslOK;
585 }
586 /*-------------- end, internal clustal routines -----------------*/
587 
588 
589 /*****************************************************************
590  * 3. Unit tests.
591  *****************************************************************/
592 #ifdef eslMSAFILE_CLUSTAL_TESTDRIVE
593 
594 static void
utest_write_good1(FILE * ofp,int * ret_format,int * ret_alphatype,int * ret_nseq,int * ret_alen)595 utest_write_good1(FILE *ofp, int *ret_format, int *ret_alphatype, int *ret_nseq, int *ret_alen)
596 {
597   fputs("MUSCLE (3.7) multiple sequence alignment\n", ofp);
598   fputs("\n", ofp);
599   fputs("\n", ofp);
600   fputs("MYG_PHYCA       --------V-LSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRFKHLKT\n", ofp);
601   fputs("GLB5_PETMA      PIVDTGSVAPLSAAEKTKIRSAWAPVYSTYETSGVDILVKFFTSTPAAQEFFPKFKGLTT\n", ofp);
602   fputs("HBB_HUMAN       --------VHLTPEEKSAVTALWGKV--NVDEVGGEALGRLLVVYPWTQRFFESFGDLST\n", ofp);
603   fputs("HBA_HUMAN       --------V-LSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHF-----\n", ofp);
604   fputs("                        . *:  :.  :   *. *       * : * .::   * :   *  *     \n", ofp);
605   fputs("\n", ofp);
606   fputs("MYG_PHYCA       EAEMKASEDLKKHGVTVLTALGAILKKKGH---HEAELKPLAQSHATKHKIPIKYLEFIS\n", ofp);
607   fputs("GLB5_PETMA      ADQLKKSADVRWHAERIINAVNDAVASMDDTEKMSMKLRDLSGKHAKSFQVDPQYFKVLA\n", ofp);
608   fputs("HBB_HUMAN       PDAVMGNPKVKAHGKKVLGAFSDGLAHLDN---LKGTFATLSELHCDKLHVDPENFRLLG\n", ofp);
609   fputs("HBA_HUMAN       -DLSHGSAQVKGHGKKVADALTNAVAHVDD---MPNALSALSDLHAHKLRVDPVNFKLLS\n", ofp);
610   fputs("                                                                            \n", ofp); /* deliberately made blank */
611   fputs("\n", ofp);
612   fputs("MYG_PHYCA       EAIIHVLHSRHPGDFGADAQGAMNKALELFRKDIAAKYKELGYQG\n", ofp);
613   fputs("GLB5_PETMA      AVI---------ADTVAAGDAGFEKLMSMICILLRSAY-------\n", ofp);
614   fputs("HBB_HUMAN       NVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH------\n", ofp);
615   fputs("HBA_HUMAN       HCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKYR------\n", ofp);
616   fputs("                  :          :  .   .. :* :  .   :   *       \n", ofp);
617   fputs("\n", ofp);
618 
619   *ret_format    = eslMSAFILE_CLUSTALLIKE;
620   *ret_alphatype = eslAMINO;
621   *ret_nseq      = 4;
622   *ret_alen      = 165;
623 }
624 
625 static void
utest_write_good2(FILE * ofp,int * ret_format,int * ret_alphatype,int * ret_nseq,int * ret_alen)626 utest_write_good2(FILE *ofp, int *ret_format, int *ret_alphatype, int *ret_nseq, int *ret_alen)
627 {
628   fputs("CLUSTAL W (1.81) multiple sequence alignment\n", ofp);
629   fputs("\n", ofp);
630   fputs("tRNA2           UCCGAUAUAGUGUAACGGCUAUCACAUCACGCUUUCACCGUGG-AGACCGGGGUUCGACU\n", ofp);
631   fputs("tRNA3           UCCGUGAUAGUUUAAUGGUCAGAAUGG-GCGCUUGUCGCGUGCCAGAUCGGGGUUCAAUU\n", ofp);
632   fputs("tRNA5           GGGCACAUGGCGCAGUUGGUAGCGCGCUUCCCUUGCAAGGAAGAGGUCAUCGGUUCGAUU\n", ofp);
633   fputs("tRNA1           GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAUC\n", ofp);
634   fputs("tRNA4           GCUCGUAUGGCGCAGUGG-UAGCGCAGCAGAUUGCAAAUCUGUUGGUCCUUAGUUCGAUC\n", ofp);
635   fputs("                       * *   *   *  *           *            *      **** *  \n", ofp);
636   fputs("\n", ofp);
637   fputs("tRNA2           CCCCGUAUCGGAG\n", ofp);
638   fputs("tRNA3           CCCCGUCGCGGAG\n", ofp);
639   fputs("tRNA5           CCGGUUGCGUCCA\n", ofp);
640   fputs("tRNA1           CACAGAAUUCGCA\n", ofp);
641   fputs("tRNA4           CUGAGUGCGAGCU\n", ofp);
642   fputs("                *            \n", ofp);
643 
644   *ret_format    = eslMSAFILE_CLUSTAL;
645   *ret_alphatype = eslRNA;
646   *ret_nseq      = 5;
647   *ret_alen      = 73;
648 }
649 
650 /* An example of clustal format with optional sequence coords;
651  * a quickly-taken subset of a larger alignment reported as a bug.
652  */
653 static void
utest_write_good3(FILE * ofp,int * ret_format,int * ret_alphatype,int * ret_nseq,int * ret_alen)654 utest_write_good3(FILE *ofp, int *ret_format, int *ret_alphatype, int *ret_nseq, int *ret_alen)
655 {
656   fputs("CLUSTAL 2.1 multiple sequence alignment\n", ofp);
657   fputs("\n", ofp);
658   fputs("gi|85091828|ref|XP_959093.1|        MSDFTSKVKVLRDGQKPEFPSN----ANTLEYAQSLDAQDELRHFRNEFI 46\n", ofp);
659   fputs("gi|70993990|ref|XP_751842.1|        ----MSTNGTLS---KPEFPAN----AASKEYAASLDAADPFAGFREKFI 39\n", ofp);
660   fputs("gi|71001376|ref|XP_755369.1|        ---MGSRLHVQVIHGGPPLPYKDDIRAFGKEYAEQLDAQDPLRRFRDEFI 47\n", ofp);
661   fputs("gi|71744026|ref|XP_803513.1|        -----------------------------------MDRNDPLQVHRDAFN 15\n", ofp);
662   fputs("                                                                                 :  : \n",    ofp);
663   fputs("\n", ofp);
664   fputs("gi|85091828|ref|XP_959093.1|        IPTRASLKKKALDGI--------------IPGTQANGTTTSTDADTPCIY 82\n", ofp);
665   fputs("gi|70993990|ref|XP_751842.1|        IPSKANIASTKLA----------------KPGLSSE----------PCIY 63\n", ofp);
666   fputs("gi|71001376|ref|XP_755369.1|        IPSKKDLKRKTLFPNDGMYSCGHPICFANTSCACVHAAETEETSDEKCIY 97\n", ofp);
667   fputs("gi|71744026|ref|XP_803513.1|        IPKRRDGS--------------------------------------DHVY 27\n", ofp);
668   fputs("                                                                                     *\n",    ofp);
669 
670   *ret_format    = eslMSAFILE_CLUSTAL;
671   *ret_alphatype = eslAMINO;
672   *ret_nseq      = 4;
673   *ret_alen      = 100;
674 }
675 
676 
677 static void
utest_goodfile(char * filename,int testnumber,int expected_format,int expected_alphatype,int expected_nseq,int expected_alen)678 utest_goodfile(char *filename, int testnumber, int expected_format, int expected_alphatype, int expected_nseq, int expected_alen)
679 {
680   ESL_ALPHABET        *abc          = NULL;
681   ESL_MSAFILE         *afp          = NULL;
682   ESL_MSA             *msa1         = NULL;
683   ESL_MSA             *msa2         = NULL;
684   char                 tmpfile1[32] = "esltmpXXXXXX";
685   char                 tmpfile2[32] = "esltmpXXXXXX";
686   FILE                *ofp          = NULL;
687   int                  status;
688 
689   /* guessing both the format and the alphabet should work: this is a digital open */
690   if ( (status = esl_msafile_Open(&abc, filename, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) esl_fatal("clustal good file test %d failed: digital open",           testnumber);
691   if (afp->format != expected_format)                                                              esl_fatal("clustal good file test %d failed: format autodetection",   testnumber);
692   if (abc->type   != expected_alphatype)                                                           esl_fatal("clustal good file test %d failed: alphabet autodetection", testnumber);
693 
694   /* This is a digital read, using <abc>. */
695   if ( (status = esl_msafile_clustal_Read(afp, &msa1))   != eslOK) esl_fatal("clustal good file test %d failed: msa read, digital", testnumber);
696   if (msa1->nseq != expected_nseq || msa1->alen != expected_alen)  esl_fatal("clustal good file test %d failed: nseq/alen",         testnumber);
697   if (esl_msa_Validate(msa1, NULL) != eslOK)                       esl_fatal("clustal good file test %d failed: msa1 invalid",      testnumber);
698   esl_msafile_Close(afp);
699 
700   /* write it back out to a new tmpfile (digital write) */
701   if ( (status = esl_tmpfile_named(tmpfile1, &ofp))                     != eslOK) esl_fatal("clustal good file test %d failed: tmpfile creation",   testnumber);
702   if ( (status = esl_msafile_clustal_Write(ofp, msa1, expected_format)) != eslOK) esl_fatal("clustal good file test %d failed: msa write, digital", testnumber);
703   fclose(ofp);
704 
705   /* now open and read it as text mode, in known format. (We have to pass fmtd now, to deal with the possibility of a nonstandard name width) */
706   if ( (status = esl_msafile_Open(NULL, tmpfile1, NULL, expected_format, NULL, &afp)) != eslOK) esl_fatal("clustal good file test %d failed: text mode open", testnumber);
707   if ( (status = esl_msafile_clustal_Read(afp, &msa2))                                != eslOK) esl_fatal("clustal good file test %d failed: msa read, text", testnumber);
708   if (msa2->nseq != expected_nseq || msa2->alen != expected_alen)                               esl_fatal("clustal good file test %d failed: nseq/alen",      testnumber);
709   if (esl_msa_Validate(msa2, NULL) != eslOK)                                                    esl_fatal("clustal good file test %d failed: msa2 invalid",   testnumber);
710   esl_msafile_Close(afp);
711 
712   /* write it back out to a new tmpfile (text write) */
713   if ( (status = esl_tmpfile_named(tmpfile2, &ofp))                     != eslOK) esl_fatal("clustal good file test %d failed: tmpfile creation", testnumber);
714   if ( (status = esl_msafile_clustal_Write(ofp, msa2, expected_format)) != eslOK) esl_fatal("clustal good file test %d failed: msa write, text",  testnumber);
715   fclose(ofp);
716   esl_msa_Destroy(msa2);
717 
718   /* open and read it in digital mode */
719   if ( (status = esl_msafile_Open(&abc, tmpfile1, NULL, expected_format, NULL, &afp)) != eslOK) esl_fatal("clustal good file test %d failed: 2nd digital mode open", testnumber);
720   if ( (status = esl_msafile_clustal_Read(afp, &msa2))                                != eslOK) esl_fatal("clustal good file test %d failed: 2nd digital msa read",  testnumber);
721   if (esl_msa_Validate(msa2, NULL) != eslOK)                                                    esl_fatal("clustal good file test %d failed: msa2 invalid",          testnumber);
722   esl_msafile_Close(afp);
723 
724   /* this msa <msa2> should be identical to <msa1> */
725   if (esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal("clustal good file test %d failed: msa compare", testnumber);
726 
727   remove(tmpfile1);
728   remove(tmpfile2);
729   esl_msa_Destroy(msa1);
730   esl_msa_Destroy(msa2);
731   esl_alphabet_Destroy(abc);
732 }
733 
734 static void
write_test_msas(FILE * ofp1,FILE * ofp2)735 write_test_msas(FILE *ofp1, FILE *ofp2)
736 {
737   fprintf(ofp1, "EASEL (X.x) multiple sequence alignment\n");
738   fprintf(ofp1, "\n");
739   fprintf(ofp1, "seq1 ..acdefghiklmnpqrstvwy\n");
740   fprintf(ofp1, "seq2 ..acdefghiklmnpqrstv--\n");
741   fprintf(ofp1, "seq3 aaacdefghiklmnpqrstv--\n");
742   fprintf(ofp1, "seq4 ..acdefghiklmnpqrstvwy\n");
743   fprintf(ofp1, "       ******************  \n");
744   fprintf(ofp1, "\n");
745   fprintf(ofp1, "seq1 ACDEFGHIKLMNPQRSTVWY\n");
746   fprintf(ofp1, "seq2 ACDEFGHIKLMNPQRSTVWY\n");
747   fprintf(ofp1, "seq3 ACDEFGHIKLMNPQRSTVWY\n");
748   fprintf(ofp1, "seq4 ACDEFGHIKLMNPQRSTVWY\n");
749   fprintf(ofp1, "     ********************\n");
750   fprintf(ofp1, "\n");
751   fprintf(ofp1, "seq1 ..\n");
752   fprintf(ofp1, "seq2 YY\n");
753   fprintf(ofp1, "seq3 ..\n");
754   fprintf(ofp1, "seq4 ..\n");
755   fprintf(ofp1, "\n");
756 
757   fprintf(ofp2, "# STOCKHOLM 1.0\n");
758   fprintf(ofp2, "\n");
759   fprintf(ofp2, "seq1    ..acdefghiklmnpqrstvwyACDEFGHIKLMNPQRSTVWY..\n");
760   fprintf(ofp2, "seq2    ..acdefghiklmnpqrstv--ACDEFGHIKLMNPQRSTVWYYY\n");
761   fprintf(ofp2, "seq3    aaacdefghiklmnpqrstv--ACDEFGHIKLMNPQRSTVWY..\n");
762   fprintf(ofp2, "seq4    ..acdefghiklmnpqrstvwyACDEFGHIKLMNPQRSTVWY..\n");
763   fprintf(ofp2, "//\n");
764 }
765 
766 static void
read_test_msas_digital(char * alnfile,char * stkfile)767 read_test_msas_digital(char *alnfile, char *stkfile)
768 {
769   char msg[]         = "CLUSTAL msa digital read unit test failed";
770   ESL_ALPHABET *abc  = NULL;
771   ESL_MSAFILE  *afp1 = NULL;
772   ESL_MSAFILE  *afp2 = NULL;
773   ESL_MSA      *msa1, *msa2, *msa3, *msa4;
774   FILE         *alnfp, *stkfp;
775   char          alnfile2[32] = "esltmpaln2XXXXXX";
776   char          stkfile2[32] = "esltmpstk2XXXXXX";
777 
778   if ( esl_msafile_Open(&abc, alnfile, NULL, eslMSAFILE_CLUSTALLIKE, NULL, &afp1) != eslOK)  esl_fatal(msg);
779   if ( !abc || abc->type != eslAMINO)                                                        esl_fatal(msg);
780   if ( esl_msafile_Open(&abc, stkfile, NULL, eslMSAFILE_STOCKHOLM,   NULL, &afp2) != eslOK)  esl_fatal(msg);
781   if ( esl_msafile_clustal_Read  (afp1, &msa1)                                    != eslOK)  esl_fatal(msg);
782   if ( esl_msafile_stockholm_Read(afp2, &msa2)                                    != eslOK)  esl_fatal(msg);
783   if ( esl_msa_Compare(msa1, msa2)                                                != eslOK)  esl_fatal(msg);
784 
785   if ( esl_msafile_clustal_Read  (afp1, &msa3) != eslEOF) esl_fatal(msg);
786   if ( esl_msafile_stockholm_Read(afp2, &msa3) != eslEOF) esl_fatal(msg);
787 
788   esl_msafile_Close(afp2);
789   esl_msafile_Close(afp1);
790 
791   /* Now write stk to clustal file, and vice versa; then retest */
792   if ( esl_tmpfile_named(alnfile2, &alnfp)                                  != eslOK) esl_fatal(msg);
793   if ( esl_tmpfile_named(stkfile2, &stkfp)                                  != eslOK) esl_fatal(msg);
794   if ( esl_msafile_clustal_Write  (alnfp, msa2, eslMSAFILE_CLUSTAL)         != eslOK) esl_fatal(msg);
795   if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_STOCKHOLM)       != eslOK) esl_fatal(msg);
796   fclose(alnfp);
797   fclose(stkfp);
798   if ( esl_msafile_Open(&abc, alnfile2, NULL, eslMSAFILE_CLUSTAL,   NULL, &afp1) != eslOK) esl_fatal(msg);
799   if ( esl_msafile_Open(&abc, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
800   if ( esl_msafile_clustal_Read  (afp1, &msa3)                                   != eslOK) esl_fatal(msg);
801   if ( esl_msafile_stockholm_Read(afp2, &msa4)                                   != eslOK) esl_fatal(msg);
802   if ( esl_msa_Compare(msa3, msa4)                                               != eslOK) esl_fatal(msg);
803 
804   remove(alnfile2);
805   remove(stkfile2);
806   esl_msafile_Close(afp2);
807   esl_msafile_Close(afp1);
808 
809   esl_msa_Destroy(msa1);
810   esl_msa_Destroy(msa2);
811   esl_msa_Destroy(msa3);
812   esl_msa_Destroy(msa4);
813   esl_alphabet_Destroy(abc);
814 }
815 
816 static void
read_test_msas_text(char * alnfile,char * stkfile)817 read_test_msas_text(char *alnfile, char *stkfile)
818 {
819   char msg[]         = "CLUSTAL msa text-mode read unit test failed";
820   ESL_MSAFILE  *afp1 = NULL;
821   ESL_MSAFILE  *afp2 = NULL;
822   ESL_MSA      *msa1, *msa2, *msa3, *msa4;
823   FILE         *alnfp, *stkfp;
824   char          alnfile2[32] = "esltmpaln2XXXXXX";
825   char          stkfile2[32] = "esltmpstk2XXXXXX";
826 
827   /*                    vvvv-- everything's the same as the digital utest except these NULLs  */
828   if ( esl_msafile_Open(NULL, alnfile, NULL, eslMSAFILE_CLUSTALLIKE, NULL, &afp1) != eslOK)  esl_fatal(msg);
829   if ( esl_msafile_Open(NULL, stkfile, NULL, eslMSAFILE_STOCKHOLM,   NULL, &afp2) != eslOK)  esl_fatal(msg);
830   if ( esl_msafile_clustal_Read  (afp1, &msa1)                                    != eslOK)  esl_fatal(msg);
831   if ( esl_msafile_stockholm_Read(afp2, &msa2)                                    != eslOK)  esl_fatal(msg);
832   if ( esl_msa_Compare(msa1, msa2)                                                != eslOK)  esl_fatal(msg);
833   if ( esl_msafile_clustal_Read  (afp1, &msa3)                                    != eslEOF) esl_fatal(msg);
834   if ( esl_msafile_stockholm_Read(afp2, &msa3)                                    != eslEOF) esl_fatal(msg);
835   esl_msafile_Close(afp2);
836   esl_msafile_Close(afp1);
837 
838   if ( esl_tmpfile_named(alnfile2, &alnfp)                                   != eslOK) esl_fatal(msg);
839   if ( esl_tmpfile_named(stkfile2, &stkfp)                                   != eslOK) esl_fatal(msg);
840   if ( esl_msafile_clustal_Write  (alnfp, msa2, eslMSAFILE_CLUSTAL)          != eslOK) esl_fatal(msg);
841   if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_STOCKHOLM)        != eslOK) esl_fatal(msg);
842   fclose(alnfp);
843   fclose(stkfp);
844   if ( esl_msafile_Open(NULL, alnfile2, NULL, eslMSAFILE_CLUSTAL,   NULL, &afp1)  != eslOK) esl_fatal(msg);
845   if ( esl_msafile_Open(NULL, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2)  != eslOK) esl_fatal(msg);
846   if ( esl_msafile_clustal_Read  (afp1, &msa3)                                    != eslOK) esl_fatal(msg);
847   if ( esl_msafile_stockholm_Read(afp2, &msa4)                                    != eslOK) esl_fatal(msg);
848   if ( esl_msa_Compare(msa3, msa4)                                                != eslOK) esl_fatal(msg);
849 
850   remove(alnfile2);
851   remove(stkfile2);
852   esl_msafile_Close(afp2);
853   esl_msafile_Close(afp1);
854 
855   esl_msa_Destroy(msa1);
856   esl_msa_Destroy(msa2);
857   esl_msa_Destroy(msa3);
858   esl_msa_Destroy(msa4);
859 }
860 #endif /*eslMSAFILE_CLUSTAL_TESTDRIVE*/
861 /*---------------------- end, unit tests ------------------------*/
862 
863 
864 /*****************************************************************
865  * 4. Test driver.
866  *****************************************************************/
867 #ifdef eslMSAFILE_CLUSTAL_TESTDRIVE
868 /* compile: gcc -g -Wall -I. -L. -o esl_msafile_clustal_utest -DeslMSAFILE_CLUSTAL_TESTDRIVE esl_msafile_clustal.c -leasel -lm
869  *  (gcov): gcc -g -Wall -fprofile-arcs -ftest-coverage -I. -L. -o esl_msafile_clustal_utest -DeslMSAFILE_CLUSTAL_TESTDRIVE esl_msafile_clustal.c -leasel -lm
870  * run:     ./esl_msafile_clustal_utest
871  */
872 #include "esl_config.h"
873 
874 #include <stdio.h>
875 
876 #include "easel.h"
877 #include "esl_getopts.h"
878 #include "esl_random.h"
879 #include "esl_msafile.h"
880 #include "esl_msafile_clustal.h"
881 
882 static ESL_OPTIONS options[] = {
883    /* name  type         default  env   range togs  reqs  incomp  help                docgrp */
884   {"-h",  eslARG_NONE,    FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage",                            0},
885   { 0,0,0,0,0,0,0,0,0,0},
886 };
887 static char usage[]  = "[-options]";
888 static char banner[] = "test driver for CLUSTAL MSA format module";
889 
890 int
main(int argc,char ** argv)891 main(int argc, char **argv)
892 {
893   char            msg[]        = "CLUSTAL MSA i/o module test driver failed";
894   ESL_GETOPTS    *go           = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
895   char            alnfile[32] = "esltmpalnXXXXXX";
896   char            stkfile[32] = "esltmpstkXXXXXX";
897   FILE           *alnfp, *stkfp;
898   int             testnumber;
899   int             ngoodtests = 3;
900   char            tmpfile[32];
901   FILE           *ofp;
902   int             expected_format;
903   int             expected_alphatype;
904   int             expected_nseq;
905   int             expected_alen;
906 
907   if ( esl_tmpfile_named(alnfile, &alnfp) != eslOK) esl_fatal(msg);
908   if ( esl_tmpfile_named(stkfile, &stkfp) != eslOK) esl_fatal(msg);
909   write_test_msas(alnfp, stkfp);
910   fclose(alnfp);
911   fclose(stkfp);
912 
913   read_test_msas_digital(alnfile, stkfile);
914   read_test_msas_text   (alnfile, stkfile);
915 
916   /* Various "good" files that should be parsed correctly */
917   for (testnumber = 1; testnumber <= ngoodtests; testnumber++)
918     {
919       strcpy(tmpfile, "esltmpXXXXXX");
920       if (esl_tmpfile_named(tmpfile, &ofp) != eslOK) esl_fatal(msg);
921       switch (testnumber) {
922       case  1:  utest_write_good1 (ofp, &expected_format, &expected_alphatype, &expected_nseq, &expected_alen); break;
923       case  2:  utest_write_good2 (ofp, &expected_format, &expected_alphatype, &expected_nseq, &expected_alen); break;
924       case  3:  utest_write_good3 (ofp, &expected_format, &expected_alphatype, &expected_nseq, &expected_alen); break;
925       }
926       fclose(ofp);
927       utest_goodfile(tmpfile, testnumber, expected_format, expected_alphatype, expected_nseq, expected_alen);
928       remove(tmpfile);
929     }
930 
931   remove(alnfile);
932   remove(stkfile);
933   esl_getopts_Destroy(go);
934   return 0;
935 }
936 #endif /*eslMSAFILE_CLUSTAL_TESTDRIVE*/
937 /*--------------------- end, test driver ------------------------*/
938 
939 
940 /*****************************************************************
941  * 5. Examples.
942  *****************************************************************/
943 
944 
945 #ifdef eslMSAFILE_CLUSTAL_EXAMPLE
946 /* A full-featured example of reading/writing an MSA in Clustal format(s).
947    gcc -g -Wall -o esl_msafile_clustal_example -I. -L. -DeslMSAFILE_CLUSTAL_EXAMPLE esl_msafile_clustal.c -leasel -lm
948    ./esl_msafile_clustal_example <msafile>
949  */
950 /*::cexcerpt::msafile_clustal_example::begin::*/
951 #include <stdio.h>
952 
953 #include "easel.h"
954 #include "esl_alphabet.h"
955 #include "esl_getopts.h"
956 #include "esl_msa.h"
957 #include "esl_msafile.h"
958 #include "esl_msafile_clustal.h"
959 
960 static ESL_OPTIONS options[] = {
961   /* name             type          default  env  range toggles reqs incomp  help                                       docgroup*/
962   { "-h",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",        0 },
963   { "-1",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "no autodetection; use CLUSTAL format",        0 },
964   { "-2",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "no autodetection; use CLUSTALLIKE format",    0 },
965   { "-q",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "quieter: don't write msa back, just summary", 0 },
966   { "-t",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use text mode: no digital alphabet",          0 },
967   { "--dna",       eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, "-t", "specify that alphabet is DNA",                0 },
968   { "--rna",       eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, "-t", "specify that alphabet is RNA",                0 },
969   { "--amino",     eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, "-t", "specify that alphabet is protein",            0 },
970   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
971 };
972 static char usage[]  = "[-options] <msafile>";
973 static char banner[] = "example of guessing, reading, writing Clustal formats";
974 
975 int
main(int argc,char ** argv)976 main(int argc, char **argv)
977 {
978   ESL_GETOPTS        *go          = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
979   char               *filename    = esl_opt_GetArg(go, 1);
980   int                 infmt       = eslMSAFILE_UNKNOWN;
981   ESL_ALPHABET       *abc         = NULL;
982   ESL_MSAFILE        *afp         = NULL;
983   ESL_MSA            *msa         = NULL;
984   int                 status;
985 
986   if      (esl_opt_GetBoolean(go, "-1"))      infmt = eslMSAFILE_CLUSTAL;
987   else if (esl_opt_GetBoolean(go, "-2"))      infmt = eslMSAFILE_CLUSTALLIKE;
988 
989   if      (esl_opt_GetBoolean(go, "--rna"))   abc = esl_alphabet_Create(eslRNA);
990   else if (esl_opt_GetBoolean(go, "--dna"))   abc = esl_alphabet_Create(eslDNA);
991   else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
992 
993   /* Text mode: pass NULL for alphabet.
994    * Digital mode: pass ptr to expected ESL_ALPHABET; and if abc=NULL, alphabet is guessed
995    */
996   if   (esl_opt_GetBoolean(go, "-t"))  status = esl_msafile_Open(NULL, filename, NULL, infmt, NULL, &afp);
997   else                                 status = esl_msafile_Open(&abc, filename, NULL, infmt, NULL, &afp);
998   if (status != eslOK) esl_msafile_OpenFailure(afp, status);
999 
1000   if ( (status = esl_msafile_clustal_Read(afp, &msa)) != eslOK)
1001     esl_msafile_ReadFailure(afp, status);
1002 
1003   printf("format variant: %s\n", esl_msafile_DecodeFormat(afp->format));
1004   printf("alphabet:       %s\n", (abc ? esl_abc_DecodeType(abc->type) : "none (text mode)"));
1005   printf("# of seqs:      %d\n", msa->nseq);
1006   printf("# of cols:      %d\n", (int) msa->alen);
1007   printf("\n");
1008 
1009   if (! esl_opt_GetBoolean(go, "-q"))
1010     esl_msafile_clustal_Write(stdout, msa, eslMSAFILE_CLUSTAL);
1011 
1012   esl_msa_Destroy(msa);
1013   esl_msafile_Close(afp);
1014   if (abc) esl_alphabet_Destroy(abc);
1015   esl_getopts_Destroy(go);
1016   exit(0);
1017 }
1018 /*::cexcerpt::msafile_clustal_example::end::*/
1019 #endif /*eslMSAFILE_CLUSTAL_EXAMPLE*/
1020 
1021 #ifdef eslMSAFILE_CLUSTAL_EXAMPLE2
1022 /* A minimal example. Read Clustal MSA, in text mode.
1023    gcc -g -Wall -o esl_msafile_clustal_example2 -I. -L. -DeslMSAFILE_CLUSTAL_EXAMPLE2 esl_msafile_clustal.c -leasel -lm
1024    ./esl_msafile_clustal_example2 <msafile>
1025  */
1026 
1027 /*::cexcerpt::msafile_clustal_example2::begin::*/
1028 #include <stdio.h>
1029 
1030 #include "easel.h"
1031 #include "esl_msa.h"
1032 #include "esl_msafile.h"
1033 #include "esl_msafile_clustal.h"
1034 
1035 int
main(int argc,char ** argv)1036 main(int argc, char **argv)
1037 {
1038   char         *filename = argv[1];
1039   int           fmt      = eslMSAFILE_CLUSTAL; /* or eslMSAFILE_CLUSTALLIKE */
1040   ESL_MSAFILE  *afp      = NULL;
1041   ESL_MSA      *msa      = NULL;
1042   int          status;
1043 
1044   if ( (status = esl_msafile_Open(NULL, filename, NULL, fmt, NULL, &afp)) != eslOK)  esl_msafile_OpenFailure(afp, status);
1045   if ( (status = esl_msafile_clustal_Read(afp, &msa))                     != eslOK)  esl_msafile_ReadFailure(afp, status);
1046 
1047   printf("%6d seqs, %5d columns\n", msa->nseq, (int) msa->alen);
1048 
1049   esl_msafile_clustal_Write(stdout, msa, eslMSAFILE_CLUSTAL);
1050 
1051   esl_msa_Destroy(msa);
1052   esl_msafile_Close(afp);
1053   exit(0);
1054 }
1055 /*::cexcerpt::msafile_clustal_example2::end::*/
1056 #endif /*eslMSAFILE_CLUSTAL_EXAMPLE2*/
1057 /*--------------------- end of example --------------------------*/
1058 
1059