1 /* i/o of multiple sequence alignment files in Clustal-like formats
2 *
3 * Contents:
4 * 1. API for reading/writing Clustal and Clustal-like formats
5 * 2. Internal routines for Clustal formats.
6 * 3. Unit tests.
7 * 4. Test driver.
8 * 5. Example.
9 *
10 * This module is responsible for i/o of both eslMSAFILE_CLUSTAL and
11 * eslMSAFILE_CLUSTALLIKE alignment formats.
12 */
13 #include "esl_config.h"
14
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <ctype.h>
19
20 #include "easel.h"
21 #include "esl_alphabet.h"
22 #include "esl_mem.h"
23 #include "esl_msa.h"
24 #include "esl_msafile.h"
25
26 #include "esl_msafile_clustal.h"
27
28 static int make_text_consensus_line(const ESL_MSA *msa, char **ret_consline);
29 static int make_digital_consensus_line(const ESL_MSA *msa, char **ret_consline);
30
31 /*****************************************************************
32 *# 1. API for reading/writing Clustal and Clustal-like formats
33 *****************************************************************/
34
35 /* Function: esl_msafile_clustal_SetInmap()
36 * Synopsis: Configure input map for CLUSTAL, CLUSTALLIKE formats.
37 *
38 * Purpose: Set the <afp->inmap> for Clustal-like formats.
39 *
40 * Text mode accepts any <isgraph()> character.
41 * Digital mode enforces the usual Easel alphabets.
42 */
43 int
esl_msafile_clustal_SetInmap(ESL_MSAFILE * afp)44 esl_msafile_clustal_SetInmap(ESL_MSAFILE *afp)
45 {
46 int sym;
47
48 if (afp->abc)
49 {
50 for (sym = 0; sym < 128; sym++)
51 afp->inmap[sym] = afp->abc->inmap[sym];
52 afp->inmap[0] = esl_abc_XGetUnknown(afp->abc);
53 }
54
55 if (! afp->abc)
56 {
57 for (sym = 1; sym < 128; sym++)
58 afp->inmap[sym] = (isgraph(sym) ? sym : eslDSQ_ILLEGAL);
59 afp->inmap[0] = '?';
60 }
61 return eslOK;
62 }
63
64
65 /* Function: esl_msafile_clustal_GuessAlphabet()
66 * Synopsis: Guess the alphabet of an open Clustal MSA input.
67 *
68 * Purpose: Guess the alpbabet of the sequences in open
69 * Clustal format MSA file <afp>.
70 *
71 * On a normal return, <*ret_type> is set to <eslDNA>,
72 * <eslRNA>, or <eslAMINO>, and <afp> is reset to its
73 * original position.
74 *
75 * Args: afp - open Clustal format MSA file
76 * ret_type - RETURN: <eslDNA>, <eslRNA>, or <eslAMINO>
77 *
78 * Returns: <eslOK> on success.
79 * <eslENOALPHABET> if alphabet type can't be determined.
80 * In either case, <afp> is rewound to the position it
81 * started at.
82 *
83 * Throws: <eslEMEM> on allocation error.
84 * <eslESYS> on failures of fread() or other system calls
85 */
86 int
esl_msafile_clustal_GuessAlphabet(ESL_MSAFILE * afp,int * ret_type)87 esl_msafile_clustal_GuessAlphabet(ESL_MSAFILE *afp, int *ret_type)
88 {
89 int alphatype = eslUNKNOWN;
90 esl_pos_t anchor = -1;
91 int threshold[3] = { 500, 5000, 50000 }; /* we check after 500, 5000, 50000 residues; else we go to EOF */
92 int nsteps = 3;
93 int step = 0;
94 int nres = 0;
95 int x;
96 int64_t ct[26];
97 char *p, *tok;
98 esl_pos_t n, toklen, pos;
99 int status;
100
101 for (x = 0; x < 26; x++) ct[x] = 0;
102
103 anchor = esl_buffer_GetOffset(afp->bf);
104 if ((status = esl_buffer_SetAnchor(afp->bf, anchor)) != eslOK) { status = eslEINCONCEIVABLE; goto ERROR; } /* [eslINVAL] can't happen here */
105
106 /* Ignore the first nonblank line, which says "CLUSTAL W (1.83) multiple sequence alignment" or some such */
107 while ( (status = esl_buffer_GetLine(afp->bf, &p, &n)) == eslOK && esl_memspn(p, n, " \t") == n) ;
108 if (status == eslEOF) ESL_XFAIL(eslENOALPHABET, afp->errmsg, "can't determine alphabet: no alignment data found");
109 else if (status != eslOK) goto ERROR;
110
111 while ( (status = esl_buffer_GetLine(afp->bf, &p, &n)) == eslOK)
112 {
113 if ((status = esl_memtok(&p, &n, " \t", &tok, &toklen)) != eslOK) continue; /* ignore blank lines */
114 /* p now points to the rest of the sequence line, after a name */
115
116 /* count characters into ct[] array */
117 for (pos = 0; pos < n; pos++)
118 if (isalpha(p[pos])) {
119 x = toupper(p[pos]) - 'A';
120 ct[x]++;
121 nres++;
122 }
123 /* note that GuessAlphabet() is robust against the optional coord lines
124 * and the annotation lines -- it only counts ascii characters.
125 */
126
127 /* try to stop early, checking after 500, 5000, and 50000 residues: */
128 if (step < nsteps && nres > threshold[step]) {
129 if ((status = esl_abc_GuessAlphabet(ct, &alphatype)) == eslOK) goto DONE; /* (eslENOALPHABET) */
130 step++;
131 }
132 }
133 if (status != eslEOF) goto ERROR; /* [eslEMEM,eslESYS,eslEINCONCEIVABLE] */
134 status = esl_abc_GuessAlphabet(ct, &alphatype); /* (eslENOALPHABET) */
135
136 DONE:
137 esl_buffer_SetOffset(afp->bf, anchor); /* Rewind to where we were. */
138 esl_buffer_RaiseAnchor(afp->bf, anchor);
139 *ret_type = alphatype;
140 return status;
141
142 ERROR:
143 if (anchor != -1) {
144 esl_buffer_SetOffset(afp->bf, anchor);
145 esl_buffer_RaiseAnchor(afp->bf, anchor);
146 }
147 *ret_type = eslUNKNOWN;
148 return status;
149 }
150
151
152 /* Function: esl_msafile_clustal_Read()
153 * Synopsis: Read in a CLUSTAL or CLUSTALLIKE alignment.
154 *
155 * Purpose: Read an MSA from an open <ESL_MSAFILE> <afp>, parsing
156 * for Clustal or Clustal-like format, starting from the
157 * current point. (<afp->format> is expected to be
158 * <eslMSAFILE_CLUSTAL> or <eslMSAFILE_CLUSTALLIKE>.) Create a
159 * new multiple alignment, and return a ptr to that
160 * alignment in <*ret_msa>. Caller is responsible for
161 * free'ing this <ESL_MSA>.
162 *
163 * Args: afp - open <ESL_MSAFILE>
164 * ret_msa - RETURN: newly parsed <ESL_MSA>
165 *
166 * Returns: <eslOK> on success.
167 *
168 * <eslEOF> if no (more) alignment data are found in
169 * <afp>, and <afp> is returned at EOF.
170 *
171 * <eslEFORMAT> on a parse error. <*ret_msa> is set to
172 * <NULL>. <afp> contains information sufficient for
173 * constructing useful diagnostic output:
174 * | <afp->errmsg> | user-directed error message |
175 * | <afp->linenumber> | line # where error was detected |
176 * | <afp->line> | offending line (not NUL-term) |
177 * | <afp->n> | length of offending line |
178 * | <afp->bf->filename> | name of the file |
179 * and <afp> is poised at the start of the following line,
180 * so (in principle) the caller could try to resume
181 * parsing.
182 *
183 * Throws: <eslEMEM> - an allocation failed.
184 * <eslESYS> - a system call such as fread() failed
185 * <eslEINCONCEIVABLE> - "impossible" corruption
186 */
187 int
esl_msafile_clustal_Read(ESL_MSAFILE * afp,ESL_MSA ** ret_msa)188 esl_msafile_clustal_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa)
189 {
190 ESL_MSA *msa = NULL;
191 char *p = NULL;
192 esl_pos_t n = 0;
193 char *tok = NULL;
194 esl_pos_t ntok = 0;
195 int nblocks = 0;
196 int idx = 0;
197 int nseq = 0;
198 int64_t alen = 0;
199 int64_t cur_alen;
200 esl_pos_t pos;
201 esl_pos_t name_start, name_len;
202 esl_pos_t seq_start, seq_len;
203 esl_pos_t block_seq_start, block_seq_len;
204 int status;
205
206 ESL_DASSERT1( (afp->format == eslMSAFILE_CLUSTAL || afp->format == eslMSAFILE_CLUSTALLIKE) );
207
208 afp->errmsg[0] = '\0';
209
210 if (afp->abc && (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
211 if (! afp->abc && (msa = esl_msa_Create( 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
212
213 /* skip leading blank lines in file */
214 while ( (status = esl_msafile_GetLine(afp, &p, &n)) == eslOK && esl_memspn(afp->line, afp->n, " \t") == afp->n) ;
215 if (status != eslOK) goto ERROR; /* includes normal EOF */
216
217 /* That first line says something like: "CLUSTAL W (1.83) multiple sequence alignment" */
218 if (esl_memtok(&p, &n, " \t", &tok, &ntok) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "missing CLUSTAL header");
219 if (afp->format == eslMSAFILE_CLUSTAL && ! esl_memstrpfx(tok, ntok, "CLUSTAL")) ESL_XFAIL(eslEFORMAT, afp->errmsg, "missing CLUSTAL header");
220 if (! esl_memstrcontains(p, n, "multiple sequence alignment")) ESL_XFAIL(eslEFORMAT, afp->errmsg, "missing CLUSTAL header");
221
222 /* skip blank lines again */
223 do {
224 status = esl_msafile_GetLine(afp, &p, &n);
225 if (status == eslEOF) ESL_XFAIL(eslEFORMAT, afp->errmsg, "no alignment data following header");
226 else if (status != eslOK) goto ERROR;
227 } while (esl_memspn(afp->line, afp->n, " \t") == afp->n); /* idiom for "blank line" */
228
229 /* Read the file a line at a time. */
230 do { /* afp->line, afp->n is now the first line of a block... */
231 idx = 0;
232 do {
233 for (pos = 0; pos < n; pos++) if (! isspace(p[pos])) break;
234 name_start = pos;
235 for (pos = pos+1; pos < n; pos++) if ( isspace(p[pos])) break;
236 name_len = pos - name_start;
237 for (pos = pos+1; pos < n; pos++) if (! isspace(p[pos])) break;
238 seq_start = pos;
239 if (pos >= n) ESL_XFAIL(eslEFORMAT, afp->errmsg, "invalid alignment line");
240 for (pos = pos+1; pos < n; pos++) if ( isspace(p[pos])) break;
241 seq_len = pos - seq_start; /* expect one block; ignore trailing stuff, inc. optional coords */
242
243 if (idx == 0) {
244 block_seq_start = seq_start;
245 block_seq_len = seq_len;
246 } else {
247 if (seq_start != block_seq_start) ESL_XFAIL(eslEFORMAT, afp->errmsg, "sequence start is misaligned");
248 if (seq_len != block_seq_len) ESL_XFAIL(eslEFORMAT, afp->errmsg, "sequence end is misaligned");
249 }
250
251 /* Store the sequence name. */
252 if (nblocks == 0) {
253 /* make sure we have room for another sequence */
254 if (idx >= msa->sqalloc && (status = esl_msa_Expand(msa)) != eslOK) goto ERROR;
255 if ( (status = esl_msa_SetSeqName(msa, idx, p+name_start, name_len)) != eslOK) goto ERROR;
256 nseq++;
257 } else {
258 if (! esl_memstrcmp(p+name_start, name_len, msa->sqname[idx]))
259 ESL_XFAIL(eslEFORMAT, afp->errmsg, "expected sequence %s on this line, but saw %.*s", msa->sqname[idx], (int) name_len, p+name_start);
260 }
261
262 /* Append the sequence. */
263 cur_alen = alen;
264 if (msa->abc) { status = esl_abc_dsqcat(afp->inmap, &(msa->ax[idx]), &(cur_alen), p+seq_start, seq_len); }
265 if (! msa->abc) { status = esl_strmapcat (afp->inmap, &(msa->aseq[idx]), &(cur_alen), p+seq_start, seq_len); }
266 if (status == eslEINVAL) ESL_XFAIL(eslEFORMAT, afp->errmsg, "one or more invalid sequence characters");
267 else if (status != eslOK) goto ERROR;
268 if (cur_alen - alen != seq_len) ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected number of seq characters");
269
270 /* get next line. if it's a consensus line, we're done with the block */
271 status = esl_msafile_GetLine(afp, &p, &n);
272 if (status == eslEOF) ESL_XFAIL(eslEFORMAT, afp->errmsg, "alignment block did not end with consensus line");
273 else if (status != eslOK) goto ERROR;
274
275 idx++;
276 } while (esl_memspn(afp->line, afp->n, " .:*") < afp->n); /* end loop over a block */
277
278 if (idx != nseq) ESL_XFAIL(eslEFORMAT, afp->errmsg, "last block didn't contain same # of seqs as earlier blocks");
279
280 /* skip blank lines until we find start of next block, or EOF */
281 do {
282 status = esl_msafile_GetLine(afp, &p, &n);
283 if (status == eslEOF) break;
284 else if (status != eslOK) goto ERROR;
285 } while (esl_memspn(p, n, " \t") == n);
286
287 alen += block_seq_len;
288 nblocks++;
289 } while (status == eslOK); /* normal end has status == EOF after last block. */
290
291 msa->nseq = nseq;
292 msa->alen = alen;
293 if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR;
294 *ret_msa = msa;
295 return eslOK;
296
297 ERROR:
298 if (msa) esl_msa_Destroy(msa);
299 *ret_msa = NULL;
300 return status;
301 }
302
303
304 /* Function: esl_msafile_clustal_Write()
305 * Synopsis: Write a CLUSTAL format alignment file to a stream.
306 *
307 * Purpose: Write alignment <msa> to output stream <fp>, in
308 * format <fmt>. If <fmt> is <eslMSAFILE_CLUSTAL>,
309 * write strict CLUSTAL 2.1 format. If <fmt>
310 * is <eslMSAFILE_CLUSTALLIKE>, put "EASEL (VERSION)"
311 * in the header.
312 *
313 * The alignment is written in blocks of 60 aligned
314 * residues at a time.
315 *
316 * Constructing the CLUSTAL consensus line properly
317 * requires knowing the alphabet. If the <msa> is in text
318 * mode, we don't know the alphabet, so then we use a
319 * simplified consensus line, with '*' marking completely
320 * conserved columns, ' ' on everything else. If the <msa>
321 * is in digital mode and of type <eslAMINO>, then we also
322 * use Clustal's "strong" and "weak" residue group
323 * annotations, ':' and '.'. Strong groups are STA, NEQK,
324 * NHQK, NDEQ, QHRK, MILV, MILF, HY, and FYW. Weak groups
325 * are CSA, ATV, SAG, STNK, STPA, SGND, SNDEQK, NDEQHK,
326 * NEQHRK, FVLIM, and HFY.
327 *
328 * Args: fp - open output stream, writable
329 * msa - alignment to write
330 * fmt - eslMSAFILE_CLUSTAL or eslMSAFILE_CLUSTALLIKE
331 *
332 * Returns: <eslOK> on success.
333 *
334 * Throws: <eslEMEM> on allocation error.
335 * <eslEWRITE> on any system write error, such as filled disk.
336 */
337 int
esl_msafile_clustal_Write(FILE * fp,const ESL_MSA * msa,int fmt)338 esl_msafile_clustal_Write(FILE *fp, const ESL_MSA *msa, int fmt)
339 {
340 int cpl = 60;
341 int maxnamelen = 0;
342 int namelen;
343 char *consline = NULL;
344 char *buf = NULL;
345 int64_t apos;
346 int i;
347 int status;
348
349 ESL_ALLOC(buf, sizeof(char) * (cpl+1));
350 buf[cpl] = '\0';
351 for (i = 0; i < msa->nseq; i++)
352 {
353 namelen = strlen(msa->sqname[i]);
354 maxnamelen = ESL_MAX(namelen, maxnamelen);
355 }
356
357 /* Make a CLUSTAL-like consensus line */
358 if ( msa->abc && (status = make_digital_consensus_line(msa, &consline)) != eslOK) goto ERROR;
359 if (! msa->abc && (status = make_text_consensus_line (msa, &consline)) != eslOK) goto ERROR;
360
361 /* The magic header */
362 if (fmt == eslMSAFILE_CLUSTAL) { if (fprintf(fp, "CLUSTAL 2.1 multiple sequence alignment\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "clustal msa write failed"); }
363 else if (fmt == eslMSAFILE_CLUSTALLIKE) { if (fprintf(fp, "EASEL (%s) multiple sequence alignment\n", EASEL_VERSION) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "clustal msa write failed"); }
364
365 /* The alignment */
366 for (apos = 0; apos < msa->alen; apos += cpl)
367 {
368 if (fprintf(fp, "\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "clustal msa write failed");
369 for (i = 0; i < msa->nseq; i++)
370 {
371 if (msa->abc) esl_abc_TextizeN(msa->abc, msa->ax[i]+apos+1, cpl, buf);
372 if (! msa->abc) strncpy(buf, msa->aseq[i]+apos, cpl);
373 if (fprintf(fp, "%-*s %s\n", maxnamelen, msa->sqname[i], buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "clustal msa write failed");
374 }
375 strncpy(buf, consline+apos, cpl);
376 if (fprintf(fp, "%-*s %s\n", maxnamelen, "", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "clustal msa write failed");
377 }
378
379 free(buf);
380 free(consline);
381 return eslOK;
382
383 ERROR:
384 if (buf) free(buf);
385 if (consline) free(consline);
386 return status;
387 }
388 /*---------------- end, Clustal API -----------------------------*/
389
390
391
392 /*****************************************************************
393 * 2. Internal routines for Clustal formats
394 *****************************************************************/
395
396 /* Clustal consensus lines.
397 * '*' : 100% conserved positions
398 * ':' : all residues in column belong to a "strong group"
399 * '.' : all residues in column belong to a "weak group"
400 * ' ' : otherwise
401 *
402 * Gap characters count, and ambiguity codes are interpreted verbatim,
403 * so even a single gap or ambiguity code makes the column a ' '.
404 *
405 * From examining the source code for ClustalW (as it writes its
406 * "self explanatory format", ahem!):
407 * strong groups = STA, NEQK, NHQK, NDEQ, QHRK, MILV, MILF,
408 * HY, FYW
409 * weak groups = CSA, ATV, SAG, STNK, STPA, SGND, SNDEQK,
410 * NDEQHK, NEQHRK, FVLIM, HFY
411 *
412 * These groups only apply to protein data, and therefore only to
413 * digital alignments using an <eslAMINO> alphabet.
414
415 * Calculating the consensus line can be compute-intensive, for large
416 * alignments. A naive implementation (for each column, collect
417 * residue counts, compare to each conservation group) was judged too
418 * slow: 16.2s to write the Pkinase full alignment, compared to 1.5s
419 * to write Stockholm format [SRE:J8/22]. Here we use a slightly less
420 * naive implementation, which collects a bit vector (one bit per
421 * residue) for each column, and traverses the alignment in stride
422 * (sequences, then columns). Writing Clustal format Pkinase now takes
423 * 2.3s, and most of the difference w.r.t. Stockholm is now assignable
424 * to the smaller width (thus greater number of blocks) written for
425 * Clustal (60 cpl vs 200) rather than to consensus construction.
426 *
427 * An oversophisticated approach could use a finite
428 * automaton to store all groups in one machine, then to use the FA to
429 * process each residue seen in a column; for most columns, we would
430 * quickly reach a rejection state (most columns don't belong to
431 * a conservation group, especially in large alignments). For a sketch
432 * of how to construct and use such an automaton, xref SRE:J8/22.
433 * I decided this was probably overkill, and didn't implement it.
434 */
435
436
437 /* make_text_consensus_line()
438 *
439 * Given a text mode <msa>, allocate and create a CLUSTAL-style
440 * consensus line; return it in <*ret_consline>. Caller is responsible
441 * for free'ing this string.
442 *
443 * In text mode, we don't know the alphabet; in particular, we can't
444 * know if the data are amino acids, so we don't know if it's
445 * appropriate to use the amino acid group codes. So we don't;
446 * in text mode, only '*' and ' ' appear in consensus lines.
447 *
448 * The consensus line is numbered 0..alen-1, and is NUL-terminated.
449 *
450 * Returns <eslOK> on success.
451 * No normal failure codes.
452 * Throws <eslEMEM> on allocation error.
453 */
454 static int
make_text_consensus_line(const ESL_MSA * msa,char ** ret_consline)455 make_text_consensus_line(const ESL_MSA *msa, char **ret_consline)
456 {
457 char *consline = NULL;
458 uint32_t *v = NULL;
459 uint32_t tmpv, maxv;
460 int n;
461 int idx, apos, x;
462 int status;
463
464 ESL_ALLOC(consline, sizeof(char) * (msa->alen+1));
465 ESL_ALLOC(v, sizeof(uint32_t) * (msa->alen));
466 for (apos = 0; apos < msa->alen; apos++)
467 v[apos] = 0;
468
469 for (idx = 0; idx < msa->nseq; idx++)
470 for (apos = 0; apos < msa->alen; apos++)
471 {
472 x = toupper(msa->aseq[idx][apos]) - 'A';
473 if (x >= 0 && x < 26) v[apos] |= (1 << x);
474 else v[apos] |= (1 << 26);
475 }
476 maxv = (1 << 26) - 1;
477
478 for (apos = 0; apos < msa->alen; apos++)
479 {
480 for (n = 0, tmpv = v[apos]; tmpv; n++) tmpv &= tmpv-1; /* Kernighan magic: count # of bits set in tmpv */
481 consline[apos] = ((n == 1 && v[apos] < maxv) ? '*' : ' ');
482 }
483 consline[msa->alen] = '\0';
484
485 *ret_consline = consline;
486 free(v);
487 return eslOK;
488
489 ERROR:
490 if (v) free(v);
491 if (consline) free(consline);
492 *ret_consline = NULL;
493 return status;
494 }
495
496
497 /* make_digital_consensus_line()
498 *
499 * Exactly the same as make_text_consensus_line(), except for
500 * digital mode <msa>.
501 */
502 static int
matches_group_digital(ESL_ALPHABET * abc,uint32_t v,char * group)503 matches_group_digital(ESL_ALPHABET *abc, uint32_t v, char *group)
504 {
505 uint32_t gv = 0;
506 ESL_DSQ sym;
507 char *c;
508
509 for (c = group; *c; c++) {
510 sym = esl_abc_DigitizeSymbol(abc, *c);
511 gv |= (1 << sym);
512 }
513 return ( ((v & gv) == v) ? TRUE : FALSE);
514 }
515
516 static int
make_digital_consensus_line(const ESL_MSA * msa,char ** ret_consline)517 make_digital_consensus_line(const ESL_MSA *msa, char **ret_consline)
518 {
519 char *consline = NULL;
520 uint32_t *v = NULL;
521 uint32_t tmpv, maxv;
522 int n;
523 int idx, apos;
524 int status;
525
526 /* if this ever becomes a problem, easy enough to make v a uint64_t to get up to Kp<=64 */
527 if (msa->abc->Kp > 32) ESL_EXCEPTION(eslEINVAL, "Clustal format writer cannot handle digital alphabets of Kp>32 residues");
528
529 ESL_ALLOC(v, sizeof(uint32_t) * (msa->alen+1));
530 ESL_ALLOC(consline, sizeof(char) * (msa->alen+1));
531 for (apos = 0; apos <= msa->alen; apos++)
532 v[apos] = 0;
533
534 for (idx = 0; idx < msa->nseq; idx++)
535 for (apos = 1; apos <= msa->alen; apos++)
536 v[apos] |= (1 << msa->ax[idx][apos]);
537
538 maxv = (1 << msa->abc->K) - 1; /* maxv: has all canonical residue bits set */
539
540 for (apos = 1; apos <= msa->alen; apos++)
541 {
542 consline[apos-1] = ' ';
543
544 for (n = 0, tmpv = v[apos]; tmpv; n++) tmpv &= tmpv-1; /* Kernighan magic: count # of bits set in tmpv */
545
546 if (n == 0 || n > 6) continue; /* n==0 shouldn't happen; n > 6 means too many different residues seen */
547 else if (v[apos] > maxv) continue; /* gap or ambiguity chars seen; column must be left unannotated */
548 else if (n == 1) consline[apos-1] = '*'; /* complete conservation of a canonical residue */
549 else if (msa->abc->type == eslAMINO)
550 {
551 if (matches_group_digital(msa->abc, v[apos], "STA")) consline[apos-1] = ':';
552 else if (matches_group_digital(msa->abc, v[apos], "NEQK")) consline[apos-1] = ':';
553 else if (matches_group_digital(msa->abc, v[apos], "NHQK")) consline[apos-1] = ':';
554 else if (matches_group_digital(msa->abc, v[apos], "NDEQ")) consline[apos-1] = ':';
555 else if (matches_group_digital(msa->abc, v[apos], "QHRK")) consline[apos-1] = ':';
556 else if (matches_group_digital(msa->abc, v[apos], "MILV")) consline[apos-1] = ':';
557 else if (matches_group_digital(msa->abc, v[apos], "MILF")) consline[apos-1] = ':';
558 else if (matches_group_digital(msa->abc, v[apos], "HY")) consline[apos-1] = ':';
559 else if (matches_group_digital(msa->abc, v[apos], "FYW")) consline[apos-1] = ':';
560
561 else if (matches_group_digital(msa->abc, v[apos], "CSA")) consline[apos-1] = '.';
562 else if (matches_group_digital(msa->abc, v[apos], "ATV")) consline[apos-1] = '.';
563 else if (matches_group_digital(msa->abc, v[apos], "SAG")) consline[apos-1] = '.';
564 else if (matches_group_digital(msa->abc, v[apos], "STNK")) consline[apos-1] = '.';
565 else if (matches_group_digital(msa->abc, v[apos], "STPA")) consline[apos-1] = '.';
566 else if (matches_group_digital(msa->abc, v[apos], "SGND")) consline[apos-1] = '.';
567 else if (matches_group_digital(msa->abc, v[apos], "SNDEQK")) consline[apos-1] = '.';
568 else if (matches_group_digital(msa->abc, v[apos], "NDEQHK")) consline[apos-1] = '.';
569 else if (matches_group_digital(msa->abc, v[apos], "NEQHRK")) consline[apos-1] = '.';
570 else if (matches_group_digital(msa->abc, v[apos], "FVLIM")) consline[apos-1] = '.';
571 else if (matches_group_digital(msa->abc, v[apos], "HFY")) consline[apos-1] = '.';
572 }
573 }
574 consline[apos-1] = '\0';
575
576 *ret_consline = consline;
577 free(v);
578 return eslOK;
579
580 ERROR:
581 if (v) free(v);
582 if (consline) free(consline);
583 *ret_consline = NULL;
584 return eslOK;
585 }
586 /*-------------- end, internal clustal routines -----------------*/
587
588
589 /*****************************************************************
590 * 3. Unit tests.
591 *****************************************************************/
592 #ifdef eslMSAFILE_CLUSTAL_TESTDRIVE
593
594 static void
utest_write_good1(FILE * ofp,int * ret_format,int * ret_alphatype,int * ret_nseq,int * ret_alen)595 utest_write_good1(FILE *ofp, int *ret_format, int *ret_alphatype, int *ret_nseq, int *ret_alen)
596 {
597 fputs("MUSCLE (3.7) multiple sequence alignment\n", ofp);
598 fputs("\n", ofp);
599 fputs("\n", ofp);
600 fputs("MYG_PHYCA --------V-LSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRFKHLKT\n", ofp);
601 fputs("GLB5_PETMA PIVDTGSVAPLSAAEKTKIRSAWAPVYSTYETSGVDILVKFFTSTPAAQEFFPKFKGLTT\n", ofp);
602 fputs("HBB_HUMAN --------VHLTPEEKSAVTALWGKV--NVDEVGGEALGRLLVVYPWTQRFFESFGDLST\n", ofp);
603 fputs("HBA_HUMAN --------V-LSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHF-----\n", ofp);
604 fputs(" . *: :. : *. * * : * .:: * : * * \n", ofp);
605 fputs("\n", ofp);
606 fputs("MYG_PHYCA EAEMKASEDLKKHGVTVLTALGAILKKKGH---HEAELKPLAQSHATKHKIPIKYLEFIS\n", ofp);
607 fputs("GLB5_PETMA ADQLKKSADVRWHAERIINAVNDAVASMDDTEKMSMKLRDLSGKHAKSFQVDPQYFKVLA\n", ofp);
608 fputs("HBB_HUMAN PDAVMGNPKVKAHGKKVLGAFSDGLAHLDN---LKGTFATLSELHCDKLHVDPENFRLLG\n", ofp);
609 fputs("HBA_HUMAN -DLSHGSAQVKGHGKKVADALTNAVAHVDD---MPNALSALSDLHAHKLRVDPVNFKLLS\n", ofp);
610 fputs(" \n", ofp); /* deliberately made blank */
611 fputs("\n", ofp);
612 fputs("MYG_PHYCA EAIIHVLHSRHPGDFGADAQGAMNKALELFRKDIAAKYKELGYQG\n", ofp);
613 fputs("GLB5_PETMA AVI---------ADTVAAGDAGFEKLMSMICILLRSAY-------\n", ofp);
614 fputs("HBB_HUMAN NVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH------\n", ofp);
615 fputs("HBA_HUMAN HCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKYR------\n", ofp);
616 fputs(" : : . .. :* : . : * \n", ofp);
617 fputs("\n", ofp);
618
619 *ret_format = eslMSAFILE_CLUSTALLIKE;
620 *ret_alphatype = eslAMINO;
621 *ret_nseq = 4;
622 *ret_alen = 165;
623 }
624
625 static void
utest_write_good2(FILE * ofp,int * ret_format,int * ret_alphatype,int * ret_nseq,int * ret_alen)626 utest_write_good2(FILE *ofp, int *ret_format, int *ret_alphatype, int *ret_nseq, int *ret_alen)
627 {
628 fputs("CLUSTAL W (1.81) multiple sequence alignment\n", ofp);
629 fputs("\n", ofp);
630 fputs("tRNA2 UCCGAUAUAGUGUAACGGCUAUCACAUCACGCUUUCACCGUGG-AGACCGGGGUUCGACU\n", ofp);
631 fputs("tRNA3 UCCGUGAUAGUUUAAUGGUCAGAAUGG-GCGCUUGUCGCGUGCCAGAUCGGGGUUCAAUU\n", ofp);
632 fputs("tRNA5 GGGCACAUGGCGCAGUUGGUAGCGCGCUUCCCUUGCAAGGAAGAGGUCAUCGGUUCGAUU\n", ofp);
633 fputs("tRNA1 GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAUC\n", ofp);
634 fputs("tRNA4 GCUCGUAUGGCGCAGUGG-UAGCGCAGCAGAUUGCAAAUCUGUUGGUCCUUAGUUCGAUC\n", ofp);
635 fputs(" * * * * * * * **** * \n", ofp);
636 fputs("\n", ofp);
637 fputs("tRNA2 CCCCGUAUCGGAG\n", ofp);
638 fputs("tRNA3 CCCCGUCGCGGAG\n", ofp);
639 fputs("tRNA5 CCGGUUGCGUCCA\n", ofp);
640 fputs("tRNA1 CACAGAAUUCGCA\n", ofp);
641 fputs("tRNA4 CUGAGUGCGAGCU\n", ofp);
642 fputs(" * \n", ofp);
643
644 *ret_format = eslMSAFILE_CLUSTAL;
645 *ret_alphatype = eslRNA;
646 *ret_nseq = 5;
647 *ret_alen = 73;
648 }
649
650 /* An example of clustal format with optional sequence coords;
651 * a quickly-taken subset of a larger alignment reported as a bug.
652 */
653 static void
utest_write_good3(FILE * ofp,int * ret_format,int * ret_alphatype,int * ret_nseq,int * ret_alen)654 utest_write_good3(FILE *ofp, int *ret_format, int *ret_alphatype, int *ret_nseq, int *ret_alen)
655 {
656 fputs("CLUSTAL 2.1 multiple sequence alignment\n", ofp);
657 fputs("\n", ofp);
658 fputs("gi|85091828|ref|XP_959093.1| MSDFTSKVKVLRDGQKPEFPSN----ANTLEYAQSLDAQDELRHFRNEFI 46\n", ofp);
659 fputs("gi|70993990|ref|XP_751842.1| ----MSTNGTLS---KPEFPAN----AASKEYAASLDAADPFAGFREKFI 39\n", ofp);
660 fputs("gi|71001376|ref|XP_755369.1| ---MGSRLHVQVIHGGPPLPYKDDIRAFGKEYAEQLDAQDPLRRFRDEFI 47\n", ofp);
661 fputs("gi|71744026|ref|XP_803513.1| -----------------------------------MDRNDPLQVHRDAFN 15\n", ofp);
662 fputs(" : : \n", ofp);
663 fputs("\n", ofp);
664 fputs("gi|85091828|ref|XP_959093.1| IPTRASLKKKALDGI--------------IPGTQANGTTTSTDADTPCIY 82\n", ofp);
665 fputs("gi|70993990|ref|XP_751842.1| IPSKANIASTKLA----------------KPGLSSE----------PCIY 63\n", ofp);
666 fputs("gi|71001376|ref|XP_755369.1| IPSKKDLKRKTLFPNDGMYSCGHPICFANTSCACVHAAETEETSDEKCIY 97\n", ofp);
667 fputs("gi|71744026|ref|XP_803513.1| IPKRRDGS--------------------------------------DHVY 27\n", ofp);
668 fputs(" *\n", ofp);
669
670 *ret_format = eslMSAFILE_CLUSTAL;
671 *ret_alphatype = eslAMINO;
672 *ret_nseq = 4;
673 *ret_alen = 100;
674 }
675
676
677 static void
utest_goodfile(char * filename,int testnumber,int expected_format,int expected_alphatype,int expected_nseq,int expected_alen)678 utest_goodfile(char *filename, int testnumber, int expected_format, int expected_alphatype, int expected_nseq, int expected_alen)
679 {
680 ESL_ALPHABET *abc = NULL;
681 ESL_MSAFILE *afp = NULL;
682 ESL_MSA *msa1 = NULL;
683 ESL_MSA *msa2 = NULL;
684 char tmpfile1[32] = "esltmpXXXXXX";
685 char tmpfile2[32] = "esltmpXXXXXX";
686 FILE *ofp = NULL;
687 int status;
688
689 /* guessing both the format and the alphabet should work: this is a digital open */
690 if ( (status = esl_msafile_Open(&abc, filename, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) esl_fatal("clustal good file test %d failed: digital open", testnumber);
691 if (afp->format != expected_format) esl_fatal("clustal good file test %d failed: format autodetection", testnumber);
692 if (abc->type != expected_alphatype) esl_fatal("clustal good file test %d failed: alphabet autodetection", testnumber);
693
694 /* This is a digital read, using <abc>. */
695 if ( (status = esl_msafile_clustal_Read(afp, &msa1)) != eslOK) esl_fatal("clustal good file test %d failed: msa read, digital", testnumber);
696 if (msa1->nseq != expected_nseq || msa1->alen != expected_alen) esl_fatal("clustal good file test %d failed: nseq/alen", testnumber);
697 if (esl_msa_Validate(msa1, NULL) != eslOK) esl_fatal("clustal good file test %d failed: msa1 invalid", testnumber);
698 esl_msafile_Close(afp);
699
700 /* write it back out to a new tmpfile (digital write) */
701 if ( (status = esl_tmpfile_named(tmpfile1, &ofp)) != eslOK) esl_fatal("clustal good file test %d failed: tmpfile creation", testnumber);
702 if ( (status = esl_msafile_clustal_Write(ofp, msa1, expected_format)) != eslOK) esl_fatal("clustal good file test %d failed: msa write, digital", testnumber);
703 fclose(ofp);
704
705 /* now open and read it as text mode, in known format. (We have to pass fmtd now, to deal with the possibility of a nonstandard name width) */
706 if ( (status = esl_msafile_Open(NULL, tmpfile1, NULL, expected_format, NULL, &afp)) != eslOK) esl_fatal("clustal good file test %d failed: text mode open", testnumber);
707 if ( (status = esl_msafile_clustal_Read(afp, &msa2)) != eslOK) esl_fatal("clustal good file test %d failed: msa read, text", testnumber);
708 if (msa2->nseq != expected_nseq || msa2->alen != expected_alen) esl_fatal("clustal good file test %d failed: nseq/alen", testnumber);
709 if (esl_msa_Validate(msa2, NULL) != eslOK) esl_fatal("clustal good file test %d failed: msa2 invalid", testnumber);
710 esl_msafile_Close(afp);
711
712 /* write it back out to a new tmpfile (text write) */
713 if ( (status = esl_tmpfile_named(tmpfile2, &ofp)) != eslOK) esl_fatal("clustal good file test %d failed: tmpfile creation", testnumber);
714 if ( (status = esl_msafile_clustal_Write(ofp, msa2, expected_format)) != eslOK) esl_fatal("clustal good file test %d failed: msa write, text", testnumber);
715 fclose(ofp);
716 esl_msa_Destroy(msa2);
717
718 /* open and read it in digital mode */
719 if ( (status = esl_msafile_Open(&abc, tmpfile1, NULL, expected_format, NULL, &afp)) != eslOK) esl_fatal("clustal good file test %d failed: 2nd digital mode open", testnumber);
720 if ( (status = esl_msafile_clustal_Read(afp, &msa2)) != eslOK) esl_fatal("clustal good file test %d failed: 2nd digital msa read", testnumber);
721 if (esl_msa_Validate(msa2, NULL) != eslOK) esl_fatal("clustal good file test %d failed: msa2 invalid", testnumber);
722 esl_msafile_Close(afp);
723
724 /* this msa <msa2> should be identical to <msa1> */
725 if (esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal("clustal good file test %d failed: msa compare", testnumber);
726
727 remove(tmpfile1);
728 remove(tmpfile2);
729 esl_msa_Destroy(msa1);
730 esl_msa_Destroy(msa2);
731 esl_alphabet_Destroy(abc);
732 }
733
734 static void
write_test_msas(FILE * ofp1,FILE * ofp2)735 write_test_msas(FILE *ofp1, FILE *ofp2)
736 {
737 fprintf(ofp1, "EASEL (X.x) multiple sequence alignment\n");
738 fprintf(ofp1, "\n");
739 fprintf(ofp1, "seq1 ..acdefghiklmnpqrstvwy\n");
740 fprintf(ofp1, "seq2 ..acdefghiklmnpqrstv--\n");
741 fprintf(ofp1, "seq3 aaacdefghiklmnpqrstv--\n");
742 fprintf(ofp1, "seq4 ..acdefghiklmnpqrstvwy\n");
743 fprintf(ofp1, " ****************** \n");
744 fprintf(ofp1, "\n");
745 fprintf(ofp1, "seq1 ACDEFGHIKLMNPQRSTVWY\n");
746 fprintf(ofp1, "seq2 ACDEFGHIKLMNPQRSTVWY\n");
747 fprintf(ofp1, "seq3 ACDEFGHIKLMNPQRSTVWY\n");
748 fprintf(ofp1, "seq4 ACDEFGHIKLMNPQRSTVWY\n");
749 fprintf(ofp1, " ********************\n");
750 fprintf(ofp1, "\n");
751 fprintf(ofp1, "seq1 ..\n");
752 fprintf(ofp1, "seq2 YY\n");
753 fprintf(ofp1, "seq3 ..\n");
754 fprintf(ofp1, "seq4 ..\n");
755 fprintf(ofp1, "\n");
756
757 fprintf(ofp2, "# STOCKHOLM 1.0\n");
758 fprintf(ofp2, "\n");
759 fprintf(ofp2, "seq1 ..acdefghiklmnpqrstvwyACDEFGHIKLMNPQRSTVWY..\n");
760 fprintf(ofp2, "seq2 ..acdefghiklmnpqrstv--ACDEFGHIKLMNPQRSTVWYYY\n");
761 fprintf(ofp2, "seq3 aaacdefghiklmnpqrstv--ACDEFGHIKLMNPQRSTVWY..\n");
762 fprintf(ofp2, "seq4 ..acdefghiklmnpqrstvwyACDEFGHIKLMNPQRSTVWY..\n");
763 fprintf(ofp2, "//\n");
764 }
765
766 static void
read_test_msas_digital(char * alnfile,char * stkfile)767 read_test_msas_digital(char *alnfile, char *stkfile)
768 {
769 char msg[] = "CLUSTAL msa digital read unit test failed";
770 ESL_ALPHABET *abc = NULL;
771 ESL_MSAFILE *afp1 = NULL;
772 ESL_MSAFILE *afp2 = NULL;
773 ESL_MSA *msa1, *msa2, *msa3, *msa4;
774 FILE *alnfp, *stkfp;
775 char alnfile2[32] = "esltmpaln2XXXXXX";
776 char stkfile2[32] = "esltmpstk2XXXXXX";
777
778 if ( esl_msafile_Open(&abc, alnfile, NULL, eslMSAFILE_CLUSTALLIKE, NULL, &afp1) != eslOK) esl_fatal(msg);
779 if ( !abc || abc->type != eslAMINO) esl_fatal(msg);
780 if ( esl_msafile_Open(&abc, stkfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
781 if ( esl_msafile_clustal_Read (afp1, &msa1) != eslOK) esl_fatal(msg);
782 if ( esl_msafile_stockholm_Read(afp2, &msa2) != eslOK) esl_fatal(msg);
783 if ( esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal(msg);
784
785 if ( esl_msafile_clustal_Read (afp1, &msa3) != eslEOF) esl_fatal(msg);
786 if ( esl_msafile_stockholm_Read(afp2, &msa3) != eslEOF) esl_fatal(msg);
787
788 esl_msafile_Close(afp2);
789 esl_msafile_Close(afp1);
790
791 /* Now write stk to clustal file, and vice versa; then retest */
792 if ( esl_tmpfile_named(alnfile2, &alnfp) != eslOK) esl_fatal(msg);
793 if ( esl_tmpfile_named(stkfile2, &stkfp) != eslOK) esl_fatal(msg);
794 if ( esl_msafile_clustal_Write (alnfp, msa2, eslMSAFILE_CLUSTAL) != eslOK) esl_fatal(msg);
795 if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_STOCKHOLM) != eslOK) esl_fatal(msg);
796 fclose(alnfp);
797 fclose(stkfp);
798 if ( esl_msafile_Open(&abc, alnfile2, NULL, eslMSAFILE_CLUSTAL, NULL, &afp1) != eslOK) esl_fatal(msg);
799 if ( esl_msafile_Open(&abc, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
800 if ( esl_msafile_clustal_Read (afp1, &msa3) != eslOK) esl_fatal(msg);
801 if ( esl_msafile_stockholm_Read(afp2, &msa4) != eslOK) esl_fatal(msg);
802 if ( esl_msa_Compare(msa3, msa4) != eslOK) esl_fatal(msg);
803
804 remove(alnfile2);
805 remove(stkfile2);
806 esl_msafile_Close(afp2);
807 esl_msafile_Close(afp1);
808
809 esl_msa_Destroy(msa1);
810 esl_msa_Destroy(msa2);
811 esl_msa_Destroy(msa3);
812 esl_msa_Destroy(msa4);
813 esl_alphabet_Destroy(abc);
814 }
815
816 static void
read_test_msas_text(char * alnfile,char * stkfile)817 read_test_msas_text(char *alnfile, char *stkfile)
818 {
819 char msg[] = "CLUSTAL msa text-mode read unit test failed";
820 ESL_MSAFILE *afp1 = NULL;
821 ESL_MSAFILE *afp2 = NULL;
822 ESL_MSA *msa1, *msa2, *msa3, *msa4;
823 FILE *alnfp, *stkfp;
824 char alnfile2[32] = "esltmpaln2XXXXXX";
825 char stkfile2[32] = "esltmpstk2XXXXXX";
826
827 /* vvvv-- everything's the same as the digital utest except these NULLs */
828 if ( esl_msafile_Open(NULL, alnfile, NULL, eslMSAFILE_CLUSTALLIKE, NULL, &afp1) != eslOK) esl_fatal(msg);
829 if ( esl_msafile_Open(NULL, stkfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
830 if ( esl_msafile_clustal_Read (afp1, &msa1) != eslOK) esl_fatal(msg);
831 if ( esl_msafile_stockholm_Read(afp2, &msa2) != eslOK) esl_fatal(msg);
832 if ( esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal(msg);
833 if ( esl_msafile_clustal_Read (afp1, &msa3) != eslEOF) esl_fatal(msg);
834 if ( esl_msafile_stockholm_Read(afp2, &msa3) != eslEOF) esl_fatal(msg);
835 esl_msafile_Close(afp2);
836 esl_msafile_Close(afp1);
837
838 if ( esl_tmpfile_named(alnfile2, &alnfp) != eslOK) esl_fatal(msg);
839 if ( esl_tmpfile_named(stkfile2, &stkfp) != eslOK) esl_fatal(msg);
840 if ( esl_msafile_clustal_Write (alnfp, msa2, eslMSAFILE_CLUSTAL) != eslOK) esl_fatal(msg);
841 if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_STOCKHOLM) != eslOK) esl_fatal(msg);
842 fclose(alnfp);
843 fclose(stkfp);
844 if ( esl_msafile_Open(NULL, alnfile2, NULL, eslMSAFILE_CLUSTAL, NULL, &afp1) != eslOK) esl_fatal(msg);
845 if ( esl_msafile_Open(NULL, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
846 if ( esl_msafile_clustal_Read (afp1, &msa3) != eslOK) esl_fatal(msg);
847 if ( esl_msafile_stockholm_Read(afp2, &msa4) != eslOK) esl_fatal(msg);
848 if ( esl_msa_Compare(msa3, msa4) != eslOK) esl_fatal(msg);
849
850 remove(alnfile2);
851 remove(stkfile2);
852 esl_msafile_Close(afp2);
853 esl_msafile_Close(afp1);
854
855 esl_msa_Destroy(msa1);
856 esl_msa_Destroy(msa2);
857 esl_msa_Destroy(msa3);
858 esl_msa_Destroy(msa4);
859 }
860 #endif /*eslMSAFILE_CLUSTAL_TESTDRIVE*/
861 /*---------------------- end, unit tests ------------------------*/
862
863
864 /*****************************************************************
865 * 4. Test driver.
866 *****************************************************************/
867 #ifdef eslMSAFILE_CLUSTAL_TESTDRIVE
868 /* compile: gcc -g -Wall -I. -L. -o esl_msafile_clustal_utest -DeslMSAFILE_CLUSTAL_TESTDRIVE esl_msafile_clustal.c -leasel -lm
869 * (gcov): gcc -g -Wall -fprofile-arcs -ftest-coverage -I. -L. -o esl_msafile_clustal_utest -DeslMSAFILE_CLUSTAL_TESTDRIVE esl_msafile_clustal.c -leasel -lm
870 * run: ./esl_msafile_clustal_utest
871 */
872 #include "esl_config.h"
873
874 #include <stdio.h>
875
876 #include "easel.h"
877 #include "esl_getopts.h"
878 #include "esl_random.h"
879 #include "esl_msafile.h"
880 #include "esl_msafile_clustal.h"
881
882 static ESL_OPTIONS options[] = {
883 /* name type default env range togs reqs incomp help docgrp */
884 {"-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0},
885 { 0,0,0,0,0,0,0,0,0,0},
886 };
887 static char usage[] = "[-options]";
888 static char banner[] = "test driver for CLUSTAL MSA format module";
889
890 int
main(int argc,char ** argv)891 main(int argc, char **argv)
892 {
893 char msg[] = "CLUSTAL MSA i/o module test driver failed";
894 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
895 char alnfile[32] = "esltmpalnXXXXXX";
896 char stkfile[32] = "esltmpstkXXXXXX";
897 FILE *alnfp, *stkfp;
898 int testnumber;
899 int ngoodtests = 3;
900 char tmpfile[32];
901 FILE *ofp;
902 int expected_format;
903 int expected_alphatype;
904 int expected_nseq;
905 int expected_alen;
906
907 if ( esl_tmpfile_named(alnfile, &alnfp) != eslOK) esl_fatal(msg);
908 if ( esl_tmpfile_named(stkfile, &stkfp) != eslOK) esl_fatal(msg);
909 write_test_msas(alnfp, stkfp);
910 fclose(alnfp);
911 fclose(stkfp);
912
913 read_test_msas_digital(alnfile, stkfile);
914 read_test_msas_text (alnfile, stkfile);
915
916 /* Various "good" files that should be parsed correctly */
917 for (testnumber = 1; testnumber <= ngoodtests; testnumber++)
918 {
919 strcpy(tmpfile, "esltmpXXXXXX");
920 if (esl_tmpfile_named(tmpfile, &ofp) != eslOK) esl_fatal(msg);
921 switch (testnumber) {
922 case 1: utest_write_good1 (ofp, &expected_format, &expected_alphatype, &expected_nseq, &expected_alen); break;
923 case 2: utest_write_good2 (ofp, &expected_format, &expected_alphatype, &expected_nseq, &expected_alen); break;
924 case 3: utest_write_good3 (ofp, &expected_format, &expected_alphatype, &expected_nseq, &expected_alen); break;
925 }
926 fclose(ofp);
927 utest_goodfile(tmpfile, testnumber, expected_format, expected_alphatype, expected_nseq, expected_alen);
928 remove(tmpfile);
929 }
930
931 remove(alnfile);
932 remove(stkfile);
933 esl_getopts_Destroy(go);
934 return 0;
935 }
936 #endif /*eslMSAFILE_CLUSTAL_TESTDRIVE*/
937 /*--------------------- end, test driver ------------------------*/
938
939
940 /*****************************************************************
941 * 5. Examples.
942 *****************************************************************/
943
944
945 #ifdef eslMSAFILE_CLUSTAL_EXAMPLE
946 /* A full-featured example of reading/writing an MSA in Clustal format(s).
947 gcc -g -Wall -o esl_msafile_clustal_example -I. -L. -DeslMSAFILE_CLUSTAL_EXAMPLE esl_msafile_clustal.c -leasel -lm
948 ./esl_msafile_clustal_example <msafile>
949 */
950 /*::cexcerpt::msafile_clustal_example::begin::*/
951 #include <stdio.h>
952
953 #include "easel.h"
954 #include "esl_alphabet.h"
955 #include "esl_getopts.h"
956 #include "esl_msa.h"
957 #include "esl_msafile.h"
958 #include "esl_msafile_clustal.h"
959
960 static ESL_OPTIONS options[] = {
961 /* name type default env range toggles reqs incomp help docgroup*/
962 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
963 { "-1", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "no autodetection; use CLUSTAL format", 0 },
964 { "-2", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "no autodetection; use CLUSTALLIKE format", 0 },
965 { "-q", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "quieter: don't write msa back, just summary", 0 },
966 { "-t", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use text mode: no digital alphabet", 0 },
967 { "--dna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is DNA", 0 },
968 { "--rna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is RNA", 0 },
969 { "--amino", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is protein", 0 },
970 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
971 };
972 static char usage[] = "[-options] <msafile>";
973 static char banner[] = "example of guessing, reading, writing Clustal formats";
974
975 int
main(int argc,char ** argv)976 main(int argc, char **argv)
977 {
978 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
979 char *filename = esl_opt_GetArg(go, 1);
980 int infmt = eslMSAFILE_UNKNOWN;
981 ESL_ALPHABET *abc = NULL;
982 ESL_MSAFILE *afp = NULL;
983 ESL_MSA *msa = NULL;
984 int status;
985
986 if (esl_opt_GetBoolean(go, "-1")) infmt = eslMSAFILE_CLUSTAL;
987 else if (esl_opt_GetBoolean(go, "-2")) infmt = eslMSAFILE_CLUSTALLIKE;
988
989 if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA);
990 else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA);
991 else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
992
993 /* Text mode: pass NULL for alphabet.
994 * Digital mode: pass ptr to expected ESL_ALPHABET; and if abc=NULL, alphabet is guessed
995 */
996 if (esl_opt_GetBoolean(go, "-t")) status = esl_msafile_Open(NULL, filename, NULL, infmt, NULL, &afp);
997 else status = esl_msafile_Open(&abc, filename, NULL, infmt, NULL, &afp);
998 if (status != eslOK) esl_msafile_OpenFailure(afp, status);
999
1000 if ( (status = esl_msafile_clustal_Read(afp, &msa)) != eslOK)
1001 esl_msafile_ReadFailure(afp, status);
1002
1003 printf("format variant: %s\n", esl_msafile_DecodeFormat(afp->format));
1004 printf("alphabet: %s\n", (abc ? esl_abc_DecodeType(abc->type) : "none (text mode)"));
1005 printf("# of seqs: %d\n", msa->nseq);
1006 printf("# of cols: %d\n", (int) msa->alen);
1007 printf("\n");
1008
1009 if (! esl_opt_GetBoolean(go, "-q"))
1010 esl_msafile_clustal_Write(stdout, msa, eslMSAFILE_CLUSTAL);
1011
1012 esl_msa_Destroy(msa);
1013 esl_msafile_Close(afp);
1014 if (abc) esl_alphabet_Destroy(abc);
1015 esl_getopts_Destroy(go);
1016 exit(0);
1017 }
1018 /*::cexcerpt::msafile_clustal_example::end::*/
1019 #endif /*eslMSAFILE_CLUSTAL_EXAMPLE*/
1020
1021 #ifdef eslMSAFILE_CLUSTAL_EXAMPLE2
1022 /* A minimal example. Read Clustal MSA, in text mode.
1023 gcc -g -Wall -o esl_msafile_clustal_example2 -I. -L. -DeslMSAFILE_CLUSTAL_EXAMPLE2 esl_msafile_clustal.c -leasel -lm
1024 ./esl_msafile_clustal_example2 <msafile>
1025 */
1026
1027 /*::cexcerpt::msafile_clustal_example2::begin::*/
1028 #include <stdio.h>
1029
1030 #include "easel.h"
1031 #include "esl_msa.h"
1032 #include "esl_msafile.h"
1033 #include "esl_msafile_clustal.h"
1034
1035 int
main(int argc,char ** argv)1036 main(int argc, char **argv)
1037 {
1038 char *filename = argv[1];
1039 int fmt = eslMSAFILE_CLUSTAL; /* or eslMSAFILE_CLUSTALLIKE */
1040 ESL_MSAFILE *afp = NULL;
1041 ESL_MSA *msa = NULL;
1042 int status;
1043
1044 if ( (status = esl_msafile_Open(NULL, filename, NULL, fmt, NULL, &afp)) != eslOK) esl_msafile_OpenFailure(afp, status);
1045 if ( (status = esl_msafile_clustal_Read(afp, &msa)) != eslOK) esl_msafile_ReadFailure(afp, status);
1046
1047 printf("%6d seqs, %5d columns\n", msa->nseq, (int) msa->alen);
1048
1049 esl_msafile_clustal_Write(stdout, msa, eslMSAFILE_CLUSTAL);
1050
1051 esl_msa_Destroy(msa);
1052 esl_msafile_Close(afp);
1053 exit(0);
1054 }
1055 /*::cexcerpt::msafile_clustal_example2::end::*/
1056 #endif /*eslMSAFILE_CLUSTAL_EXAMPLE2*/
1057 /*--------------------- end of example --------------------------*/
1058
1059