1 /* i/o of multiple sequence alignment files in "aligned FASTA" format
2  *
3  * Contents:
4  *   1. API for reading/writing AFA format
5  *   2. Unit tests.
6  *   3. Test driver.
7  *   4. Examples.
8  */
9 #include "esl_config.h"
10 
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <ctype.h>
15 
16 #include "easel.h"
17 #include "esl_alphabet.h"
18 #include "esl_mem.h"
19 #include "esl_msa.h"
20 #include "esl_msafile.h"
21 
22 #include "esl_msafile_afa.h"
23 
24 /*****************************************************************
25  *# 1. API for reading/writing AFA format
26  *****************************************************************/
27 
28 /* Function:  esl_msafile_afa_SetInmap()
29  * Synopsis:  Set input map for aligned FASTA format.
30  *
31  * Purpose:   Set the <afp->inmap> for aligned FASTA format.
32  *
33  *            Text mode accepts any <isgraph()> character.
34  *            Digital mode enforces the usual Easel alphabets.
35  *
36  *            We skip spaces in input lines of aligned FASTA format;
37  *            map ' ' to <eslDSQ_IGNORED>.
38  */
39 int
esl_msafile_afa_SetInmap(ESL_MSAFILE * afp)40 esl_msafile_afa_SetInmap(ESL_MSAFILE *afp)
41 {
42   int sym;
43 
44   if (afp->abc)
45     {
46       for (sym = 0; sym < 128; sym++)
47 	afp->inmap[sym] = afp->abc->inmap[sym];
48       afp->inmap[0] = esl_abc_XGetUnknown(afp->abc);
49     }
50 
51   if (! afp->abc)
52     {
53       for (sym = 1; sym < 128; sym++)
54 	afp->inmap[sym] = (isgraph(sym) ? sym : eslDSQ_ILLEGAL);
55       afp->inmap[0]   = '?';
56     }
57 
58   afp->inmap[' '] = eslDSQ_IGNORED;
59   return eslOK;
60 }
61 
62 
63 /* Function:  esl_msafile_afa_GuessAlphabet()
64  * Synopsis:  Guess the alphabet of an open AFA MSA file.
65  *
66  * Purpose:   Guess the alpbabet of the sequences in open
67  *            AFA format MSA file <afp>.
68  *
69  *            On a normal return, <*ret_type> is set to <eslDNA>,
70  *            <eslRNA>, or <eslAMINO>, and <afp> is reset to its
71  *            original position.
72  *
73  * Args:      afp      - open AFA format MSA file
74  *            ret_type - RETURN: <eslDNA>, <eslRNA>, or <eslAMINO>
75  *
76  * Returns:   <eslOK> on success.
77  *            <eslENOALPHABET> if alphabet type can't be determined.
78  *            In either case, <afp> is rewound to the position it
79  *            started at.
80  *
81  * Throws:    <eslEMEM> on allocation error.
82  *            <eslESYS> on failures of fread() or other system calls
83  *
84  * Note:      Essentially identical to <esl_msafile_a2m_GuessAlphabet()>,
85  *            but we provide both versions because design calls for
86  *            modularity/separability of parsers.
87  */
88 int
esl_msafile_afa_GuessAlphabet(ESL_MSAFILE * afp,int * ret_type)89 esl_msafile_afa_GuessAlphabet(ESL_MSAFILE *afp, int *ret_type)
90 {
91   int       alphatype     = eslUNKNOWN;
92   esl_pos_t anchor        = -1;
93   int       threshold[3]  = { 500, 5000, 50000 }; /* we check after 500, 5000, 50000 residues; else we go to EOF */
94   int       nsteps        = 3;
95   int       step          = 0;
96   int       nres          = 0;
97   int       x;
98   int64_t   ct[26];
99   char     *p;
100   esl_pos_t n, pos;
101   int       status;
102 
103   for (x = 0; x < 26; x++) ct[x] = 0;
104 
105   anchor = esl_buffer_GetOffset(afp->bf);
106   if ((status = esl_buffer_SetAnchor(afp->bf, anchor)) != eslOK) { status = eslEINCONCEIVABLE; goto ERROR; } /* [eslINVAL] can't happen here */
107 
108   while ( (status = esl_buffer_GetLine(afp->bf, &p, &n)) == eslOK)
109     {
110       while (n && isspace(*p)) { p++; n--; }
111       if    (!n || *p == '>') continue;
112 
113       for (pos = 0; pos < n; pos++)
114 	if (isalpha(p[pos])) {
115 	  x = toupper(p[pos]) - 'A';
116 	  ct[x]++;
117 	  nres++;
118 	}
119 
120       /* try to stop early, checking after 500, 5000, and 50000 residues: */
121       if (step < nsteps && nres > threshold[step]) {
122 	if ((status = esl_abc_GuessAlphabet(ct, &alphatype)) == eslOK) goto DONE; /* (eslENOALPHABET) */
123 	step++;
124       }
125     }
126   if (status != eslEOF) goto ERROR; /* [eslEMEM,eslESYS,eslEINCONCEIVABLE] */
127   status = esl_abc_GuessAlphabet(ct, &alphatype); /* (eslENOALPHABET) */
128 
129   /* deliberate flowthrough...*/
130  DONE:
131   esl_buffer_SetOffset(afp->bf, anchor);   /* Rewind to where we were. */
132   esl_buffer_RaiseAnchor(afp->bf, anchor);
133   *ret_type = alphatype;
134   return status;
135 
136  ERROR:
137   if (anchor != -1) {
138     esl_buffer_SetOffset(afp->bf, anchor);
139     esl_buffer_RaiseAnchor(afp->bf, anchor);
140   }
141   *ret_type = eslUNKNOWN;
142   return status;
143 }
144 
145 /* Function:  esl_msafile_afa_Read()
146  * Synopsis:  Read in an aligned FASTA format alignment.
147  *
148  * Purpose:   Read an MSA from an open <ESL_MSAFILE> <afp>,
149  *            parsing for aligned FASTA format. Create
150  *            a new MSA, and return a ptr to that alignment
151  *            in <*ret_msa>. Caller is responsible for free'ing
152  *            this <ESL_MSA>.
153  *
154  * Args:      afp     - open <ESL_MSAFILE>
155  *            ret_msa - RETURN: newly parsed <ESL_MSA>
156  *
157  * Returns:   <eslOK> on success. <*ret_msa> is set to the newly
158  *            allocated MSA, and <afp> is at EOF.
159  *
160  *            <eslEOF> if no (more) alignment data are found in
161  *            <afp>;, and <afp> is returned at EOF.
162  *
163  *            <eslEFORMAT> on a parse error. <*ret_msa> is set to
164  *            <NULL>. <afp> contains information sufficient for
165  *            constructing useful diagnostic output:
166  *            | <afp->errmsg>       | user-directed error message     |
167  *            | <afp->linenumber>   | line # where error was detected |
168  *            | <afp->line>         | offending line (not NUL-term)   |
169  *            | <afp->n>            | length of offending line        |
170  *            | <afp->bf->filename> | name of the file                |
171  *            and <afp> is poised at the start of the following line,
172  *            so (in principle) the caller could try to resume
173  *            parsing.
174  *
175  * Throws:    <eslEMEM> - an allocation failed.
176  *            <eslESYS> - a system call such as fread() failed
177  *            <eslEINCONCEIVABLE> - "impossible" corruption
178  *            On these, <*ret_msa> is returned <NULL>, and the state of
179  *            <afp> is undefined.
180  */
181 int
esl_msafile_afa_Read(ESL_MSAFILE * afp,ESL_MSA ** ret_msa)182 esl_msafile_afa_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa)
183 {
184   ESL_MSA  *msa  = NULL;
185   int       idx  = 0;
186   int64_t   alen = 0;
187   int64_t   this_alen = 0;
188   char     *p, *tok;
189   esl_pos_t n, ntok;
190   int       status;
191 
192   ESL_DASSERT1( (afp->format == eslMSAFILE_AFA) );
193 
194   afp->errmsg[0] = '\0';	                                  /* Blank the error message. */
195 
196   if (afp->abc   &&  (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
197   if (! afp->abc &&  (msa = esl_msa_Create(                 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
198 
199   /* skip leading blank lines in file */
200   while ( (status = esl_msafile_GetLine(afp, &p, &n)) == eslOK && esl_memspn(afp->line, afp->n, " \t") == afp->n) ;
201   if      (status != eslOK)  goto ERROR; /* includes normal EOF */
202 
203   /* tolerate sloppy space at start of line */
204   while (n && isspace(*p)) { p++; n--; }
205   if (*p != '>') ESL_XFAIL(eslEFORMAT, afp->errmsg, "expected aligned FASTA name/desc line starting with >");
206 
207   do {
208     if (n <= 1 || *p != '>' ) ESL_XFAIL(eslEFORMAT, afp->errmsg, "expected aligned FASTA name/desc line starting with >");
209     p++; n--;			/* advance past > */
210 
211     if ( (status = esl_memtok(&p, &n, " \t", &tok, &ntok)) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "no name found for aligned FASTA record");
212     if (idx >= msa->sqalloc && (status = esl_msa_Expand(msa))    != eslOK) goto ERROR;
213 
214     if (     (status = esl_msa_SetSeqName       (msa, idx, tok, ntok)) != eslOK) goto ERROR;
215     if (n && (status = esl_msa_SetSeqDescription(msa, idx, p,   n))    != eslOK) goto ERROR;
216 
217     /* The code below will do a realloc on every line. Possible optimization: once you know
218      * alen (from first sequence), allocate subsequent seqs once, use noalloc versions of
219      * esl_strmapcat/esl_abc_dsqcat(). Requires implementing protection against overrun, if
220      * input is bad and a sequence is too long. Could gain ~25% or so that way (quickie
221      * test on PF00005 Full)
222      */
223     this_alen = 0;
224     while ((status = esl_msafile_GetLine(afp, &p, &n)) == eslOK)
225     {
226       while (n && isspace(*p)) { p++; n--; } /* tolerate and skip leading whitespace on line */
227       if (n  == 0)   continue;	       /* tolerate and skip blank lines */
228       if (*p == '>') break;
229 
230       if (msa->abc)   { status = esl_abc_dsqcat(afp->inmap, &(msa->ax[idx]),   &this_alen, p, n); }
231       if (! msa->abc) { status = esl_strmapcat (afp->inmap, &(msa->aseq[idx]), &this_alen, p, n); }
232       if (status == eslEINVAL)   ESL_XFAIL(eslEFORMAT, afp->errmsg, "one or more invalid sequence characters");
233       else if (status != eslOK)  goto ERROR;
234     }
235     if (this_alen == 0)            ESL_XFAIL(eslEFORMAT, afp->errmsg, "sequence %s has alen %" PRId64 , msa->sqname[idx], this_alen);
236     if (alen && alen != this_alen) ESL_XFAIL(eslEFORMAT, afp->errmsg, "sequence %s has alen %" PRId64 "; expected %" PRId64, msa->sqname[idx], this_alen, alen);
237 
238     alen = this_alen;
239     idx++;
240   } while (status == eslOK);	/* normally ends on eslEOF. */
241 
242   msa->nseq = idx;
243   msa->alen = alen;
244   if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR;
245 
246   *ret_msa = msa;
247   return eslOK;
248 
249  ERROR:
250   if (msa) esl_msa_Destroy(msa);
251   *ret_msa = NULL;
252   return status;
253 
254 }
255 
256 /* Function:  esl_msafile_afa_Write()
257  * Synopsis:  Write an aligned FASTA format alignment file to a stream.
258  *
259  * Purpose:   Write alignment <msa> in aligned FASTA format to a stream
260  *            <fp>.
261  *
262  *            If <msa> is in text mode, residues and gaps are written
263  *            exactly as they appear in the data structure. If <msa>
264  *            is digital, residues are in uppercase and all gaps are
265  *            dots (.). Dots are preferred to dashes because it
266  *            minimizes confusion with A2M format.
267  *
268  * Args:      fp  - open stream to write to
269  *            msa - MSA to write
270  *
271  * Returns:   <eslOK> on success.
272  *
273  * Throws:    <eslEWRITE> on any system write failure, such as filled disk.
274  */
275 int
esl_msafile_afa_Write(FILE * fp,const ESL_MSA * msa)276 esl_msafile_afa_Write(FILE *fp, const ESL_MSA *msa)
277 {
278   int     i;
279   int64_t pos;
280   char    buf[61];
281   int     acpl;       /* actual number of characters per line */
282 
283   for (i = 0; i < msa->nseq; i++)
284     {
285       if (fprintf(fp, ">%s", msa->sqname[i])                                                      < 0) ESL_EXCEPTION_SYS(eslEWRITE, "afa msa file write failed");
286       if (msa->sqacc  != NULL && msa->sqacc[i]  != NULL) { if (fprintf(fp, " %s", msa->sqacc[i])  < 0) ESL_EXCEPTION_SYS(eslEWRITE, "afa msa file write failed"); }
287       if (msa->sqdesc != NULL && msa->sqdesc[i] != NULL) { if (fprintf(fp, " %s", msa->sqdesc[i]) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "afa msa file write failed"); }
288       if (fputc('\n', fp)                                                                         < 0) ESL_EXCEPTION_SYS(eslEWRITE, "afa msa file write failed");
289 
290       pos = 0;
291       while (pos < msa->alen)
292 	{
293 	  acpl = (msa->alen - pos > 60)? 60 : msa->alen - pos;
294 
295 	  if (msa->abc)   esl_abc_TextizeN(msa->abc, msa->ax[i] + pos + 1, acpl, buf);
296 	  if (! msa->abc) strncpy(buf, msa->aseq[i] + pos, acpl);
297 
298 	  buf[acpl] = '\0';
299 	  if (fprintf(fp, "%s\n", buf) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "afa msa file write failed");
300 	  pos += 60;
301 	}
302     }
303   return eslOK;
304 }
305 
306 /*****************************************************************
307  * 2. Unit tests.
308  *****************************************************************/
309 #ifdef eslMSAFILE_AFA_TESTDRIVE
310 /* a standard globin example, dusted with evil:
311  *  1. \r\n DOS line terminators;
312  *  2. extra blank lines and whitespace
313  *  3. unusual but legal residues
314  */
315 static void
utest_write_good1(FILE * ofp,int * ret_alphatype,int * ret_nseq,int * ret_alen)316 utest_write_good1(FILE *ofp, int *ret_alphatype, int *ret_nseq, int *ret_alen)
317 {
318   fputs("   \r\n", ofp);
319   fputs(">   MYG_PHYCA   description   \r\n", ofp);
320   fputs("--------V-LSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRFKHLKT\r\n", ofp);
321   fputs("EAEMKASEDLKKHGVTVLTALGAILKKKGH---HEABJZOUX*SHATKHKIPIKYLEFIS  \r\n", ofp);
322   fputs("EAIIHVLHSRHPGDFGADAQGAMNKALELFRKDIAAKYKELGYQG\r\n", ofp);
323   fputs("   \r\n", ofp);
324   fputs(">GLB5_PETMA\r\n", ofp);
325   fputs("PIVDTGSVAPLSAAEKTKIRSAWAPVYSTYETSGVDILVKFFTSTPAAQEFFPKFKGLTT\r\n", ofp);
326   fputs("ADQLKKSADVRWHAERIINAVNDAVASMDDTEKMSMKLRDLSGKHAKSFQVDPQYFKVLA\r\n", ofp);
327   fputs("AVI---------ADTVAAGDAGFEKLMSMICILLRSAY-------\r\n", ofp);
328   fputs(">HBB_HUMAN\r\n", ofp);
329   fputs("--------VHLTPEEKSAVTALWGKV--NVDEVGGEALGRLLVVYPWTQRFFESFGDLST\r\n", ofp);
330   fputs("PDAVMGNPKVKAHGKKVLGAFSDGLAHLDN---LKGTFATLSELHCDKLHVDPENFRLLG\r\n", ofp);
331   fputs("NVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH------\r\n", ofp);
332   fputs(">HBA_HUMAN\r\n", ofp);
333   fputs("--------V-LSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHF-----\r\n", ofp);
334   fputs("-DLSHGSAQVKGHGKKVADALTNAVAHVDD---MPNALSALSDLHAHKLRVDPVNFKLLS\r\n", ofp);
335   fputs("HCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKYR------\r\n", ofp);
336   fputs("   \r\n", ofp);
337 
338   *ret_alphatype = eslAMINO;
339   *ret_nseq      = 4;
340   *ret_alen      = 165;
341 }
342 
343 
344 static void
utest_goodfile(char * filename,int testnumber,int expected_alphatype,int expected_nseq,int expected_alen)345 utest_goodfile(char *filename, int testnumber, int expected_alphatype, int expected_nseq, int expected_alen)
346 {
347   ESL_ALPHABET        *abc          = NULL;
348   ESL_MSAFILE         *afp          = NULL;
349   ESL_MSA             *msa1         = NULL;
350   ESL_MSA             *msa2         = NULL;
351   char                 tmpfile1[32] = "esltmpXXXXXX";
352   char                 tmpfile2[32] = "esltmpXXXXXX";
353   FILE                *ofp          = NULL;
354   int                  status;
355 
356   /* guessing both the format and the alphabet should work: this is a digital open */
357   if ( (status = esl_msafile_Open(&abc, filename, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) esl_fatal("afa good file test %d failed: digital open",           testnumber);
358   if (afp->format != eslMSAFILE_AFA)                                                               esl_fatal("afa good file test %d failed: format autodetection",   testnumber);
359   if (abc->type   != expected_alphatype)                                                           esl_fatal("afa good file test %d failed: alphabet autodetection", testnumber);
360 
361   /* This is a digital read, using <abc>. */
362   if ( (status = esl_msafile_afa_Read(afp, &msa1))   != eslOK)     esl_fatal("afa good file test %d failed: msa read, digital", testnumber);
363   if (msa1->nseq != expected_nseq || msa1->alen != expected_alen)  esl_fatal("afa good file test %d failed: nseq/alen",         testnumber);
364   if (esl_msa_Validate(msa1, NULL) != eslOK)                       esl_fatal("afa good file test %d failed: msa invalid",       testnumber);
365 
366   esl_msafile_Close(afp);
367 
368   /* write it back out to a new tmpfile (digital write) */
369   if ( (status = esl_tmpfile_named(tmpfile1, &ofp)) != eslOK) esl_fatal("afa good file test %d failed: tmpfile creation",   testnumber);
370   if ( (status = esl_msafile_afa_Write(ofp, msa1))  != eslOK) esl_fatal("afa good file test %d failed: msa write, digital", testnumber);
371   fclose(ofp);
372 
373   /* now open and read it as text mode, in known format. (We have to pass fmtd now, to deal with the possibility of a nonstandard name width) */
374   if ( (status = esl_msafile_Open(NULL, tmpfile1, NULL, eslMSAFILE_AFA, NULL, &afp)) != eslOK) esl_fatal("afa good file test %d failed: text mode open", testnumber);
375   if ( (status = esl_msafile_afa_Read(afp, &msa2))                                   != eslOK) esl_fatal("afa good file test %d failed: msa read, text", testnumber);
376   if (msa2->nseq != expected_nseq || msa2->alen != expected_alen)                              esl_fatal("afa good file test %d failed: nseq/alen",      testnumber);
377   if (esl_msa_Validate(msa2, NULL) != eslOK)                                                   esl_fatal("afa good file test %d failed: msa invalid",    testnumber);
378   esl_msafile_Close(afp);
379 
380   /* write it back out to a new tmpfile (text write) */
381   if ( (status = esl_tmpfile_named(tmpfile2, &ofp)) != eslOK) esl_fatal("afa good file test %d failed: tmpfile creation", testnumber);
382   if ( (status = esl_msafile_afa_Write(ofp, msa2))  != eslOK) esl_fatal("afa good file test %d failed: msa write, text",  testnumber);
383   fclose(ofp);
384   esl_msa_Destroy(msa2);
385 
386   /* open and read it in digital mode */
387   if ( (status = esl_msafile_Open(&abc, tmpfile1, NULL, eslMSAFILE_AFA, NULL, &afp)) != eslOK) esl_fatal("afa good file test %d failed: 2nd digital mode open", testnumber);
388   if ( (status = esl_msafile_afa_Read(afp, &msa2))                                   != eslOK) esl_fatal("afa good file test %d failed: 2nd digital msa read",  testnumber);
389   if (esl_msa_Validate(msa2, NULL) != eslOK)                                                   esl_fatal("afa good file test %d failed: msa invalid",           testnumber);
390   esl_msafile_Close(afp);
391 
392   /* this msa <msa2> should be identical to <msa1> */
393   if (esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal("afa good file test %d failed: msa compare", testnumber);
394 
395   remove(tmpfile1);
396   remove(tmpfile2);
397   esl_msa_Destroy(msa1);
398   esl_msa_Destroy(msa2);
399   esl_alphabet_Destroy(abc);
400 }
401 
402 
403 static void
write_test_msas(FILE * ofp1,FILE * ofp2)404 write_test_msas(FILE *ofp1, FILE *ofp2)
405 {
406   fprintf(ofp1, "\n");
407   fprintf(ofp1, ">seq1    description line for seq1\n");
408   fprintf(ofp1, "..acdefghiklmnpqrstvwy\n");
409   fprintf(ofp1, "ACDEFGHIKLMNPQRSTVWY..\n");
410   fprintf(ofp1, "\n");
411   fprintf(ofp1, ">seq2 description line for seq2\n");
412   fprintf(ofp1, "..acdefghiklmnpqrstv--\n");
413   fprintf(ofp1, "ACDEFGHIKLMNPQRSTVWYyy\n");
414   fprintf(ofp1, "  >seq3\n");
415   fprintf(ofp1, "aaacdefghiklmnpqrstv--ACDEFGHIKLMNPQRSTVWY..\n");
416   fprintf(ofp1, ">seq4\n");
417   fprintf(ofp1, "..acdefghiklm\n");
418   fprintf(ofp1, "npqrstvwyACDE\n");
419   fprintf(ofp1, "FGHIKLMNPQRSTVWY..\n");
420 
421   fprintf(ofp2, "# STOCKHOLM 1.0\n");
422   fprintf(ofp2, "\n");
423   fprintf(ofp2, "#=GS seq1 DE description line for seq1\n");
424   fprintf(ofp2, "#=GS seq2 DE description line for seq2\n");
425   fprintf(ofp2, "\n");
426   fprintf(ofp2, "seq1    ..acdefghiklmnpqrstvwyACDEFGHIKLMNPQRSTVWY..\n");
427   fprintf(ofp2, "seq2    ..acdefghiklmnpqrstv--ACDEFGHIKLMNPQRSTVWYyy\n");
428   fprintf(ofp2, "seq3    aaacdefghiklmnpqrstv--ACDEFGHIKLMNPQRSTVWY..\n");
429   fprintf(ofp2, "seq4    ..acdefghiklmnpqrstvwyACDEFGHIKLMNPQRSTVWY..\n");
430   fprintf(ofp2, "//\n");
431 }
432 
433 static void
read_test_msas_digital(char * afafile,char * stkfile)434 read_test_msas_digital(char *afafile, char *stkfile)
435 {
436   char msg[]         = "aligned FASTA msa digital read unit test failed";
437   ESL_ALPHABET *abc  = NULL;
438   ESL_MSAFILE  *afp1 = NULL;
439   ESL_MSAFILE  *afp2 = NULL;
440   ESL_MSA      *msa1, *msa2, *msa3, *msa4;
441   FILE         *afafp, *stkfp;
442   char          afafile2[32] = "esltmpafa2XXXXXX";
443   char          stkfile2[32] = "esltmpstk2XXXXXX";
444 
445   if ( esl_msafile_Open(&abc, afafile, NULL, eslMSAFILE_AFA,       NULL, &afp1) != eslOK)  esl_fatal(msg);
446   if ( !abc || abc->type != eslAMINO)                                                      esl_fatal(msg);
447   if ( esl_msafile_Open(&abc, stkfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK)  esl_fatal(msg);
448   if ( esl_msafile_afa_Read      (afp1, &msa1)                                  != eslOK)  esl_fatal(msg);
449   if ( esl_msafile_stockholm_Read(afp2, &msa2)                                  != eslOK)  esl_fatal(msg);
450   if ( esl_msa_Compare(msa1, msa2)                                              != eslOK)  esl_fatal(msg);
451 
452   if ( esl_msafile_afa_Read      (afp1, &msa3) != eslEOF) esl_fatal(msg);
453   if ( esl_msafile_stockholm_Read(afp2, &msa3) != eslEOF) esl_fatal(msg);
454 
455   esl_msafile_Close(afp2);
456   esl_msafile_Close(afp1);
457 
458   /* Now write stk to afa file, and vice versa; then retest */
459   if ( esl_tmpfile_named(afafile2, &afafp)                             != eslOK) esl_fatal(msg);
460   if ( esl_tmpfile_named(stkfile2, &stkfp)                             != eslOK) esl_fatal(msg);
461   if ( esl_msafile_afa_Write      (afafp, msa2)                        != eslOK) esl_fatal(msg);
462   if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_STOCKHOLM)  != eslOK) esl_fatal(msg);
463   fclose(afafp);
464   fclose(stkfp);
465   if ( esl_msafile_Open(&abc, afafile2, NULL, eslMSAFILE_AFA,       NULL, &afp1) != eslOK) esl_fatal(msg);
466   if ( esl_msafile_Open(&abc, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
467   if ( esl_msafile_afa_Read      (afp1, &msa3)                                   != eslOK) esl_fatal(msg);
468   if ( esl_msafile_stockholm_Read(afp2, &msa4)                                   != eslOK) esl_fatal(msg);
469   if ( esl_msa_Compare(msa3, msa4)                                               != eslOK) esl_fatal(msg);
470 
471   remove(afafile2);
472   remove(stkfile2);
473   esl_msafile_Close(afp2);
474   esl_msafile_Close(afp1);
475 
476   esl_msa_Destroy(msa1);
477   esl_msa_Destroy(msa2);
478   esl_msa_Destroy(msa3);
479   esl_msa_Destroy(msa4);
480   esl_alphabet_Destroy(abc);
481 }
482 
483 static void
read_test_msas_text(char * afafile,char * stkfile)484 read_test_msas_text(char *afafile, char *stkfile)
485 {
486   char msg[]         = "aligned FASTA msa text-mode read unit test failed";
487   ESL_MSAFILE  *afp1 = NULL;
488   ESL_MSAFILE  *afp2 = NULL;
489   ESL_MSA      *msa1, *msa2, *msa3, *msa4;
490   FILE         *afafp, *stkfp;
491   char          afafile2[32] = "esltmpafa2XXXXXX";
492   char          stkfile2[32] = "esltmpstk2XXXXXX";
493 
494   /*                    vvvv-- everything's the same as the digital utest except these NULLs  */
495   if ( esl_msafile_Open(NULL, afafile, NULL, eslMSAFILE_AFA,       NULL, &afp1) != eslOK)  esl_fatal(msg);
496   if ( esl_msafile_Open(NULL, stkfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK)  esl_fatal(msg);
497   if ( esl_msafile_afa_Read      (afp1, &msa1)                                  != eslOK)  esl_fatal(msg);
498   if ( esl_msafile_stockholm_Read(afp2, &msa2)                                  != eslOK)  esl_fatal(msg);
499   if ( esl_msa_Compare(msa1, msa2)                                              != eslOK)  esl_fatal(msg);
500   if ( esl_msafile_afa_Read      (afp1, &msa3)                                  != eslEOF) esl_fatal(msg);
501   if ( esl_msafile_stockholm_Read(afp2, &msa3)                                  != eslEOF) esl_fatal(msg);
502   esl_msafile_Close(afp2);
503   esl_msafile_Close(afp1);
504 
505   if ( esl_tmpfile_named(afafile2, &afafp)                             != eslOK) esl_fatal(msg);
506   if ( esl_tmpfile_named(stkfile2, &stkfp)                             != eslOK) esl_fatal(msg);
507   if ( esl_msafile_afa_Write      (afafp, msa2)                        != eslOK) esl_fatal(msg);
508   if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_STOCKHOLM)  != eslOK) esl_fatal(msg);
509   fclose(afafp);
510   fclose(stkfp);
511   if ( esl_msafile_Open(NULL, afafile2, NULL, eslMSAFILE_AFA,       NULL, &afp1) != eslOK) esl_fatal(msg);
512   if ( esl_msafile_Open(NULL, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
513   if ( esl_msafile_afa_Read      (afp1, &msa3)                                   != eslOK) esl_fatal(msg);
514   if ( esl_msafile_stockholm_Read(afp2, &msa4)                                   != eslOK) esl_fatal(msg);
515   if ( esl_msa_Compare(msa3, msa4)                                               != eslOK) esl_fatal(msg);
516 
517   remove(afafile2);
518   remove(stkfile2);
519   esl_msafile_Close(afp2);
520   esl_msafile_Close(afp1);
521 
522   esl_msa_Destroy(msa1);
523   esl_msa_Destroy(msa2);
524   esl_msa_Destroy(msa3);
525   esl_msa_Destroy(msa4);
526 }
527 #endif /*eslMSAFILE_AFA_TESTDRIVE*/
528 /*---------------------- end, unit tests ------------------------*/
529 
530 
531 /*****************************************************************
532  * 3. Test driver.
533  *****************************************************************/
534 #ifdef eslMSAFILE_AFA_TESTDRIVE
535 /* compile: gcc -g -Wall -I. -L. -o esl_msafile_afa_utest -DeslMSAFILE_AFA_TESTDRIVE esl_msafile_afa.c -leasel -lm
536  *  (gcov): gcc -g -Wall -fprofile-arcs -ftest-coverage -I. -L. -o esl_msafile_afa_utest -DeslMSAFILE_AFA_TESTDRIVE esl_msafile_afa.c -leasel -lm
537  * run:     ./esl_msafile_afa_utest
538  */
539 #include "esl_config.h"
540 
541 #include <stdio.h>
542 
543 #include "easel.h"
544 #include "esl_getopts.h"
545 #include "esl_random.h"
546 #include "esl_msafile.h"
547 #include "esl_msafile_afa.h"
548 
549 static ESL_OPTIONS options[] = {
550    /* name  type         default  env   range togs  reqs  incomp  help                docgrp */
551   {"-h",  eslARG_NONE,    FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage",                            0},
552   { 0,0,0,0,0,0,0,0,0,0},
553 };
554 static char usage[]  = "[-options]";
555 static char banner[] = "test driver for AFA MSA format module";
556 
557 int
main(int argc,char ** argv)558 main(int argc, char **argv)
559 {
560   char            msg[]        = "aligned FASTA MSA i/o module test driver failed";
561   ESL_GETOPTS    *go           = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
562   char            afafile[32] = "esltmpafaXXXXXX";
563   char            stkfile[32] = "esltmpstkXXXXXX";
564   FILE           *afafp, *stkfp;
565   int             testnumber;
566   int             ngoodtests  = 1;
567   char            tmpfile[32];
568   FILE           *ofp;
569   int             expected_alphatype;
570   int             expected_nseq;
571   int             expected_alen;
572 
573 
574   if ( esl_tmpfile_named(afafile, &afafp) != eslOK) esl_fatal(msg);
575   if ( esl_tmpfile_named(stkfile, &stkfp) != eslOK) esl_fatal(msg);
576   write_test_msas(afafp, stkfp);
577   fclose(afafp);
578   fclose(stkfp);
579 
580   read_test_msas_digital(afafile, stkfile);
581   read_test_msas_text   (afafile, stkfile);
582 
583   /* Various "good" files that should be parsed correctly */
584   for (testnumber = 1; testnumber <= ngoodtests; testnumber++)
585     {
586       strcpy(tmpfile, "esltmpXXXXXX");
587       if (esl_tmpfile_named(tmpfile, &ofp) != eslOK) esl_fatal(msg);
588       switch (testnumber) {
589       case  1:  utest_write_good1 (ofp, &expected_alphatype, &expected_nseq, &expected_alen); break;
590       }
591       fclose(ofp);
592       utest_goodfile(tmpfile, testnumber, expected_alphatype, expected_nseq, expected_alen);
593       remove(tmpfile);
594     }
595 
596   remove(afafile);
597   remove(stkfile);
598   esl_getopts_Destroy(go);
599   return 0;
600 }
601 #endif /*eslMSAFILE_AFA_TESTDRIVE*/
602 /*--------------------- end, test driver ------------------------*/
603 
604 
605 
606 /*****************************************************************
607  * 4. Examples.
608  *****************************************************************/
609 
610 #ifdef eslMSAFILE_AFA_EXAMPLE
611 /* A full-featured example of reading/writing an MSA in aligned FASTA (AFA) format.
612    gcc -g -Wall -o esl_msafile_afa_example -I. -L. -DeslMSAFILE_afa_EXAMPLE esl_msafile_afa.c -leasel -lm
613    ./esl_msafile_afa_example <msafile>
614  */
615 /*::cexcerpt::msafile_afa_example::begin::*/
616 #include <stdio.h>
617 
618 #include "easel.h"
619 #include "esl_alphabet.h"
620 #include "esl_getopts.h"
621 #include "esl_msa.h"
622 #include "esl_msafile.h"
623 #include "esl_msafile_afa.h"
624 
625 static ESL_OPTIONS options[] = {
626   /* name             type          default  env  range toggles reqs incomp  help                                       docgroup*/
627   { "-h",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",            0 },
628   { "-1",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "override autodetection; force AFA format",        0 },
629   { "-q",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "quieter: don't write msa back, just summary",     0 },
630   { "-t",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use text mode: no digital alphabet",              0 },
631   { "--dna",       eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, "-t", "specify that alphabet is DNA",                    0 },
632   { "--rna",       eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, "-t", "specify that alphabet is RNA",                    0 },
633   { "--amino",     eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, "-t", "specify that alphabet is protein",                0 },
634   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
635 };
636 static char usage[]  = "[-options] <msafile>";
637 static char banner[] = "example of guessing, reading, writing AFA format";
638 
639 int
main(int argc,char ** argv)640 main(int argc, char **argv)
641 {
642   ESL_GETOPTS        *go          = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
643   char               *filename    = esl_opt_GetArg(go, 1);
644   int                 infmt       = eslMSAFILE_UNKNOWN;
645   ESL_ALPHABET       *abc         = NULL;
646   ESL_MSAFILE        *afp         = NULL;
647   ESL_MSA            *msa         = NULL;
648   int                 status;
649 
650   if      (esl_opt_GetBoolean(go, "-1"))      infmt = eslMSAFILE_AFA;  /* override format autodetection */
651 
652   if      (esl_opt_GetBoolean(go, "--rna"))   abc = esl_alphabet_Create(eslRNA);
653   else if (esl_opt_GetBoolean(go, "--dna"))   abc = esl_alphabet_Create(eslDNA);
654   else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
655 
656   /* Text mode: pass NULL for alphabet.
657    * Digital mode: pass ptr to expected ESL_ALPHABET; and if abc=NULL, alphabet is guessed
658    */
659   if   (esl_opt_GetBoolean(go, "-t"))  status = esl_msafile_Open(NULL, filename, NULL, infmt, NULL, &afp);
660   else                                 status = esl_msafile_Open(&abc, filename, NULL, infmt, NULL, &afp);
661   if (status != eslOK) esl_msafile_OpenFailure(afp, status);
662 
663   if ((status = esl_msafile_afa_Read(afp, &msa)) != eslOK)
664     esl_msafile_ReadFailure(afp, status);
665 
666   printf("alphabet:       %s\n", (abc ? esl_abc_DecodeType(abc->type) : "none (text mode)"));
667   printf("# of seqs:      %d\n", msa->nseq);
668   printf("# of cols:      %d\n", (int) msa->alen);
669   printf("\n");
670 
671   if (! esl_opt_GetBoolean(go, "-q"))
672     esl_msafile_afa_Write(stdout, msa);
673 
674   esl_msa_Destroy(msa);
675   esl_msafile_Close(afp);
676   if (abc) esl_alphabet_Destroy(abc);
677   esl_getopts_Destroy(go);
678   exit(0);
679 }
680 /*::cexcerpt::msafile_afa_example::end::*/
681 #endif /*eslMSAFILE_AFA_EXAMPLE*/
682 
683 #ifdef eslMSAFILE_AFA_EXAMPLE2
684 /* A minimal example. Read AFA format MSA, in text mode.
685    gcc -g -Wall -o esl_msafile_afa_example2 -I. -L. -DeslMSAFILE_AFA_EXAMPLE2 esl_msafile_afa.c -leasel -lm
686    ./esl_msafile_afa_example2 <msafile>
687 */
688 /*::cexcerpt::msafile_afa_example2::begin::*/
689 #include <stdio.h>
690 
691 #include "easel.h"
692 #include "esl_msa.h"
693 #include "esl_msafile.h"
694 #include "esl_msafile_afa.h"
695 
696 int
main(int argc,char ** argv)697 main(int argc, char **argv)
698 {
699   char         *filename = argv[1];
700   int           fmt      = eslMSAFILE_AFA;
701   ESL_MSAFILE  *afp      = NULL;
702   ESL_MSA      *msa      = NULL;
703   int           status;
704 
705   if ( (status = esl_msafile_Open(NULL, filename, NULL, fmt, NULL, &afp)) != eslOK) esl_msafile_OpenFailure(afp, status);
706   if ( (status = esl_msafile_afa_Read(afp, &msa))                         != eslOK) esl_msafile_ReadFailure(afp, status);
707 
708   printf("%6d seqs, %5d columns\n", msa->nseq, (int) msa->alen);
709 
710   esl_msafile_afa_Write(stdout, msa);
711   esl_msa_Destroy(msa);
712   esl_msafile_Close(afp);
713   exit(0);
714 }
715 /*::cexcerpt::msafile_afa_example2::end::*/
716 #endif /*eslMSAFILE_AFA_EXAMPLE2*/
717 /*--------------------- end of examples -------------------------*/
718