1 /* i/o of multiple sequence alignment files in "aligned FASTA" format
2 *
3 * Contents:
4 * 1. API for reading/writing AFA format
5 * 2. Unit tests.
6 * 3. Test driver.
7 * 4. Examples.
8 */
9 #include "esl_config.h"
10
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <ctype.h>
15
16 #include "easel.h"
17 #include "esl_alphabet.h"
18 #include "esl_mem.h"
19 #include "esl_msa.h"
20 #include "esl_msafile.h"
21
22 #include "esl_msafile_afa.h"
23
24 /*****************************************************************
25 *# 1. API for reading/writing AFA format
26 *****************************************************************/
27
28 /* Function: esl_msafile_afa_SetInmap()
29 * Synopsis: Set input map for aligned FASTA format.
30 *
31 * Purpose: Set the <afp->inmap> for aligned FASTA format.
32 *
33 * Text mode accepts any <isgraph()> character.
34 * Digital mode enforces the usual Easel alphabets.
35 *
36 * We skip spaces in input lines of aligned FASTA format;
37 * map ' ' to <eslDSQ_IGNORED>.
38 */
39 int
esl_msafile_afa_SetInmap(ESL_MSAFILE * afp)40 esl_msafile_afa_SetInmap(ESL_MSAFILE *afp)
41 {
42 int sym;
43
44 if (afp->abc)
45 {
46 for (sym = 0; sym < 128; sym++)
47 afp->inmap[sym] = afp->abc->inmap[sym];
48 afp->inmap[0] = esl_abc_XGetUnknown(afp->abc);
49 }
50
51 if (! afp->abc)
52 {
53 for (sym = 1; sym < 128; sym++)
54 afp->inmap[sym] = (isgraph(sym) ? sym : eslDSQ_ILLEGAL);
55 afp->inmap[0] = '?';
56 }
57
58 afp->inmap[' '] = eslDSQ_IGNORED;
59 return eslOK;
60 }
61
62
63 /* Function: esl_msafile_afa_GuessAlphabet()
64 * Synopsis: Guess the alphabet of an open AFA MSA file.
65 *
66 * Purpose: Guess the alpbabet of the sequences in open
67 * AFA format MSA file <afp>.
68 *
69 * On a normal return, <*ret_type> is set to <eslDNA>,
70 * <eslRNA>, or <eslAMINO>, and <afp> is reset to its
71 * original position.
72 *
73 * Args: afp - open AFA format MSA file
74 * ret_type - RETURN: <eslDNA>, <eslRNA>, or <eslAMINO>
75 *
76 * Returns: <eslOK> on success.
77 * <eslENOALPHABET> if alphabet type can't be determined.
78 * In either case, <afp> is rewound to the position it
79 * started at.
80 *
81 * Throws: <eslEMEM> on allocation error.
82 * <eslESYS> on failures of fread() or other system calls
83 *
84 * Note: Essentially identical to <esl_msafile_a2m_GuessAlphabet()>,
85 * but we provide both versions because design calls for
86 * modularity/separability of parsers.
87 */
88 int
esl_msafile_afa_GuessAlphabet(ESL_MSAFILE * afp,int * ret_type)89 esl_msafile_afa_GuessAlphabet(ESL_MSAFILE *afp, int *ret_type)
90 {
91 int alphatype = eslUNKNOWN;
92 esl_pos_t anchor = -1;
93 int threshold[3] = { 500, 5000, 50000 }; /* we check after 500, 5000, 50000 residues; else we go to EOF */
94 int nsteps = 3;
95 int step = 0;
96 int nres = 0;
97 int x;
98 int64_t ct[26];
99 char *p;
100 esl_pos_t n, pos;
101 int status;
102
103 for (x = 0; x < 26; x++) ct[x] = 0;
104
105 anchor = esl_buffer_GetOffset(afp->bf);
106 if ((status = esl_buffer_SetAnchor(afp->bf, anchor)) != eslOK) { status = eslEINCONCEIVABLE; goto ERROR; } /* [eslINVAL] can't happen here */
107
108 while ( (status = esl_buffer_GetLine(afp->bf, &p, &n)) == eslOK)
109 {
110 while (n && isspace(*p)) { p++; n--; }
111 if (!n || *p == '>') continue;
112
113 for (pos = 0; pos < n; pos++)
114 if (isalpha(p[pos])) {
115 x = toupper(p[pos]) - 'A';
116 ct[x]++;
117 nres++;
118 }
119
120 /* try to stop early, checking after 500, 5000, and 50000 residues: */
121 if (step < nsteps && nres > threshold[step]) {
122 if ((status = esl_abc_GuessAlphabet(ct, &alphatype)) == eslOK) goto DONE; /* (eslENOALPHABET) */
123 step++;
124 }
125 }
126 if (status != eslEOF) goto ERROR; /* [eslEMEM,eslESYS,eslEINCONCEIVABLE] */
127 status = esl_abc_GuessAlphabet(ct, &alphatype); /* (eslENOALPHABET) */
128
129 /* deliberate flowthrough...*/
130 DONE:
131 esl_buffer_SetOffset(afp->bf, anchor); /* Rewind to where we were. */
132 esl_buffer_RaiseAnchor(afp->bf, anchor);
133 *ret_type = alphatype;
134 return status;
135
136 ERROR:
137 if (anchor != -1) {
138 esl_buffer_SetOffset(afp->bf, anchor);
139 esl_buffer_RaiseAnchor(afp->bf, anchor);
140 }
141 *ret_type = eslUNKNOWN;
142 return status;
143 }
144
145 /* Function: esl_msafile_afa_Read()
146 * Synopsis: Read in an aligned FASTA format alignment.
147 *
148 * Purpose: Read an MSA from an open <ESL_MSAFILE> <afp>,
149 * parsing for aligned FASTA format. Create
150 * a new MSA, and return a ptr to that alignment
151 * in <*ret_msa>. Caller is responsible for free'ing
152 * this <ESL_MSA>.
153 *
154 * Args: afp - open <ESL_MSAFILE>
155 * ret_msa - RETURN: newly parsed <ESL_MSA>
156 *
157 * Returns: <eslOK> on success. <*ret_msa> is set to the newly
158 * allocated MSA, and <afp> is at EOF.
159 *
160 * <eslEOF> if no (more) alignment data are found in
161 * <afp>;, and <afp> is returned at EOF.
162 *
163 * <eslEFORMAT> on a parse error. <*ret_msa> is set to
164 * <NULL>. <afp> contains information sufficient for
165 * constructing useful diagnostic output:
166 * | <afp->errmsg> | user-directed error message |
167 * | <afp->linenumber> | line # where error was detected |
168 * | <afp->line> | offending line (not NUL-term) |
169 * | <afp->n> | length of offending line |
170 * | <afp->bf->filename> | name of the file |
171 * and <afp> is poised at the start of the following line,
172 * so (in principle) the caller could try to resume
173 * parsing.
174 *
175 * Throws: <eslEMEM> - an allocation failed.
176 * <eslESYS> - a system call such as fread() failed
177 * <eslEINCONCEIVABLE> - "impossible" corruption
178 * On these, <*ret_msa> is returned <NULL>, and the state of
179 * <afp> is undefined.
180 */
181 int
esl_msafile_afa_Read(ESL_MSAFILE * afp,ESL_MSA ** ret_msa)182 esl_msafile_afa_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa)
183 {
184 ESL_MSA *msa = NULL;
185 int idx = 0;
186 int64_t alen = 0;
187 int64_t this_alen = 0;
188 char *p, *tok;
189 esl_pos_t n, ntok;
190 int status;
191
192 ESL_DASSERT1( (afp->format == eslMSAFILE_AFA) );
193
194 afp->errmsg[0] = '\0'; /* Blank the error message. */
195
196 if (afp->abc && (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
197 if (! afp->abc && (msa = esl_msa_Create( 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
198
199 /* skip leading blank lines in file */
200 while ( (status = esl_msafile_GetLine(afp, &p, &n)) == eslOK && esl_memspn(afp->line, afp->n, " \t") == afp->n) ;
201 if (status != eslOK) goto ERROR; /* includes normal EOF */
202
203 /* tolerate sloppy space at start of line */
204 while (n && isspace(*p)) { p++; n--; }
205 if (*p != '>') ESL_XFAIL(eslEFORMAT, afp->errmsg, "expected aligned FASTA name/desc line starting with >");
206
207 do {
208 if (n <= 1 || *p != '>' ) ESL_XFAIL(eslEFORMAT, afp->errmsg, "expected aligned FASTA name/desc line starting with >");
209 p++; n--; /* advance past > */
210
211 if ( (status = esl_memtok(&p, &n, " \t", &tok, &ntok)) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "no name found for aligned FASTA record");
212 if (idx >= msa->sqalloc && (status = esl_msa_Expand(msa)) != eslOK) goto ERROR;
213
214 if ( (status = esl_msa_SetSeqName (msa, idx, tok, ntok)) != eslOK) goto ERROR;
215 if (n && (status = esl_msa_SetSeqDescription(msa, idx, p, n)) != eslOK) goto ERROR;
216
217 /* The code below will do a realloc on every line. Possible optimization: once you know
218 * alen (from first sequence), allocate subsequent seqs once, use noalloc versions of
219 * esl_strmapcat/esl_abc_dsqcat(). Requires implementing protection against overrun, if
220 * input is bad and a sequence is too long. Could gain ~25% or so that way (quickie
221 * test on PF00005 Full)
222 */
223 this_alen = 0;
224 while ((status = esl_msafile_GetLine(afp, &p, &n)) == eslOK)
225 {
226 while (n && isspace(*p)) { p++; n--; } /* tolerate and skip leading whitespace on line */
227 if (n == 0) continue; /* tolerate and skip blank lines */
228 if (*p == '>') break;
229
230 if (msa->abc) { status = esl_abc_dsqcat(afp->inmap, &(msa->ax[idx]), &this_alen, p, n); }
231 if (! msa->abc) { status = esl_strmapcat (afp->inmap, &(msa->aseq[idx]), &this_alen, p, n); }
232 if (status == eslEINVAL) ESL_XFAIL(eslEFORMAT, afp->errmsg, "one or more invalid sequence characters");
233 else if (status != eslOK) goto ERROR;
234 }
235 if (this_alen == 0) ESL_XFAIL(eslEFORMAT, afp->errmsg, "sequence %s has alen %" PRId64 , msa->sqname[idx], this_alen);
236 if (alen && alen != this_alen) ESL_XFAIL(eslEFORMAT, afp->errmsg, "sequence %s has alen %" PRId64 "; expected %" PRId64, msa->sqname[idx], this_alen, alen);
237
238 alen = this_alen;
239 idx++;
240 } while (status == eslOK); /* normally ends on eslEOF. */
241
242 msa->nseq = idx;
243 msa->alen = alen;
244 if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR;
245
246 *ret_msa = msa;
247 return eslOK;
248
249 ERROR:
250 if (msa) esl_msa_Destroy(msa);
251 *ret_msa = NULL;
252 return status;
253
254 }
255
256 /* Function: esl_msafile_afa_Write()
257 * Synopsis: Write an aligned FASTA format alignment file to a stream.
258 *
259 * Purpose: Write alignment <msa> in aligned FASTA format to a stream
260 * <fp>.
261 *
262 * If <msa> is in text mode, residues and gaps are written
263 * exactly as they appear in the data structure. If <msa>
264 * is digital, residues are in uppercase and all gaps are
265 * dots (.). Dots are preferred to dashes because it
266 * minimizes confusion with A2M format.
267 *
268 * Args: fp - open stream to write to
269 * msa - MSA to write
270 *
271 * Returns: <eslOK> on success.
272 *
273 * Throws: <eslEWRITE> on any system write failure, such as filled disk.
274 */
275 int
esl_msafile_afa_Write(FILE * fp,const ESL_MSA * msa)276 esl_msafile_afa_Write(FILE *fp, const ESL_MSA *msa)
277 {
278 int i;
279 int64_t pos;
280 char buf[61];
281 int acpl; /* actual number of characters per line */
282
283 for (i = 0; i < msa->nseq; i++)
284 {
285 if (fprintf(fp, ">%s", msa->sqname[i]) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "afa msa file write failed");
286 if (msa->sqacc != NULL && msa->sqacc[i] != NULL) { if (fprintf(fp, " %s", msa->sqacc[i]) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "afa msa file write failed"); }
287 if (msa->sqdesc != NULL && msa->sqdesc[i] != NULL) { if (fprintf(fp, " %s", msa->sqdesc[i]) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "afa msa file write failed"); }
288 if (fputc('\n', fp) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "afa msa file write failed");
289
290 pos = 0;
291 while (pos < msa->alen)
292 {
293 acpl = (msa->alen - pos > 60)? 60 : msa->alen - pos;
294
295 if (msa->abc) esl_abc_TextizeN(msa->abc, msa->ax[i] + pos + 1, acpl, buf);
296 if (! msa->abc) strncpy(buf, msa->aseq[i] + pos, acpl);
297
298 buf[acpl] = '\0';
299 if (fprintf(fp, "%s\n", buf) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "afa msa file write failed");
300 pos += 60;
301 }
302 }
303 return eslOK;
304 }
305
306 /*****************************************************************
307 * 2. Unit tests.
308 *****************************************************************/
309 #ifdef eslMSAFILE_AFA_TESTDRIVE
310 /* a standard globin example, dusted with evil:
311 * 1. \r\n DOS line terminators;
312 * 2. extra blank lines and whitespace
313 * 3. unusual but legal residues
314 */
315 static void
utest_write_good1(FILE * ofp,int * ret_alphatype,int * ret_nseq,int * ret_alen)316 utest_write_good1(FILE *ofp, int *ret_alphatype, int *ret_nseq, int *ret_alen)
317 {
318 fputs(" \r\n", ofp);
319 fputs("> MYG_PHYCA description \r\n", ofp);
320 fputs("--------V-LSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRFKHLKT\r\n", ofp);
321 fputs("EAEMKASEDLKKHGVTVLTALGAILKKKGH---HEABJZOUX*SHATKHKIPIKYLEFIS \r\n", ofp);
322 fputs("EAIIHVLHSRHPGDFGADAQGAMNKALELFRKDIAAKYKELGYQG\r\n", ofp);
323 fputs(" \r\n", ofp);
324 fputs(">GLB5_PETMA\r\n", ofp);
325 fputs("PIVDTGSVAPLSAAEKTKIRSAWAPVYSTYETSGVDILVKFFTSTPAAQEFFPKFKGLTT\r\n", ofp);
326 fputs("ADQLKKSADVRWHAERIINAVNDAVASMDDTEKMSMKLRDLSGKHAKSFQVDPQYFKVLA\r\n", ofp);
327 fputs("AVI---------ADTVAAGDAGFEKLMSMICILLRSAY-------\r\n", ofp);
328 fputs(">HBB_HUMAN\r\n", ofp);
329 fputs("--------VHLTPEEKSAVTALWGKV--NVDEVGGEALGRLLVVYPWTQRFFESFGDLST\r\n", ofp);
330 fputs("PDAVMGNPKVKAHGKKVLGAFSDGLAHLDN---LKGTFATLSELHCDKLHVDPENFRLLG\r\n", ofp);
331 fputs("NVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH------\r\n", ofp);
332 fputs(">HBA_HUMAN\r\n", ofp);
333 fputs("--------V-LSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHF-----\r\n", ofp);
334 fputs("-DLSHGSAQVKGHGKKVADALTNAVAHVDD---MPNALSALSDLHAHKLRVDPVNFKLLS\r\n", ofp);
335 fputs("HCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKYR------\r\n", ofp);
336 fputs(" \r\n", ofp);
337
338 *ret_alphatype = eslAMINO;
339 *ret_nseq = 4;
340 *ret_alen = 165;
341 }
342
343
344 static void
utest_goodfile(char * filename,int testnumber,int expected_alphatype,int expected_nseq,int expected_alen)345 utest_goodfile(char *filename, int testnumber, int expected_alphatype, int expected_nseq, int expected_alen)
346 {
347 ESL_ALPHABET *abc = NULL;
348 ESL_MSAFILE *afp = NULL;
349 ESL_MSA *msa1 = NULL;
350 ESL_MSA *msa2 = NULL;
351 char tmpfile1[32] = "esltmpXXXXXX";
352 char tmpfile2[32] = "esltmpXXXXXX";
353 FILE *ofp = NULL;
354 int status;
355
356 /* guessing both the format and the alphabet should work: this is a digital open */
357 if ( (status = esl_msafile_Open(&abc, filename, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) esl_fatal("afa good file test %d failed: digital open", testnumber);
358 if (afp->format != eslMSAFILE_AFA) esl_fatal("afa good file test %d failed: format autodetection", testnumber);
359 if (abc->type != expected_alphatype) esl_fatal("afa good file test %d failed: alphabet autodetection", testnumber);
360
361 /* This is a digital read, using <abc>. */
362 if ( (status = esl_msafile_afa_Read(afp, &msa1)) != eslOK) esl_fatal("afa good file test %d failed: msa read, digital", testnumber);
363 if (msa1->nseq != expected_nseq || msa1->alen != expected_alen) esl_fatal("afa good file test %d failed: nseq/alen", testnumber);
364 if (esl_msa_Validate(msa1, NULL) != eslOK) esl_fatal("afa good file test %d failed: msa invalid", testnumber);
365
366 esl_msafile_Close(afp);
367
368 /* write it back out to a new tmpfile (digital write) */
369 if ( (status = esl_tmpfile_named(tmpfile1, &ofp)) != eslOK) esl_fatal("afa good file test %d failed: tmpfile creation", testnumber);
370 if ( (status = esl_msafile_afa_Write(ofp, msa1)) != eslOK) esl_fatal("afa good file test %d failed: msa write, digital", testnumber);
371 fclose(ofp);
372
373 /* now open and read it as text mode, in known format. (We have to pass fmtd now, to deal with the possibility of a nonstandard name width) */
374 if ( (status = esl_msafile_Open(NULL, tmpfile1, NULL, eslMSAFILE_AFA, NULL, &afp)) != eslOK) esl_fatal("afa good file test %d failed: text mode open", testnumber);
375 if ( (status = esl_msafile_afa_Read(afp, &msa2)) != eslOK) esl_fatal("afa good file test %d failed: msa read, text", testnumber);
376 if (msa2->nseq != expected_nseq || msa2->alen != expected_alen) esl_fatal("afa good file test %d failed: nseq/alen", testnumber);
377 if (esl_msa_Validate(msa2, NULL) != eslOK) esl_fatal("afa good file test %d failed: msa invalid", testnumber);
378 esl_msafile_Close(afp);
379
380 /* write it back out to a new tmpfile (text write) */
381 if ( (status = esl_tmpfile_named(tmpfile2, &ofp)) != eslOK) esl_fatal("afa good file test %d failed: tmpfile creation", testnumber);
382 if ( (status = esl_msafile_afa_Write(ofp, msa2)) != eslOK) esl_fatal("afa good file test %d failed: msa write, text", testnumber);
383 fclose(ofp);
384 esl_msa_Destroy(msa2);
385
386 /* open and read it in digital mode */
387 if ( (status = esl_msafile_Open(&abc, tmpfile1, NULL, eslMSAFILE_AFA, NULL, &afp)) != eslOK) esl_fatal("afa good file test %d failed: 2nd digital mode open", testnumber);
388 if ( (status = esl_msafile_afa_Read(afp, &msa2)) != eslOK) esl_fatal("afa good file test %d failed: 2nd digital msa read", testnumber);
389 if (esl_msa_Validate(msa2, NULL) != eslOK) esl_fatal("afa good file test %d failed: msa invalid", testnumber);
390 esl_msafile_Close(afp);
391
392 /* this msa <msa2> should be identical to <msa1> */
393 if (esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal("afa good file test %d failed: msa compare", testnumber);
394
395 remove(tmpfile1);
396 remove(tmpfile2);
397 esl_msa_Destroy(msa1);
398 esl_msa_Destroy(msa2);
399 esl_alphabet_Destroy(abc);
400 }
401
402
403 static void
write_test_msas(FILE * ofp1,FILE * ofp2)404 write_test_msas(FILE *ofp1, FILE *ofp2)
405 {
406 fprintf(ofp1, "\n");
407 fprintf(ofp1, ">seq1 description line for seq1\n");
408 fprintf(ofp1, "..acdefghiklmnpqrstvwy\n");
409 fprintf(ofp1, "ACDEFGHIKLMNPQRSTVWY..\n");
410 fprintf(ofp1, "\n");
411 fprintf(ofp1, ">seq2 description line for seq2\n");
412 fprintf(ofp1, "..acdefghiklmnpqrstv--\n");
413 fprintf(ofp1, "ACDEFGHIKLMNPQRSTVWYyy\n");
414 fprintf(ofp1, " >seq3\n");
415 fprintf(ofp1, "aaacdefghiklmnpqrstv--ACDEFGHIKLMNPQRSTVWY..\n");
416 fprintf(ofp1, ">seq4\n");
417 fprintf(ofp1, "..acdefghiklm\n");
418 fprintf(ofp1, "npqrstvwyACDE\n");
419 fprintf(ofp1, "FGHIKLMNPQRSTVWY..\n");
420
421 fprintf(ofp2, "# STOCKHOLM 1.0\n");
422 fprintf(ofp2, "\n");
423 fprintf(ofp2, "#=GS seq1 DE description line for seq1\n");
424 fprintf(ofp2, "#=GS seq2 DE description line for seq2\n");
425 fprintf(ofp2, "\n");
426 fprintf(ofp2, "seq1 ..acdefghiklmnpqrstvwyACDEFGHIKLMNPQRSTVWY..\n");
427 fprintf(ofp2, "seq2 ..acdefghiklmnpqrstv--ACDEFGHIKLMNPQRSTVWYyy\n");
428 fprintf(ofp2, "seq3 aaacdefghiklmnpqrstv--ACDEFGHIKLMNPQRSTVWY..\n");
429 fprintf(ofp2, "seq4 ..acdefghiklmnpqrstvwyACDEFGHIKLMNPQRSTVWY..\n");
430 fprintf(ofp2, "//\n");
431 }
432
433 static void
read_test_msas_digital(char * afafile,char * stkfile)434 read_test_msas_digital(char *afafile, char *stkfile)
435 {
436 char msg[] = "aligned FASTA msa digital read unit test failed";
437 ESL_ALPHABET *abc = NULL;
438 ESL_MSAFILE *afp1 = NULL;
439 ESL_MSAFILE *afp2 = NULL;
440 ESL_MSA *msa1, *msa2, *msa3, *msa4;
441 FILE *afafp, *stkfp;
442 char afafile2[32] = "esltmpafa2XXXXXX";
443 char stkfile2[32] = "esltmpstk2XXXXXX";
444
445 if ( esl_msafile_Open(&abc, afafile, NULL, eslMSAFILE_AFA, NULL, &afp1) != eslOK) esl_fatal(msg);
446 if ( !abc || abc->type != eslAMINO) esl_fatal(msg);
447 if ( esl_msafile_Open(&abc, stkfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
448 if ( esl_msafile_afa_Read (afp1, &msa1) != eslOK) esl_fatal(msg);
449 if ( esl_msafile_stockholm_Read(afp2, &msa2) != eslOK) esl_fatal(msg);
450 if ( esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal(msg);
451
452 if ( esl_msafile_afa_Read (afp1, &msa3) != eslEOF) esl_fatal(msg);
453 if ( esl_msafile_stockholm_Read(afp2, &msa3) != eslEOF) esl_fatal(msg);
454
455 esl_msafile_Close(afp2);
456 esl_msafile_Close(afp1);
457
458 /* Now write stk to afa file, and vice versa; then retest */
459 if ( esl_tmpfile_named(afafile2, &afafp) != eslOK) esl_fatal(msg);
460 if ( esl_tmpfile_named(stkfile2, &stkfp) != eslOK) esl_fatal(msg);
461 if ( esl_msafile_afa_Write (afafp, msa2) != eslOK) esl_fatal(msg);
462 if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_STOCKHOLM) != eslOK) esl_fatal(msg);
463 fclose(afafp);
464 fclose(stkfp);
465 if ( esl_msafile_Open(&abc, afafile2, NULL, eslMSAFILE_AFA, NULL, &afp1) != eslOK) esl_fatal(msg);
466 if ( esl_msafile_Open(&abc, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
467 if ( esl_msafile_afa_Read (afp1, &msa3) != eslOK) esl_fatal(msg);
468 if ( esl_msafile_stockholm_Read(afp2, &msa4) != eslOK) esl_fatal(msg);
469 if ( esl_msa_Compare(msa3, msa4) != eslOK) esl_fatal(msg);
470
471 remove(afafile2);
472 remove(stkfile2);
473 esl_msafile_Close(afp2);
474 esl_msafile_Close(afp1);
475
476 esl_msa_Destroy(msa1);
477 esl_msa_Destroy(msa2);
478 esl_msa_Destroy(msa3);
479 esl_msa_Destroy(msa4);
480 esl_alphabet_Destroy(abc);
481 }
482
483 static void
read_test_msas_text(char * afafile,char * stkfile)484 read_test_msas_text(char *afafile, char *stkfile)
485 {
486 char msg[] = "aligned FASTA msa text-mode read unit test failed";
487 ESL_MSAFILE *afp1 = NULL;
488 ESL_MSAFILE *afp2 = NULL;
489 ESL_MSA *msa1, *msa2, *msa3, *msa4;
490 FILE *afafp, *stkfp;
491 char afafile2[32] = "esltmpafa2XXXXXX";
492 char stkfile2[32] = "esltmpstk2XXXXXX";
493
494 /* vvvv-- everything's the same as the digital utest except these NULLs */
495 if ( esl_msafile_Open(NULL, afafile, NULL, eslMSAFILE_AFA, NULL, &afp1) != eslOK) esl_fatal(msg);
496 if ( esl_msafile_Open(NULL, stkfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
497 if ( esl_msafile_afa_Read (afp1, &msa1) != eslOK) esl_fatal(msg);
498 if ( esl_msafile_stockholm_Read(afp2, &msa2) != eslOK) esl_fatal(msg);
499 if ( esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal(msg);
500 if ( esl_msafile_afa_Read (afp1, &msa3) != eslEOF) esl_fatal(msg);
501 if ( esl_msafile_stockholm_Read(afp2, &msa3) != eslEOF) esl_fatal(msg);
502 esl_msafile_Close(afp2);
503 esl_msafile_Close(afp1);
504
505 if ( esl_tmpfile_named(afafile2, &afafp) != eslOK) esl_fatal(msg);
506 if ( esl_tmpfile_named(stkfile2, &stkfp) != eslOK) esl_fatal(msg);
507 if ( esl_msafile_afa_Write (afafp, msa2) != eslOK) esl_fatal(msg);
508 if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_STOCKHOLM) != eslOK) esl_fatal(msg);
509 fclose(afafp);
510 fclose(stkfp);
511 if ( esl_msafile_Open(NULL, afafile2, NULL, eslMSAFILE_AFA, NULL, &afp1) != eslOK) esl_fatal(msg);
512 if ( esl_msafile_Open(NULL, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
513 if ( esl_msafile_afa_Read (afp1, &msa3) != eslOK) esl_fatal(msg);
514 if ( esl_msafile_stockholm_Read(afp2, &msa4) != eslOK) esl_fatal(msg);
515 if ( esl_msa_Compare(msa3, msa4) != eslOK) esl_fatal(msg);
516
517 remove(afafile2);
518 remove(stkfile2);
519 esl_msafile_Close(afp2);
520 esl_msafile_Close(afp1);
521
522 esl_msa_Destroy(msa1);
523 esl_msa_Destroy(msa2);
524 esl_msa_Destroy(msa3);
525 esl_msa_Destroy(msa4);
526 }
527 #endif /*eslMSAFILE_AFA_TESTDRIVE*/
528 /*---------------------- end, unit tests ------------------------*/
529
530
531 /*****************************************************************
532 * 3. Test driver.
533 *****************************************************************/
534 #ifdef eslMSAFILE_AFA_TESTDRIVE
535 /* compile: gcc -g -Wall -I. -L. -o esl_msafile_afa_utest -DeslMSAFILE_AFA_TESTDRIVE esl_msafile_afa.c -leasel -lm
536 * (gcov): gcc -g -Wall -fprofile-arcs -ftest-coverage -I. -L. -o esl_msafile_afa_utest -DeslMSAFILE_AFA_TESTDRIVE esl_msafile_afa.c -leasel -lm
537 * run: ./esl_msafile_afa_utest
538 */
539 #include "esl_config.h"
540
541 #include <stdio.h>
542
543 #include "easel.h"
544 #include "esl_getopts.h"
545 #include "esl_random.h"
546 #include "esl_msafile.h"
547 #include "esl_msafile_afa.h"
548
549 static ESL_OPTIONS options[] = {
550 /* name type default env range togs reqs incomp help docgrp */
551 {"-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0},
552 { 0,0,0,0,0,0,0,0,0,0},
553 };
554 static char usage[] = "[-options]";
555 static char banner[] = "test driver for AFA MSA format module";
556
557 int
main(int argc,char ** argv)558 main(int argc, char **argv)
559 {
560 char msg[] = "aligned FASTA MSA i/o module test driver failed";
561 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
562 char afafile[32] = "esltmpafaXXXXXX";
563 char stkfile[32] = "esltmpstkXXXXXX";
564 FILE *afafp, *stkfp;
565 int testnumber;
566 int ngoodtests = 1;
567 char tmpfile[32];
568 FILE *ofp;
569 int expected_alphatype;
570 int expected_nseq;
571 int expected_alen;
572
573
574 if ( esl_tmpfile_named(afafile, &afafp) != eslOK) esl_fatal(msg);
575 if ( esl_tmpfile_named(stkfile, &stkfp) != eslOK) esl_fatal(msg);
576 write_test_msas(afafp, stkfp);
577 fclose(afafp);
578 fclose(stkfp);
579
580 read_test_msas_digital(afafile, stkfile);
581 read_test_msas_text (afafile, stkfile);
582
583 /* Various "good" files that should be parsed correctly */
584 for (testnumber = 1; testnumber <= ngoodtests; testnumber++)
585 {
586 strcpy(tmpfile, "esltmpXXXXXX");
587 if (esl_tmpfile_named(tmpfile, &ofp) != eslOK) esl_fatal(msg);
588 switch (testnumber) {
589 case 1: utest_write_good1 (ofp, &expected_alphatype, &expected_nseq, &expected_alen); break;
590 }
591 fclose(ofp);
592 utest_goodfile(tmpfile, testnumber, expected_alphatype, expected_nseq, expected_alen);
593 remove(tmpfile);
594 }
595
596 remove(afafile);
597 remove(stkfile);
598 esl_getopts_Destroy(go);
599 return 0;
600 }
601 #endif /*eslMSAFILE_AFA_TESTDRIVE*/
602 /*--------------------- end, test driver ------------------------*/
603
604
605
606 /*****************************************************************
607 * 4. Examples.
608 *****************************************************************/
609
610 #ifdef eslMSAFILE_AFA_EXAMPLE
611 /* A full-featured example of reading/writing an MSA in aligned FASTA (AFA) format.
612 gcc -g -Wall -o esl_msafile_afa_example -I. -L. -DeslMSAFILE_afa_EXAMPLE esl_msafile_afa.c -leasel -lm
613 ./esl_msafile_afa_example <msafile>
614 */
615 /*::cexcerpt::msafile_afa_example::begin::*/
616 #include <stdio.h>
617
618 #include "easel.h"
619 #include "esl_alphabet.h"
620 #include "esl_getopts.h"
621 #include "esl_msa.h"
622 #include "esl_msafile.h"
623 #include "esl_msafile_afa.h"
624
625 static ESL_OPTIONS options[] = {
626 /* name type default env range toggles reqs incomp help docgroup*/
627 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
628 { "-1", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "override autodetection; force AFA format", 0 },
629 { "-q", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "quieter: don't write msa back, just summary", 0 },
630 { "-t", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use text mode: no digital alphabet", 0 },
631 { "--dna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is DNA", 0 },
632 { "--rna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is RNA", 0 },
633 { "--amino", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is protein", 0 },
634 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
635 };
636 static char usage[] = "[-options] <msafile>";
637 static char banner[] = "example of guessing, reading, writing AFA format";
638
639 int
main(int argc,char ** argv)640 main(int argc, char **argv)
641 {
642 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
643 char *filename = esl_opt_GetArg(go, 1);
644 int infmt = eslMSAFILE_UNKNOWN;
645 ESL_ALPHABET *abc = NULL;
646 ESL_MSAFILE *afp = NULL;
647 ESL_MSA *msa = NULL;
648 int status;
649
650 if (esl_opt_GetBoolean(go, "-1")) infmt = eslMSAFILE_AFA; /* override format autodetection */
651
652 if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA);
653 else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA);
654 else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
655
656 /* Text mode: pass NULL for alphabet.
657 * Digital mode: pass ptr to expected ESL_ALPHABET; and if abc=NULL, alphabet is guessed
658 */
659 if (esl_opt_GetBoolean(go, "-t")) status = esl_msafile_Open(NULL, filename, NULL, infmt, NULL, &afp);
660 else status = esl_msafile_Open(&abc, filename, NULL, infmt, NULL, &afp);
661 if (status != eslOK) esl_msafile_OpenFailure(afp, status);
662
663 if ((status = esl_msafile_afa_Read(afp, &msa)) != eslOK)
664 esl_msafile_ReadFailure(afp, status);
665
666 printf("alphabet: %s\n", (abc ? esl_abc_DecodeType(abc->type) : "none (text mode)"));
667 printf("# of seqs: %d\n", msa->nseq);
668 printf("# of cols: %d\n", (int) msa->alen);
669 printf("\n");
670
671 if (! esl_opt_GetBoolean(go, "-q"))
672 esl_msafile_afa_Write(stdout, msa);
673
674 esl_msa_Destroy(msa);
675 esl_msafile_Close(afp);
676 if (abc) esl_alphabet_Destroy(abc);
677 esl_getopts_Destroy(go);
678 exit(0);
679 }
680 /*::cexcerpt::msafile_afa_example::end::*/
681 #endif /*eslMSAFILE_AFA_EXAMPLE*/
682
683 #ifdef eslMSAFILE_AFA_EXAMPLE2
684 /* A minimal example. Read AFA format MSA, in text mode.
685 gcc -g -Wall -o esl_msafile_afa_example2 -I. -L. -DeslMSAFILE_AFA_EXAMPLE2 esl_msafile_afa.c -leasel -lm
686 ./esl_msafile_afa_example2 <msafile>
687 */
688 /*::cexcerpt::msafile_afa_example2::begin::*/
689 #include <stdio.h>
690
691 #include "easel.h"
692 #include "esl_msa.h"
693 #include "esl_msafile.h"
694 #include "esl_msafile_afa.h"
695
696 int
main(int argc,char ** argv)697 main(int argc, char **argv)
698 {
699 char *filename = argv[1];
700 int fmt = eslMSAFILE_AFA;
701 ESL_MSAFILE *afp = NULL;
702 ESL_MSA *msa = NULL;
703 int status;
704
705 if ( (status = esl_msafile_Open(NULL, filename, NULL, fmt, NULL, &afp)) != eslOK) esl_msafile_OpenFailure(afp, status);
706 if ( (status = esl_msafile_afa_Read(afp, &msa)) != eslOK) esl_msafile_ReadFailure(afp, status);
707
708 printf("%6d seqs, %5d columns\n", msa->nseq, (int) msa->alen);
709
710 esl_msafile_afa_Write(stdout, msa);
711 esl_msa_Destroy(msa);
712 esl_msafile_Close(afp);
713 exit(0);
714 }
715 /*::cexcerpt::msafile_afa_example2::end::*/
716 #endif /*eslMSAFILE_AFA_EXAMPLE2*/
717 /*--------------------- end of examples -------------------------*/
718