1 /* I/O of multiple sequence alignments in A2M format (UCSC SAM)
2 *
3 * Contents:
4 * 1. API for reading/writing A2M format
5 * 2. Internal functions used by the A2M parser
6 * 3. Unit tests.
7 * 4. Test driver.
8 * 5. Examples.
9 *
10 * Reference:
11 * http://compbio.soe.ucsc.edu/a2m-desc.html
12 */
13 #include "esl_config.h"
14
15 #include <stdio.h>
16 #include <string.h>
17
18 #include "easel.h"
19 #include "esl_alphabet.h"
20 #include "esl_mem.h"
21 #include "esl_msa.h"
22 #include "esl_msafile.h"
23
24 #include "esl_msafile_a2m.h"
25
26 static int a2m_padding_digital(ESL_MSA *msa, char **csflag, int *nins, int ncons);
27 static int a2m_padding_text (ESL_MSA *msa, char **csflag, int *nins, int ncons);
28
29 /*****************************************************************
30 *# 1. API for reading/writing A2M format
31 *****************************************************************/
32
33
34 /* Function: esl_msafile_a2m_SetInmap()
35 * Synopsis: Set input map specific for A2M format
36 *
37 * Purpose: Set the <afp->inmap> for A2M format.
38 *
39 * A2M ignores whitespace and periods (and ignoring
40 * periods makes us agnostic whether the input is
41 * "dotless" format or not). Make ' ', '\t', and
42 * '.' ignored.
43 *
44 * A2M format only allows - for a gap, so make
45 * all other Easel gap characters illegal on input.
46 *
47 * A2M format handles an 'O' specially: this indicates
48 * a FIM (free insertion module) to the SAM software.
49 * We ignore it.
50
51 * A2M allows ACDEFGHIKLMNPQRSTVWY for aa, plus XBZ.
52 * Unknown letters (including other ambig codes) are mapped
53 * to X. A2M allows ACGTU for nucleic, plus YRN. Unknown
54 * letters (including other ambig codes) are mapped to N.
55 * However, Easel enforces its normal input restrictions on
56 * residues: digital bioalphabets allow only valid residue
57 * symbols, and text mode allows any isalpha() character
58 * verbatim.
59 */
60 int
esl_msafile_a2m_SetInmap(ESL_MSAFILE * afp)61 esl_msafile_a2m_SetInmap(ESL_MSAFILE *afp)
62 {
63 int sym;
64
65 if (afp->abc)
66 {
67 for (sym = 0; sym < 128; sym++)
68 afp->inmap[sym] = afp->abc->inmap[sym];
69 afp->inmap[0] = esl_abc_XGetUnknown(afp->abc);
70 afp->inmap['_'] = eslDSQ_ILLEGAL;
71 afp->inmap['*'] = eslDSQ_ILLEGAL;
72 afp->inmap['~'] = eslDSQ_ILLEGAL;
73 }
74
75 if (! afp->abc)
76 {
77 for (sym = 1; sym < 128; sym++)
78 afp->inmap[sym] = (isalpha(sym) ? sym : eslDSQ_ILLEGAL);
79 afp->inmap[0] = '?';
80 afp->inmap['-'] = '-';
81 }
82
83 afp->inmap[' '] = eslDSQ_IGNORED;
84 afp->inmap['\t'] = eslDSQ_IGNORED;
85 afp->inmap['.'] = eslDSQ_IGNORED;
86 afp->inmap['O'] = eslDSQ_IGNORED;
87 afp->inmap['o'] = eslDSQ_IGNORED;
88 return eslOK;
89 }
90
91 /* Function: esl_msafile_a2m_GuessAlphabet()
92 * Synopsis: Guess the alphabet of an open A2M MSA file.
93 *
94 * Purpose: Guess the alpbabet of the sequences in open
95 * A2M format MSA file <afp>.
96 *
97 * On a normal return, <*ret_type> is set to <eslDNA>,
98 * <eslRNA>, or <eslAMINO>, and <afp> is reset to its
99 * original position.
100 *
101 * Args: afp - open A2M format MSA file
102 * ret_type - RETURN: <eslDNA>, <eslRNA>, or <eslAMINO>
103 *
104 * Returns: <eslOK> on success.
105 * <eslENOALPHABET> if alphabet type can't be determined.
106 * In either case, <afp> is rewound to the position it
107 * started at.
108 *
109 * Throws: <eslEMEM> on allocation error.
110 * <eslESYS> on failures of fread() or other system calls
111 */
112 int
esl_msafile_a2m_GuessAlphabet(ESL_MSAFILE * afp,int * ret_type)113 esl_msafile_a2m_GuessAlphabet(ESL_MSAFILE *afp, int *ret_type)
114 {
115 int alphatype = eslUNKNOWN;
116 esl_pos_t anchor = -1;
117 int threshold[3] = { 500, 5000, 50000 }; /* we check after 500, 5000, 50000 residues; else we go to EOF */
118 int nsteps = 3;
119 int step = 0;
120 int nres = 0;
121 int x;
122 int64_t ct[26];
123 char *p;
124 esl_pos_t n, pos;
125 int status;
126
127 for (x = 0; x < 26; x++) ct[x] = 0;
128
129 anchor = esl_buffer_GetOffset(afp->bf);
130 if ((status = esl_buffer_SetAnchor(afp->bf, anchor)) != eslOK) { status = eslEINCONCEIVABLE; goto ERROR; } /* [eslINVAL] can't happen here */
131
132 while ( (status = esl_buffer_GetLine(afp->bf, &p, &n)) == eslOK)
133 {
134 while (n && isspace(*p)) { p++; n--; }
135 if (!n || *p == '>') continue;
136
137 for (pos = 0; pos < n; pos++)
138 if (isalpha(p[pos])) {
139 x = toupper(p[pos]) - 'A';
140 ct[x]++;
141 nres++;
142 }
143
144 /* try to stop early, checking after 500, 5000, and 50000 residues: */
145 if (step < nsteps && nres > threshold[step]) {
146 if ((status = esl_abc_GuessAlphabet(ct, &alphatype)) == eslOK) goto DONE; /* (eslENOALPHABET) */
147 step++;
148 }
149 }
150 if (status != eslEOF) goto ERROR; /* [eslEMEM,eslESYS,eslEINCONCEIVABLE] */
151 status = esl_abc_GuessAlphabet(ct, &alphatype); /* (eslENOALPHABET) */
152
153 /* deliberate flowthrough...*/
154 DONE:
155 esl_buffer_SetOffset(afp->bf, anchor); /* Rewind to where we were. */
156 esl_buffer_RaiseAnchor(afp->bf, anchor);
157 *ret_type = alphatype;
158 return status;
159
160 ERROR:
161 if (anchor != -1) {
162 esl_buffer_SetOffset(afp->bf, anchor);
163 esl_buffer_RaiseAnchor(afp->bf, anchor);
164 }
165 *ret_type = eslUNKNOWN;
166 return status;
167 }
168
169
170 /* Function: esl_msafile_a2m_Read()
171 * Synopsis: Read a UCSC A2M format alignment.
172 *
173 * Purpose: Read an MSA from an open <ESL_MSAFILE> <afp>, parsing
174 * for UCSC A2M (SAM) format. Create a new MSA,
175 * and return a ptr to it in <*ret_msa>. Caller is responsible
176 * for freeing this <ESL_MSA>.
177 *
178 * The <msa> has a reference line (<msa->rf[]>) that
179 * corresponds to the uppercase/lowercase columns in the
180 * alignment: consensus (uppercase) columns are marked 'X',
181 * and insert (lowercase) columns are marked '.' in the RF
182 * annotation line.
183 *
184 * This input parser can deal both with "dotless" A2M, and
185 * full A2M format with dots.
186 *
187 * Args: afp - open <ESL_MSAFILE>
188 * ret_msa - RETURN: newly parsed <ESL_MSA>
189 *
190 * Returns: <eslOK> on success. <*ret_msa> is set to the newly
191 * allocated MSA, and <afp> is at EOF.
192 *
193 * <eslEOF> if no (more) alignment data are found in
194 * <afp>, and <afp> is returned at EOF.
195 *
196 * <eslEFORMAT> on a parse error. <*ret_msa> is set to
197 * <NULL>. <afp> contains information sufficient for
198 * constructing useful diagnostic output:
199 * | <afp->errmsg> | user-directed error message |
200 * | <afp->linenumber> | line # where error was detected |
201 * | <afp->line> | offending line (not NUL-term) |
202 * | <afp->n> | length of offending line |
203 * | <afp->bf->filename> | name of the file |
204 * and <afp> is poised at the start of the following line,
205 * so (in principle) the caller could try to resume
206 * parsing.
207 *
208 * Throws: <eslEMEM> - an allocation failed.
209 * <eslESYS> - a system call such as fread() failed
210 * <eslEINCONCEIVABLE> - "impossible" corruption
211 * On these, <*ret_msa> is returned <NULL>, and the state of
212 * <afp> is undefined.
213 */
214 int
esl_msafile_a2m_Read(ESL_MSAFILE * afp,ESL_MSA ** ret_msa)215 esl_msafile_a2m_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa)
216 {
217 ESL_MSA *msa = NULL;
218 char **csflag = NULL; /* csflag[i][pos] is TRUE if aseq[i][pos] was uppercase consensus */
219 int *nins = NULL; /* # of inserted residues before each consensus col [0..ncons-1] */
220 int *this_nins = NULL; /* # of inserted residues before each consensus residue in this seq */
221 int nseq = 0;
222 int ncons = 0;
223 int idx;
224 int64_t thislen;
225 int64_t spos;
226 int this_ncons;
227 int cpos, bpos;
228 char *p, *tok;
229 esl_pos_t n, toklen;
230 int status;
231
232 ESL_DASSERT1( (afp->format == eslMSAFILE_A2M) );
233
234 afp->errmsg[0] = '\0';
235
236 if (afp->abc && (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
237 if (! afp->abc && (msa = esl_msa_Create( 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
238 ESL_ALLOC(csflag, sizeof(char *) * msa->sqalloc);
239 for (idx = 0; idx < msa->sqalloc; idx++) csflag[idx] = NULL;
240
241 /* skip leading blank lines in file */
242 while ( (status = esl_msafile_GetLine(afp, &p, &n)) == eslOK && esl_memspn(afp->line, afp->n, " \t") == afp->n) ;
243 if (status != eslOK) goto ERROR; /* includes normal EOF */
244
245 /* tolerate sloppy space at start of name/desc line */
246 while (n && isspace(*p)) { p++; n--; }
247 if (*p != '>') ESL_XFAIL(eslEFORMAT, afp->errmsg, "expected A2M name/desc line starting with >");
248
249 do { /* for each record starting in '>': */
250 p++; n--; /* advance past > */
251
252 if ( (status = esl_memtok(&p, &n, " \t", &tok, &toklen)) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "no name found for A2M record");
253 if (nseq >= msa->sqalloc) {
254 int old_sqalloc = msa->sqalloc;
255 if ( (status = esl_msa_Expand(msa)) != eslOK) goto ERROR;
256 ESL_REALLOC(csflag, sizeof(char *) * msa->sqalloc);
257 for (idx = old_sqalloc; idx < msa->sqalloc; idx++) csflag[idx] = NULL;
258 }
259
260 if ( (status = esl_msa_SetSeqName (msa, nseq, tok, toklen)) != eslOK) goto ERROR;
261 if (n && (status = esl_msa_SetSeqDescription(msa, nseq, p, n)) != eslOK) goto ERROR;
262
263 /* now for each sequence line... */
264 thislen = 0; /* count of lowercase, uppercase, and '-': w/o dots, on first pass */
265 this_ncons = 0; /* count of uppercase + '-': number of consensus columns in alignment: must match for all seqs */
266 if (! this_nins) // Starting w/ first sequence, we need a wee initial alloc. Also nseq == 0, ncons == 0 ...
267 ESL_ALLOC(this_nins, sizeof(int) * 1);
268 for (cpos = 0; cpos <= ncons; cpos++) // A little tricksy. On the first sequence, ncons == 0, so this initializes just [0].
269 this_nins[cpos] = 0;
270
271 while ( (status = esl_msafile_GetLine(afp, &p, &n)) == eslOK)
272 {
273 while (n && isspace(*p)) { p++; n--; } /* tolerate and skip leading whitespace on line */
274 if (n == 0) continue; /* tolerate and skip blank lines */
275 if (*p == '>') break;
276
277 ESL_REALLOC(csflag[nseq], sizeof(char) * (thislen + n + 1)); /* might be an overalloc by a bit, depending on whitespace on line */
278 if (nseq == 0) {
279 ESL_REALLOC(this_nins, sizeof(int) * (this_ncons + n + 1));
280 for (cpos = this_ncons+1; cpos <= this_ncons+n; cpos++) // DO NOT zero [this_ncons]; we may still be in an insert from prev line.
281 this_nins[cpos] = 0;
282 }
283
284 for (spos = thislen, bpos = 0; bpos < n; bpos++)
285 {
286 if (p[bpos] == 'O') continue;
287 else if (isupper(p[bpos])) { csflag[nseq][spos++] = TRUE; this_ncons++; }
288 else if (islower(p[bpos])) { csflag[nseq][spos++] = FALSE; this_nins[this_ncons]++; }
289 else if (p[bpos] == '-') { csflag[nseq][spos++] = TRUE; this_ncons++; }
290 if (ncons && this_ncons > ncons) ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected # of consensus residues, didn't match previous seq(s)");
291 }
292 csflag[nseq][spos] = TRUE; /* need a sentinel, because of the way the padding functions work */
293
294 if (msa->abc) { status = esl_abc_dsqcat(afp->inmap, &(msa->ax[nseq]), &thislen, p, n); }
295 if (! msa->abc) { status = esl_strmapcat (afp->inmap, &(msa->aseq[nseq]), &thislen, p, n); }
296 if (status == eslEINVAL) ESL_XFAIL(eslEFORMAT, afp->errmsg, "one or more invalid sequence characters");
297 else if (status != eslOK) goto ERROR;
298 ESL_DASSERT1( (spos == thislen) );
299 }
300 if (status != eslOK && status != eslEOF) goto ERROR; /* exception thrown by esl_msafile_GetLine() */
301 /* status == OK: then *p == '>'. status == eslEOF: we're eof. status == anything else: error */
302 /* Finished reading a sequence record. */
303
304 /* Edge case: it's possible to have a sequence of length 0 residues in the alignment,
305 * and that causes the loop structure above to exit before allocating csflag[] or ax[]
306 */
307 if (thislen == 0)
308 {
309 ESL_ALLOC(csflag[nseq], sizeof(char) * 1);
310 csflag[nseq][0] = TRUE; // csflag[] needs a sentinel even if there's no seq; make one.
311 ESL_ALLOC(msa->ax[nseq], sizeof(ESL_DSQ) * 2);
312 msa->ax[nseq][0] = msa->ax[nseq][1] = eslDSQ_SENTINEL;
313 }
314
315 if (nseq == 0)
316 {
317 ncons = this_ncons;
318 ESL_ALLOC(nins, sizeof(int) * (ncons+1));
319 for (cpos = 0; cpos <= ncons; cpos++)
320 nins[cpos] = this_nins[cpos];
321 }
322 else
323 {
324 if (this_ncons != ncons) ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected # of consensus residues, didn't match previous seq(s). (Do you have an O residue?)");
325 for (cpos = 0; cpos <= ncons; cpos++)
326 nins[cpos] = ESL_MAX(nins[cpos], this_nins[cpos]);
327 }
328 nseq++;
329 } while (status == eslOK);
330
331 /* Now we have nseq *unaligned* sequences in ax/aseq[0..nseq-1]; call the length slen, though we don't explicitly store it
332 * csflag[idx][spos] tells us whether each unaligned residue is an insertion or consensus, for spos==0..slen-1.
333 * nins[0..ncons] tells us the max number of inserted residues before each consensus column
334 * This is sufficient information to reconstruct each aligned sequence.
335 */
336 msa->nseq = nseq;
337 if (msa->abc) { if ((status = a2m_padding_digital(msa, csflag, nins, ncons)) != eslOK) goto ERROR; }
338 if (!msa->abc) { if ((status = a2m_padding_text (msa, csflag, nins, ncons)) != eslOK) goto ERROR; }
339
340 if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR;
341
342 *ret_msa = msa;
343 free(nins);
344 free(this_nins);
345 for (idx = 0; idx < msa->nseq; idx++) free(csflag[idx]);
346 free(csflag);
347 return eslOK;
348
349 ERROR:
350 if (nins) free(nins);
351 if (this_nins) free(this_nins);
352 if (csflag) {
353 for (idx = 0; idx < msa->nseq; idx++)
354 if (csflag[idx]) free(csflag[idx]);
355 free(csflag);
356 }
357 if (msa) esl_msa_Destroy(msa);
358 return status;
359 }
360
361
362 /* Function: esl_msafile_a2m_Write()
363 * Synopsis: Write an A2M (UCSC SAM) dotless format alignment to a stream.
364 *
365 * Purpose: Write alignment <msa> in dotless UCSC A2M format to a
366 * stream <fp>.
367 *
368 * The <msa> should have a valid reference line <msa->rf>,
369 * with alphanumeric characters marking consensus (match)
370 * columns, and non-alphanumeric characters marking
371 * nonconsensus (insert) columns. If it does not,
372 * then as a fallback, the first sequence in the alignment is
373 * considered to be the consensus.
374 *
375 * In "dotless" A2M format, gap characters (.) in insert
376 * columns are omitted; therefore sequences can be of
377 * different lengths, but each sequence has the same number
378 * of consensus columns (residue or -).
379 *
380 * A2M format cannot represent missing data symbols
381 * (Easel's ~). Any missing data symbols are converted to
382 * gaps.
383 *
384 * A2M format cannot represent pyrrolysine residues in
385 * amino acid sequences, because it treats 'O' symbols
386 * specially, as indicating a position at which a
387 * free-insertion module (FIM) should be created. Any 'O'
388 * in the <msa> is written instead as an unknown
389 * residue ('X', in protein sequences).
390 *
391 * Args: fp - open output stream
392 * msa - MSA to write
393 *
394 * Returns: <eslOK> on success.
395 *
396 * Throws: <eslEMEM> on allocation failure.
397 * <eslEWRITE> on any system write error, such as filled disk.
398 */
399 int
esl_msafile_a2m_Write(FILE * fp,const ESL_MSA * msa)400 esl_msafile_a2m_Write(FILE *fp, const ESL_MSA *msa)
401 {
402 char *buf = NULL;
403 int cpl = 60;
404 int bpos;
405 int pos;
406 int is_consensus;
407 int is_residue;
408 int do_dotless = TRUE; /* just changing this to FALSE makes it write dots too */
409 int i;
410 int sym;
411 int status;
412
413 ESL_ALLOC(buf, sizeof(char) * (cpl+1));
414
415 for (i = 0; i < msa->nseq; i++)
416 {
417 /* Construct the name/description line */
418 if (fprintf(fp, ">%s", msa->sqname[i]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "a2m msa file write failed");
419 if (msa->sqacc != NULL && msa->sqacc[i] != NULL) { if (fprintf(fp, " %s", msa->sqacc[i]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "a2m msa file write failed"); }
420 if (msa->sqdesc != NULL && msa->sqdesc[i] != NULL) { if (fprintf(fp, " %s", msa->sqdesc[i]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "a2m msa file write failed"); }
421 if (fputc('\n', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "a2m msa file write failed");
422
423 if (msa->abc)
424 {
425 pos = 0;
426 while (pos < msa->alen)
427 {
428 for (bpos = 0; pos < msa->alen && bpos < cpl; pos++)
429 {
430 sym = msa->abc->sym[msa->ax[i][pos+1]]; /* note off-by-one in digitized aseq: 1..alen */
431 is_residue = esl_abc_XIsResidue(msa->abc, msa->ax[i][pos+1]);
432 if (msa->rf) is_consensus = (isalnum(msa->rf[pos]) ? TRUE : FALSE);
433 else is_consensus = (esl_abc_XIsResidue(msa->abc, msa->ax[0][pos+1]) ? TRUE : FALSE);
434
435 if (sym == 'O') sym = esl_abc_CGetUnknown(msa->abc); /* watch out: O means "insert a FIM" in a2m format, not pyrrolysine */
436
437 if (is_consensus) { buf[bpos++] = (is_residue ? toupper(sym) : '-'); }
438 else if (is_residue) { buf[bpos++] = tolower(sym); }
439 else if (! do_dotless) { buf[bpos++] = '.'; }
440 }
441 buf[bpos] = '\0';
442 if (bpos) { if (fprintf(fp, "%s\n", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "a2m msa file write failed");}
443 }
444 }
445
446 if (! msa->abc)
447 {
448 pos = 0;
449 while (pos < msa->alen)
450 {
451 for (bpos = 0; pos < msa->alen && bpos < cpl; pos++)
452 {
453 sym = msa->aseq[i][pos];
454 is_residue = isalpha(msa->aseq[i][pos]);
455 if (msa->rf) is_consensus = (isalnum(msa->rf[pos]) ? TRUE : FALSE);
456 else is_consensus = (isalnum(msa->aseq[0][pos]) ? TRUE : FALSE);
457
458 if (sym == 'O') sym = 'X';
459
460 if (is_consensus) { buf[bpos++] = ( is_residue ? toupper(sym) : '-'); }
461 else if (is_residue) { buf[bpos++] = tolower(sym); }
462 else if (! do_dotless) { buf[bpos++] = '.'; }
463
464 }
465 buf[bpos] = '\0';
466 if (bpos) { if (fprintf(fp, "%s\n", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "a2m msa file write failed"); }
467 }
468 }
469 } /* end, loop over sequences in the MSA */
470
471 free(buf);
472 return eslOK;
473
474 ERROR:
475 if (buf) free(buf);
476 return status;
477 }
478 /*------------- end, API for i/o of a2m format ------------------*/
479
480
481 /*****************************************************************
482 * 2. Internal functions used by the A2M parser
483 *****************************************************************/
484
485 /* A2M parser has an input phase, followed by an alignment padding phase.
486 * The a2m_padding_{digital,text} functions do the padding phase.
487 *
488 * Upon call:
489 * msa->nseq is set;
490 * msa->ax[0..nseq-1][1..slen] are unaligned seqs (consensus cols +
491 * inserted residues); or msa->aseq[0..nseq-1][0..slen-1], for text mode
492 * csflag[0..nseq-1][0..slen-1] is TRUE/FALSE for whether each pos
493 * in msa->ax[][1..slen]/msa->aseq[][0..slen-1] is consensus or insert
494 * nins[0..ncons] is the number of insert columns preceding each consensus column
495 *
496 * watch out, ax[] is a digital sequence, 1..alen not 0..alen-1: hence
497 * the [apos+1], [spos+1] indexing
498 *
499 * these functions may not fail with any normal (eslEFORMAT) error, because
500 * we wouldn't be able to tell the line/linenumber of the error
501 *
502 * Upon successful return:
503 * msa->alen is set
504 * all msa->ax[]/msa->aseq are now aligned digital sequences
505 * msa->rf is set
506 */
507 static int
a2m_padding_digital(ESL_MSA * msa,char ** csflag,int * nins,int ncons)508 a2m_padding_digital(ESL_MSA *msa, char **csflag, int *nins, int ncons)
509 {
510 ESL_DSQ *ax = NULL; /* new aligned sequence - will be swapped into msa->ax[] */
511 ESL_DSQ gapsym = esl_abc_XGetGap(msa->abc);
512 int apos, cpos, spos; /* position counters for alignment 0..alen, consensus cols 0..cpos-1, sequence position 0..slen-1 */
513 int alen;
514 int icount;
515 int idx;
516 int status;
517
518 alen = ncons;
519 for (cpos = 0; cpos <= ncons; cpos++)
520 alen += nins[cpos];
521
522 ESL_ALLOC(msa->rf, sizeof(char) * (alen+1));
523 for (apos = 0, cpos = 0; cpos <= ncons; cpos++)
524 {
525 for (icount = 0; icount < nins[cpos]; icount++) msa->rf[apos++] = '.';
526 if (cpos < ncons) msa->rf[apos++] = 'x';
527 }
528 msa->rf[apos] = '\0';
529
530 for (idx = 0; idx < msa->nseq; idx++)
531 {
532 ESL_ALLOC(ax, sizeof(ESL_DSQ) * (alen + 2));
533 ax[0] = eslDSQ_SENTINEL;
534 apos = spos = 0;
535 for (cpos = 0; cpos <= ncons; cpos++)
536 {
537 icount = 0;
538 while (csflag[idx][spos] == FALSE) { ax[apos+1] = msa->ax[idx][spos+1]; apos++; spos++; icount++; }
539 while (icount < nins[cpos]) { ax[apos+1] = gapsym; apos++; icount++; }
540 if (cpos < ncons) { ax[apos+1] = msa->ax[idx][spos+1]; apos++; spos++; }
541 }
542 ESL_DASSERT1( (msa->ax[idx][spos+1] == eslDSQ_SENTINEL) );
543 ESL_DASSERT1( (apos == alen) );
544 ax[alen+1] = eslDSQ_SENTINEL;
545 free(msa->ax[idx]);
546 msa->ax[idx] = ax;
547 ax = NULL;
548 }
549 msa->alen = alen;
550
551
552
553 return eslOK;
554
555 ERROR:
556 if (ax) free(ax);
557 return status;
558 }
559
560
561 static int
a2m_padding_text(ESL_MSA * msa,char ** csflag,int * nins,int ncons)562 a2m_padding_text(ESL_MSA *msa, char **csflag, int *nins, int ncons)
563 {
564 char *aseq = NULL; /* new aligned sequence - will be swapped into msa->aseq[] */
565 int apos, cpos, spos; /* position counters for alignment 0..alen, consensus cols 0..cpos-1, sequence position 0..slen-1 */
566 int alen;
567 int icount;
568 int idx;
569 int status;
570
571 alen = ncons;
572 for (cpos = 0; cpos <= ncons; cpos++)
573 alen += nins[cpos];
574
575 ESL_ALLOC(msa->rf, sizeof(char) * (alen+1));
576 for (apos = 0, cpos = 0; cpos <= ncons; cpos++)
577 {
578 for (icount = 0; icount < nins[cpos]; icount++) msa->rf[apos++] = '.';
579 if (cpos < ncons) msa->rf[apos++] = 'x';
580 }
581 msa->rf[apos] = '\0';
582
583 for (idx = 0; idx < msa->nseq; idx++)
584 {
585 ESL_ALLOC(aseq, sizeof(char) * (alen + 1));
586 apos = spos = 0;
587 for (cpos = 0; cpos <= ncons; cpos++)
588 {
589 icount = 0;
590 while (csflag[idx][spos] == FALSE) { aseq[apos] = msa->aseq[idx][spos]; apos++; spos++; icount++; }
591 while (icount < nins[cpos]) { aseq[apos] = '.'; apos++; icount++; }
592 if (cpos < ncons) { aseq[apos] = msa->aseq[idx][spos]; apos++; spos++; }
593 }
594 ESL_DASSERT1( (msa->aseq[idx][spos] == '\0') );
595 ESL_DASSERT1( (apos == alen) );
596 aseq[alen] = '\0';
597 free(msa->aseq[idx]);
598 msa->aseq[idx] = aseq;
599 aseq = NULL;
600 }
601 msa->alen = alen;
602 return eslOK;
603
604 ERROR:
605 if (aseq) free(aseq);
606 return status;
607 }
608 /*---------- end, internal functions for the parser -------------*/
609
610
611 /*****************************************************************
612 * 3. Unit tests.
613 *****************************************************************/
614 #ifdef eslMSAFILE_A2M_TESTDRIVE
615
616 #include "esl_random.h"
617
618 /* A2M is unable to write O (pyrrolysine) residues, because it uses O
619 * to mean a free insertion module. For unit tests that generate
620 * dirty/sampled MSAs, we have to avoid O's in those alignments.
621 */
622 static void
a2m_no_O(ESL_MSA * msa)623 a2m_no_O(ESL_MSA *msa)
624 {
625 int idx;
626 int apos;
627 int symO;
628
629 ESL_DASSERT1(( msa->flags & eslMSA_DIGITAL ));
630 ESL_DASSERT1(( msa->abc != NULL ));
631
632 symO = esl_abc_DigitizeSymbol(msa->abc, 'O');
633
634 for (idx = 0; idx < msa->nseq; idx++)
635 for (apos = 1; apos <= msa->alen; apos++)
636 if (msa->ax[idx][apos] == symO) msa->ax[idx][apos] = esl_abc_XGetUnknown(msa->abc);
637 }
638
639 static void
utest_write_good1(FILE * ofp,int * ret_alphatype,int * ret_nseq,int * ret_alen)640 utest_write_good1(FILE *ofp, int *ret_alphatype, int *ret_nseq, int *ret_alen)
641 {
642 fputs(">MYG_PHYCA\n", ofp);
643 fputs("VLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRFKHLKTEAEMKASED\n", ofp);
644 fputs("LKKHGVTVLTALGAILKKKGHHEAELKPLAQSHATKHKIPIKYLEFISEAIIHVLHSRHP\n", ofp);
645 fputs("GDFGADAQGAMNKALELFRKDIAAKYKELGYQG\n", ofp);
646 fputs(">GLB5_PETMA\n", ofp);
647 fputs("pivdtgsvApLSAAEKTKIRSAWAPVYSTYETSGVDILVKFFTSTPAAQEFFPKFKGLTT\n", ofp);
648 fputs("ADQLKKSADVRWHAERIINAVNDAVASMDDtekMSMKLRDLSGKHAKSFQVDPQYFKVLA\n", ofp);
649 fputs("AVI---------ADTVAAGDAGFEKLMSMICILLRSAY-------\n", ofp);
650 fputs(">HBB_HUMAN\n", ofp);
651 fputs("VhLTPEEKSAVTALWGKV--NVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNP\n", ofp);
652 fputs("KVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHF\n", ofp);
653 fputs("GKEFTPPVQAAYQKVVAGVANALAHKYH------\n", ofp);
654 fputs(">HBA_HUMAN\n", ofp);
655 fputs("VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHF------DLSHGSAQ\n", ofp);
656 fputs("VKGHGKKVADALTNAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLP\n", ofp);
657 fputs("AEFTPAVHASLDKFLASVSTVLTSKYR------\n", ofp);
658
659 *ret_alphatype = eslAMINO;
660 *ret_nseq = 4;
661 *ret_alen = 165;
662 }
663
664 static void
utest_write_good2(FILE * ofp,int * ret_alphatype,int * ret_nseq,int * ret_alen)665 utest_write_good2(FILE *ofp, int *ret_alphatype, int *ret_nseq, int *ret_alen)
666 {
667 fputs(">tRNA2\n", ofp);
668 fputs("UCCGAUAUAGUGUAACGGCUAUCACAUCACGCUUUCACCGUGGAGACCGGGGUUCGACUC\n", ofp);
669 fputs("CCCGUAUCGGAG\n", ofp);
670 fputs(">tRNA3\n", ofp);
671 fputs("UCCGUGAUAGUUUAAUGGUCAGAAUGG-GCGCUUGUCGCGUGCcAGAUCGGGGUUCAAUU\n", ofp);
672 fputs("CCCCGUCGCGGAG\n", ofp);
673 fputs(">tRNA5\n", ofp);
674 fputs("GGGCACAUGGCGCAGUUGGUAGCGCGCUUCCCUUGCAAGGAAGaGGUCAUCGGUUCGAUU\n", ofp);
675 fputs("CCGGUUGCGUCCA\n", ofp);
676 fputs(">tRNA1\n", ofp);
677 fputs("GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGaGGUCCUGUGUUCGAUC\n", ofp);
678 fputs("CACAGAAUUCGCA\n", ofp);
679 fputs(">tRNA4\n", ofp);
680 fputs("GCUCGUAUGGCGCAGUGG-UAGCGCAGCAGAUUGCAAAUCUGUuGGUCCUUAGUUCGAUC\n", ofp);
681 fputs("CUGAGUGCGAGCU\n", ofp);
682
683 *ret_alphatype = eslRNA;
684 *ret_nseq = 5;
685 *ret_alen = 73;
686 }
687
688 static void
utest_goodfile(char * filename,int testnumber,int expected_alphatype,int expected_nseq,int expected_alen)689 utest_goodfile(char *filename, int testnumber, int expected_alphatype, int expected_nseq, int expected_alen)
690 {
691 ESL_ALPHABET *abc = NULL;
692 ESL_MSAFILE *afp = NULL;
693 ESL_MSA *msa1 = NULL;
694 ESL_MSA *msa2 = NULL;
695 char tmpfile1[32] = "esltmpXXXXXX";
696 char tmpfile2[32] = "esltmpXXXXXX";
697 FILE *ofp = NULL;
698 int status;
699
700 /* A2M must be specified (no format guessing, unless we use .a2m suffix) but guessing the alphabet should work: this is a digital open */
701 if ( (status = esl_msafile_Open(&abc, filename, NULL, eslMSAFILE_A2M, NULL, &afp)) != eslOK) esl_fatal("a2m good file test %d failed: digital open", testnumber);
702 if (afp->format != eslMSAFILE_A2M) esl_fatal("a2m good file test %d failed: bad format", testnumber);
703 if (abc->type != expected_alphatype) esl_fatal("a2m good file test %d failed: alphabet autodetection", testnumber);
704
705 /* This is a digital read, using <abc>. */
706 if ( (status = esl_msafile_a2m_Read(afp, &msa1)) != eslOK) esl_fatal("a2m good file test %d failed: msa read, digital", testnumber);
707 if (msa1->nseq != expected_nseq || msa1->alen != expected_alen) esl_fatal("a2m good file test %d failed: nseq/alen", testnumber);
708 if (esl_msa_Validate(msa1, NULL) != eslOK) esl_fatal("a2m good file test %d failed: msa invalid", testnumber);
709 esl_msafile_Close(afp);
710
711 /* write it back out to a new tmpfile (digital write) */
712 if ( (status = esl_tmpfile_named(tmpfile1, &ofp)) != eslOK) esl_fatal("a2m good file test %d failed: tmpfile creation", testnumber);
713 if ( (status = esl_msafile_a2m_Write(ofp, msa1)) != eslOK) esl_fatal("a2m good file test %d failed: msa write, digital", testnumber);
714 fclose(ofp);
715
716 /* now open and read it as text mode, in known format. */
717 if ( (status = esl_msafile_Open(NULL, tmpfile1, NULL, eslMSAFILE_A2M, NULL, &afp)) != eslOK) esl_fatal("a2m good file test %d failed: text mode open", testnumber);
718 if ( (status = esl_msafile_a2m_Read(afp, &msa2)) != eslOK) esl_fatal("a2m good file test %d failed: msa read, text", testnumber);
719 if (msa2->nseq != expected_nseq || msa2->alen != expected_alen) esl_fatal("a2m good file test %d failed: nseq/alen", testnumber);
720 if (esl_msa_Validate(msa2, NULL) != eslOK) esl_fatal("a2m good file test %d failed: msa invalid", testnumber);
721 esl_msafile_Close(afp);
722
723 /* write it back out to a new tmpfile (text write) */
724 if ( (status = esl_tmpfile_named(tmpfile2, &ofp)) != eslOK) esl_fatal("a2m good file test %d failed: tmpfile creation", testnumber);
725 if ( (status = esl_msafile_a2m_Write(ofp, msa2)) != eslOK) esl_fatal("a2m good file test %d failed: msa write, text", testnumber);
726 fclose(ofp);
727 esl_msa_Destroy(msa2);
728
729 /* open and read it in digital mode */
730 if ( (status = esl_msafile_Open(&abc, tmpfile1, NULL, eslMSAFILE_A2M, NULL, &afp)) != eslOK) esl_fatal("a2m good file test %d failed: 2nd digital mode open", testnumber);
731 if ( (status = esl_msafile_a2m_Read(afp, &msa2)) != eslOK) esl_fatal("a2m good file test %d failed: 2nd digital msa read", testnumber);
732 if (esl_msa_Validate(msa2, NULL) != eslOK) esl_fatal("a2m good file test %d failed: msa invalid", testnumber);
733 esl_msafile_Close(afp);
734
735 /* this msa <msa2> should be identical to <msa1> */
736 if (esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal("a2m good file test %d failed: msa compare", testnumber);
737
738 remove(tmpfile1);
739 remove(tmpfile2);
740 esl_msa_Destroy(msa1);
741 esl_msa_Destroy(msa2);
742 esl_alphabet_Destroy(abc);
743 }
744
745
746 static void
write_test_msas(FILE * ofp1,FILE * ofp2)747 write_test_msas(FILE *ofp1, FILE *ofp2)
748 {
749 fprintf(ofp1, ">seq1 description line for seq1\n");
750 fprintf(ofp1, "ACDEFGHIKLMNPQRSTVWY\n");
751 fprintf(ofp1, "ACDEFGHIKLMNPQRSTVWY\n");
752 fprintf(ofp1, ">seq2 description line for seq2\n");
753 fprintf(ofp1, "ACDEFGHIKLMNPQRSTV--\n");
754 fprintf(ofp1, "ACDEFGHIKLMNPQRSTVWY\n");
755 fprintf(ofp1, "yy\n");
756 fprintf(ofp1, ">seq3\n");
757 fprintf(ofp1, "aaACDEFGHIKLMNPQRSTV\n");
758 fprintf(ofp1, "--ACDEFGHIKLMNPQRSTVWY\n");
759 fprintf(ofp1, ">seq4 \n");
760 fprintf(ofp1, "ACDEFGHIKLMNPQR\n");
761 fprintf(ofp1, "STVWYACDEFGHIKL\n");
762 fprintf(ofp1, "MNPQRSTVWY\n");
763
764 fprintf(ofp2, "# STOCKHOLM 1.0\n");
765 fprintf(ofp2, "\n");
766 fprintf(ofp2, "#=GS seq1 DE description line for seq1\n");
767 fprintf(ofp2, "#=GS seq2 DE description line for seq2\n");
768 fprintf(ofp2, "\n");
769 fprintf(ofp2, "#=GC RF ..xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx..\n");
770 fprintf(ofp2, "seq1 ..ACDEFGHIKLMNPQRSTVWYACDEFGHIKLMNPQRSTVWY..\n");
771 fprintf(ofp2, "seq2 ..ACDEFGHIKLMNPQRSTV--ACDEFGHIKLMNPQRSTVWYyy\n");
772 fprintf(ofp2, "seq3 aaACDEFGHIKLMNPQRSTV--ACDEFGHIKLMNPQRSTVWY..\n");
773 fprintf(ofp2, "seq4 ..ACDEFGHIKLMNPQRSTVWYACDEFGHIKLMNPQRSTVWY..\n");
774 fprintf(ofp2, "//\n");
775 }
776
777 static void
read_test_msas_digital(char * a2mfile,char * stkfile)778 read_test_msas_digital(char *a2mfile, char *stkfile)
779 {
780 char msg[] = "A2M msa digital read unit test failed";
781 ESL_ALPHABET *abc = NULL;
782 ESL_MSAFILE *afp1 = NULL;
783 ESL_MSAFILE *afp2 = NULL;
784 ESL_MSA *msa1, *msa2, *msa3, *msa4;
785 FILE *a2mfp, *stkfp;
786 char a2mfile2[32] = "esltmpa2m2XXXXXX";
787 char stkfile2[32] = "esltmpstk2XXXXXX";
788
789 if ( esl_msafile_Open(&abc, a2mfile, NULL, eslMSAFILE_A2M, NULL, &afp1) != eslOK) esl_fatal(msg);
790 if ( !abc || abc->type != eslAMINO) esl_fatal(msg);
791 if ( esl_msafile_Open(&abc, stkfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
792 if ( esl_msafile_a2m_Read (afp1, &msa1) != eslOK) esl_fatal(msg);
793 if ( esl_msafile_stockholm_Read(afp2, &msa2) != eslOK) esl_fatal(msg);
794 if ( esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal(msg);
795
796 if ( esl_msafile_a2m_Read (afp1, &msa3) != eslEOF) esl_fatal(msg);
797 if ( esl_msafile_stockholm_Read(afp2, &msa3) != eslEOF) esl_fatal(msg);
798
799 esl_msafile_Close(afp2);
800 esl_msafile_Close(afp1);
801
802 /* Now write stk to a2m file, and vice versa; then retest */
803 if ( esl_tmpfile_named(a2mfile2, &a2mfp) != eslOK) esl_fatal(msg);
804 if ( esl_tmpfile_named(stkfile2, &stkfp) != eslOK) esl_fatal(msg);
805 if ( esl_msafile_a2m_Write (a2mfp, msa2) != eslOK) esl_fatal(msg);
806 if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_PFAM) != eslOK) esl_fatal(msg);
807 fclose(a2mfp);
808 fclose(stkfp);
809 if ( esl_msafile_Open(&abc, a2mfile2, NULL, eslMSAFILE_A2M, NULL, &afp1) != eslOK) esl_fatal(msg);
810 if ( esl_msafile_Open(&abc, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
811 if ( esl_msafile_a2m_Read (afp1, &msa3) != eslOK) esl_fatal(msg);
812 if ( esl_msafile_stockholm_Read(afp2, &msa4) != eslOK) esl_fatal(msg);
813 if ( esl_msa_Compare(msa3, msa4) != eslOK) esl_fatal(msg);
814
815 remove(a2mfile2);
816 remove(stkfile2);
817 esl_msafile_Close(afp2);
818 esl_msafile_Close(afp1);
819
820 esl_msa_Destroy(msa1);
821 esl_msa_Destroy(msa2);
822 esl_msa_Destroy(msa3);
823 esl_msa_Destroy(msa4);
824 esl_alphabet_Destroy(abc);
825 }
826
827 static void
read_test_msas_text(char * a2mfile,char * stkfile)828 read_test_msas_text(char *a2mfile, char *stkfile)
829 {
830 char msg[] = "A2M msa text-mode read unit test failed";
831 ESL_MSAFILE *afp1 = NULL;
832 ESL_MSAFILE *afp2 = NULL;
833 ESL_MSA *msa1, *msa2, *msa3, *msa4;
834 FILE *a2mfp, *stkfp;
835 char a2mfile2[32] = "esltmpa2m2XXXXXX";
836 char stkfile2[32] = "esltmpstk2XXXXXX";
837
838 /* vvvv-- everything's the same as the digital utest except these NULLs */
839 if ( esl_msafile_Open(NULL, a2mfile, NULL, eslMSAFILE_A2M, NULL, &afp1) != eslOK) esl_fatal(msg);
840 if ( esl_msafile_Open(NULL, stkfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
841 if ( esl_msafile_a2m_Read (afp1, &msa1) != eslOK) esl_fatal(msg);
842 if ( esl_msafile_stockholm_Read(afp2, &msa2) != eslOK) esl_fatal(msg);
843 if ( esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal(msg);
844 if ( esl_msafile_a2m_Read (afp1, &msa3) != eslEOF) esl_fatal(msg);
845 if ( esl_msafile_stockholm_Read(afp2, &msa3) != eslEOF) esl_fatal(msg);
846 esl_msafile_Close(afp2);
847 esl_msafile_Close(afp1);
848
849 if ( esl_tmpfile_named(a2mfile2, &a2mfp) != eslOK) esl_fatal(msg);
850 if ( esl_tmpfile_named(stkfile2, &stkfp) != eslOK) esl_fatal(msg);
851 if ( esl_msafile_a2m_Write (a2mfp, msa2) != eslOK) esl_fatal(msg);
852 if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_PFAM) != eslOK) esl_fatal(msg);
853 fclose(a2mfp);
854 fclose(stkfp);
855 if ( esl_msafile_Open(NULL, a2mfile2, NULL, eslMSAFILE_A2M, NULL, &afp1) != eslOK) esl_fatal(msg);
856 if ( esl_msafile_Open(NULL, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
857 if ( esl_msafile_a2m_Read (afp1, &msa3) != eslOK) esl_fatal(msg);
858 if ( esl_msafile_stockholm_Read(afp2, &msa4) != eslOK) esl_fatal(msg);
859 if ( esl_msa_Compare(msa3, msa4) != eslOK) esl_fatal(msg);
860
861 remove(a2mfile2);
862 remove(stkfile2);
863 esl_msafile_Close(afp2);
864 esl_msafile_Close(afp1);
865
866 esl_msa_Destroy(msa1);
867 esl_msa_Destroy(msa2);
868 esl_msa_Destroy(msa3);
869 esl_msa_Destroy(msa4);
870 }
871
872 static void
utest_gibberish(ESL_RANDOMNESS * rng)873 utest_gibberish(ESL_RANDOMNESS *rng)
874 {
875 char msg[] = "esl_msafile_a2m gibberish test failed";
876 char a2mfile[32] = "esltmpXXXXXX";
877 FILE *fp = NULL;
878 ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO);
879 ESL_MSA *msa = NULL;
880 ESL_MSAFILE *msafp = NULL;
881 ESL_MSA *msa2 = NULL;
882
883 if ( esl_msa_Sample(rng, abc, 100, 100, &msa) != eslOK) esl_fatal(msg);
884 if ( esl_msa_Validate(msa, NULL) != eslOK) esl_fatal(msg);
885 if ( esl_msa_FlushLeftInserts(msa) != eslOK) esl_fatal(msg); // Reading A2M back in will flush inserts left.
886 if ( esl_msa_MinimGaps(msa, NULL, NULL, TRUE) != eslOK) esl_fatal(msg); // Reading A2M back in will minimize gaps.
887 a2m_no_O(msa); // A2M doesn't allow O residues.
888 ESL_DASSERT1(( !(msa->flags & eslMSA_HASWGTS) )); // A2M can't store weights.
889 ESL_DASSERT1(( msa->rf != NULL )); // A2M always implies consensus annotation.
890
891 if ( esl_tmpfile_named(a2mfile, &fp) != eslOK) esl_fatal(msg);
892 if ( esl_msafile_a2m_Write(fp, msa) != eslOK) esl_fatal(msg);
893 fclose(fp);
894
895 if ( esl_msafile_Open(&abc, a2mfile, NULL, eslMSAFILE_A2M, NULL, &msafp) != eslOK) esl_fatal(msg);
896 if ( esl_msafile_a2m_Read(msafp, &msa2) != eslOK) esl_fatal(msg);
897 if ( esl_msa_Validate(msa2, NULL) != eslOK) esl_fatal(msg);
898 if ( esl_msa_Compare(msa, msa2) != eslOK) esl_fatal(msg);
899 esl_msafile_Close(msafp);
900
901 remove(a2mfile);
902 esl_msa_Destroy(msa);
903 esl_msa_Destroy(msa2);
904 esl_alphabet_Destroy(abc);
905 }
906 #endif /*eslMSAFILE_A2M_TESTDRIVE*/
907 /*---------------------- end, unit tests ------------------------*/
908
909
910
911 /*****************************************************************
912 * 4. Test driver.
913 *****************************************************************/
914 #ifdef eslMSAFILE_A2M_TESTDRIVE
915 /* compile: gcc -g -Wall -I. -L. -o esl_msafile_a2m_utest -DeslMSAFILE_A2M_TESTDRIVE esl_msafile_a2m.c -leasel -lm
916 * (gcov): gcc -g -Wall -fprofile-arcs -ftest-coverage -I. -L. -o esl_msafile_a2m_utest -DeslMSAFILE_A2M_TESTDRIVE esl_msafile_a2m.c -leasel -lm
917 * run: ./esl_msafile_a2m_utest
918 */
919 #include "esl_config.h"
920
921 #include <stdio.h>
922
923 #include "easel.h"
924 #include "esl_getopts.h"
925 #include "esl_random.h"
926 #include "esl_msafile.h"
927 #include "esl_msafile_a2m.h"
928
929 static ESL_OPTIONS options[] = {
930 /* name type default env range togs reqs incomp help docgrp */
931 {"-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0},
932 {"-s", eslARG_INT, "0", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>", 0},
933 { 0,0,0,0,0,0,0,0,0,0},
934 };
935 static char usage[] = "[-options]";
936 static char banner[] = "test driver for A2M MSA format module";
937
938 int
main(int argc,char ** argv)939 main(int argc, char **argv)
940 {
941 char msg[] = "a2m MSA i/o module test driver failed";
942 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
943 ESL_RANDOMNESS *rng = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));
944 char a2mfile[32] = "esltmpa2mXXXXXX";
945 char stkfile[32] = "esltmpstkXXXXXX";
946 FILE *a2mfp, *stkfp;
947 int testnumber;
948 int ngoodtests = 2;
949 char tmpfile[32];
950 FILE *ofp;
951 int expected_alphatype;
952 int expected_nseq;
953 int expected_alen;
954
955 fprintf(stderr, "## %s\n", argv[0]);
956 fprintf(stderr, "# rng seed = %" PRIu32 "\n", esl_randomness_GetSeed(rng));
957
958 if ( esl_tmpfile_named(a2mfile, &a2mfp) != eslOK) esl_fatal(msg);
959 if ( esl_tmpfile_named(stkfile, &stkfp) != eslOK) esl_fatal(msg);
960 write_test_msas(a2mfp, stkfp);
961 fclose(a2mfp);
962 fclose(stkfp);
963
964 read_test_msas_digital(a2mfile, stkfile);
965 read_test_msas_text (a2mfile, stkfile);
966
967 /* Various "good" files that should be parsed correctly */
968 for (testnumber = 1; testnumber <= ngoodtests; testnumber++)
969 {
970 strcpy(tmpfile, "esltmpXXXXXX");
971 if (esl_tmpfile_named(tmpfile, &ofp) != eslOK) esl_fatal(msg);
972 switch (testnumber) {
973 case 1: utest_write_good1 (ofp, &expected_alphatype, &expected_nseq, &expected_alen); break;
974 case 2: utest_write_good2 (ofp, &expected_alphatype, &expected_nseq, &expected_alen); break;
975 }
976 fclose(ofp);
977 utest_goodfile(tmpfile, testnumber, expected_alphatype, expected_nseq, expected_alen);
978 remove(tmpfile);
979 }
980
981 utest_gibberish(rng);
982
983 fprintf(stderr, "# status = ok\n");
984
985 remove(a2mfile);
986 remove(stkfile);
987 esl_randomness_Destroy(rng);
988 esl_getopts_Destroy(go);
989 return 0;
990 }
991 #endif /*eslMSAFILE_A2M_TESTDRIVE*/
992 /*--------------------- end, test driver ------------------------*/
993
994
995
996 /*****************************************************************
997 * 5. Examples.
998 *****************************************************************/
999
1000 #ifdef eslMSAFILE_A2M_EXAMPLE
1001 /* A full-featured example of reading/writing an MSA in A2M format.
1002 gcc -g -Wall -o esl_msafile_a2m_example -I. -L. -DeslMSAFILE_A2M_EXAMPLE esl_msafile_a2m.c -leasel -lm
1003 ./esl_msafile_a2m_example <msafile>
1004 */
1005 /*::cexcerpt::msafile_a2m_example::begin::*/
1006 #include <stdio.h>
1007
1008 #include "easel.h"
1009 #include "esl_alphabet.h"
1010 #include "esl_getopts.h"
1011 #include "esl_msa.h"
1012 #include "esl_msafile.h"
1013 #include "esl_msafile_a2m.h"
1014
1015 static ESL_OPTIONS options[] = {
1016 /* name type default env range toggles reqs incomp help docgroup*/
1017 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
1018 { "-1", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "override autodetection; force A2M format", 0 },
1019 { "-q", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "quieter: don't write msa back, just summary", 0 },
1020 { "-t", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use text mode: no digital alphabet", 0 },
1021 { "--dna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is DNA", 0 },
1022 { "--rna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is RNA", 0 },
1023 { "--amino", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is protein", 0 },
1024 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1025 };
1026 static char usage[] = "[-options] <msafile>";
1027 static char banner[] = "example of guessing, reading, writing A2M format";
1028
1029 int
main(int argc,char ** argv)1030 main(int argc, char **argv)
1031 {
1032 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
1033 char *filename = esl_opt_GetArg(go, 1);
1034 int infmt = eslMSAFILE_UNKNOWN;
1035 ESL_ALPHABET *abc = NULL;
1036 ESL_MSAFILE *afp = NULL;
1037 ESL_MSA *msa = NULL;
1038 int status;
1039
1040 if (esl_opt_GetBoolean(go, "-1")) infmt = eslMSAFILE_A2M; /* override format autodetection */
1041
1042 if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA);
1043 else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA);
1044 else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
1045
1046 /* Text mode: pass NULL for alphabet.
1047 * Digital mode: pass ptr to expected ESL_ALPHABET; and if abc=NULL, alphabet is guessed
1048 */
1049 if (esl_opt_GetBoolean(go, "-t")) status = esl_msafile_Open(NULL, filename, NULL, infmt, NULL, &afp);
1050 else status = esl_msafile_Open(&abc, filename, NULL, infmt, NULL, &afp);
1051 if (status != eslOK) esl_msafile_OpenFailure(afp, status);
1052
1053 if ((status = esl_msafile_a2m_Read(afp, &msa)) != eslOK)
1054 esl_msafile_ReadFailure(afp, status);
1055
1056 printf("alphabet: %s\n", (abc ? esl_abc_DecodeType(abc->type) : "none (text mode)"));
1057 printf("# of seqs: %d\n", msa->nseq);
1058 printf("# of cols: %d\n", (int) msa->alen);
1059 printf("\n");
1060
1061 if (! esl_opt_GetBoolean(go, "-q"))
1062 esl_msafile_a2m_Write(stdout, msa);
1063
1064 esl_msa_Destroy(msa);
1065 esl_msafile_Close(afp);
1066 if (abc) esl_alphabet_Destroy(abc);
1067 esl_getopts_Destroy(go);
1068 exit(0);
1069 }
1070 /*::cexcerpt::msafile_a2m_example::end::*/
1071 #endif /*eslMSAFILE_A2M_EXAMPLE*/
1072
1073
1074 #ifdef eslMSAFILE_A2M_EXAMPLE2
1075 /* A minimal example. Read A2M MSA, in text mode
1076 gcc -g -Wall -o esl_msafile_a2m_example2 -I. -L. -DeslMSAFILE_A2M_EXAMPLE2 esl_msafile_a2m.c -leasel -lm
1077 ./esl_msafile_a2m_example <msafile>
1078 */
1079 /*::cexcerpt::msafile_a2m_example2::begin::*/
1080 #include <stdio.h>
1081
1082 #include "easel.h"
1083 #include "esl_msa.h"
1084 #include "esl_msafile.h"
1085 #include "esl_msafile_a2m.h"
1086
1087 int
main(int argc,char ** argv)1088 main(int argc, char **argv)
1089 {
1090 char *filename = argv[1];
1091 int fmt = eslMSAFILE_A2M;
1092 ESL_MSAFILE *afp = NULL;
1093 ESL_MSA *msa = NULL;
1094 int status;
1095
1096 if ( (status = esl_msafile_Open(NULL, filename, NULL, fmt, NULL, &afp)) != eslOK) esl_msafile_OpenFailure(afp, status);
1097 if ( (status = esl_msafile_a2m_Read(afp, &msa)) != eslOK) esl_msafile_ReadFailure(afp, status);
1098
1099 printf("%6d seqs, %5d columns\n", msa->nseq, (int) msa->alen);
1100
1101 esl_msafile_a2m_Write(stdout, msa);
1102 esl_msa_Destroy(msa);
1103 esl_msafile_Close(afp);
1104 exit(0);
1105 }
1106 /*::cexcerpt::msafile_a2m_example2::end::*/
1107 #endif /*eslMSAFILE_A2M_EXAMPLE2*/
1108 /*--------------------- end of examples -------------------------*/
1109
1110