1 /* I/O of multiple sequence alignment files in SELEX format
2 *
3 * Contents:
4 * 1. API for reading/writing SELEX input.
5 * 2. Internal functions for a block of input lines.
6 * 3. Internal functions for parsing SELEX input.
7 * 4. Unit tests.
8 * 5. Test driver.
9 * 6. Examples.
10 *
11 * Notes:
12 * In SELEX, a tricky and unusual issue is that spaces are allowed
13 * as gaps, and can even overlap names. Alignments like this are
14 * legitimate:
15 * seq1_longname ACCCGGT
16 * seq2 AAAAACCCGGTT
17 *
18 * You can't determine the aligned length of any sequence in the
19 * block without seeing the whole block. We define an internal
20 * object (an ESL_SELEX_BLOCK) and some local functions to handle
21 * reading a block of input lines from an input buffer.
22 *
23 * Even though spaces are allowed as gaps in input files, Easel
24 * disallows them internally, even in text-mode alignments. Any
25 * spaces are mapped to '.'.
26 */
27 #include "esl_config.h"
28
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <ctype.h>
33
34 #include "easel.h"
35 #include "esl_alphabet.h"
36 #include "esl_mem.h"
37 #include "esl_msa.h"
38 #include "esl_msafile.h"
39
40 #include "esl_msafile_selex.h"
41
42 #define eslSELEX_LINE_SQ 1
43 #define eslSELEX_LINE_RF 2
44 #define eslSELEX_LINE_CS 3
45 #define eslSELEX_LINE_SS 4
46 #define eslSELEX_LINE_SA 5
47 #define eslSELEX_LINE_MM 6
48
49 typedef struct {
50 char **line; /* line[0..nlines-1][0..llen-1]: memory lines in input buffer */
51 esl_pos_t *llen; /* length of line[] in bytes */
52 esl_pos_t *offsets; /* offset of start of each line in input buffer */
53 int64_t *linenum; /* line number of each line[] in input */
54 int *ltype; /* code for line type: eslSELEX_LINE_SQ, etc. */
55 esl_pos_t *lpos; /* leftmost position of seq data on line[], 0..llen-1 [or -1] */
56 esl_pos_t *rpos; /* rightmost pos of seq data on line[], 0..llen-1 [or -1] */
57 int nlines; /* number of lines in this block */
58 int nalloc; /* number of lines allocated for (>=nlines) */
59 esl_pos_t anchor; /* input buffer anchor set at the start of the block */
60 } ESL_SELEX_BLOCK;
61
62 static ESL_SELEX_BLOCK *selex_block_Create(int nalloc);
63 static int selex_block_Grow(ESL_SELEX_BLOCK *b);
64 static void selex_block_Destroy(ESL_SELEX_BLOCK *b);
65
66 static int selex_ErrorInBlock(ESL_MSAFILE *afp, ESL_SELEX_BLOCK *b, int idx);
67 static int selex_read_block (ESL_MSAFILE *afp, ESL_SELEX_BLOCK **block_p);
68 static int selex_first_block (ESL_MSAFILE *afp, ESL_SELEX_BLOCK *b, ESL_MSA **ret_msa);
69 static int selex_other_block (ESL_MSAFILE *afp, ESL_SELEX_BLOCK *b, ESL_MSA *msa);
70 static int selex_append_block(ESL_MSAFILE *afp, ESL_SELEX_BLOCK *b, ESL_MSA *msa);
71
72
73 /*****************************************************************
74 * 1. API for reading/writing SELEX input
75 *****************************************************************/
76
77 /* Function: esl_msafile_selex_SetInmap()
78 * Synopsis: Set the input map for SELEX format
79 *
80 * Purpose: Set <afp->inmap> for selex input.
81 *
82 * In text mode, accept any <isgraph()> character, plus space.
83 * In digital mode, accept standard Easel alphabets, plus map
84 * space to gap.
85 *
86 * SELEX not only tolerates spaces in input, it
87 * allows a space as a gap character. (Which significantly
88 * complicates parsing.)
89 *
90 * The inmap may not contain any <eslDSQ_IGNORED> mappings.
91 * Annotation lines are parsed literally: every character
92 * is copied. If some characters of the aligned sequence
93 * were ignored, we'd be misaligned with the annotation.
94 * In general, because of this, it seems unlikely that any
95 * alignment format would use <eslDSQ_IGNORED> mappings.
96 */
97 int
esl_msafile_selex_SetInmap(ESL_MSAFILE * afp)98 esl_msafile_selex_SetInmap(ESL_MSAFILE *afp)
99 {
100 int sym;
101
102 if (afp->abc)
103 {
104 for (sym = 0; sym < 128; sym++)
105 afp->inmap[sym] = afp->abc->inmap[sym];
106 afp->inmap[0] = esl_abc_XGetUnknown(afp->abc);
107 afp->inmap[' '] = esl_abc_XGetGap(afp->abc);
108 }
109
110 if (! afp->abc)
111 {
112 for (sym = 1; sym < 128; sym++)
113 afp->inmap[sym] = (isgraph(sym) ? sym : eslDSQ_ILLEGAL);
114 afp->inmap[0] = '?';
115 afp->inmap[' '] = '.'; /* Easel does not allow spaces as gap characters. */
116 }
117 return eslOK;
118 }
119
120
121 /* Function: esl_msafile_selex_GuessAlphabet()
122 * Synopsis: Guess the alphabet of an open PSI-BLAST MSA file.
123 *
124 * Purpose: Guess the alpbabet of the sequences in open
125 * SELEX format MSA file <afp>.
126 *
127 * On a normal return, <*ret_type> is set to <eslDNA>,
128 * <eslRNA>, or <eslAMINO>, and <afp> is reset to its
129 * original position.
130 *
131 * Args: afp - open SELEX format MSA file
132 * ret_type - RETURN: <eslDNA>, <eslRNA>, or <eslAMINO>
133 *
134 * Returns: <eslOK> on success.
135 * <eslENOALPHABET> if alphabet type can't be determined.
136 * In either case, <afp> is rewound to the position it
137 * started at.
138 */
139 int
esl_msafile_selex_GuessAlphabet(ESL_MSAFILE * afp,int * ret_type)140 esl_msafile_selex_GuessAlphabet(ESL_MSAFILE *afp, int *ret_type)
141 {
142 int alphatype = eslUNKNOWN;
143 esl_pos_t anchor = -1;
144 int threshold[3] = { 500, 5000, 50000 }; /* we check after 500, 5000, 50000 residues; else we go to EOF */
145 int nsteps = 3;
146 int step = 0;
147 int nres = 0;
148 int x;
149 int64_t ct[26];
150 char *p, *tok;
151 esl_pos_t n, toklen, pos;
152 int status;
153
154 for (x = 0; x < 26; x++) ct[x] = 0;
155
156 anchor = esl_buffer_GetOffset(afp->bf);
157 if ((status = esl_buffer_SetAnchor(afp->bf, anchor)) != eslOK) { status = eslEINCONCEIVABLE; goto ERROR; } /* [eslINVAL] can't happen here */
158
159 while ( (status = esl_buffer_GetLine(afp->bf, &p, &n)) == eslOK)
160 {
161 if ((status = esl_memtok(&p, &n, " \t", &tok, &toklen)) != eslOK) continue; /* blank lines */
162 if (*tok == '#') continue; /* comments and annotation */
163 /* p now points to the rest of the sequence line, after a name */
164
165 /* count characters into ct[] array */
166 for (pos = 0; pos < n; pos++)
167 if (isalpha(p[pos])) {
168 x = toupper(p[pos]) - 'A';
169 ct[x]++;
170 nres++;
171 }
172
173 /* try to stop early, checking after 500, 5000, and 50000 residues: */
174 if (step < nsteps && nres > threshold[step]) {
175 if ((status = esl_abc_GuessAlphabet(ct, &alphatype)) == eslOK) goto DONE; /* (eslENOALPHABET) */
176 step++;
177 }
178 }
179 if (status != eslEOF) goto ERROR; /* [eslEMEM,eslESYS,eslEINCONCEIVABLE] */
180 status = esl_abc_GuessAlphabet(ct, &alphatype); /* (eslENOALPHABET) */
181
182 DONE:
183 esl_buffer_SetOffset(afp->bf, anchor); /* Rewind to where we were. */
184 esl_buffer_RaiseAnchor(afp->bf, anchor);
185 *ret_type = alphatype;
186 return status;
187
188 ERROR:
189 if (anchor != -1) {
190 esl_buffer_SetOffset(afp->bf, anchor);
191 esl_buffer_RaiseAnchor(afp->bf, anchor);
192 }
193 *ret_type = eslUNKNOWN;
194 return status;
195 }
196
197
198 /* Function: esl_msafile_selex_Read()
199 * Synopsis: Read in a SELEX format alignment.
200 *
201 * Purpose: Read an MSA from an open <ESL_MSAFILE> <afp>,
202 * parsing for SELEX format, starting from the
203 * current point. (<afp->format> is expected to
204 * be <eslMSAFILE_SELEX>.) Create a new multiple
205 * alignment and return it via <*ret_msa>.
206 * Caller is responsible for free'ing this
207 * <ESL_MSA>.
208 *
209 * Args: afp - open <ESL_MSAFILE>
210 * ret_msa - RETURN: newly parsed <ESL_MSA>
211 *
212 * Returns: <eslOK> on success.
213 *
214 * <eslEOF> if no (more) alignment data are found in
215 * <afp>, and <afp> is returned at EOF.
216 *
217 * <eslEFORMAT> on a parse error. <*ret_msa> is set to
218 * <NULL>. <afp> contains information sufficient for
219 * constructing useful diagnostic output:
220 * | <afp->errmsg> | user-directed error message |
221 * | <afp->linenumber> | line # where error was detected |
222 * | <afp->line> | offending line (not NUL-term) |
223 * | <afp->n> | length of offending line |
224 * | <afp->bf->filename> | name of the file |
225 * and <afp> is poised at the start of the following line,
226 * so (in principle) the caller could try to resume
227 * parsing.
228 *
229 * Throws: <eslEMEM> - an allocation failed.
230 * <eslESYS> - a system call such as fread() failed
231 * <eslEINCONCEIVABLE> - "impossible" corruption
232 */
233 int
esl_msafile_selex_Read(ESL_MSAFILE * afp,ESL_MSA ** ret_msa)234 esl_msafile_selex_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa)
235 {
236 ESL_MSA *msa = NULL;
237 ESL_SELEX_BLOCK *b = NULL;
238 int32_t nblocks = 0;
239 int status;
240
241 ESL_DASSERT1( (afp->format == eslMSAFILE_SELEX) );
242
243 afp->errmsg[0] = '\0';
244
245 while ( (status = selex_read_block(afp, &b)) == eslOK)
246 {
247 if (! nblocks && (status = selex_first_block(afp, b, &msa)) != eslOK) goto ERROR;
248 else if ( nblocks && (status = selex_other_block(afp, b, msa)) != eslOK) goto ERROR;
249
250 if ((status = selex_append_block(afp, b, msa)) != eslOK) goto ERROR;
251
252 esl_buffer_RaiseAnchor(afp->bf, b->anchor);
253 b->anchor = -1;
254
255 nblocks++;
256 }
257 /* selex_read_block took care of destroying the block! */
258 if (status != eslEOF || nblocks == 0) goto ERROR;
259
260 msa->offset = 0;
261 if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR;
262 *ret_msa = msa;
263 return eslOK;
264
265 ERROR:
266 if (b) {
267 if (b->anchor != -1) esl_buffer_RaiseAnchor(afp->bf, b->anchor);
268 selex_block_Destroy(b);
269 }
270 *ret_msa = NULL;
271 return status;
272 }
273
274 /* Function: esl_msafile_selex_Write()
275 * Synopsis: Write a SELEX format alignment to a stream
276 *
277 * Purpose: Write alignment <msa> to output stream <fp>,
278 * in SELEX format. The alignment is written
279 * in blocks of 60 aligned residues at a time.
280 *
281 * Args: fp - open output stream, writable
282 * msa - alignment to write
283 *
284 * Returns: <eslOK> on success.
285 *
286 * Throws: <eslEMEM> on allocation error.
287 * <eslEWRITE> on any system write error, such
288 * as a filled disk.
289 */
290 int
esl_msafile_selex_Write(FILE * fp,const ESL_MSA * msa)291 esl_msafile_selex_Write(FILE *fp, const ESL_MSA *msa)
292 {
293 int cpl = 60;
294 int maxnamelen = 4; /* init to 4 because minimum name field is #=CS, etc. */
295 int namelen;
296 char *buf = NULL;
297 int i;
298 int64_t apos;
299 int status;
300
301 ESL_ALLOC(buf, sizeof(char) * (cpl+1));
302 buf[cpl] = '\0';
303 for (i = 0; i < msa->nseq; i++) {
304 namelen = strlen(msa->sqname[i]);
305 maxnamelen = ESL_MAX(namelen, maxnamelen);
306 }
307
308 for (apos = 0; apos < msa->alen; apos += cpl)
309 {
310 if (apos && fprintf(fp, "\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "selex msa write failed");
311 if (msa->ss_cons && fprintf(fp, "%-*s %.*s\n", maxnamelen, "#=CS", cpl, msa->ss_cons+apos) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "selex msa write failed");
312 if (msa->rf && fprintf(fp, "%-*s %.*s\n", maxnamelen, "#=RF", cpl, msa->rf+apos) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "selex msa write failed");
313 if (msa->mm && fprintf(fp, "%-*s %.*s\n", maxnamelen, "#=MM", cpl, msa->mm+apos) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "selex msa write failed");
314
315 for (i = 0; i < msa->nseq; i++)
316 {
317 if (msa->abc) esl_abc_TextizeN(msa->abc, msa->ax[i]+apos+1, cpl, buf);
318 if (! msa->abc) strncpy(buf, msa->aseq[i]+apos, cpl);
319 if (fprintf(fp, "%-*s %s\n", maxnamelen, msa->sqname[i], buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "selex msa write failed");
320
321 if (msa->ss && msa->ss[i]) { if (fprintf(fp, "%-*s %.*s\n", maxnamelen, "#=SS", cpl, msa->ss[i]+apos) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "selex msa write failed"); }
322 if (msa->sa && msa->sa[i]) { if (fprintf(fp, "%-*s %.*s\n", maxnamelen, "#=SA", cpl, msa->sa[i]+apos) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "selex msa write failed"); }
323 }
324 }
325
326 free(buf);
327 return eslOK;
328
329 ERROR:
330 if (buf) free(buf);
331 return status;
332 }
333 /*--------------------- end, SELEX i/o API ----------------------*/
334
335
336
337 /*****************************************************************
338 * 2. Internal functions handling a block of input lines.
339 *****************************************************************/
340
341 static ESL_SELEX_BLOCK *
selex_block_Create(int nalloc)342 selex_block_Create(int nalloc)
343 {
344 ESL_SELEX_BLOCK *b = NULL;
345 int idx;
346 int status;
347
348 ESL_ALLOC(b, sizeof(ESL_SELEX_BLOCK));
349 b->line = NULL;
350 b->llen = NULL;
351 b->offsets = NULL;
352 b->linenum = NULL;
353 b->ltype = NULL;
354 b->lpos = NULL;
355 b->rpos = NULL;
356 b->nlines = 0;
357 b->anchor = -1; /* -1 is a flag for "unused" */
358
359 ESL_ALLOC(b->line, sizeof(char *) * nalloc);
360 ESL_ALLOC(b->llen, sizeof(esl_pos_t) * nalloc);
361 ESL_ALLOC(b->offsets, sizeof(esl_pos_t) * nalloc);
362 ESL_ALLOC(b->linenum, sizeof(int64_t) * nalloc);
363 ESL_ALLOC(b->ltype, sizeof(int) * nalloc);
364 ESL_ALLOC(b->lpos, sizeof(esl_pos_t) * nalloc);
365 ESL_ALLOC(b->rpos, sizeof(esl_pos_t) * nalloc);
366 for (idx = 0; idx < nalloc; idx++)
367 {
368 b->line[idx] = NULL;
369 b->llen[idx] = 0;
370 b->offsets[idx] = 0;
371 b->linenum[idx] = 0;
372 b->ltype[idx] = 0;
373 b->lpos[idx] = 0;
374 b->rpos[idx] = 0;
375 }
376 b->nalloc = nalloc;
377 return b;
378
379 ERROR:
380 if (b) selex_block_Destroy(b);
381 return NULL;
382 }
383
384 static int
selex_block_Grow(ESL_SELEX_BLOCK * b)385 selex_block_Grow(ESL_SELEX_BLOCK *b)
386 {
387 int idx;
388 int status;
389
390 ESL_REALLOC(b->line, sizeof(char *) * b->nalloc * 2);
391 ESL_REALLOC(b->llen, sizeof(esl_pos_t) * b->nalloc * 2);
392 ESL_REALLOC(b->offsets, sizeof(esl_pos_t) * b->nalloc * 2);
393 ESL_REALLOC(b->linenum, sizeof(int64_t) * b->nalloc * 2);
394 ESL_REALLOC(b->ltype, sizeof(int) * b->nalloc * 2);
395 ESL_REALLOC(b->lpos, sizeof(esl_pos_t) * b->nalloc * 2);
396 ESL_REALLOC(b->rpos, sizeof(esl_pos_t) * b->nalloc * 2);
397 for (idx = b->nalloc; idx < b->nalloc*2; idx++)
398 {
399 b->line[idx] = NULL;
400 b->llen[idx] = 0;
401 b->offsets[idx] = 0;
402 b->linenum[idx] = 0;
403 b->ltype[idx] = 0;
404 b->lpos[idx] = 0;
405 b->rpos[idx] = 0;
406 }
407 b->nalloc *= 2;
408 return eslOK;
409
410 ERROR:
411 return status;
412 }
413
414 static void
selex_block_Destroy(ESL_SELEX_BLOCK * b)415 selex_block_Destroy(ESL_SELEX_BLOCK *b)
416 {
417 if (!b) return;
418 if (b->line) free(b->line);
419 if (b->llen) free(b->llen);
420 if (b->offsets) free(b->offsets);
421 if (b->linenum) free(b->linenum);
422 if (b->ltype) free(b->ltype);
423 if (b->lpos) free(b->lpos);
424 if (b->rpos) free(b->rpos);
425 free(b);
426 return;
427 }
428 /*------- end, internal functions for input line blocks ---------*/
429
430
431
432
433 /*****************************************************************
434 * 3. Internal functions for parsing SELEX input.
435 *****************************************************************/
436
437 /* Before we return a parse error,
438 * reset the <afp> so its current line is the one at fault.
439 */
440 static int
selex_ErrorInBlock(ESL_MSAFILE * afp,ESL_SELEX_BLOCK * b,int which)441 selex_ErrorInBlock(ESL_MSAFILE *afp, ESL_SELEX_BLOCK *b, int which)
442 {
443 afp->line = b->line[which];
444 afp->n = b->llen[which];
445 afp->lineoffset = b->offsets[which];
446 afp->linenumber = b->linenum[which];
447 return esl_buffer_SetOffset(afp->bf, b->offsets[which] + b->llen[which]);
448 }
449
450 /* selex_read_block: read one block of alignment data.
451 *
452 * Note that line numbers aren't necessarily consecutive,
453 * because we're stripping out comment lines here. On a parse error
454 * on a specific line, we're going to reset the buffer to that line,
455 * and we'll need the linenumber to do that reset.
456 *
457 * The <afp> detected the end of the block by reading a blank line, or EOF.
458 * Thus its point is at the next line after that blank, or at EOF.
459 *
460 * The <afp> has a stable anchor set at (*block_p)->anchor.
461 * Caller must raise this anchor when it's done parsing the block.
462 *
463 * Returns: <eslOK> on success.
464 *
465 * <eslEOF> if no more blocks are found in the input.
466 * <eslEFORMAT> on failure, if a subsequent block has a
467 * different number of data lines than the first block.
468 * On normal errors, all the references are returned set to NULL.
469 *
470 * Throws: <eslEMEM> on allocation failure.
471 */
472 static int
selex_read_block(ESL_MSAFILE * afp,ESL_SELEX_BLOCK ** block_p)473 selex_read_block(ESL_MSAFILE *afp, ESL_SELEX_BLOCK **block_p)
474 {
475 ESL_SELEX_BLOCK *b = *block_p; /* now b==NULL if first block; or on subsequent blocks, reuse prev block storage. */
476 int idx = 0;
477 int status;
478
479 /* Advance past blank lines until we have the first line of next
480 * block. We may hit a normal EOF here, in which case we return
481 * EOF, we're done.
482 */
483 do {
484 if ( ( status = esl_msafile_GetLine(afp, NULL, NULL)) != eslOK) goto ERROR; /* EOF here is a normal EOF */
485 } while (esl_memspn(afp->line, afp->n, " \t") == afp->n || /* idiomatic for "blank line" */
486 (esl_memstrpfx(afp->line, afp->n, "#") && ! esl_memstrpfx(afp->line, afp->n, "#="))); /* a SELEX comment line */
487
488 /* if this is first block, allocate block; subsequent blocks reuse it */
489 if (!b && (b = selex_block_Create(16)) == NULL) { status = eslEMEM; goto ERROR; }
490
491 /* Anchor stably at this point. */
492 b->anchor = afp->lineoffset;
493 if ((status = esl_buffer_SetStableAnchor(afp->bf, b->anchor)) != eslOK) goto ERROR;
494
495 /* Parse for a block of lines. */
496 do {
497 if (b->nalloc && idx == b->nalloc && (status = selex_block_Grow(b)) != eslOK) goto ERROR;
498
499 b->line[idx] = afp->line;
500 b->llen[idx] = afp->n;
501 b->offsets[idx] = afp->lineoffset;
502 b->linenum[idx] = afp->linenumber; /* ltype, lpos, rpos aren't set yet */
503 idx++;
504
505 /* Get next non-comment line; this can be next line of block, blank (end of block), or EOF. */
506 do {
507 status = esl_msafile_GetLine(afp, NULL, NULL);
508 } while ( status == eslOK && (esl_memstrpfx(afp->line, afp->n, "#") && ! esl_memstrpfx(afp->line, afp->n, "#=")));
509
510 } while (status == eslOK && esl_memspn(afp->line, afp->n, " \t") < afp->n); /* end of block on EOF or blank line */
511
512 if (*block_p && b->nlines != idx)
513 ESL_XFAIL(eslEFORMAT, afp->errmsg, "expected %d lines in block, saw %d", b->nlines, idx);
514
515 b->nlines = idx;
516 *block_p = b;
517 return eslOK; /* EOF status gets turned into OK: we've read a block successfully and hit EOF. Next call will generate the EOF */
518
519 ERROR:
520 if (b && b->anchor != -1) esl_buffer_RaiseAnchor(afp->bf, b->anchor);
521 if (b) selex_block_Destroy(b);
522 *block_p = NULL;
523 return status;
524 }
525
526
527 /* selex_first_block()
528 *
529 * 1. Determine and store line types, in b->ltype[0..b->nlines-1].
530 * 2. From the number of eslSELEX_LINE_SQ lines, we know nseq.
531 * 3. From nseq, we can allocate a new MSA.
532 * 4. Parse each line for sequence names, and store them.
533 * 5. Determine lpos[] for each line.
534 */
535 static int
selex_first_block(ESL_MSAFILE * afp,ESL_SELEX_BLOCK * b,ESL_MSA ** ret_msa)536 selex_first_block(ESL_MSAFILE *afp, ESL_SELEX_BLOCK *b, ESL_MSA **ret_msa)
537 {
538 ESL_MSA *msa = NULL;
539 int nrf, nmm, ncs, nss, nsa, nseq;
540 int has_ss, has_sa;
541 char *p, *tok;
542 esl_pos_t n, ntok;
543 int idx, seqi;
544 int status;
545
546 afp->errmsg[0] = '\0';
547
548 nrf = nmm = ncs = nss = nsa = nseq = 0;
549 has_ss = has_sa = FALSE;
550 for (idx = 0; idx < b->nlines; idx++)
551 {
552 if (esl_memstrpfx(b->line[idx], b->llen[idx], "#=RF")) { b->ltype[idx] = eslSELEX_LINE_RF; nrf++; }
553 else if (esl_memstrpfx(b->line[idx], b->llen[idx], "#=MM")) { b->ltype[idx] = eslSELEX_LINE_MM; nmm++; }
554 else if (esl_memstrpfx(b->line[idx], b->llen[idx], "#=CS")) { b->ltype[idx] = eslSELEX_LINE_CS; ncs++; }
555 else if (esl_memstrpfx(b->line[idx], b->llen[idx], "#=SS")) { b->ltype[idx] = eslSELEX_LINE_SS; nss++; has_ss = TRUE; }
556 else if (esl_memstrpfx(b->line[idx], b->llen[idx], "#=SA")) { b->ltype[idx] = eslSELEX_LINE_SA; nsa++; has_sa = TRUE; }
557 else { b->ltype[idx] = eslSELEX_LINE_SQ; nseq++; nss = nsa = 0; }
558
559 if (nss && !nseq) { selex_ErrorInBlock(afp, b, idx); ESL_XFAIL(eslEFORMAT, afp->errmsg, "#=SS must follow a sequence"); }
560 if (nsa && !nseq) { selex_ErrorInBlock(afp, b, idx); ESL_XFAIL(eslEFORMAT, afp->errmsg, "#=SA must follow a sequence"); }
561 if (nrf > 1) { selex_ErrorInBlock(afp, b, idx); ESL_XFAIL(eslEFORMAT, afp->errmsg, "Too many #=RF lines for block"); }
562 if (ncs > 1) { selex_ErrorInBlock(afp, b, idx); ESL_XFAIL(eslEFORMAT, afp->errmsg, "Too many #=CS lines for block"); }
563 if (nss > 1) { selex_ErrorInBlock(afp, b, idx); ESL_XFAIL(eslEFORMAT, afp->errmsg, "Too many #=SS lines for seq"); }
564 if (nsa > 1) { selex_ErrorInBlock(afp, b, idx); ESL_XFAIL(eslEFORMAT, afp->errmsg, "Too many #=SA lines for seq"); }
565 }
566
567 if ( afp->abc && (msa = esl_msa_CreateDigital(afp->abc, nseq, -1)) == NULL) { status = eslEMEM; goto ERROR; } /* a growable MSA */
568 if (!afp->abc && (msa = esl_msa_Create( nseq, -1)) == NULL) { status = eslEMEM; goto ERROR; }
569 if (has_ss) {
570 ESL_ALLOC(msa->ss, sizeof(char *) * nseq);
571 for (seqi = 0; seqi < nseq; seqi++) msa->ss[seqi] = NULL;
572 }
573 if (has_sa) {
574 ESL_ALLOC(msa->sa, sizeof(char *) * nseq);
575 for (seqi = 0; seqi < nseq; seqi++) msa->sa[seqi] = NULL;
576 }
577 msa->nseq = nseq;
578 msa->alen = 0;
579
580 for (seqi = 0, idx = 0; idx < b->nlines; idx++)
581 {
582 p = b->line[idx];
583 n = b->llen[idx];
584 if ( esl_memtok(&p, &n, " \t", &tok, &ntok) != eslOK) ESL_XEXCEPTION(eslEINCONCEIVABLE, "can't happen"); /* because a block by definition consists of non-blank lines */
585 if (b->ltype[idx] == eslSELEX_LINE_SQ) /* otherwise, first token is #=XX marking annotation of some sort */
586 {
587 if ((status = esl_msa_SetSeqName(msa, seqi, tok, ntok)) != eslOK) goto ERROR;
588 seqi++;
589 }
590 b->lpos[idx] = (n ? p-b->line[idx] : -1); /* set lpos[] to position of first seq or annotation residue */
591 }
592
593 *ret_msa = msa;
594 return eslOK;
595
596 ERROR:
597 if (msa) esl_msa_Destroy(msa);
598 *ret_msa = NULL;
599 return status;
600 }
601
602
603 /* selex_other_block()
604 * We've already parsed the first block.
605 * So we know the order of line types, nseq, and sequence names.
606 * Validate that a subsequent block has the same.
607 */
608 static int
selex_other_block(ESL_MSAFILE * afp,ESL_SELEX_BLOCK * b,ESL_MSA * msa)609 selex_other_block(ESL_MSAFILE *afp, ESL_SELEX_BLOCK *b, ESL_MSA *msa)
610 {
611 char *p, *tok;
612 esl_pos_t n, ntok;
613 int idx, seqi;
614
615 /* Validate line types */
616 for (idx = 0; idx < b->nlines; idx++)
617 {
618 if (esl_memstrpfx(b->line[idx], b->llen[idx], "#=RF")) { if (b->ltype[idx] != eslSELEX_LINE_RF) { selex_ErrorInBlock(afp, b, idx); ESL_FAIL(eslEFORMAT, afp->errmsg, "#=RF line isn't in expected order in block"); } }
619 else if (esl_memstrpfx(b->line[idx], b->llen[idx], "#=MM")) { if (b->ltype[idx] != eslSELEX_LINE_MM) { selex_ErrorInBlock(afp, b, idx); ESL_FAIL(eslEFORMAT, afp->errmsg, "#=MM line isn't in expected order in block"); } }
620 else if (esl_memstrpfx(b->line[idx], b->llen[idx], "#=CS")) { if (b->ltype[idx] != eslSELEX_LINE_CS) { selex_ErrorInBlock(afp, b, idx); ESL_FAIL(eslEFORMAT, afp->errmsg, "#=CS line isn't in expected order in block"); } }
621 else if (esl_memstrpfx(b->line[idx], b->llen[idx], "#=SS")) { if (b->ltype[idx] != eslSELEX_LINE_SS) { selex_ErrorInBlock(afp, b, idx); ESL_FAIL(eslEFORMAT, afp->errmsg, "#=SS line isn't in expected order in block"); } }
622 else if (esl_memstrpfx(b->line[idx], b->llen[idx], "#=SA")) { if (b->ltype[idx] != eslSELEX_LINE_SA) { selex_ErrorInBlock(afp, b, idx); ESL_FAIL(eslEFORMAT, afp->errmsg, "#=SA line isn't in expected order in block"); } }
623 else { if (b->ltype[idx] != eslSELEX_LINE_SQ) { selex_ErrorInBlock(afp, b, idx); ESL_FAIL(eslEFORMAT, afp->errmsg, "sequence line isn't in expected order in block"); } }
624 }
625
626 /* Validate seq names, and set lpos */
627 for (seqi = 0, idx = 0; idx < b->nlines; idx++)
628 {
629 p = b->line[idx];
630 n = b->llen[idx];
631 if ( esl_memtok(&p, &n, " \t", &tok, &ntok) != eslOK) ESL_EXCEPTION(eslEINCONCEIVABLE, "can't happen"); /* because a block by definition consists of non-blank lines */
632 if (b->ltype[idx] == eslSELEX_LINE_SQ)
633 {
634 if (! esl_memstrcmp(tok, ntok, msa->sqname[seqi])) { selex_ErrorInBlock(afp, b, idx); ESL_FAIL(eslEFORMAT, afp->errmsg, "expected sequence %s at this line of block", msa->sqname[seqi]); }
635 seqi++;
636 }
637 b->lpos[idx] = (n ? p-b->line[idx] : -1); /* set lpos[] to position of first seq or annotation residue */
638 }
639 return eslOK;
640 }
641
642 static int
selex_append_block(ESL_MSAFILE * afp,ESL_SELEX_BLOCK * b,ESL_MSA * msa)643 selex_append_block(ESL_MSAFILE *afp, ESL_SELEX_BLOCK *b, ESL_MSA *msa)
644 {
645 char *p;
646 esl_pos_t pos;
647 int idx, seqi;
648 esl_pos_t leftmost, rightmost;
649 int64_t nadd; /* width of this sequence block, in aligned columns added to msa */
650 esl_pos_t nleft, ntext;
651 int64_t alen;
652 int status;
653
654 /* Determine rpos for each line. */
655 for (idx = 0; idx < b->nlines; idx++)
656 {
657 p = b->line[idx];
658 pos = b->llen[idx] - 1;
659 while (pos>=0 && isspace(p[pos])) pos--;
660 b->rpos[idx] = ( (pos < b->lpos[idx]) ? -1 : pos); /* -1: a completely blank seq line is valid */
661 }
662
663 /* Determine leftmost and rightmost positions for entire block */
664 leftmost = b->lpos[0];
665 rightmost = b->rpos[0];
666 for (idx = 1; idx < b->nlines; idx++) {
667 leftmost = (b->lpos[idx] == -1) ? leftmost : ESL_MIN(leftmost, b->lpos[idx]);
668 rightmost = (b->rpos[idx] == -1) ? rightmost : ESL_MAX(rightmost, b->rpos[idx]);
669 }
670 if (rightmost == -1) return eslOK; /* super special case: no sequence or annotation data in this block at all! */
671 nadd = rightmost - leftmost + 1;
672
673 /* Appends */
674 for (seqi = 0, idx = 0; idx < b->nlines; idx++)
675 {
676 nleft = ((b->lpos[idx] != -1) ? b->lpos[idx] - leftmost : nadd); /* watch special case of all whitespace on data line, lpos>rpos */
677 ntext = ((b->lpos[idx] != -1) ? b->rpos[idx] - b->lpos[idx] + 1 : 0);
678 //nright = ((b->lpos[idx] != -1) ? rightmost - b->rpos[idx] : 0); // someday you might want to know nright, but for now the code doesn't use it
679
680 if (b->ltype[idx] == eslSELEX_LINE_SQ)
681 {
682 if (msa->abc)
683 { /* digital sequence append - mapped, preallocated */
684 ESL_REALLOC(msa->ax[seqi], sizeof(ESL_DSQ) * (msa->alen + nadd + 2));
685 if (msa->alen == 0) msa->ax[seqi][0] = eslDSQ_SENTINEL;
686 for (alen = msa->alen; alen < msa->alen+nleft; alen++) msa->ax[seqi][alen+1] = esl_abc_XGetGap(msa->abc);
687
688 status = esl_abc_dsqcat_noalloc(afp->inmap, msa->ax[seqi], &alen, b->line[idx] + b->lpos[idx], ntext);
689 if (status == eslEINVAL) { selex_ErrorInBlock(afp, b, idx); ESL_FAIL(eslEFORMAT, afp->errmsg, "illegal residue(s) in sequence line"); }
690 else if (status != eslOK) { selex_ErrorInBlock(afp, b, idx); goto ERROR; }
691 if (alen != msa->alen + nleft + ntext) { selex_ErrorInBlock(afp, b, idx); ESL_EXCEPTION(eslEINCONCEIVABLE, afp->errmsg, "unexpected inconsistency appending a sequence"); };
692
693 for (; alen < msa->alen+nadd; alen++) msa->ax[seqi][alen+1] = esl_abc_XGetGap(msa->abc);
694 msa->ax[seqi][alen+1] = eslDSQ_SENTINEL;
695 }
696
697 if (! msa->abc)
698 { /* text mode sequence append - mapped, preallocated */
699 ESL_REALLOC(msa->aseq[seqi], sizeof(char) * (msa->alen + nadd + 1));
700 for (alen = msa->alen; alen < msa->alen+nleft; alen++) msa->aseq[seqi][alen] = '.';
701
702 status = esl_strmapcat_noalloc(afp->inmap, msa->aseq[seqi], &alen, b->line[idx] + b->lpos[idx], ntext);
703 if (status == eslEINVAL) { selex_ErrorInBlock(afp, b, idx); ESL_FAIL(eslEFORMAT, afp->errmsg, "illegal residue(s) in input line"); }
704 else if (status != eslOK) { selex_ErrorInBlock(afp, b, idx); goto ERROR; }
705 if (alen != msa->alen + nleft + ntext) { selex_ErrorInBlock(afp, b, idx); ESL_EXCEPTION(eslEINCONCEIVABLE, afp->errmsg, "unexpected inconsistency appending a sequence"); };
706
707 for (; alen < msa->alen+nadd; alen++) msa->aseq[seqi][alen] = '.';
708 msa->aseq[seqi][alen] = '\0';
709 }
710 seqi++;
711 }
712 else
713 { /* annotation append: not mapped, characters are copied exactly as they are */
714 if (b->ltype[idx] == eslSELEX_LINE_RF) { ESL_REALLOC(msa->rf, sizeof(char) * (msa->alen + nadd + 1)); p = msa->rf; }
715 if (b->ltype[idx] == eslSELEX_LINE_MM) { ESL_REALLOC(msa->mm, sizeof(char) * (msa->alen + nadd + 1)); p = msa->mm; }
716 else if (b->ltype[idx] == eslSELEX_LINE_CS) { ESL_REALLOC(msa->ss_cons, sizeof(char) * (msa->alen + nadd + 1)); p = msa->ss_cons; }
717 else if (b->ltype[idx] == eslSELEX_LINE_SS) { ESL_REALLOC(msa->ss[seqi-1], sizeof(char) * (msa->alen + nadd + 1)); p = msa->ss[seqi-1]; }
718 else if (b->ltype[idx] == eslSELEX_LINE_SA) { ESL_REALLOC(msa->sa[seqi-1], sizeof(char) * (msa->alen + nadd + 1)); p = msa->sa[seqi-1]; }
719
720 for (alen = msa->alen; alen < msa->alen+nleft; alen++) p[alen] = '.';
721 if (ntext) memcpy(p+msa->alen+nleft, b->line[idx]+b->lpos[idx], sizeof(char)*ntext);
722 for (alen = msa->alen+nleft+ntext; alen < msa->alen+nadd; alen++) p[alen] = '.';
723 p[alen] = '\0';
724 }
725 }
726 msa->alen += nadd;
727 return eslOK;
728
729 ERROR:
730 return status;
731 }
732
733
734 /*****************************************************************
735 * 4. Unit tests.
736 *****************************************************************/
737 #ifdef eslMSAFILE_SELEX_TESTDRIVE
738
739 static void
utest_write_good1(FILE * ofp,int * ret_alphatype,int * ret_nseq,int * ret_alen)740 utest_write_good1(FILE *ofp, int *ret_alphatype, int *ret_nseq, int *ret_alen)
741 {
742 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
743 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
744 fputs("seq3 ACDEFGHIKLMNPQRSTVWY\n", ofp);
745 fputs("seq4 ACDEFGHIKLMNPQRSTVWY\n", ofp);
746 fputs("seq5 ACDEFGHIKLMNPQRSTVWY\n", ofp);
747
748 *ret_alphatype = eslAMINO;
749 *ret_nseq = 5;
750 *ret_alen = 20;
751 }
752
753 static void
utest_write_good2(FILE * ofp,int * ret_alphatype,int * ret_nseq,int * ret_alen)754 utest_write_good2(FILE *ofp, int *ret_alphatype, int *ret_nseq, int *ret_alen)
755 {
756 fputs("# DOS format (\\r\\n), and doesn't end in a newline.\r\n", ofp);
757 fputs("# \r\n", ofp);
758 fputs("#=RF xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
759 fputs("#=CS xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
760 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\r\n", ofp);
761 fputs("#=SS xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
762 fputs("#=SA xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
763 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\r\n", ofp);
764 fputs("#=SS xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
765 fputs("#=SA xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
766 fputs("seq3 ACDEFGHIKLMNPQRSTVWY\r\n", ofp);
767 fputs("#=SS xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
768 fputs("#=SA xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
769 fputs("seq4 ACDEFGHIKLMNPQRSTVWY\r\n", ofp);
770 fputs("#=SS xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
771 fputs("#=SA xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
772 fputs("seq5 ACDEFGHIKLMNPQRSTVWY\r\n", ofp);
773 fputs("#=SS xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
774 fputs("#=SA xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
775 fputs("\r\n", ofp);
776 fputs("#=RF xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
777 fputs("#=CS xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
778 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\r\n", ofp);
779 fputs("#=SS xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
780 fputs("#=SA xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
781 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\r\n", ofp);
782 fputs("#=SS xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
783 fputs("#=SA xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
784 fputs("seq3 ACDEFGHIKLMNPQRSTVWY\r\n", ofp);
785 fputs("#=SS xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
786 fputs("#=SA xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
787 fputs("seq4 ACDEFGHIKLMNPQRSTVWY\r\n", ofp);
788 fputs("#=SS xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
789 fputs("#=SA xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
790 fputs("seq5 ACDEFGHIKLMNPQRSTVWY\r\n", ofp);
791 fputs("#=SS xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
792 fputs("#=SA xxxxxxxxxxxxxxxxxxxx", ofp);
793
794 *ret_alphatype = eslAMINO;
795 *ret_nseq = 5;
796 *ret_alen = 40;
797 }
798
799 static void
utest_write_good3(FILE * ofp,int * ret_alphatype,int * ret_nseq,int * ret_alen)800 utest_write_good3(FILE *ofp, int *ret_alphatype, int *ret_nseq, int *ret_alen)
801 {
802 fputs("\n", ofp);
803 fputs("#=CS\n", ofp);
804 fputs("#=RF\n", ofp);
805 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
806 fputs("long_name GHIKLMNPQRSTVWY\n", ofp);
807 fputs("blank_seq_all_gaps \n", ofp);
808 fputs("seq2 ACDEF---KLMNPQRSTVWY\n", ofp);
809 fputs("seq3 ACDEF...KLMNPQRSTVWY\n", ofp);
810 fputs("# embedded comments ok\n", ofp);
811 fputs("seq4 ACDEFGHIKLMNPQRSTVWY\n", ofp);
812 fputs(" seq5 CDEFGHIKLMNPQRSTVWY\n", ofp);
813 fputs("#=SS \n", ofp);
814 fputs("#=SA\n", ofp);
815 fputs("\n", ofp);
816 fputs("\n", ofp);
817 fputs("\n", ofp);
818 fputs("#=CS\n", ofp);
819 fputs("#=RF\n", ofp);
820 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
821 fputs("long_name GHIKLMNPQRSTVWY\n", ofp);
822 fputs("blank_seq_all_gaps \n", ofp);
823 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
824 fputs("seq3 ACDEFGHIKLMNPQRSTVWY\n", ofp);
825 fputs("seq4 ACDEFGHIKLMNPQRSTVWY\n", ofp);
826 fputs("seq5 ACDEFGHIKLMNPQRSTVWY\n", ofp);
827 fputs("#=SS \n", ofp);
828 fputs("#=SA\n", ofp);
829
830 *ret_alphatype = eslAMINO;
831 *ret_nseq = 7;
832 *ret_alen = 52;
833 }
834
835 static void
utest_write_good4(FILE * ofp,int * ret_alphatype,int * ret_nseq,int * ret_alen)836 utest_write_good4(FILE *ofp, int *ret_alphatype, int *ret_nseq, int *ret_alen)
837 {
838 fputs("# A complicated SELEX example\n", ofp);
839 fputs("\n", ofp);
840 fputs("\n", ofp);
841 fputs("#=RF xxxxxxx xxxx xxxxxx\n", ofp);
842 fputs("#=CS >>>>+>> ^^^^ <<<<<<\n", ofp);
843 fputs("28 gGAGUAAGAUAGC AUCA GCAUCUUGUUCC\n", ofp);
844 fputs("#=SS +++++>>>>>+>> ^^^^ <<<<<<<+++++\n", ofp);
845 fputs("longname GUUCACC AUCA GGGGAc\n", ofp);
846 fputs("#=SS >>>>+>> ^^^^ <<<<<<\n", ofp);
847 fputs("2 AUGGAUGCGCACC AUCA GGGCGUaucuau\n", ofp);
848 fputs("3 GAUCACC AUCA GGGauc\n", ofp);
849 fputs("4 GGUCACC AUCA GGGauc\n", ofp);
850 fputs("5 GGACACC AUCA GGGucu\n", ofp);
851 fputs("6 CACC AUCA GGG\n", ofp);
852 fputs("7 GAUCACC AUCA GGGauc\n", ofp);
853 fputs("8 CUCACC AUCA GGGGG\n", ofp);
854 fputs("9 AUGCACC AUCA GGGCAU\n", ofp);
855 fputs("10 CUCACC AUCA GGGGG\n", ofp);
856
857 *ret_alphatype = eslRNA;
858 *ret_nseq = 11;
859 *ret_alen = 31;
860 }
861
862 static void
utest_goodfile(char * filename,int testnumber,int expected_alphatype,int expected_nseq,int expected_alen)863 utest_goodfile(char *filename, int testnumber, int expected_alphatype, int expected_nseq, int expected_alen)
864 {
865 ESL_ALPHABET *abc = NULL;
866 ESL_MSAFILE *afp = NULL;
867 ESL_MSA *msa1 = NULL;
868 ESL_MSA *msa2 = NULL;
869 char tmpfile1[32] = "esltmpXXXXXX";
870 char tmpfile2[32] = "esltmpXXXXXX";
871 FILE *ofp = NULL;
872 int status;
873
874 /* guessing both the format and the alphabet should work: this is a digital open */
875 if ( (status = esl_msafile_Open(&abc, filename, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) esl_fatal("selex good file test %d failed: digital open", testnumber);
876 if (afp->format != eslMSAFILE_SELEX) esl_fatal("selex good file test %d failed: format autodetection", testnumber);
877 if (abc->type != expected_alphatype) esl_fatal("selex good file test %d failed: alphabet autodetection", testnumber);
878
879 /* This is a digital read, using <abc>. */
880 if ( (status = esl_msafile_selex_Read(afp, &msa1)) != eslOK) esl_fatal("selex good file test %d failed: msa read, digital", testnumber);
881 if (msa1->nseq != expected_nseq || msa1->alen != expected_alen) esl_fatal("selex good file test %d failed: nseq/alen", testnumber);
882 if (esl_msa_Validate(msa1, NULL) != eslOK) esl_fatal("selex good file test %d failed: msa1 invalid", testnumber);
883 esl_msafile_Close(afp);
884
885 /* write it back out to a new tmpfile (digital write) */
886 if ( (status = esl_tmpfile_named(tmpfile1, &ofp)) != eslOK) esl_fatal("selex good file test %d failed: tmpfile creation", testnumber);
887 if ( (status = esl_msafile_selex_Write(ofp, msa1)) != eslOK) esl_fatal("selex good file test %d failed: msa write, digital", testnumber);
888 fclose(ofp);
889
890 /* now open and read it as text mode, in known format. (We have to pass fmtd now, to deal with the possibility of a nonstandard name width) */
891 if ( (status = esl_msafile_Open(NULL, tmpfile1, NULL, eslMSAFILE_SELEX, NULL, &afp)) != eslOK) esl_fatal("selex good file test %d failed: text mode open", testnumber);
892 if ( (status = esl_msafile_selex_Read(afp, &msa2)) != eslOK) esl_fatal("selex good file test %d failed: msa read, text", testnumber);
893 if (msa2->nseq != expected_nseq || msa2->alen != expected_alen) esl_fatal("selex good file test %d failed: nseq/alen", testnumber);
894 if (esl_msa_Validate(msa2, NULL) != eslOK) esl_fatal("selex good file test %d failed: msa2 invalid", testnumber);
895 esl_msafile_Close(afp);
896
897 /* write it back out to a new tmpfile (text write) */
898 if ( (status = esl_tmpfile_named(tmpfile2, &ofp)) != eslOK) esl_fatal("selex good file test %d failed: tmpfile creation", testnumber);
899 if ( (status = esl_msafile_selex_Write(ofp, msa2)) != eslOK) esl_fatal("selex good file test %d failed: msa write, text", testnumber);
900 fclose(ofp);
901 esl_msa_Destroy(msa2);
902
903 /* open and read it in digital mode */
904 if ( (status = esl_msafile_Open(&abc, tmpfile1, NULL, eslMSAFILE_SELEX, NULL, &afp)) != eslOK) esl_fatal("selex good file test %d failed: 2nd digital mode open", testnumber);
905 if ( (status = esl_msafile_selex_Read(afp, &msa2)) != eslOK) esl_fatal("selex good file test %d failed: 2nd digital msa read", testnumber);
906 if (esl_msa_Validate(msa2, NULL) != eslOK) esl_fatal("selex good file test %d failed: msa2 invalid", testnumber);
907 esl_msafile_Close(afp);
908
909 /* this msa <msa2> should be identical to <msa1> */
910 if (esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal("selex good file test %d failed: msa compare", testnumber);
911
912 remove(tmpfile1);
913 remove(tmpfile2);
914 esl_msa_Destroy(msa1);
915 esl_msa_Destroy(msa2);
916 esl_alphabet_Destroy(abc);
917 }
918
919
920 static void
write_test_msas(FILE * ofp1,FILE * ofp2)921 write_test_msas(FILE *ofp1, FILE *ofp2)
922 {
923 fprintf(ofp1, "# selex comments ignored \n");
924 fprintf(ofp1, "#=RF ..xxxxxxxxxxxxxxxxxxxx\n");
925 fprintf(ofp1, "seq1 ..acdefghiklmnpqrstvwy\n");
926 fprintf(ofp1, "seq2 ..acdefghiklmnpqrstv--\n");
927 fprintf(ofp1, "seq3 aaacdefghiklmnpqrstv--\n");
928 fprintf(ofp1, "# selex comments ignored \n");
929 fprintf(ofp1, "seq4 ..acdefghiklmnpqrstvwy\n");
930 fprintf(ofp1, "\n");
931 fprintf(ofp1, "#=RF xxxxxxxxxxxxxxxxxxxx..\n");
932 fprintf(ofp1, "seq1 ACDEFGHIKLMNPQRSTVWY..\n");
933 fprintf(ofp1, "seq2 ACDEFGHIKLMNPQRSTVWYYY\n");
934 fprintf(ofp1, "seq3 ACDEFGHIKLMNPQRSTVWY..\n");
935 fprintf(ofp1, "seq4 ACDEFGHIKLMNPQRSTVWY..\n");
936 fprintf(ofp1, "\n");
937
938 fprintf(ofp2, "# STOCKHOLM 1.0\n");
939 fprintf(ofp2, "\n");
940 fprintf(ofp2, "#=GC RF ..xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx..\n");
941 fprintf(ofp2, "seq1 ..acdefghiklmnpqrstvwyACDEFGHIKLMNPQRSTVWY..\n");
942 fprintf(ofp2, "seq2 ..acdefghiklmnpqrstv--ACDEFGHIKLMNPQRSTVWYYY\n");
943 fprintf(ofp2, "seq3 aaacdefghiklmnpqrstv--ACDEFGHIKLMNPQRSTVWY..\n");
944 fprintf(ofp2, "seq4 ..acdefghiklmnpqrstvwyACDEFGHIKLMNPQRSTVWY..\n");
945 fprintf(ofp2, "//\n");
946 }
947
948 static void
read_test_msas_digital(char * slxfile,char * stkfile)949 read_test_msas_digital(char *slxfile, char *stkfile)
950 {
951 char msg[] = "SELEX msa digital read unit test failed";
952 ESL_ALPHABET *abc = NULL;
953 ESL_MSAFILE *afp1 = NULL;
954 ESL_MSAFILE *afp2 = NULL;
955 ESL_MSA *msa1, *msa2, *msa3, *msa4;
956 FILE *slxfp, *stkfp;
957 char slxfile2[32] = "esltmpslx2XXXXXX";
958 char stkfile2[32] = "esltmpstk2XXXXXX";
959
960 if ( esl_msafile_Open(&abc, slxfile, NULL, eslMSAFILE_SELEX, NULL, &afp1) != eslOK) esl_fatal(msg);
961 if ( !abc || abc->type != eslAMINO) esl_fatal(msg);
962 if ( esl_msafile_Open(&abc, stkfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
963 if ( esl_msafile_selex_Read (afp1, &msa1) != eslOK) esl_fatal(msg);
964 if ( esl_msafile_stockholm_Read(afp2, &msa2) != eslOK) esl_fatal(msg);
965 if ( esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal(msg);
966
967 if ( esl_msafile_selex_Read (afp1, &msa3) != eslEOF) esl_fatal(msg);
968 if ( esl_msafile_stockholm_Read(afp2, &msa3) != eslEOF) esl_fatal(msg);
969
970 esl_msafile_Close(afp2);
971 esl_msafile_Close(afp1);
972
973 /* Now write stk to selex file, and vice versa; then retest */
974 if ( esl_tmpfile_named(slxfile2, &slxfp) != eslOK) esl_fatal(msg);
975 if ( esl_tmpfile_named(stkfile2, &stkfp) != eslOK) esl_fatal(msg);
976 if ( esl_msafile_selex_Write (slxfp, msa2) != eslOK) esl_fatal(msg);
977 if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_STOCKHOLM) != eslOK) esl_fatal(msg);
978 fclose(slxfp);
979 fclose(stkfp);
980 if ( esl_msafile_Open(&abc, slxfile2, NULL, eslMSAFILE_SELEX, NULL, &afp1) != eslOK) esl_fatal(msg);
981 if ( esl_msafile_Open(&abc, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
982 if ( esl_msafile_selex_Read (afp1, &msa3) != eslOK) esl_fatal(msg);
983 if ( esl_msafile_stockholm_Read(afp2, &msa4) != eslOK) esl_fatal(msg);
984 if ( esl_msa_Compare(msa3, msa4) != eslOK) esl_fatal(msg);
985
986 remove(slxfile2);
987 remove(stkfile2);
988 esl_msafile_Close(afp2);
989 esl_msafile_Close(afp1);
990
991 esl_msa_Destroy(msa1);
992 esl_msa_Destroy(msa2);
993 esl_msa_Destroy(msa3);
994 esl_msa_Destroy(msa4);
995 esl_alphabet_Destroy(abc);
996 }
997
998 static void
read_test_msas_text(char * slxfile,char * stkfile)999 read_test_msas_text(char *slxfile, char *stkfile)
1000 {
1001 char msg[] = "SELEX msa text-mode read unit test failed";
1002 ESL_MSAFILE *afp1 = NULL;
1003 ESL_MSAFILE *afp2 = NULL;
1004 ESL_MSA *msa1, *msa2, *msa3, *msa4;
1005 FILE *slxfp, *stkfp;
1006 char slxfile2[32] = "esltmpslx2XXXXXX";
1007 char stkfile2[32] = "esltmpstk2XXXXXX";
1008
1009 /* vvvv-- everything's the same as the digital utest except these NULLs */
1010 if ( esl_msafile_Open(NULL, slxfile, NULL, eslMSAFILE_SELEX, NULL, &afp1) != eslOK) esl_fatal(msg);
1011 if ( esl_msafile_Open(NULL, stkfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
1012 if ( esl_msafile_selex_Read (afp1, &msa1) != eslOK) esl_fatal(msg);
1013 if ( esl_msafile_stockholm_Read(afp2, &msa2) != eslOK) esl_fatal(msg);
1014 if ( esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal(msg);
1015 if ( esl_msafile_selex_Read (afp1, &msa3) != eslEOF) esl_fatal(msg);
1016 if ( esl_msafile_stockholm_Read(afp2, &msa3) != eslEOF) esl_fatal(msg);
1017 esl_msafile_Close(afp2);
1018 esl_msafile_Close(afp1);
1019
1020 if ( esl_tmpfile_named(slxfile2, &slxfp) != eslOK) esl_fatal(msg);
1021 if ( esl_tmpfile_named(stkfile2, &stkfp) != eslOK) esl_fatal(msg);
1022 if ( esl_msafile_selex_Write (slxfp, msa2) != eslOK) esl_fatal(msg);
1023 if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_STOCKHOLM) != eslOK) esl_fatal(msg);
1024 fclose(slxfp);
1025 fclose(stkfp);
1026 if ( esl_msafile_Open(NULL, slxfile2, NULL, eslMSAFILE_SELEX, NULL, &afp1) != eslOK) esl_fatal(msg);
1027 if ( esl_msafile_Open(NULL, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
1028 if ( esl_msafile_selex_Read (afp1, &msa3) != eslOK) esl_fatal(msg);
1029 if ( esl_msafile_stockholm_Read(afp2, &msa4) != eslOK) esl_fatal(msg);
1030 if ( esl_msa_Compare(msa3, msa4) != eslOK) esl_fatal(msg);
1031
1032 remove(slxfile2);
1033 remove(stkfile2);
1034 esl_msafile_Close(afp2);
1035 esl_msafile_Close(afp1);
1036
1037 esl_msa_Destroy(msa1);
1038 esl_msa_Destroy(msa2);
1039 esl_msa_Destroy(msa3);
1040 esl_msa_Destroy(msa4);
1041 }
1042 #endif /*eslMSAFILE_SELEX_TESTDRIVE*/
1043 /*---------------------- end, unit tests ------------------------*/
1044
1045
1046 /*****************************************************************
1047 * 5. Test driver.
1048 *****************************************************************/
1049 #ifdef eslMSAFILE_SELEX_TESTDRIVE
1050 /* compile: gcc -g -Wall -I. -L. -o esl_msafile_selex_utest -DeslMSAFILE_SELEX_TESTDRIVE esl_msafile_selex.c -leasel -lm
1051 * (gcov): gcc -g -Wall -fprofile-arcs -ftest-coverage -I. -L. -o esl_msafile_selex_utest -DeslMSAFILE_SELEX_TESTDRIVE esl_msafile_selex.c -leasel -lm
1052 * run: ./esl_msafile_selex_utest
1053 */
1054 #include "esl_config.h"
1055
1056 #include <stdio.h>
1057
1058 #include "easel.h"
1059 #include "esl_getopts.h"
1060 #include "esl_random.h"
1061 #include "esl_msafile.h"
1062 #include "esl_msafile_selex.h"
1063
1064 static ESL_OPTIONS options[] = {
1065 /* name type default env range togs reqs incomp help docgrp */
1066 {"-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0},
1067 { 0,0,0,0,0,0,0,0,0,0},
1068 };
1069 static char usage[] = "[-options]";
1070 static char banner[] = "test driver for SELEX MSA format module";
1071
1072 int
main(int argc,char ** argv)1073 main(int argc, char **argv)
1074 {
1075 char msg[] = "PSI-BLAST MSA i/o module test driver failed";
1076 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
1077 char slxfile[32] = "esltmpslxXXXXXX";
1078 char stkfile[32] = "esltmpstkXXXXXX";
1079 FILE *slxfp, *stkfp;
1080 int testnumber;
1081 int ngoodtests = 4;
1082 char tmpfile[32];
1083 FILE *ofp;
1084 int expected_alphatype;
1085 int expected_nseq;
1086 int expected_alen;
1087
1088 if ( esl_tmpfile_named(slxfile, &slxfp) != eslOK) esl_fatal(msg);
1089 if ( esl_tmpfile_named(stkfile, &stkfp) != eslOK) esl_fatal(msg);
1090 write_test_msas(slxfp, stkfp);
1091 fclose(slxfp);
1092 fclose(stkfp);
1093
1094 read_test_msas_digital(slxfile, stkfile);
1095 read_test_msas_text (slxfile, stkfile);
1096
1097 /* Various "good" files that should be parsed correctly */
1098 for (testnumber = 1; testnumber <= ngoodtests; testnumber++)
1099 {
1100 strcpy(tmpfile, "esltmpXXXXXX");
1101 if (esl_tmpfile_named(tmpfile, &ofp) != eslOK) esl_fatal(msg);
1102 switch (testnumber) {
1103 case 1: utest_write_good1 (ofp, &expected_alphatype, &expected_nseq, &expected_alen); break;
1104 case 2: utest_write_good2 (ofp, &expected_alphatype, &expected_nseq, &expected_alen); break;
1105 case 3: utest_write_good3 (ofp, &expected_alphatype, &expected_nseq, &expected_alen); break;
1106 case 4: utest_write_good4 (ofp, &expected_alphatype, &expected_nseq, &expected_alen); break;
1107 }
1108 fclose(ofp);
1109 utest_goodfile(tmpfile, testnumber, expected_alphatype, expected_nseq, expected_alen);
1110 remove(tmpfile);
1111 }
1112
1113
1114 remove(slxfile);
1115 remove(stkfile);
1116 esl_getopts_Destroy(go);
1117 return 0;
1118 }
1119 #endif /*eslMSAFILE_SELEX_TESTDRIVE*/
1120 /*--------------------- end, test driver ------------------------*/
1121
1122
1123
1124
1125 /*****************************************************************
1126 * 6. Examples.
1127 *****************************************************************/
1128
1129 #ifdef eslMSAFILE_SELEX_EXAMPLE
1130 /* A full-featured example of reading/writing an MSA in SELEX format(s).
1131 gcc -g -Wall -o esl_msafile_selex_example -I. -L. -DeslMSAFILE_SELEX_EXAMPLE esl_msafile_selex.c -leasel -lm
1132 ./esl_msafile_selex_example <msafile>
1133 */
1134 /*::cexcerpt::msafile_selex_example::begin::*/
1135 #include <stdio.h>
1136
1137 #include "easel.h"
1138 #include "esl_alphabet.h"
1139 #include "esl_getopts.h"
1140 #include "esl_msa.h"
1141 #include "esl_msafile.h"
1142 #include "esl_msafile_selex.h"
1143
1144 static ESL_OPTIONS options[] = {
1145 /* name type default env range toggles reqs incomp help docgroup*/
1146 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
1147 { "-1", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "no autodetection; force SELEX format", 0 },
1148 { "-q", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "quieter: don't write msa back, just summary", 0 },
1149 { "-t", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use text mode: no digital alphabet", 0 },
1150 { "--dna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is DNA", 0 },
1151 { "--rna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is RNA", 0 },
1152 { "--amino", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is protein", 0 },
1153 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1154 };
1155 static char usage[] = "[-options] <msafile>";
1156 static char banner[] = "example of guessing, reading, writing SELEX format";
1157
1158 int
main(int argc,char ** argv)1159 main(int argc, char **argv)
1160 {
1161 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
1162 char *filename = esl_opt_GetArg(go, 1);
1163 int infmt = eslMSAFILE_UNKNOWN;
1164 ESL_ALPHABET *abc = NULL;
1165 ESL_MSAFILE *afp = NULL;
1166 ESL_MSA *msa = NULL;
1167 int status;
1168
1169 if (esl_opt_GetBoolean(go, "-1")) infmt = eslMSAFILE_SELEX;
1170
1171 if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA);
1172 else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA);
1173 else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
1174
1175 /* Text mode: pass NULL for alphabet.
1176 * Digital mode: pass ptr to expected ESL_ALPHABET; and if abc=NULL, alphabet is guessed
1177 */
1178 if (esl_opt_GetBoolean(go, "-t")) status = esl_msafile_Open(NULL, filename, NULL, infmt, NULL, &afp);
1179 else status = esl_msafile_Open(&abc, filename, NULL, infmt, NULL, &afp);
1180 if (status != eslOK) esl_msafile_OpenFailure(afp, status);
1181
1182 if ( (status = esl_msafile_selex_Read(afp, &msa)) != eslOK)
1183 esl_msafile_ReadFailure(afp, status);
1184
1185 printf("alphabet: %s\n", (abc ? esl_abc_DecodeType(abc->type) : "none (text mode)"));
1186 printf("# of seqs: %d\n", msa->nseq);
1187 printf("# of cols: %d\n", (int) msa->alen);
1188 printf("\n");
1189
1190 if (! esl_opt_GetBoolean(go, "-q"))
1191 esl_msafile_selex_Write(stdout, msa);
1192
1193 esl_msa_Destroy(msa);
1194 esl_msafile_Close(afp);
1195 if (abc) esl_alphabet_Destroy(abc);
1196 esl_getopts_Destroy(go);
1197 exit(0);
1198 }
1199 /*::cexcerpt::msafile_selex_example::end::*/
1200 #endif /*eslMSAFILE_SELEX_EXAMPLE*/
1201
1202 #ifdef eslMSAFILE_SELEX_EXAMPLE2
1203 /* A minimal example. Read SELEX MSA, in text mode.
1204 gcc -g -Wall -o esl_msafile_selex_example2 -I. -L. -DeslMSAFILE_SELEX_EXAMPLE2 esl_msafile_selex.c -leasel -lm
1205 ./esl_msafile_selex_example2 <msafile>
1206 */
1207
1208 /*::cexcerpt::msafile_selex_example2::begin::*/
1209 #include <stdio.h>
1210
1211 #include "easel.h"
1212 #include "esl_msa.h"
1213 #include "esl_msafile.h"
1214 #include "esl_msafile_selex.h"
1215
1216 int
main(int argc,char ** argv)1217 main(int argc, char **argv)
1218 {
1219 char *filename = argv[1];
1220 int fmt = eslMSAFILE_SELEX;
1221 ESL_MSAFILE *afp = NULL;
1222 ESL_MSA *msa = NULL;
1223 int status;
1224
1225 if ( (status = esl_msafile_Open(NULL, filename, NULL, fmt, NULL, &afp)) != eslOK) esl_msafile_OpenFailure(afp, status);
1226 if ( (status = esl_msafile_selex_Read(afp, &msa)) != eslOK) esl_msafile_ReadFailure(afp, status);
1227
1228 printf("%6d seqs, %5d columns\n", msa->nseq, (int) msa->alen);
1229
1230 esl_msafile_selex_Write(stdout, msa);
1231
1232 esl_msa_Destroy(msa);
1233 esl_msafile_Close(afp);
1234 exit(0);
1235 }
1236 /*::cexcerpt::msafile_selex_example2::end::*/
1237 #endif /*eslMSAFILE_SELEX_EXAMPLE2*/
1238 /*--------------------- end of example --------------------------*/
1239