1 /* I/O of multiple sequence alignment files in SELEX format
2  *
3  * Contents:
4  *   1. API for reading/writing SELEX input.
5  *   2. Internal functions for a block of input lines.
6  *   3. Internal functions for parsing SELEX input.
7  *   4. Unit tests.
8  *   5. Test driver.
9  *   6. Examples.
10  *
11  * Notes:
12  *   In SELEX, a tricky and unusual issue is that spaces are allowed
13  *   as gaps, and can even overlap names. Alignments like this are
14  *   legitimate:
15  *        seq1_longname ACCCGGT
16  *        seq2      AAAAACCCGGTT
17  *
18  *  You can't determine the aligned length of any sequence in the
19  *  block without seeing the whole block.  We define an internal
20  *  object (an ESL_SELEX_BLOCK) and some local functions to handle
21  *  reading a block of input lines from an input buffer.
22  *
23  *  Even though spaces are allowed as gaps in input files, Easel
24  *  disallows them internally, even in text-mode alignments. Any
25  *  spaces are mapped to '.'.
26  */
27 #include "esl_config.h"
28 
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <ctype.h>
33 
34 #include "easel.h"
35 #include "esl_alphabet.h"
36 #include "esl_mem.h"
37 #include "esl_msa.h"
38 #include "esl_msafile.h"
39 
40 #include "esl_msafile_selex.h"
41 
42 #define eslSELEX_LINE_SQ 1
43 #define eslSELEX_LINE_RF 2
44 #define eslSELEX_LINE_CS 3
45 #define eslSELEX_LINE_SS 4
46 #define eslSELEX_LINE_SA 5
47 #define eslSELEX_LINE_MM 6
48 
49 typedef struct {
50   char     **line;		/* line[0..nlines-1][0..llen-1]: memory lines in input buffer */
51   esl_pos_t *llen;		/* length of line[] in bytes                                  */
52   esl_pos_t *offsets;		/* offset of start of each line in input buffer               */
53   int64_t   *linenum;		/* line number of each line[] in input                        */
54   int       *ltype;		/* code for line type: eslSELEX_LINE_SQ, etc.                 */
55   esl_pos_t *lpos;		/* leftmost position of seq data on line[], 0..llen-1 [or -1] */
56   esl_pos_t *rpos;              /* rightmost pos of seq data on line[], 0..llen-1 [or -1]     */
57   int        nlines;		/* number of lines in this block                              */
58   int        nalloc;		/* number of lines allocated for (>=nlines)                   */
59   esl_pos_t  anchor;		/* input buffer anchor set at the start of the block          */
60 } ESL_SELEX_BLOCK;
61 
62 static ESL_SELEX_BLOCK *selex_block_Create(int nalloc);
63 static int              selex_block_Grow(ESL_SELEX_BLOCK *b);
64 static void             selex_block_Destroy(ESL_SELEX_BLOCK *b);
65 
66 static int selex_ErrorInBlock(ESL_MSAFILE *afp, ESL_SELEX_BLOCK *b, int idx);
67 static int selex_read_block  (ESL_MSAFILE *afp, ESL_SELEX_BLOCK **block_p);
68 static int selex_first_block (ESL_MSAFILE *afp, ESL_SELEX_BLOCK *b, ESL_MSA **ret_msa);
69 static int selex_other_block (ESL_MSAFILE *afp, ESL_SELEX_BLOCK *b, ESL_MSA *msa);
70 static int selex_append_block(ESL_MSAFILE *afp, ESL_SELEX_BLOCK *b, ESL_MSA *msa);
71 
72 
73 /*****************************************************************
74  * 1. API for reading/writing SELEX input
75  *****************************************************************/
76 
77 /* Function:  esl_msafile_selex_SetInmap()
78  * Synopsis:  Set the input map for SELEX format
79  *
80  * Purpose:   Set <afp->inmap> for selex input.
81  *
82  *            In text mode, accept any <isgraph()> character, plus space.
83  *            In digital mode, accept standard Easel alphabets, plus map
84  *            space to gap.
85  *
86  *            SELEX not only tolerates spaces in input, it
87  *            allows a space as a gap character. (Which significantly
88  *            complicates parsing.)
89  *
90  *            The inmap may not contain any <eslDSQ_IGNORED> mappings.
91  *            Annotation lines are parsed literally: every character
92  *            is copied. If some characters of the aligned sequence
93  *            were ignored, we'd be misaligned with the annotation.
94  *            In general, because of this, it seems unlikely that any
95  *            alignment format would use <eslDSQ_IGNORED> mappings.
96  */
97 int
esl_msafile_selex_SetInmap(ESL_MSAFILE * afp)98 esl_msafile_selex_SetInmap(ESL_MSAFILE *afp)
99 {
100   int sym;
101 
102   if (afp->abc)
103     {
104       for (sym = 0; sym < 128; sym++)
105 	afp->inmap[sym] = afp->abc->inmap[sym];
106       afp->inmap[0]   = esl_abc_XGetUnknown(afp->abc);
107       afp->inmap[' '] = esl_abc_XGetGap(afp->abc);
108     }
109 
110   if (! afp->abc)
111     {
112       for (sym = 1; sym < 128; sym++)
113 	afp->inmap[sym] = (isgraph(sym) ? sym : eslDSQ_ILLEGAL);
114       afp->inmap[0]   = '?';
115       afp->inmap[' '] = '.'; /* Easel does not allow spaces as gap characters. */
116     }
117   return eslOK;
118 }
119 
120 
121 /* Function:  esl_msafile_selex_GuessAlphabet()
122  * Synopsis:  Guess the alphabet of an open PSI-BLAST MSA file.
123  *
124  * Purpose:   Guess the alpbabet of the sequences in open
125  *            SELEX format MSA file <afp>.
126  *
127  *            On a normal return, <*ret_type> is set to <eslDNA>,
128  *            <eslRNA>, or <eslAMINO>, and <afp> is reset to its
129  *            original position.
130  *
131  * Args:      afp      - open SELEX format MSA file
132  *            ret_type - RETURN: <eslDNA>, <eslRNA>, or <eslAMINO>
133  *
134  * Returns:   <eslOK> on success.
135  *            <eslENOALPHABET> if alphabet type can't be determined.
136  *            In either case, <afp> is rewound to the position it
137  *            started at.
138  */
139 int
esl_msafile_selex_GuessAlphabet(ESL_MSAFILE * afp,int * ret_type)140 esl_msafile_selex_GuessAlphabet(ESL_MSAFILE *afp, int *ret_type)
141 {
142   int       alphatype     = eslUNKNOWN;
143   esl_pos_t anchor        = -1;
144   int       threshold[3]  = { 500, 5000, 50000 }; /* we check after 500, 5000, 50000 residues; else we go to EOF */
145   int       nsteps        = 3;
146   int       step          = 0;
147   int       nres          = 0;
148   int       x;
149   int64_t   ct[26];
150   char     *p, *tok;
151   esl_pos_t n,  toklen, pos;
152   int       status;
153 
154   for (x = 0; x < 26; x++) ct[x] = 0;
155 
156   anchor = esl_buffer_GetOffset(afp->bf);
157   if ((status = esl_buffer_SetAnchor(afp->bf, anchor)) != eslOK) { status = eslEINCONCEIVABLE; goto ERROR; } /* [eslINVAL] can't happen here */
158 
159   while ( (status = esl_buffer_GetLine(afp->bf, &p, &n)) == eslOK)
160     {
161       if ((status = esl_memtok(&p, &n, " \t", &tok, &toklen)) != eslOK) continue; /* blank lines */
162       if (*tok == '#') continue; /* comments and annotation */
163       /* p now points to the rest of the sequence line, after a name */
164 
165       /* count characters into ct[] array */
166       for (pos = 0; pos < n; pos++)
167 	if (isalpha(p[pos])) {
168 	  x = toupper(p[pos]) - 'A';
169 	  ct[x]++;
170 	  nres++;
171 	}
172 
173       /* try to stop early, checking after 500, 5000, and 50000 residues: */
174       if (step < nsteps && nres > threshold[step]) {
175 	if ((status = esl_abc_GuessAlphabet(ct, &alphatype)) == eslOK) goto DONE; /* (eslENOALPHABET) */
176 	step++;
177       }
178     }
179   if (status != eslEOF) goto ERROR; /* [eslEMEM,eslESYS,eslEINCONCEIVABLE] */
180   status = esl_abc_GuessAlphabet(ct, &alphatype); /* (eslENOALPHABET) */
181 
182  DONE:
183   esl_buffer_SetOffset(afp->bf, anchor);   /* Rewind to where we were. */
184   esl_buffer_RaiseAnchor(afp->bf, anchor);
185   *ret_type = alphatype;
186   return status;
187 
188  ERROR:
189   if (anchor != -1) {
190     esl_buffer_SetOffset(afp->bf, anchor);
191     esl_buffer_RaiseAnchor(afp->bf, anchor);
192   }
193   *ret_type = eslUNKNOWN;
194   return status;
195 }
196 
197 
198 /* Function:  esl_msafile_selex_Read()
199  * Synopsis:  Read in a SELEX format alignment.
200  *
201  * Purpose:   Read an MSA from an open <ESL_MSAFILE> <afp>,
202  *            parsing for SELEX format, starting from the
203  *            current point. (<afp->format> is expected to
204  *            be <eslMSAFILE_SELEX>.) Create a new multiple
205  *            alignment and return it via <*ret_msa>.
206  *            Caller is responsible for free'ing this
207  *            <ESL_MSA>.
208  *
209  * Args:      afp     - open <ESL_MSAFILE>
210  *            ret_msa - RETURN: newly parsed <ESL_MSA>
211  *
212  * Returns:   <eslOK> on success.
213  *
214  *            <eslEOF> if no (more) alignment data are found in
215  *            <afp>, and <afp> is returned at EOF.
216  *
217  *            <eslEFORMAT> on a parse error. <*ret_msa> is set to
218  *            <NULL>. <afp> contains information sufficient for
219  *            constructing useful diagnostic output:
220  *            | <afp->errmsg>       | user-directed error message     |
221  *            | <afp->linenumber>   | line # where error was detected |
222  *            | <afp->line>         | offending line (not NUL-term)   |
223  *            | <afp->n>            | length of offending line        |
224  *            | <afp->bf->filename> | name of the file                |
225  *            and <afp> is poised at the start of the following line,
226  *            so (in principle) the caller could try to resume
227  *            parsing.
228  *
229  * Throws:    <eslEMEM> - an allocation failed.
230  *            <eslESYS> - a system call such as fread() failed
231  *            <eslEINCONCEIVABLE> - "impossible" corruption
232  */
233 int
esl_msafile_selex_Read(ESL_MSAFILE * afp,ESL_MSA ** ret_msa)234 esl_msafile_selex_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa)
235 {
236   ESL_MSA         *msa     = NULL;
237   ESL_SELEX_BLOCK *b       = NULL;
238   int32_t          nblocks = 0;
239   int              status;
240 
241   ESL_DASSERT1( (afp->format == eslMSAFILE_SELEX) );
242 
243   afp->errmsg[0] = '\0';
244 
245   while ( (status = selex_read_block(afp, &b)) == eslOK)
246     {
247       if      (! nblocks &&  (status = selex_first_block(afp, b, &msa)) != eslOK) goto ERROR;
248       else if (  nblocks &&  (status = selex_other_block(afp, b, msa))  != eslOK) goto ERROR;
249 
250       if ((status = selex_append_block(afp, b, msa)) != eslOK) goto ERROR;
251 
252       esl_buffer_RaiseAnchor(afp->bf, b->anchor);
253       b->anchor = -1;
254 
255       nblocks++;
256     }
257   /* selex_read_block took care of destroying the block! */
258   if (status != eslEOF || nblocks == 0) goto ERROR;
259 
260   msa->offset = 0;
261   if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR;
262   *ret_msa = msa;
263   return eslOK;
264 
265  ERROR:
266   if (b) {
267     if (b->anchor != -1) esl_buffer_RaiseAnchor(afp->bf, b->anchor);
268     selex_block_Destroy(b);
269   }
270   *ret_msa = NULL;
271   return status;
272 }
273 
274 /* Function:  esl_msafile_selex_Write()
275  * Synopsis:  Write a SELEX format alignment to a stream
276  *
277  * Purpose:   Write alignment <msa> to output stream <fp>,
278  *            in SELEX format. The alignment is written
279  *            in blocks of 60 aligned residues at a time.
280  *
281  * Args:      fp  - open output stream, writable
282  *            msa - alignment to write
283  *
284  * Returns:   <eslOK> on success.
285  *
286  * Throws:    <eslEMEM> on allocation error.
287  *            <eslEWRITE> on any system write error, such
288  *             as a filled disk.
289  */
290 int
esl_msafile_selex_Write(FILE * fp,const ESL_MSA * msa)291 esl_msafile_selex_Write(FILE *fp, const ESL_MSA *msa)
292 {
293   int     cpl        = 60;
294   int     maxnamelen = 4;		/* init to 4 because minimum name field is #=CS, etc. */
295   int     namelen;
296   char   *buf        = NULL;
297   int     i;
298   int64_t apos;
299   int     status;
300 
301   ESL_ALLOC(buf, sizeof(char) * (cpl+1));
302   buf[cpl] = '\0';
303   for (i = 0; i < msa->nseq; i++) {
304     namelen    = strlen(msa->sqname[i]);
305     maxnamelen = ESL_MAX(namelen, maxnamelen);
306   }
307 
308   for (apos = 0; apos < msa->alen; apos += cpl)
309     {
310       if (apos         && fprintf(fp, "\n")                                                      < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "selex msa write failed");
311       if (msa->ss_cons && fprintf(fp, "%-*s %.*s\n", maxnamelen, "#=CS", cpl, msa->ss_cons+apos) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "selex msa write failed");
312       if (msa->rf      && fprintf(fp, "%-*s %.*s\n", maxnamelen, "#=RF", cpl, msa->rf+apos)      < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "selex msa write failed");
313       if (msa->mm      && fprintf(fp, "%-*s %.*s\n", maxnamelen, "#=MM", cpl, msa->mm+apos)      < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "selex msa write failed");
314 
315       for (i = 0; i < msa->nseq; i++)
316 	{
317 	  if (msa->abc)   esl_abc_TextizeN(msa->abc, msa->ax[i]+apos+1, cpl, buf);
318 	  if (! msa->abc) strncpy(buf, msa->aseq[i]+apos, cpl);
319 	  if (fprintf(fp, "%-*s %s\n", maxnamelen, msa->sqname[i], buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "selex msa write failed");
320 
321 	  if (msa->ss && msa->ss[i]) { if (fprintf(fp, "%-*s %.*s\n", maxnamelen, "#=SS", cpl, msa->ss[i]+apos) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "selex msa write failed"); }
322 	  if (msa->sa && msa->sa[i]) { if (fprintf(fp, "%-*s %.*s\n", maxnamelen, "#=SA", cpl, msa->sa[i]+apos) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "selex msa write failed"); }
323 	}
324     }
325 
326   free(buf);
327   return eslOK;
328 
329  ERROR:
330   if (buf) free(buf);
331   return status;
332 }
333 /*--------------------- end, SELEX i/o API ----------------------*/
334 
335 
336 
337 /*****************************************************************
338  * 2. Internal functions handling a block of input lines.
339  *****************************************************************/
340 
341 static ESL_SELEX_BLOCK *
selex_block_Create(int nalloc)342 selex_block_Create(int nalloc)
343 {
344   ESL_SELEX_BLOCK *b = NULL;
345   int              idx;
346   int              status;
347 
348   ESL_ALLOC(b,       sizeof(ESL_SELEX_BLOCK));
349   b->line    = NULL;
350   b->llen    = NULL;
351   b->offsets = NULL;
352   b->linenum = NULL;
353   b->ltype   = NULL;
354   b->lpos    = NULL;
355   b->rpos    = NULL;
356   b->nlines  = 0;
357   b->anchor  = -1;		/* -1 is a flag for "unused" */
358 
359   ESL_ALLOC(b->line,    sizeof(char *)    * nalloc);
360   ESL_ALLOC(b->llen,    sizeof(esl_pos_t) * nalloc);
361   ESL_ALLOC(b->offsets, sizeof(esl_pos_t) * nalloc);
362   ESL_ALLOC(b->linenum, sizeof(int64_t)   * nalloc);
363   ESL_ALLOC(b->ltype,   sizeof(int)       * nalloc);
364   ESL_ALLOC(b->lpos,    sizeof(esl_pos_t) * nalloc);
365   ESL_ALLOC(b->rpos,    sizeof(esl_pos_t) * nalloc);
366   for (idx = 0; idx < nalloc; idx++)
367     {
368       b->line[idx]    = NULL;
369       b->llen[idx]    = 0;
370       b->offsets[idx] = 0;
371       b->linenum[idx] = 0;
372       b->ltype[idx]   = 0;
373       b->lpos[idx]    = 0;
374       b->rpos[idx]    = 0;
375     }
376   b->nalloc = nalloc;
377   return b;
378 
379  ERROR:
380   if (b) selex_block_Destroy(b);
381   return NULL;
382 }
383 
384 static int
selex_block_Grow(ESL_SELEX_BLOCK * b)385 selex_block_Grow(ESL_SELEX_BLOCK *b)
386 {
387   int idx;
388   int status;
389 
390   ESL_REALLOC(b->line,    sizeof(char *)    * b->nalloc * 2);
391   ESL_REALLOC(b->llen,    sizeof(esl_pos_t) * b->nalloc * 2);
392   ESL_REALLOC(b->offsets, sizeof(esl_pos_t) * b->nalloc * 2);
393   ESL_REALLOC(b->linenum, sizeof(int64_t)   * b->nalloc * 2);
394   ESL_REALLOC(b->ltype,   sizeof(int)       * b->nalloc * 2);
395   ESL_REALLOC(b->lpos,    sizeof(esl_pos_t) * b->nalloc * 2);
396   ESL_REALLOC(b->rpos,    sizeof(esl_pos_t) * b->nalloc * 2);
397   for (idx = b->nalloc; idx < b->nalloc*2; idx++)
398     {
399       b->line[idx]    = NULL;
400       b->llen[idx]    = 0;
401       b->offsets[idx] = 0;
402       b->linenum[idx] = 0;
403       b->ltype[idx]   = 0;
404       b->lpos[idx]    = 0;
405       b->rpos[idx]    = 0;
406     }
407   b->nalloc  *= 2;
408   return eslOK;
409 
410  ERROR:
411   return status;
412 }
413 
414 static void
selex_block_Destroy(ESL_SELEX_BLOCK * b)415 selex_block_Destroy(ESL_SELEX_BLOCK *b)
416 {
417   if (!b) return;
418   if (b->line)    free(b->line);
419   if (b->llen)    free(b->llen);
420   if (b->offsets) free(b->offsets);
421   if (b->linenum) free(b->linenum);
422   if (b->ltype)   free(b->ltype);
423   if (b->lpos)    free(b->lpos);
424   if (b->rpos)    free(b->rpos);
425   free(b);
426   return;
427 }
428 /*------- end, internal functions for input line blocks ---------*/
429 
430 
431 
432 
433 /*****************************************************************
434  * 3. Internal functions for parsing SELEX input.
435  *****************************************************************/
436 
437 /* Before we return a parse error,
438  * reset the <afp> so its current line is the one at fault.
439  */
440 static int
selex_ErrorInBlock(ESL_MSAFILE * afp,ESL_SELEX_BLOCK * b,int which)441 selex_ErrorInBlock(ESL_MSAFILE *afp, ESL_SELEX_BLOCK *b, int which)
442 {
443   afp->line       = b->line[which];
444   afp->n      = b->llen[which];
445   afp->lineoffset = b->offsets[which];
446   afp->linenumber = b->linenum[which];
447   return esl_buffer_SetOffset(afp->bf, b->offsets[which] + b->llen[which]);
448 }
449 
450 /* selex_read_block:  read one block of alignment data.
451  *
452  * Note that line numbers aren't necessarily consecutive,
453  * because we're stripping out comment lines here. On a parse error
454  * on a specific line, we're going to reset the buffer to that line,
455  * and we'll need the linenumber to do that reset.
456  *
457  * The <afp> detected the end of the block by reading a blank line, or EOF.
458  * Thus its point is at the next line after that blank, or at EOF.
459  *
460  * The <afp> has a stable anchor set at (*block_p)->anchor.
461  * Caller must raise this anchor when it's done parsing the block.
462  *
463  * Returns: <eslOK> on success.
464  *
465  *          <eslEOF> if no more blocks are found in the input.
466  *          <eslEFORMAT> on failure, if a subsequent block has a
467  *          different number of data lines than the first block.
468  *          On normal errors, all the references are returned set to NULL.
469  *
470  * Throws:  <eslEMEM> on allocation failure.
471  */
472 static int
selex_read_block(ESL_MSAFILE * afp,ESL_SELEX_BLOCK ** block_p)473 selex_read_block(ESL_MSAFILE *afp, ESL_SELEX_BLOCK **block_p)
474 {
475   ESL_SELEX_BLOCK *b      = *block_p; /* now b==NULL if first block; or on subsequent blocks, reuse prev block storage. */
476   int              idx    = 0;
477   int              status;
478 
479   /* Advance past blank lines until we have the first line of next
480    * block.  We may hit a normal EOF here, in which case we return
481    * EOF, we're done.
482    */
483   do {
484     if ( ( status = esl_msafile_GetLine(afp, NULL, NULL)) != eslOK) goto ERROR;                   /* EOF here is a normal EOF   */
485   } while (esl_memspn(afp->line, afp->n, " \t") == afp->n ||                                       /* idiomatic for "blank line" */
486 	   (esl_memstrpfx(afp->line, afp->n, "#") && ! esl_memstrpfx(afp->line, afp->n, "#=")));   /* a SELEX comment line       */
487 
488   /* if this is first block, allocate block; subsequent blocks reuse it */
489   if (!b && (b = selex_block_Create(16)) == NULL) { status = eslEMEM; goto ERROR; }
490 
491   /* Anchor stably at this point. */
492   b->anchor = afp->lineoffset;
493   if ((status = esl_buffer_SetStableAnchor(afp->bf, b->anchor)) != eslOK) goto ERROR;
494 
495   /* Parse for a block of lines. */
496   do {
497     if (b->nalloc && idx == b->nalloc && (status = selex_block_Grow(b)) != eslOK) goto ERROR;
498 
499     b->line[idx]     = afp->line;
500     b->llen[idx]     = afp->n;
501     b->offsets[idx]  = afp->lineoffset;
502     b->linenum[idx]  = afp->linenumber;   /* ltype, lpos, rpos aren't set yet */
503     idx++;
504 
505     /* Get next non-comment line; this can be next line of block, blank (end of block), or EOF. */
506     do {
507       status = esl_msafile_GetLine(afp, NULL, NULL);
508     } while ( status == eslOK && (esl_memstrpfx(afp->line, afp->n, "#") && ! esl_memstrpfx(afp->line, afp->n, "#=")));
509 
510   } while (status == eslOK && esl_memspn(afp->line, afp->n, " \t") < afp->n); /* end of block on EOF or blank line */
511 
512   if (*block_p && b->nlines != idx)
513     ESL_XFAIL(eslEFORMAT, afp->errmsg, "expected %d lines in block, saw %d", b->nlines, idx);
514 
515   b->nlines  = idx;
516   *block_p   = b;
517   return eslOK;	/* EOF status gets turned into OK: we've read a block successfully and hit EOF. Next call will generate the EOF */
518 
519  ERROR:
520   if (b && b->anchor != -1) esl_buffer_RaiseAnchor(afp->bf, b->anchor);
521   if (b) selex_block_Destroy(b);
522   *block_p = NULL;
523   return status;
524 }
525 
526 
527 /* selex_first_block()
528  *
529  * 1. Determine and store line types, in b->ltype[0..b->nlines-1].
530  * 2. From the number of eslSELEX_LINE_SQ lines, we know nseq.
531  * 3. From nseq, we can allocate a new MSA.
532  * 4. Parse each line for sequence names, and store them.
533  * 5. Determine lpos[] for each line.
534  */
535 static int
selex_first_block(ESL_MSAFILE * afp,ESL_SELEX_BLOCK * b,ESL_MSA ** ret_msa)536 selex_first_block(ESL_MSAFILE *afp, ESL_SELEX_BLOCK *b, ESL_MSA **ret_msa)
537 {
538   ESL_MSA  *msa = NULL;
539   int       nrf, nmm, ncs, nss, nsa, nseq;
540   int       has_ss, has_sa;
541   char     *p, *tok;
542   esl_pos_t n,  ntok;
543   int       idx, seqi;
544   int       status;
545 
546   afp->errmsg[0] = '\0';
547 
548   nrf = nmm = ncs = nss = nsa = nseq = 0;
549   has_ss = has_sa = FALSE;
550   for (idx = 0; idx < b->nlines; idx++)
551     {
552       if      (esl_memstrpfx(b->line[idx], b->llen[idx], "#=RF")) { b->ltype[idx] = eslSELEX_LINE_RF; nrf++; }
553       else if (esl_memstrpfx(b->line[idx], b->llen[idx], "#=MM")) { b->ltype[idx] = eslSELEX_LINE_MM; nmm++; }
554       else if (esl_memstrpfx(b->line[idx], b->llen[idx], "#=CS")) { b->ltype[idx] = eslSELEX_LINE_CS; ncs++; }
555       else if (esl_memstrpfx(b->line[idx], b->llen[idx], "#=SS")) { b->ltype[idx] = eslSELEX_LINE_SS; nss++; has_ss = TRUE; }
556       else if (esl_memstrpfx(b->line[idx], b->llen[idx], "#=SA")) { b->ltype[idx] = eslSELEX_LINE_SA; nsa++; has_sa = TRUE; }
557       else                                                        { b->ltype[idx] = eslSELEX_LINE_SQ; nseq++; nss = nsa = 0; }
558 
559       if (nss && !nseq) { selex_ErrorInBlock(afp, b, idx); ESL_XFAIL(eslEFORMAT, afp->errmsg, "#=SS must follow a sequence");   }
560       if (nsa && !nseq) { selex_ErrorInBlock(afp, b, idx); ESL_XFAIL(eslEFORMAT, afp->errmsg, "#=SA must follow a sequence");   }
561       if (nrf > 1)      { selex_ErrorInBlock(afp, b, idx); ESL_XFAIL(eslEFORMAT, afp->errmsg, "Too many #=RF lines for block"); }
562       if (ncs > 1)      { selex_ErrorInBlock(afp, b, idx); ESL_XFAIL(eslEFORMAT, afp->errmsg, "Too many #=CS lines for block"); }
563       if (nss > 1)      { selex_ErrorInBlock(afp, b, idx); ESL_XFAIL(eslEFORMAT, afp->errmsg, "Too many #=SS lines for seq");   }
564       if (nsa > 1)      { selex_ErrorInBlock(afp, b, idx); ESL_XFAIL(eslEFORMAT, afp->errmsg, "Too many #=SA lines for seq");   }
565     }
566 
567   if ( afp->abc && (msa = esl_msa_CreateDigital(afp->abc, nseq, -1)) == NULL) { status = eslEMEM; goto ERROR; } /* a growable MSA */
568   if (!afp->abc && (msa = esl_msa_Create(                 nseq, -1)) == NULL) { status = eslEMEM; goto ERROR; }
569   if (has_ss) {
570     ESL_ALLOC(msa->ss, sizeof(char *) * nseq);
571     for (seqi = 0; seqi < nseq; seqi++) msa->ss[seqi] = NULL;
572   }
573   if (has_sa) {
574     ESL_ALLOC(msa->sa, sizeof(char *) * nseq);
575     for (seqi = 0; seqi < nseq; seqi++) msa->sa[seqi] = NULL;
576   }
577   msa->nseq = nseq;
578   msa->alen = 0;
579 
580   for (seqi = 0, idx = 0; idx < b->nlines; idx++)
581     {
582       p = b->line[idx];
583       n = b->llen[idx];
584       if ( esl_memtok(&p, &n, " \t", &tok, &ntok) != eslOK) ESL_XEXCEPTION(eslEINCONCEIVABLE, "can't happen"); /* because a block by definition consists of non-blank lines */
585       if (b->ltype[idx] == eslSELEX_LINE_SQ) /* otherwise, first token is #=XX marking annotation of some sort */
586 	{
587 	  if ((status = esl_msa_SetSeqName(msa, seqi, tok, ntok)) != eslOK) goto ERROR;
588 	  seqi++;
589 	}
590       b->lpos[idx] = (n ? p-b->line[idx] : -1);  /* set lpos[] to position of first seq or annotation residue */
591     }
592 
593   *ret_msa = msa;
594   return eslOK;
595 
596  ERROR:
597   if (msa) esl_msa_Destroy(msa);
598   *ret_msa = NULL;
599   return status;
600 }
601 
602 
603 /* selex_other_block()
604  * We've already parsed the first block.
605  * So we know the order of line types, nseq, and sequence names.
606  * Validate that a subsequent block has the same.
607  */
608 static int
selex_other_block(ESL_MSAFILE * afp,ESL_SELEX_BLOCK * b,ESL_MSA * msa)609 selex_other_block(ESL_MSAFILE *afp, ESL_SELEX_BLOCK *b, ESL_MSA *msa)
610 {
611   char     *p, *tok;
612   esl_pos_t n, ntok;
613   int       idx, seqi;
614 
615   /* Validate line types */
616   for (idx = 0; idx < b->nlines; idx++)
617     {
618       if      (esl_memstrpfx(b->line[idx], b->llen[idx], "#=RF")) { if (b->ltype[idx] != eslSELEX_LINE_RF) { selex_ErrorInBlock(afp, b, idx); ESL_FAIL(eslEFORMAT, afp->errmsg, "#=RF line isn't in expected order in block"); } }
619       else if (esl_memstrpfx(b->line[idx], b->llen[idx], "#=MM")) { if (b->ltype[idx] != eslSELEX_LINE_MM) { selex_ErrorInBlock(afp, b, idx); ESL_FAIL(eslEFORMAT, afp->errmsg, "#=MM line isn't in expected order in block"); } }
620       else if (esl_memstrpfx(b->line[idx], b->llen[idx], "#=CS")) { if (b->ltype[idx] != eslSELEX_LINE_CS) { selex_ErrorInBlock(afp, b, idx); ESL_FAIL(eslEFORMAT, afp->errmsg, "#=CS line isn't in expected order in block"); } }
621       else if (esl_memstrpfx(b->line[idx], b->llen[idx], "#=SS")) { if (b->ltype[idx] != eslSELEX_LINE_SS) { selex_ErrorInBlock(afp, b, idx); ESL_FAIL(eslEFORMAT, afp->errmsg, "#=SS line isn't in expected order in block"); } }
622       else if (esl_memstrpfx(b->line[idx], b->llen[idx], "#=SA")) { if (b->ltype[idx] != eslSELEX_LINE_SA) { selex_ErrorInBlock(afp, b, idx); ESL_FAIL(eslEFORMAT, afp->errmsg, "#=SA line isn't in expected order in block"); } }
623       else                                                        { if (b->ltype[idx] != eslSELEX_LINE_SQ) { selex_ErrorInBlock(afp, b, idx); ESL_FAIL(eslEFORMAT, afp->errmsg, "sequence line isn't in expected order in block"); } }
624     }
625 
626   /* Validate seq names, and set lpos */
627   for (seqi = 0, idx = 0; idx < b->nlines; idx++)
628     {
629       p = b->line[idx];
630       n = b->llen[idx];
631       if ( esl_memtok(&p, &n, " \t", &tok, &ntok) != eslOK) ESL_EXCEPTION(eslEINCONCEIVABLE, "can't happen"); /* because a block by definition consists of non-blank lines */
632       if (b->ltype[idx] == eslSELEX_LINE_SQ)
633 	{
634 	  if (! esl_memstrcmp(tok, ntok, msa->sqname[seqi]))  { selex_ErrorInBlock(afp, b, idx); ESL_FAIL(eslEFORMAT, afp->errmsg, "expected sequence %s at this line of block", msa->sqname[seqi]); }
635 	  seqi++;
636 	}
637       b->lpos[idx] = (n ? p-b->line[idx] : -1);  /* set lpos[] to position of first seq or annotation residue */
638     }
639   return eslOK;
640 }
641 
642 static int
selex_append_block(ESL_MSAFILE * afp,ESL_SELEX_BLOCK * b,ESL_MSA * msa)643 selex_append_block(ESL_MSAFILE *afp, ESL_SELEX_BLOCK *b, ESL_MSA *msa)
644 {
645   char     *p;
646   esl_pos_t pos;
647   int       idx, seqi;
648   esl_pos_t leftmost, rightmost;
649   int64_t   nadd;		/* width of this sequence block, in aligned columns added to msa */
650   esl_pos_t nleft, ntext;
651   int64_t   alen;
652   int       status;
653 
654   /* Determine rpos for each line.  */
655   for (idx = 0; idx < b->nlines; idx++)
656     {
657       p   = b->line[idx];
658       pos = b->llen[idx] - 1;
659       while (pos>=0 && isspace(p[pos])) pos--;
660       b->rpos[idx] = ( (pos < b->lpos[idx]) ? -1 : pos); /* -1: a completely blank seq line is valid */
661     }
662 
663   /* Determine leftmost and rightmost positions for entire block */
664   leftmost  = b->lpos[0];
665   rightmost = b->rpos[0];
666   for (idx = 1; idx < b->nlines; idx++) {
667     leftmost  = (b->lpos[idx] == -1) ? leftmost  : ESL_MIN(leftmost,  b->lpos[idx]);
668     rightmost = (b->rpos[idx] == -1) ? rightmost : ESL_MAX(rightmost, b->rpos[idx]);
669   }
670   if (rightmost == -1) return eslOK; /* super special case: no sequence or annotation data in this block at all! */
671   nadd = rightmost - leftmost + 1;
672 
673   /* Appends */
674   for (seqi = 0, idx = 0; idx < b->nlines; idx++)
675     {
676       nleft  = ((b->lpos[idx] != -1) ? b->lpos[idx] - leftmost         : nadd); /* watch special case of all whitespace on data line, lpos>rpos */
677       ntext  = ((b->lpos[idx] != -1) ? b->rpos[idx] - b->lpos[idx] + 1 : 0);
678       //nright = ((b->lpos[idx] != -1) ? rightmost    - b->rpos[idx]     : 0);  // someday you might want to know nright, but for now the code doesn't use it
679 
680       if      (b->ltype[idx] == eslSELEX_LINE_SQ)
681 	{
682 	  if (msa->abc)
683 	    {			/* digital sequence append - mapped, preallocated */
684 	      ESL_REALLOC(msa->ax[seqi],   sizeof(ESL_DSQ) * (msa->alen + nadd + 2));
685 	      if (msa->alen == 0) msa->ax[seqi][0] = eslDSQ_SENTINEL;
686 	      for (alen = msa->alen; alen < msa->alen+nleft;  alen++) msa->ax[seqi][alen+1] = esl_abc_XGetGap(msa->abc);
687 
688 	      status = esl_abc_dsqcat_noalloc(afp->inmap, msa->ax[seqi], &alen, b->line[idx] + b->lpos[idx], ntext);
689 	      if      (status == eslEINVAL) { selex_ErrorInBlock(afp, b, idx); ESL_FAIL(eslEFORMAT, afp->errmsg, "illegal residue(s) in sequence line"); }
690 	      else if (status != eslOK)     { selex_ErrorInBlock(afp, b, idx); goto ERROR; }
691 	      if (alen != msa->alen + nleft + ntext) { selex_ErrorInBlock(afp, b, idx); ESL_EXCEPTION(eslEINCONCEIVABLE, afp->errmsg, "unexpected inconsistency appending a sequence"); };
692 
693 	      for (; alen < msa->alen+nadd;   alen++) msa->ax[seqi][alen+1] = esl_abc_XGetGap(msa->abc);
694 	      msa->ax[seqi][alen+1] = eslDSQ_SENTINEL;
695 	    }
696 
697 	  if (! msa->abc)
698 	    {			/* text mode sequence append - mapped, preallocated */
699 	      ESL_REALLOC(msa->aseq[seqi], sizeof(char)    * (msa->alen + nadd + 1));
700 	      for (alen = msa->alen; alen < msa->alen+nleft; alen++) msa->aseq[seqi][alen] = '.';
701 
702 	      status = esl_strmapcat_noalloc(afp->inmap, msa->aseq[seqi], &alen, b->line[idx] + b->lpos[idx], ntext);
703 	      if      (status == eslEINVAL) { selex_ErrorInBlock(afp, b, idx); ESL_FAIL(eslEFORMAT, afp->errmsg, "illegal residue(s) in input line"); }
704 	      else if (status != eslOK)     { selex_ErrorInBlock(afp, b, idx); goto ERROR; }
705 	      if (alen != msa->alen + nleft + ntext) { selex_ErrorInBlock(afp, b, idx); ESL_EXCEPTION(eslEINCONCEIVABLE, afp->errmsg, "unexpected inconsistency appending a sequence"); };
706 
707 	      for  (; alen < msa->alen+nadd;  alen++) msa->aseq[seqi][alen] = '.';
708 	      msa->aseq[seqi][alen] = '\0';
709 	    }
710 	  seqi++;
711 	}
712       else
713 	{			/* annotation append: not mapped, characters are copied exactly as they are */
714 	  if      (b->ltype[idx] == eslSELEX_LINE_RF) { ESL_REALLOC(msa->rf,         sizeof(char) * (msa->alen + nadd + 1)); p = msa->rf;         }
715 	  if      (b->ltype[idx] == eslSELEX_LINE_MM) { ESL_REALLOC(msa->mm,         sizeof(char) * (msa->alen + nadd + 1)); p = msa->mm;         }
716 	  else if (b->ltype[idx] == eslSELEX_LINE_CS) { ESL_REALLOC(msa->ss_cons,    sizeof(char) * (msa->alen + nadd + 1)); p = msa->ss_cons;    }
717 	  else if (b->ltype[idx] == eslSELEX_LINE_SS) { ESL_REALLOC(msa->ss[seqi-1], sizeof(char) * (msa->alen + nadd + 1)); p = msa->ss[seqi-1]; }
718 	  else if (b->ltype[idx] == eslSELEX_LINE_SA) { ESL_REALLOC(msa->sa[seqi-1], sizeof(char) * (msa->alen + nadd + 1)); p = msa->sa[seqi-1]; }
719 
720 	  for (alen = msa->alen; alen < msa->alen+nleft; alen++) p[alen] = '.';
721 	  if (ntext) memcpy(p+msa->alen+nleft, b->line[idx]+b->lpos[idx], sizeof(char)*ntext);
722 	  for (alen = msa->alen+nleft+ntext; alen < msa->alen+nadd; alen++) p[alen] = '.';
723 	  p[alen] = '\0';
724 	}
725     }
726   msa->alen += nadd;
727   return eslOK;
728 
729  ERROR:
730   return status;
731 }
732 
733 
734 /*****************************************************************
735  * 4. Unit tests.
736  *****************************************************************/
737 #ifdef eslMSAFILE_SELEX_TESTDRIVE
738 
739 static void
utest_write_good1(FILE * ofp,int * ret_alphatype,int * ret_nseq,int * ret_alen)740 utest_write_good1(FILE *ofp, int *ret_alphatype, int *ret_nseq, int *ret_alen)
741 {
742   fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
743   fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
744   fputs("seq3 ACDEFGHIKLMNPQRSTVWY\n", ofp);
745   fputs("seq4 ACDEFGHIKLMNPQRSTVWY\n", ofp);
746   fputs("seq5 ACDEFGHIKLMNPQRSTVWY\n", ofp);
747 
748   *ret_alphatype = eslAMINO;
749   *ret_nseq      = 5;
750   *ret_alen      = 20;
751 }
752 
753 static void
utest_write_good2(FILE * ofp,int * ret_alphatype,int * ret_nseq,int * ret_alen)754 utest_write_good2(FILE *ofp, int *ret_alphatype, int *ret_nseq, int *ret_alen)
755 {
756   fputs("# DOS format (\\r\\n), and doesn't end in a newline.\r\n", ofp);
757   fputs("# \r\n", ofp);
758   fputs("#=RF xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
759   fputs("#=CS xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
760   fputs("seq1 ACDEFGHIKLMNPQRSTVWY\r\n", ofp);
761   fputs("#=SS xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
762   fputs("#=SA xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
763   fputs("seq2 ACDEFGHIKLMNPQRSTVWY\r\n", ofp);
764   fputs("#=SS xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
765   fputs("#=SA xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
766   fputs("seq3 ACDEFGHIKLMNPQRSTVWY\r\n", ofp);
767   fputs("#=SS xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
768   fputs("#=SA xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
769   fputs("seq4 ACDEFGHIKLMNPQRSTVWY\r\n", ofp);
770   fputs("#=SS xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
771   fputs("#=SA xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
772   fputs("seq5 ACDEFGHIKLMNPQRSTVWY\r\n", ofp);
773   fputs("#=SS xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
774   fputs("#=SA xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
775   fputs("\r\n", ofp);
776   fputs("#=RF xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
777   fputs("#=CS xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
778   fputs("seq1 ACDEFGHIKLMNPQRSTVWY\r\n", ofp);
779   fputs("#=SS xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
780   fputs("#=SA xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
781   fputs("seq2 ACDEFGHIKLMNPQRSTVWY\r\n", ofp);
782   fputs("#=SS xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
783   fputs("#=SA xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
784   fputs("seq3 ACDEFGHIKLMNPQRSTVWY\r\n", ofp);
785   fputs("#=SS xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
786   fputs("#=SA xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
787   fputs("seq4 ACDEFGHIKLMNPQRSTVWY\r\n", ofp);
788   fputs("#=SS xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
789   fputs("#=SA xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
790   fputs("seq5 ACDEFGHIKLMNPQRSTVWY\r\n", ofp);
791   fputs("#=SS xxxxxxxxxxxxxxxxxxxx\r\n", ofp);
792   fputs("#=SA xxxxxxxxxxxxxxxxxxxx",    ofp);
793 
794   *ret_alphatype = eslAMINO;
795   *ret_nseq      = 5;
796   *ret_alen      = 40;
797 }
798 
799 static void
utest_write_good3(FILE * ofp,int * ret_alphatype,int * ret_nseq,int * ret_alen)800 utest_write_good3(FILE *ofp, int *ret_alphatype, int *ret_nseq, int *ret_alen)
801 {
802   fputs("\n", ofp);
803   fputs("#=CS\n", ofp);
804   fputs("#=RF\n", ofp);
805   fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
806   fputs("long_name GHIKLMNPQRSTVWY\n", ofp);
807   fputs("blank_seq_all_gaps \n", ofp);
808   fputs("seq2 ACDEF---KLMNPQRSTVWY\n", ofp);
809   fputs("seq3 ACDEF...KLMNPQRSTVWY\n", ofp);
810   fputs("# embedded comments ok\n", ofp);
811   fputs("seq4 ACDEFGHIKLMNPQRSTVWY\n", ofp);
812   fputs(" seq5 CDEFGHIKLMNPQRSTVWY\n", ofp);
813   fputs("#=SS \n", ofp);
814   fputs("#=SA\n", ofp);
815   fputs("\n", ofp);
816   fputs("\n", ofp);
817   fputs("\n", ofp);
818   fputs("#=CS\n", ofp);
819   fputs("#=RF\n", ofp);
820   fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
821   fputs("long_name GHIKLMNPQRSTVWY\n", ofp);
822   fputs("blank_seq_all_gaps \n", ofp);
823   fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
824   fputs("seq3 ACDEFGHIKLMNPQRSTVWY\n", ofp);
825   fputs("seq4 ACDEFGHIKLMNPQRSTVWY\n", ofp);
826   fputs("seq5 ACDEFGHIKLMNPQRSTVWY\n", ofp);
827   fputs("#=SS \n", ofp);
828   fputs("#=SA\n", ofp);
829 
830   *ret_alphatype = eslAMINO;
831   *ret_nseq      = 7;
832   *ret_alen      = 52;
833 }
834 
835 static void
utest_write_good4(FILE * ofp,int * ret_alphatype,int * ret_nseq,int * ret_alen)836 utest_write_good4(FILE *ofp, int *ret_alphatype, int *ret_nseq, int *ret_alen)
837 {
838   fputs("# A complicated SELEX example\n", ofp);
839   fputs("\n", ofp);
840   fputs("\n", ofp);
841   fputs("#=RF        xxxxxxx xxxx xxxxxx\n", ofp);
842   fputs("#=CS        >>>>+>> ^^^^ <<<<<<\n", ofp);
843   fputs("28    gGAGUAAGAUAGC AUCA GCAUCUUGUUCC\n", ofp);
844   fputs("#=SS  +++++>>>>>+>> ^^^^ <<<<<<<+++++\n", ofp);
845   fputs("longname    GUUCACC AUCA GGGGAc\n", ofp);
846   fputs("#=SS        >>>>+>> ^^^^ <<<<<<\n", ofp);
847   fputs("2     AUGGAUGCGCACC AUCA GGGCGUaucuau\n", ofp);
848   fputs("3           GAUCACC AUCA GGGauc\n", ofp);
849   fputs("4           GGUCACC AUCA GGGauc\n", ofp);
850   fputs("5           GGACACC AUCA GGGucu\n", ofp);
851   fputs("6              CACC AUCA GGG\n", ofp);
852   fputs("7           GAUCACC AUCA GGGauc\n", ofp);
853   fputs("8            CUCACC AUCA GGGGG\n", ofp);
854   fputs("9           AUGCACC AUCA GGGCAU\n", ofp);
855   fputs("10           CUCACC AUCA GGGGG\n", ofp);
856 
857   *ret_alphatype = eslRNA;
858   *ret_nseq      = 11;
859   *ret_alen      = 31;
860 }
861 
862 static void
utest_goodfile(char * filename,int testnumber,int expected_alphatype,int expected_nseq,int expected_alen)863 utest_goodfile(char *filename, int testnumber, int expected_alphatype, int expected_nseq, int expected_alen)
864 {
865   ESL_ALPHABET        *abc          = NULL;
866   ESL_MSAFILE        *afp          = NULL;
867   ESL_MSA             *msa1         = NULL;
868   ESL_MSA             *msa2         = NULL;
869   char                 tmpfile1[32] = "esltmpXXXXXX";
870   char                 tmpfile2[32] = "esltmpXXXXXX";
871   FILE                *ofp          = NULL;
872   int                  status;
873 
874   /* guessing both the format and the alphabet should work: this is a digital open */
875   if ( (status = esl_msafile_Open(&abc, filename, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) esl_fatal("selex good file test %d failed: digital open",           testnumber);
876   if (afp->format != eslMSAFILE_SELEX)                                                             esl_fatal("selex good file test %d failed: format autodetection",   testnumber);
877   if (abc->type   != expected_alphatype)                                                           esl_fatal("selex good file test %d failed: alphabet autodetection", testnumber);
878 
879   /* This is a digital read, using <abc>. */
880   if ( (status = esl_msafile_selex_Read(afp, &msa1))   != eslOK)  esl_fatal("selex good file test %d failed: msa read, digital", testnumber);
881   if (msa1->nseq != expected_nseq || msa1->alen != expected_alen) esl_fatal("selex good file test %d failed: nseq/alen",         testnumber);
882   if (esl_msa_Validate(msa1, NULL) != eslOK)                      esl_fatal("selex good file test %d failed: msa1 invalid",      testnumber);
883   esl_msafile_Close(afp);
884 
885   /* write it back out to a new tmpfile (digital write) */
886   if ( (status = esl_tmpfile_named(tmpfile1, &ofp))  != eslOK) esl_fatal("selex good file test %d failed: tmpfile creation",   testnumber);
887   if ( (status = esl_msafile_selex_Write(ofp, msa1)) != eslOK) esl_fatal("selex good file test %d failed: msa write, digital", testnumber);
888   fclose(ofp);
889 
890   /* now open and read it as text mode, in known format. (We have to pass fmtd now, to deal with the possibility of a nonstandard name width) */
891   if ( (status = esl_msafile_Open(NULL, tmpfile1, NULL, eslMSAFILE_SELEX, NULL, &afp)) != eslOK) esl_fatal("selex good file test %d failed: text mode open", testnumber);
892   if ( (status = esl_msafile_selex_Read(afp, &msa2))                                   != eslOK) esl_fatal("selex good file test %d failed: msa read, text", testnumber);
893   if (msa2->nseq != expected_nseq || msa2->alen != expected_alen)                                esl_fatal("selex good file test %d failed: nseq/alen",      testnumber);
894   if (esl_msa_Validate(msa2, NULL) != eslOK)                                                     esl_fatal("selex good file test %d failed: msa2 invalid",   testnumber);
895   esl_msafile_Close(afp);
896 
897   /* write it back out to a new tmpfile (text write) */
898   if ( (status = esl_tmpfile_named(tmpfile2, &ofp))   != eslOK) esl_fatal("selex good file test %d failed: tmpfile creation", testnumber);
899   if ( (status = esl_msafile_selex_Write(ofp, msa2))  != eslOK) esl_fatal("selex good file test %d failed: msa write, text",  testnumber);
900   fclose(ofp);
901   esl_msa_Destroy(msa2);
902 
903   /* open and read it in digital mode */
904   if ( (status = esl_msafile_Open(&abc, tmpfile1, NULL, eslMSAFILE_SELEX, NULL, &afp)) != eslOK) esl_fatal("selex good file test %d failed: 2nd digital mode open", testnumber);
905   if ( (status = esl_msafile_selex_Read(afp, &msa2))                                   != eslOK) esl_fatal("selex good file test %d failed: 2nd digital msa read",  testnumber);
906   if (esl_msa_Validate(msa2, NULL) != eslOK)                                                     esl_fatal("selex good file test %d failed: msa2 invalid",          testnumber);
907   esl_msafile_Close(afp);
908 
909   /* this msa <msa2> should be identical to <msa1> */
910   if (esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal("selex good file test %d failed: msa compare", testnumber);
911 
912   remove(tmpfile1);
913   remove(tmpfile2);
914   esl_msa_Destroy(msa1);
915   esl_msa_Destroy(msa2);
916   esl_alphabet_Destroy(abc);
917 }
918 
919 
920 static void
write_test_msas(FILE * ofp1,FILE * ofp2)921 write_test_msas(FILE *ofp1, FILE *ofp2)
922 {
923   fprintf(ofp1, "# selex comments ignored \n");
924   fprintf(ofp1, "#=RF ..xxxxxxxxxxxxxxxxxxxx\n");
925   fprintf(ofp1, "seq1 ..acdefghiklmnpqrstvwy\n");
926   fprintf(ofp1, "seq2 ..acdefghiklmnpqrstv--\n");
927   fprintf(ofp1, "seq3 aaacdefghiklmnpqrstv--\n");
928   fprintf(ofp1, "# selex comments ignored \n");
929   fprintf(ofp1, "seq4 ..acdefghiklmnpqrstvwy\n");
930   fprintf(ofp1, "\n");
931   fprintf(ofp1, "#=RF xxxxxxxxxxxxxxxxxxxx..\n");
932   fprintf(ofp1, "seq1 ACDEFGHIKLMNPQRSTVWY..\n");
933   fprintf(ofp1, "seq2 ACDEFGHIKLMNPQRSTVWYYY\n");
934   fprintf(ofp1, "seq3 ACDEFGHIKLMNPQRSTVWY..\n");
935   fprintf(ofp1, "seq4 ACDEFGHIKLMNPQRSTVWY..\n");
936   fprintf(ofp1, "\n");
937 
938   fprintf(ofp2, "# STOCKHOLM 1.0\n");
939   fprintf(ofp2, "\n");
940   fprintf(ofp2, "#=GC RF ..xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx..\n");
941   fprintf(ofp2, "seq1    ..acdefghiklmnpqrstvwyACDEFGHIKLMNPQRSTVWY..\n");
942   fprintf(ofp2, "seq2    ..acdefghiklmnpqrstv--ACDEFGHIKLMNPQRSTVWYYY\n");
943   fprintf(ofp2, "seq3    aaacdefghiklmnpqrstv--ACDEFGHIKLMNPQRSTVWY..\n");
944   fprintf(ofp2, "seq4    ..acdefghiklmnpqrstvwyACDEFGHIKLMNPQRSTVWY..\n");
945   fprintf(ofp2, "//\n");
946 }
947 
948 static void
read_test_msas_digital(char * slxfile,char * stkfile)949 read_test_msas_digital(char *slxfile, char *stkfile)
950 {
951   char msg[]         = "SELEX msa digital read unit test failed";
952   ESL_ALPHABET *abc  = NULL;
953   ESL_MSAFILE *afp1 = NULL;
954   ESL_MSAFILE *afp2 = NULL;
955   ESL_MSA      *msa1, *msa2, *msa3, *msa4;
956   FILE         *slxfp, *stkfp;
957   char          slxfile2[32]  = "esltmpslx2XXXXXX";
958   char          stkfile2[32]  = "esltmpstk2XXXXXX";
959 
960   if ( esl_msafile_Open(&abc, slxfile, NULL, eslMSAFILE_SELEX,     NULL, &afp1)   != eslOK)  esl_fatal(msg);
961   if ( !abc || abc->type != eslAMINO)                                                        esl_fatal(msg);
962   if ( esl_msafile_Open(&abc, stkfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2)   != eslOK)  esl_fatal(msg);
963   if ( esl_msafile_selex_Read    (afp1, &msa1)                                    != eslOK)  esl_fatal(msg);
964   if ( esl_msafile_stockholm_Read(afp2, &msa2)                                    != eslOK)  esl_fatal(msg);
965   if ( esl_msa_Compare(msa1, msa2)                                                != eslOK)  esl_fatal(msg);
966 
967   if ( esl_msafile_selex_Read    (afp1, &msa3)  != eslEOF) esl_fatal(msg);
968   if ( esl_msafile_stockholm_Read(afp2, &msa3)  != eslEOF) esl_fatal(msg);
969 
970   esl_msafile_Close(afp2);
971   esl_msafile_Close(afp1);
972 
973   /* Now write stk to selex file, and vice versa; then retest */
974   if ( esl_tmpfile_named(slxfile2, &slxfp)                                  != eslOK) esl_fatal(msg);
975   if ( esl_tmpfile_named(stkfile2, &stkfp)                                  != eslOK) esl_fatal(msg);
976   if ( esl_msafile_selex_Write    (slxfp, msa2)                             != eslOK) esl_fatal(msg);
977   if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_STOCKHOLM)       != eslOK) esl_fatal(msg);
978   fclose(slxfp);
979   fclose(stkfp);
980   if ( esl_msafile_Open(&abc, slxfile2, NULL, eslMSAFILE_SELEX,     NULL, &afp1) != eslOK) esl_fatal(msg);
981   if ( esl_msafile_Open(&abc, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
982   if ( esl_msafile_selex_Read    (afp1, &msa3)                                   != eslOK) esl_fatal(msg);
983   if ( esl_msafile_stockholm_Read(afp2, &msa4)                                   != eslOK) esl_fatal(msg);
984   if ( esl_msa_Compare(msa3, msa4)                                               != eslOK) esl_fatal(msg);
985 
986   remove(slxfile2);
987   remove(stkfile2);
988   esl_msafile_Close(afp2);
989   esl_msafile_Close(afp1);
990 
991   esl_msa_Destroy(msa1);
992   esl_msa_Destroy(msa2);
993   esl_msa_Destroy(msa3);
994   esl_msa_Destroy(msa4);
995   esl_alphabet_Destroy(abc);
996 }
997 
998 static void
read_test_msas_text(char * slxfile,char * stkfile)999 read_test_msas_text(char *slxfile, char *stkfile)
1000 {
1001   char msg[]         = "SELEX msa text-mode read unit test failed";
1002   ESL_MSAFILE *afp1 = NULL;
1003   ESL_MSAFILE *afp2 = NULL;
1004   ESL_MSA      *msa1, *msa2, *msa3, *msa4;
1005   FILE         *slxfp, *stkfp;
1006   char          slxfile2[32]  = "esltmpslx2XXXXXX";
1007   char          stkfile2[32]  = "esltmpstk2XXXXXX";
1008 
1009   /*                     vvvv-- everything's the same as the digital utest except these NULLs  */
1010   if ( esl_msafile_Open(NULL, slxfile, NULL, eslMSAFILE_SELEX,     NULL, &afp1)   != eslOK)  esl_fatal(msg);
1011   if ( esl_msafile_Open(NULL, stkfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2)   != eslOK)  esl_fatal(msg);
1012   if ( esl_msafile_selex_Read    (afp1, &msa1)                                    != eslOK)  esl_fatal(msg);
1013   if ( esl_msafile_stockholm_Read(afp2, &msa2)                                    != eslOK)  esl_fatal(msg);
1014   if ( esl_msa_Compare(msa1, msa2)                                                != eslOK)  esl_fatal(msg);
1015   if ( esl_msafile_selex_Read    (afp1, &msa3)                                    != eslEOF) esl_fatal(msg);
1016   if ( esl_msafile_stockholm_Read(afp2, &msa3)                                    != eslEOF) esl_fatal(msg);
1017   esl_msafile_Close(afp2);
1018   esl_msafile_Close(afp1);
1019 
1020   if ( esl_tmpfile_named(slxfile2, &slxfp)                               != eslOK) esl_fatal(msg);
1021   if ( esl_tmpfile_named(stkfile2, &stkfp)                               != eslOK) esl_fatal(msg);
1022   if ( esl_msafile_selex_Write    (slxfp, msa2)                          != eslOK) esl_fatal(msg);
1023   if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_STOCKHOLM)    != eslOK) esl_fatal(msg);
1024   fclose(slxfp);
1025   fclose(stkfp);
1026   if ( esl_msafile_Open(NULL, slxfile2, NULL, eslMSAFILE_SELEX,     NULL, &afp1)  != eslOK) esl_fatal(msg);
1027   if ( esl_msafile_Open(NULL, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2)  != eslOK) esl_fatal(msg);
1028   if ( esl_msafile_selex_Read    (afp1, &msa3)                                    != eslOK) esl_fatal(msg);
1029   if ( esl_msafile_stockholm_Read(afp2, &msa4)                                    != eslOK) esl_fatal(msg);
1030   if ( esl_msa_Compare(msa3, msa4)                                                != eslOK) esl_fatal(msg);
1031 
1032   remove(slxfile2);
1033   remove(stkfile2);
1034   esl_msafile_Close(afp2);
1035   esl_msafile_Close(afp1);
1036 
1037   esl_msa_Destroy(msa1);
1038   esl_msa_Destroy(msa2);
1039   esl_msa_Destroy(msa3);
1040   esl_msa_Destroy(msa4);
1041 }
1042 #endif /*eslMSAFILE_SELEX_TESTDRIVE*/
1043 /*---------------------- end, unit tests ------------------------*/
1044 
1045 
1046 /*****************************************************************
1047  * 5. Test driver.
1048  *****************************************************************/
1049 #ifdef eslMSAFILE_SELEX_TESTDRIVE
1050 /* compile: gcc -g -Wall -I. -L. -o esl_msafile_selex_utest -DeslMSAFILE_SELEX_TESTDRIVE esl_msafile_selex.c -leasel -lm
1051  *  (gcov): gcc -g -Wall -fprofile-arcs -ftest-coverage -I. -L. -o esl_msafile_selex_utest -DeslMSAFILE_SELEX_TESTDRIVE esl_msafile_selex.c -leasel -lm
1052  * run:     ./esl_msafile_selex_utest
1053  */
1054 #include "esl_config.h"
1055 
1056 #include <stdio.h>
1057 
1058 #include "easel.h"
1059 #include "esl_getopts.h"
1060 #include "esl_random.h"
1061 #include "esl_msafile.h"
1062 #include "esl_msafile_selex.h"
1063 
1064 static ESL_OPTIONS options[] = {
1065    /* name  type         default  env   range togs  reqs  incomp  help                docgrp */
1066   {"-h",  eslARG_NONE,    FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage",                            0},
1067   { 0,0,0,0,0,0,0,0,0,0},
1068 };
1069 static char usage[]  = "[-options]";
1070 static char banner[] = "test driver for SELEX MSA format module";
1071 
1072 int
main(int argc,char ** argv)1073 main(int argc, char **argv)
1074 {
1075   char            msg[]        = "PSI-BLAST MSA i/o module test driver failed";
1076   ESL_GETOPTS    *go           = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
1077   char            slxfile[32]  = "esltmpslxXXXXXX";
1078   char            stkfile[32]  = "esltmpstkXXXXXX";
1079   FILE           *slxfp, *stkfp;
1080   int             testnumber;
1081   int             ngoodtests = 4;
1082   char            tmpfile[32];
1083   FILE           *ofp;
1084   int             expected_alphatype;
1085   int             expected_nseq;
1086   int             expected_alen;
1087 
1088   if ( esl_tmpfile_named(slxfile, &slxfp) != eslOK) esl_fatal(msg);
1089   if ( esl_tmpfile_named(stkfile, &stkfp) != eslOK) esl_fatal(msg);
1090   write_test_msas(slxfp, stkfp);
1091   fclose(slxfp);
1092   fclose(stkfp);
1093 
1094   read_test_msas_digital(slxfile, stkfile);
1095   read_test_msas_text   (slxfile, stkfile);
1096 
1097   /* Various "good" files that should be parsed correctly */
1098   for (testnumber = 1; testnumber <= ngoodtests; testnumber++)
1099     {
1100       strcpy(tmpfile, "esltmpXXXXXX");
1101       if (esl_tmpfile_named(tmpfile, &ofp) != eslOK) esl_fatal(msg);
1102       switch (testnumber) {
1103       case  1:  utest_write_good1 (ofp, &expected_alphatype, &expected_nseq, &expected_alen); break;
1104       case  2:  utest_write_good2 (ofp, &expected_alphatype, &expected_nseq, &expected_alen); break;
1105       case  3:  utest_write_good3 (ofp, &expected_alphatype, &expected_nseq, &expected_alen); break;
1106       case  4:  utest_write_good4 (ofp, &expected_alphatype, &expected_nseq, &expected_alen); break;
1107       }
1108       fclose(ofp);
1109       utest_goodfile(tmpfile, testnumber, expected_alphatype, expected_nseq, expected_alen);
1110       remove(tmpfile);
1111     }
1112 
1113 
1114   remove(slxfile);
1115   remove(stkfile);
1116   esl_getopts_Destroy(go);
1117   return 0;
1118 }
1119 #endif /*eslMSAFILE_SELEX_TESTDRIVE*/
1120 /*--------------------- end, test driver ------------------------*/
1121 
1122 
1123 
1124 
1125 /*****************************************************************
1126  * 6. Examples.
1127  *****************************************************************/
1128 
1129 #ifdef eslMSAFILE_SELEX_EXAMPLE
1130 /* A full-featured example of reading/writing an MSA in SELEX format(s).
1131    gcc -g -Wall -o esl_msafile_selex_example -I. -L. -DeslMSAFILE_SELEX_EXAMPLE esl_msafile_selex.c -leasel -lm
1132    ./esl_msafile_selex_example <msafile>
1133  */
1134 /*::cexcerpt::msafile_selex_example::begin::*/
1135 #include <stdio.h>
1136 
1137 #include "easel.h"
1138 #include "esl_alphabet.h"
1139 #include "esl_getopts.h"
1140 #include "esl_msa.h"
1141 #include "esl_msafile.h"
1142 #include "esl_msafile_selex.h"
1143 
1144 static ESL_OPTIONS options[] = {
1145   /* name             type          default  env  range toggles reqs incomp  help                                       docgroup*/
1146   { "-h",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",        0 },
1147   { "-1",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "no autodetection; force SELEX format",        0 },
1148   { "-q",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "quieter: don't write msa back, just summary", 0 },
1149   { "-t",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use text mode: no digital alphabet",          0 },
1150   { "--dna",       eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, "-t", "specify that alphabet is DNA",                0 },
1151   { "--rna",       eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, "-t", "specify that alphabet is RNA",                0 },
1152   { "--amino",     eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, "-t", "specify that alphabet is protein",            0 },
1153   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1154 };
1155 static char usage[]  = "[-options] <msafile>";
1156 static char banner[] = "example of guessing, reading, writing SELEX format";
1157 
1158 int
main(int argc,char ** argv)1159 main(int argc, char **argv)
1160 {
1161   ESL_GETOPTS        *go          = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
1162   char               *filename    = esl_opt_GetArg(go, 1);
1163   int                 infmt       = eslMSAFILE_UNKNOWN;
1164   ESL_ALPHABET       *abc         = NULL;
1165   ESL_MSAFILE        *afp         = NULL;
1166   ESL_MSA            *msa         = NULL;
1167   int                 status;
1168 
1169   if      (esl_opt_GetBoolean(go, "-1"))      infmt = eslMSAFILE_SELEX;
1170 
1171   if      (esl_opt_GetBoolean(go, "--rna"))   abc = esl_alphabet_Create(eslRNA);
1172   else if (esl_opt_GetBoolean(go, "--dna"))   abc = esl_alphabet_Create(eslDNA);
1173   else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
1174 
1175   /* Text mode: pass NULL for alphabet.
1176    * Digital mode: pass ptr to expected ESL_ALPHABET; and if abc=NULL, alphabet is guessed
1177    */
1178   if   (esl_opt_GetBoolean(go, "-t"))  status = esl_msafile_Open(NULL, filename, NULL, infmt, NULL, &afp);
1179   else                                 status = esl_msafile_Open(&abc, filename, NULL, infmt, NULL, &afp);
1180   if (status != eslOK) esl_msafile_OpenFailure(afp, status);
1181 
1182   if ( (status = esl_msafile_selex_Read(afp, &msa)) != eslOK)
1183     esl_msafile_ReadFailure(afp, status);
1184 
1185   printf("alphabet:       %s\n", (abc ? esl_abc_DecodeType(abc->type) : "none (text mode)"));
1186   printf("# of seqs:      %d\n", msa->nseq);
1187   printf("# of cols:      %d\n", (int) msa->alen);
1188   printf("\n");
1189 
1190   if (! esl_opt_GetBoolean(go, "-q"))
1191     esl_msafile_selex_Write(stdout, msa);
1192 
1193   esl_msa_Destroy(msa);
1194   esl_msafile_Close(afp);
1195   if (abc) esl_alphabet_Destroy(abc);
1196   esl_getopts_Destroy(go);
1197   exit(0);
1198 }
1199 /*::cexcerpt::msafile_selex_example::end::*/
1200 #endif /*eslMSAFILE_SELEX_EXAMPLE*/
1201 
1202 #ifdef eslMSAFILE_SELEX_EXAMPLE2
1203 /* A minimal example. Read SELEX MSA, in text mode.
1204    gcc -g -Wall -o esl_msafile_selex_example2 -I. -L. -DeslMSAFILE_SELEX_EXAMPLE2 esl_msafile_selex.c -leasel -lm
1205    ./esl_msafile_selex_example2 <msafile>
1206  */
1207 
1208 /*::cexcerpt::msafile_selex_example2::begin::*/
1209 #include <stdio.h>
1210 
1211 #include "easel.h"
1212 #include "esl_msa.h"
1213 #include "esl_msafile.h"
1214 #include "esl_msafile_selex.h"
1215 
1216 int
main(int argc,char ** argv)1217 main(int argc, char **argv)
1218 {
1219   char         *filename = argv[1];
1220   int           fmt      = eslMSAFILE_SELEX;
1221   ESL_MSAFILE  *afp      = NULL;
1222   ESL_MSA      *msa      = NULL;
1223   int           status;
1224 
1225   if ( (status = esl_msafile_Open(NULL, filename, NULL, fmt, NULL, &afp)) != eslOK)  esl_msafile_OpenFailure(afp, status);
1226   if ( (status = esl_msafile_selex_Read(afp, &msa))                       != eslOK)  esl_msafile_ReadFailure(afp, status);
1227 
1228   printf("%6d seqs, %5d columns\n",  msa->nseq, (int) msa->alen);
1229 
1230   esl_msafile_selex_Write(stdout, msa);
1231 
1232   esl_msa_Destroy(msa);
1233   esl_msafile_Close(afp);
1234   exit(0);
1235 }
1236 /*::cexcerpt::msafile_selex_example2::end::*/
1237 #endif /*eslMSAFILE_SELEX_EXAMPLE2*/
1238 /*--------------------- end of example --------------------------*/
1239