1 /* I/O of multiple sequence alignment files in Stockholm/Pfam format
2  * (Pfam format = always single block; Stockholm = multiblock allowed)
3  *
4  * Contents:
5  *   1. API for reading/writing Stockholm/Pfam input.
6  *   2. Internal: ESL_STOCKHOLM_PARSEDATA auxiliary structure.
7  *   3. Internal: parsing Stockholm line types.
8  *   4. Internal: looking up seq, tag indices.
9  *   5. Internal: writing Stockholm/Pfam formats
10  *   6. Unit tests.
11  *   7. Test driver.
12  *   8. Example.
13  */
14 #include "esl_config.h"
15 
16 #include <string.h>
17 #include <ctype.h>
18 
19 #include "easel.h"
20 #include "esl_alphabet.h"
21 #include "esl_mem.h"
22 #include "esl_msa.h"
23 #include "esl_msafile.h"
24 
25 #include "esl_msafile_stockholm.h"
26 
27 /* Valid line types in an alignment block */
28 #define eslSTOCKHOLM_LINE_SQ        1
29 #define eslSTOCKHOLM_LINE_GC_SSCONS 2
30 #define eslSTOCKHOLM_LINE_GC_SACONS 3
31 #define eslSTOCKHOLM_LINE_GC_PPCONS 4
32 #define eslSTOCKHOLM_LINE_GC_RF     5
33 #define eslSTOCKHOLM_LINE_GC_OTHER  6
34 #define eslSTOCKHOLM_LINE_GR_SS     7
35 #define eslSTOCKHOLM_LINE_GR_SA     8
36 #define eslSTOCKHOLM_LINE_GR_PP     9
37 #define eslSTOCKHOLM_LINE_GR_OTHER  10
38 #define eslSTOCKHOLM_LINE_GC_MM     11
39 
40 typedef struct {
41   /* information about the size of the growing alignment parse */
42   int       nseq;		/* # of sqnames currently stored, sqname[0..nseq-1]. Copy of msa->nseq */
43   int64_t   alen;		/* alignment length not including current block being parsed. Becomes msa->alen when done */
44 
45   /* Having to do with the expected order of lines in each Stockholm block: */
46   int       in_block;		/* TRUE if we're in a block (GC, GR, or sequence lines) */
47   char     *blinetype;		/* blinetype[bi=0..npb-1] = code for linetype on parsed block line [bi]: GC, GR, or seq  */
48   int      *bidx;		/* bidx[bi=0.npb-1] = seq index si=0..nseq-1 of seq or GR on parsed block line [bi]; or -1 for GC lines */
49   int       npb;		/* number of lines per block. Set by bi in 1st block; checked against bi thereafter */
50   int       bi;			/* index of current line in a block, 0..npb-1  */
51   int       si;		        /* current (next expected) sequence index, 0..nseq */
52   int       balloc;		/* number of lines per block currently allocated for. */
53 
54   /* Other information kept per block */
55   int       nblock;		/* current block number (starting at 0 while in first block) */
56   int       nseq_b;             /* number of sequences seen in this block so far */
57   int64_t   alen_b;     	/* residues added by each seq field in curr block            */
58 
59   /* Having to do with the growing lengths (and numbers) of sequences and annotations in <msa>: */
60   /* yes, needed: used to catch dup lines in a block, such as seq1 xxx, seq1 xxx.               */
61   int64_t    ssconslen;		/* current length of #=GC SS_cons annotation */
62   int64_t    saconslen;		/* current length of #=GC SA_cons annotation */
63   int64_t    ppconslen;		/* current length of #=GC PP_cons annotation */
64   int64_t    rflen;		    /* current length of #=GC RF annotation */
65   int64_t    mmasklen;    /* current length of #=GC MM annotation */
66   int64_t   *sqlen;		/* current lengths of ax[0..nseq-1] or aseq[0..nseq-1]  */
67   int64_t   *sslen;		/* current lengths of ss[0..nseq-1] */
68   int64_t   *salen;		/* current lengths of sa[0..nseq-1] */
69   int64_t   *pplen;		/* current lengths of pp[0..nseq-1] */
70   int64_t   *ogc_len;		/* current lengths of unparsed gc[0..ngc-1]  */
71   int64_t  **ogr_len;		/* current lengths of unparsed gr[0..ngr-1][0..nseq-1] */
72   int        salloc;		/* # of sqnames currently allocated for (synced to msa->sqalloc) */
73 } ESL_STOCKHOLM_PARSEDATA;
74 
75 static ESL_STOCKHOLM_PARSEDATA *stockholm_parsedata_Create(ESL_MSA *msa);
76 static int                      stockholm_parsedata_ExpandSeq  (ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa);
77 static int                      stockholm_parsedata_ExpandBlock(ESL_STOCKHOLM_PARSEDATA *pd);
78 static void                     stockholm_parsedata_Destroy    (ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa);
79 
80 static int stockholm_parse_gf(ESL_MSAFILE *afp, ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa, char *p, esl_pos_t n);
81 static int stockholm_parse_gs(ESL_MSAFILE *afp, ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa, char *p, esl_pos_t n);
82 static int stockholm_parse_gc(ESL_MSAFILE *afp, ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa, char *p, esl_pos_t n);
83 static int stockholm_parse_gr(ESL_MSAFILE *afp, ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa, char *p, esl_pos_t n);
84 static int stockholm_parse_sq(ESL_MSAFILE *afp, ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa, char *p, esl_pos_t n);
85 static int stockholm_parse_comment(ESL_MSA *msa, char *p, esl_pos_t n);
86 
87 static int stockholm_get_seqidx   (ESL_MSA *msa, ESL_STOCKHOLM_PARSEDATA *pd, char *name, esl_pos_t n,      int *ret_idx);
88 static int stockholm_get_gr_tagidx(ESL_MSA *msa, ESL_STOCKHOLM_PARSEDATA *pd, char *tag,  esl_pos_t taglen, int *ret_tagidx);
89 static int stockholm_get_gc_tagidx(ESL_MSA *msa, ESL_STOCKHOLM_PARSEDATA *pd, char *tag,  esl_pos_t taglen, int *ret_tagidx);
90 
91 static int stockholm_write(FILE *fp, const ESL_MSA *msa, int64_t cpl);
92 
93 
94 /*****************************************************************
95  *# 1. API for reading/writing Stockholm input.
96  *****************************************************************/
97 
98 /* Function:  esl_msafile_stockholm_SetInmap()
99  * Synopsis:  Configure the input map for Stockholm format.
100  *
101  * Purpose:   Configure <afp->inmap> for Stockholm format.
102  *
103  *            Text mode accepts any <isgraph()> character.
104  *            Digital mode enforces the usual Easel alphabets.
105  *
106  *            No characters may be ignored in the input. We cannot
107  *            skip whitespace in the inmap, because we'd misalign
108  *            relative to the text-mode annotation lines (GR, GC),
109  *            where we don't do mapped input.)
110  */
111 int
esl_msafile_stockholm_SetInmap(ESL_MSAFILE * afp)112 esl_msafile_stockholm_SetInmap(ESL_MSAFILE *afp)
113 {
114   int sym;
115 
116   if (afp->abc)
117     {
118       for (sym = 0; sym < 128; sym++)
119 	afp->inmap[sym] = afp->abc->inmap[sym];
120       afp->inmap[0] = esl_abc_XGetUnknown(afp->abc);
121     }
122 
123   if (! afp->abc)
124     {
125       for (sym = 1; sym < 128; sym++)
126 	afp->inmap[sym] = (isgraph(sym) ? sym : eslDSQ_ILLEGAL);
127       afp->inmap[0] = '?';
128     }
129   return eslOK;
130 }
131 
132 
133 /* Function:  esl_msafile_stockholm_GuessAlphabet()
134  * Synopsis:  Guess the alphabet of an open Stockholm MSA file.
135  *
136  * Purpose:   Guess the alphabet of the sequences in open
137  *            Stockholm-format MSA file <afp>.
138  *
139  *            On a normal return, <*ret_type> is set to <eslDNA>,
140  *            <eslRNA>, or <eslAMINO>, and <afp> is reset to its
141  *            original position.
142  *
143  * Args:      afp      - open Stockholm-format MSA file
144  *            ret_type - RETURN: <eslDNA>, <eslRNA>, or <eslAMINO>
145  *
146  * Returns:   <eslOK> on success.
147  *            <eslENOALPHABET> if alphabet type can't be determined.
148  *            In either case, <afp> is rewound to the position it
149  *            started at.
150  */
151 int
esl_msafile_stockholm_GuessAlphabet(ESL_MSAFILE * afp,int * ret_type)152 esl_msafile_stockholm_GuessAlphabet(ESL_MSAFILE *afp, int *ret_type)
153 {
154   int       alphatype     = eslUNKNOWN;
155   esl_pos_t anchor        = -1;
156   int       threshold[3]  = { 500, 5000, 50000 }; /* we check after 500, 5000, 50000 residues; else we go to EOF */
157   int       nsteps        = 3;
158   int       step          = 0;
159   int       nres          = 0;
160   int       x;
161   int64_t   ct[26];
162   char     *p, *tok;
163   esl_pos_t n,  toklen, pos;
164   int       status;
165 
166   for (x = 0; x < 26; x++) ct[x] = 0;
167 
168   anchor = esl_buffer_GetOffset(afp->bf);
169   if ((status = esl_buffer_SetAnchor(afp->bf, anchor)) != eslOK) { status = eslEINCONCEIVABLE; goto ERROR; } /* [eslINVAL] can't happen here */
170 
171   while ( (status = esl_buffer_GetLine(afp->bf, &p, &n)) == eslOK)
172     {
173       if ((status = esl_memtok(&p, &n, " \t", &tok, &toklen)) != eslOK || *tok == '#') continue; /* blank lines, annotation, comments */
174       /* p now points to the rest of the sequence line */
175 
176       /* count characters into ct[] array */
177       for (pos = 0; pos < n; pos++)
178 	if (isalpha(p[pos])) {
179 	  x = toupper(p[pos]) - 'A';
180 	  ct[x]++;
181 	  nres++;
182 	}
183 
184       /* try to stop early, checking after 500, 5000, and 50000 residues: */
185       if (step < nsteps && nres > threshold[step]) {
186 	if ((status = esl_abc_GuessAlphabet(ct, &alphatype)) == eslOK) goto DONE; /* (eslENOALPHABET) */
187 	step++;
188       }
189     }
190   if (status != eslEOF) goto ERROR; /* [eslEMEM,eslESYS,eslEINCONCEIVABLE] */
191   status = esl_abc_GuessAlphabet(ct, &alphatype); /* (eslENOALPHABET) */
192 
193  DONE:
194   esl_buffer_SetOffset(afp->bf, anchor);   /* Rewind to where we were. */
195   esl_buffer_RaiseAnchor(afp->bf, anchor);
196   *ret_type = alphatype;
197   return status;
198 
199  ERROR:
200   if (anchor != -1) {
201     esl_buffer_SetOffset(afp->bf, anchor);
202     esl_buffer_RaiseAnchor(afp->bf, anchor);
203   }
204   *ret_type = eslUNKNOWN;
205   return status;
206 }
207 
208 
209 /* Function:  esl_msafile_stockholm_Read()
210  * Synopsis:  Read an alignment in Stockholm format.
211  *
212  * Purpose:   Read an MSA from open <ESL_MSAFILE> <afp>,
213  *            parsing for Stockholm format. Create a new
214  *            MSA, and return it by reference through
215  *            <*ret_msa>. Caller is responsible for freeing
216  *            this <ESL_MSA>.
217  *
218  * Args:      <afp>     - open <ESL_MSAFILE> to read from
219  *            <ret_msa> - RETURN: newly parsed, created <ESL_MSA>
220  *
221  * Returns:   <eslOK> on success. <*ret_msa> contains the newly
222  *            allocated MSA. <afp> is poised at start of next
223  *            alignment record, or is at EOF.
224  *
225  *            <eslEOF> if no (more) alignment data are found in
226  *            <afp>, and <afp> is returned at EOF.
227  *
228  *            <eslEFORMAT> on a parse error. <*ret_msa> is set to
229  *            <NULL>. <afp> contains information sufficient for
230  *            constructing useful diagnostic output:
231  *            | <afp->errmsg>       | user-directed error message     |
232  *            | <afp->linenumber>   | line # where error was detected |
233  *            | <afp->line>         | offending line (not NUL-term)   |
234  *            | <afp->n>            | length of offending line        |
235  *            | <afp->bf->filename> | name of the file                |
236  *            and <afp> is poised at the start of the following line,
237  *            so (in principle) the caller could try to resume
238  *            parsing.
239  *
240  * Throws:    <eslEMEM> on allocation error.
241  *            <eslESYS> if a system call fails, such as fread().
242  *            <*ret_msa> is returned <NULL>.
243  */
244 int
esl_msafile_stockholm_Read(ESL_MSAFILE * afp,ESL_MSA ** ret_msa)245 esl_msafile_stockholm_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa)
246 {
247   ESL_MSA                 *msa      = NULL;
248   ESL_STOCKHOLM_PARSEDATA *pd       = NULL;
249   char                    *p;
250   esl_pos_t                n;
251   int                      idx;
252   int                      status;
253 
254   ESL_DASSERT1( (afp->format == eslMSAFILE_PFAM || afp->format == eslMSAFILE_STOCKHOLM) );
255 
256   afp->errmsg[0] = '\0';
257 
258   /* Allocate a growable MSA, and auxiliary parse data coupled to the MSA allocation */
259   if (afp->abc   &&  (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
260   if (! afp->abc &&  (msa = esl_msa_Create(                 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
261   if ( (pd = stockholm_parsedata_Create(msa))                        == NULL) { status = eslEMEM; goto ERROR; }
262 
263   /* Skip leading blank lines in file. EOF here is a normal EOF return. */
264   do {
265     if ( ( status = esl_msafile_GetLine(afp, &p, &n)) != eslOK) goto ERROR;  /* eslEOF is OK here - end of input (eslEOF) [eslEMEM|eslESYS] */
266   } while (esl_memspn(afp->line, afp->n, " \t") == afp->n ||                  /* skip blank lines             */
267 	   (esl_memstrpfx(afp->line, afp->n, "#")                             /* and skip comment lines       */
268 	    && ! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM")));           /* but stop on Stockholm header */
269 
270   /* Check for the magic Stockholm header */
271   if (! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM 1."))  ESL_XFAIL(eslEFORMAT, afp->errmsg, "missing Stockholm header");
272 
273   while ( (status = esl_msafile_GetLine(afp, &p, &n)) == eslOK) /* (eslEOF) [eslEMEM|eslESYS] */
274     {
275       while (n && ( *p == ' ' || *p == '\t')) { p++; n--; } /* skip leading whitespace */
276 
277       if (!n || esl_memstrpfx(p, n, "//"))
278 	{ /* blank lines and the Stockholm end-of-record // trigger end-of-block logic */
279 	  if (pd->in_block) {
280 	    if (pd->nblock) { if (pd->nseq_b != pd->nseq) ESL_XFAIL(eslEFORMAT, afp->errmsg, "number of seqs in block did not match number in earlier block(s)");     }
281 	    else            { if (pd->nseq_b < pd->nseq)  ESL_XFAIL(eslEFORMAT, afp->errmsg, "number of seqs in block did not match number annotated by #=GS lines"); };
282 	    if (pd->nblock) { if (pd->bi != pd->npb)      ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected number of lines in alignment block"); }
283 
284 	    pd->nseq     = msa->nseq = pd->nseq_b;
285 	    pd->alen    += pd->alen_b;
286 	    pd->in_block = FALSE;
287 	    pd->npb      = pd->bi;
288 	    pd->bi       = 0;
289 	    pd->si       = 0;
290 	    pd->nblock  += 1;
291 	    pd->nseq_b   = 0;
292 	    pd->alen_b   = 0;
293 	  }
294 	  if   (esl_memstrpfx(p, n, "//"))   break; /* Stockholm end-of-record marker */
295 	  else continue;			    /* else, on to next block */
296 	}
297 
298       if (*p == '#')
299 	{
300 	  if      (esl_memstrpfx(p, n, "#=GF")) { if ((status = stockholm_parse_gf     (afp, pd, msa, p, n)) != eslOK) goto ERROR; }
301 	  else if (esl_memstrpfx(p, n, "#=GS")) { if ((status = stockholm_parse_gs     (afp, pd, msa, p, n)) != eslOK) goto ERROR; }
302 	  else if (esl_memstrpfx(p, n, "#=GC")) { if ((status = stockholm_parse_gc     (afp, pd, msa, p, n)) != eslOK) goto ERROR; }
303 	  else if (esl_memstrpfx(p, n, "#=GR")) { if ((status = stockholm_parse_gr     (afp, pd, msa, p, n)) != eslOK) goto ERROR; }
304 	  else if (esl_memstrcmp(p, n, "# STOCKHOLM 1.0")) ESL_XFAIL(eslEFORMAT, afp->errmsg, "two # STOCKHOLM 1.0 headers in a row?");
305 	  else                                  { if ((status = stockholm_parse_comment(         msa, p, n)) != eslOK) goto ERROR; }
306 	}
307       else if (                                       (status = stockholm_parse_sq     (afp, pd, msa, p, n)) != eslOK) goto ERROR;
308     }
309   if      (status == eslEOF) ESL_XFAIL(eslEFORMAT, afp->errmsg, "missing // terminator after MSA");
310   else if (status != eslOK)  goto ERROR;
311   if (pd->nblock == 0)       ESL_XFAIL(eslEFORMAT, afp->errmsg, "no alignment data followed Stockholm header");
312 
313   msa->alen = pd->alen;
314 
315   /* Stockholm file can set weights. If eslMSA_HASWGTS flag is up, at least one was set: then all must be. */
316   if (msa->flags & eslMSA_HASWGTS)
317     {
318       for (idx = 0; idx < msa->nseq; idx++)
319 	if (msa->wgt[idx] == -1.0) ESL_XFAIL(eslEFORMAT, afp->errmsg, "stockholm record ended without a weight for %s", msa->sqname[idx]);
320     }
321   else if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR;
322 
323   stockholm_parsedata_Destroy(pd, msa);
324   *ret_msa  = msa;
325   return eslOK;
326 
327  ERROR:
328   if (pd)  stockholm_parsedata_Destroy(pd, msa);
329   if (msa) esl_msa_Destroy(msa);
330   *ret_msa = NULL;
331   return status;
332 }
333 
334 
335 /* Function:  esl_msafile_stockholm_Write()
336  * Synopsis:  Write a Stockholm format alignment to a stream.
337  *
338  * Purpose:   Write alignment <msa> to output stream <fp>, in Stockholm
339  *            format. <fmt> may either be <eslMSAFILE_STOCKHOLM> or
340  *            <eslMSAFILE_PFAM>.  <eslMSAFILE_PFAM> puts the alignment
341  *            into a single block, one alignment line per sequence.
342  *            <eslMSAFILE_STOCKHOLM> is a multiple block format, with
343  *            a width of 200 aligned residues per line.
344  *
345  * Args:      fp  - open output stream, writable
346  *            msa - alignment to write
347  *            fmt - eslMSAFILE_STOCKHOLM | eslMSAFILE_PFAM
348  *
349  * Returns:   <eslOK> on success.
350  *
351  * Throws:    <eslEMEM> on allocation error.
352  *            <eslEWRITE> on any system write error, such as filled disk.
353  */
354 int
esl_msafile_stockholm_Write(FILE * fp,const ESL_MSA * msa,int fmt)355 esl_msafile_stockholm_Write(FILE *fp, const ESL_MSA *msa, int fmt)
356 {
357   switch (fmt) {
358   case eslMSAFILE_PFAM:       return stockholm_write(fp, msa, msa->alen);
359   case eslMSAFILE_STOCKHOLM:  return stockholm_write(fp, msa, 200);
360   }
361   return eslEINCONCEIVABLE;
362 }
363 /*--------------- end, api for stockholm i/o --------------------*/
364 
365 
366 /*****************************************************************
367  * 2. Internal: ESL_STOCKHOLM_PARSEDATA auxiliary structure
368  *****************************************************************/
369 
370 /* The auxiliary parse data is sufficient to validate each line as we
371  * see it. Our design requires that we immediately report any errors
372  * and the line number they occur on. We do not want to detect errors
373  * in some later validation step, after we've lost track of original
374  * line numbers of the input.
375  */
376 
377 static ESL_STOCKHOLM_PARSEDATA *
stockholm_parsedata_Create(ESL_MSA * msa)378 stockholm_parsedata_Create(ESL_MSA *msa)
379 {
380   ESL_STOCKHOLM_PARSEDATA *pd = NULL;
381   int z;
382   int status;
383 
384   ESL_ALLOC(pd, sizeof(ESL_STOCKHOLM_PARSEDATA));
385   pd->nseq          = 0;
386   pd->alen          = 0;
387 
388   pd->in_block      = FALSE;
389   pd->blinetype     = NULL;
390   pd->bidx          = NULL;
391   pd->npb           = 0;
392   pd->bi            = 0;
393   pd->si            = 0;
394   pd->balloc        = 0;
395 
396   pd->nblock        = 0;
397   pd->nseq_b        = 0;
398   pd->alen_b        = 0;
399 
400   pd->ssconslen     = 0;
401   pd->saconslen     = 0;
402   pd->ppconslen     = 0;
403   pd->rflen         = 0;
404   pd->mmasklen      = 0;
405   pd->sqlen         = NULL;
406   pd->sslen         = NULL;
407   pd->salen         = NULL;
408   pd->pplen         = NULL;
409   pd->ogc_len       = NULL;
410   pd->ogr_len       = NULL;
411   pd->salloc        = 0;
412 
413   ESL_ALLOC(pd->blinetype, sizeof(char) * 16);
414   ESL_ALLOC(pd->bidx,      sizeof(int)  * 16);
415   pd->balloc = 16;
416 
417   ESL_ALLOC(pd->sqlen,     sizeof(int64_t) * msa->sqalloc);
418   for (z = 0; z < msa->sqalloc; z++)
419     pd->sqlen[z] = 0;
420   pd->salloc = msa->sqalloc;
421   return pd;
422 
423  ERROR:
424   stockholm_parsedata_Destroy(pd, msa);
425   return NULL;
426 }
427 
428 static int
stockholm_parsedata_ExpandSeq(ESL_STOCKHOLM_PARSEDATA * pd,ESL_MSA * msa)429 stockholm_parsedata_ExpandSeq(ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa)
430 {
431   int tagidx;
432   int z;
433   int status;
434 
435   ESL_REALLOC(pd->sqlen, sizeof(int64_t) * msa->sqalloc);
436   for (z = pd->salloc; z < msa->sqalloc; z++) pd->sqlen[z] = 0;
437 
438   if (pd->sslen) {
439     ESL_REALLOC(pd->sslen,   sizeof(int64_t) * msa->sqalloc);
440     for (z = pd->salloc; z < msa->sqalloc; z++) pd->sslen[z] = 0;
441   }
442 
443   if (pd->salen) {
444     ESL_REALLOC(pd->salen,   sizeof(int64_t) * msa->sqalloc);
445     for (z = pd->salloc; z < msa->sqalloc; z++) pd->salen[z] = 0;
446   }
447 
448   if (pd->pplen) {
449     ESL_REALLOC(pd->pplen,   sizeof(int64_t) * msa->sqalloc);
450     for (z = pd->salloc; z < msa->sqalloc; z++) pd->pplen[z] = 0;
451   }
452 
453   /* don't need to reallocate ogc_len here: it's [0..ngc-1], not by seq */
454 
455   if (pd->ogr_len) {
456     for (tagidx = 0; tagidx < msa->ngr; tagidx++)
457       if (pd->ogr_len[tagidx]) {
458 	ESL_REALLOC(pd->ogr_len[tagidx], sizeof(int64_t) * msa->sqalloc);
459 	for (z = pd->salloc; z < msa->sqalloc; z++) pd->ogr_len[tagidx][z] = 0;
460       }
461   }
462 
463   pd->salloc = msa->sqalloc;
464   return eslOK;
465 
466  ERROR:
467   return status;
468 }
469 
470 
471 static int
stockholm_parsedata_ExpandBlock(ESL_STOCKHOLM_PARSEDATA * pd)472 stockholm_parsedata_ExpandBlock(ESL_STOCKHOLM_PARSEDATA *pd)
473 {
474   int status;
475 
476   ESL_REALLOC(pd->blinetype, sizeof(char) * (pd->balloc * 2));
477   ESL_REALLOC(pd->bidx,      sizeof(int)  * (pd->balloc * 2));
478   pd->balloc *= 2;
479   return eslOK;
480 
481  ERROR:
482   return status;
483 }
484 
485 
486 static void
stockholm_parsedata_Destroy(ESL_STOCKHOLM_PARSEDATA * pd,ESL_MSA * msa)487 stockholm_parsedata_Destroy(ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa)
488 {
489   int i;
490   if (! pd) return;
491 
492   if (pd->blinetype) free(pd->blinetype);
493   if (pd->bidx)      free(pd->bidx);
494 
495   if (pd->sqlen)     free(pd->sqlen);
496   if (pd->sslen)     free(pd->sslen);
497   if (pd->salen)     free(pd->salen);
498   if (pd->pplen)     free(pd->pplen);
499   if (pd->ogc_len)   free(pd->ogc_len);
500   if (pd->ogr_len) {
501     for (i = 0; i < msa->ngr; i++)
502       if (pd->ogr_len[i]) free(pd->ogr_len[i]);
503     free(pd->ogr_len);
504   }
505   free(pd);
506   return;
507 }
508 /*------------------ end, ESL_STOCKHOLM_PARSEDATA auxiliary structure -------------*/
509 
510 
511 
512 
513 /*****************************************************************
514  * 3. Internal: parsing Stockholm line types
515  *****************************************************************/
516 
517 /* stockholm_parse_gf()
518  * Line format is:
519  *   #=GF <tag> <text>
520  * recognized featurenames: { ID | AC | DE | AU | GA | NC | TC }
521  */
522 static int
stockholm_parse_gf(ESL_MSAFILE * afp,ESL_STOCKHOLM_PARSEDATA * pd,ESL_MSA * msa,char * p,esl_pos_t n)523 stockholm_parse_gf(ESL_MSAFILE *afp, ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa, char *p, esl_pos_t n)
524 {
525   char      *gf,  *tag,   *tok;
526   esl_pos_t gflen, taglen, toklen;
527   int       status;
528 
529   if ( (status = esl_memtok(&p, &n, " \t", &gf,  &gflen))  != eslOK) ESL_EXCEPTION(eslEINCONCEIVABLE, "EOL can't happen here.");
530   if ( (status = esl_memtok(&p, &n, " \t", &tag, &taglen)) != eslOK) ESL_FAIL(eslEFORMAT, afp->errmsg, "#=GF line is missing <tag>, annotation");
531   if (! esl_memstrcmp(gf, gflen, "#=GF"))                            ESL_FAIL(eslEFORMAT, afp->errmsg, "faux #=GF line?");
532 
533   if      (esl_memstrcmp(tag, taglen, "ID"))
534     {
535       if ((status = esl_memtok(&p, &n, " \t", &tok, &toklen)) != eslOK) ESL_FAIL(eslEFORMAT, afp->errmsg, "No name found on #=GF ID line");
536       if (n)                                                            ESL_FAIL(eslEFORMAT, afp->errmsg, "#=GF ID line should have only one name (no whitespace allowed)");
537       if ( (status = esl_msa_SetName (msa, tok, toklen))      != eslOK) return status; /* [eslEMEM] */
538     }
539   else if (esl_memstrcmp(tag, taglen, "AC"))
540     {
541       if ((status = esl_memtok(&p, &n, " \t", &tok, &toklen)) != eslOK) ESL_FAIL(eslEFORMAT, afp->errmsg, "No accession found on #=GF AC line");
542       if (n)                                                            ESL_FAIL(eslEFORMAT, afp->errmsg, "#=GF AC line should have only one accession (no whitespace allowed)");
543       if ((status = esl_msa_SetAccession(msa, tok, toklen))   != eslOK) return status; /* [eslEMEM] */
544     }
545   else if (esl_memstrcmp(tag, taglen, "DE"))
546     {
547       if ((status = esl_msa_SetDesc     (msa, p, n))          != eslOK) return status; /* [eslEMEM] */
548     }
549   else if (esl_memstrcmp(tag, taglen, "AU"))
550     {
551       if ((status = esl_msa_SetAuthor   (msa, p, n))          != eslOK) return status; /* [eslEMEM] */
552     }
553   else if (esl_memstrcmp(tag, taglen, "GA"))
554     {
555       if ( (status = esl_memtok(&p, &n, " \t", &tok, &toklen)) == eslOK) {
556 	if (! esl_mem_IsReal(tok, toklen)) ESL_FAIL(eslEFORMAT, afp->errmsg, "Expected a real number for GA1 value on #=GF GA line");
557 	if (  esl_memtof(tok, toklen, &(msa->cutoff[eslMSA_GA1])) != eslOK) return status; /* [eslEMEM] */
558 	msa->cutset[eslMSA_GA1] = TRUE;
559       } else ESL_FAIL(eslEFORMAT, afp->errmsg, "No GA threshold value found on #=GF GA line");
560       if ( (status = esl_memtok(&p, &n, " \t", &tok, &toklen)) == eslOK) {
561 	if (! esl_mem_IsReal(tok, toklen)) ESL_FAIL(eslEFORMAT, afp->errmsg, "Expected a real number for GA2 value on #=GF GA line");
562 	if (  esl_memtof(tok, toklen, &(msa->cutoff[eslMSA_GA2])) != eslOK) return status; /* [eslEMEM] */
563 	msa->cutset[eslMSA_GA2] = TRUE;
564       }
565     }
566   else if (esl_memstrcmp(tag, taglen, "NC"))
567     {
568       if ( (status = esl_memtok(&p, &n, " \t", &tok, &toklen)) == eslOK) {
569 	if ( ! esl_memstrcmp(tok, toklen, "undefined")) /* workaround for a problem in Rfam10. ignore NC's that are set to "undefined". */
570 	  {
571 	    if (! esl_mem_IsReal(tok, toklen)) ESL_FAIL(eslEFORMAT, afp->errmsg, "Expected a real number for NC1 value on #=GF NC line");
572 	    if (  esl_memtof(tok, toklen, &(msa->cutoff[eslMSA_NC1])) != eslOK) return status; /* [eslEMEM] */
573 	    msa->cutset[eslMSA_NC1] = TRUE;
574 	  }
575       } else ESL_FAIL(eslEFORMAT, afp->errmsg, "No NC threshold value found on #=GF NC line");
576       if ( (status = esl_memtok(&p, &n, " \t", &tok, &toklen)) == eslOK) {
577 	if (! esl_mem_IsReal(tok, toklen)) ESL_FAIL(eslEFORMAT, afp->errmsg, "Expected a real number for NC2 value on #=GF NC line");
578 	if (  esl_memtof(tok, toklen, &(msa->cutoff[eslMSA_NC2])) != eslOK) return status; /* [eslEMEM] */
579 	msa->cutset[eslMSA_NC2] = TRUE;
580       }
581     }
582   else if (esl_memstrcmp(tag, taglen, "TC"))
583     {
584       if ( (status = esl_memtok(&p, &n, " \t", &tok, &toklen)) == eslOK) {
585 	if (! esl_mem_IsReal(tok, toklen)) ESL_FAIL(eslEFORMAT, afp->errmsg, "Expected a real number for TC1 value on #=GF TC line");
586 	if (  esl_memtof(tok, toklen, &(msa->cutoff[eslMSA_TC1])) != eslOK) return status; /* [eslEMEM] */
587 	msa->cutset[eslMSA_TC1] = TRUE;
588       } else ESL_FAIL(eslEFORMAT, afp->errmsg, "No TC threshold value found on #=GF TC line");
589       if ( (status = esl_memtok(&p, &n, " \t", &tok, &toklen)) == eslOK) {
590 	if (! esl_mem_IsReal(tok, toklen)) ESL_FAIL(eslEFORMAT, afp->errmsg, "Expected a real number for TC2 value on #=GF TC line");
591 	if (  esl_memtof(tok, toklen, &(msa->cutoff[eslMSA_TC2])) != eslOK) return status; /* [eslEMEM] */
592 	msa->cutset[eslMSA_TC2] = TRUE;
593       }
594     }
595   else
596     {
597       if ((status = esl_msa_AddGF(msa, tag, taglen, p, n)) != eslOK) return status;
598     }
599 
600   return eslOK;
601 }
602 
603 
604 /* stockholm_parse_gs()
605  * Format:
606  *   #=GS <seqname> <tag> <text>
607  * recognized featurenames: { WT | AC | DE }
608  */
609 static int
stockholm_parse_gs(ESL_MSAFILE * afp,ESL_STOCKHOLM_PARSEDATA * pd,ESL_MSA * msa,char * p,esl_pos_t n)610 stockholm_parse_gs(ESL_MSAFILE *afp, ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa, char *p, esl_pos_t n)
611 {
612   char      *gs,   *seqname,   *tag,   *tok;
613   esl_pos_t  gslen, seqnamelen, taglen, toklen;
614   int        seqidx;
615   int        status;
616 
617   if (esl_memtok(&p, &n, " \t", &gs,      &gslen)      != eslOK) ESL_EXCEPTION(eslEINCONCEIVABLE, "EOL can't happen here.");
618   if (esl_memtok(&p, &n, " \t", &seqname, &seqnamelen) != eslOK) ESL_FAIL(eslEFORMAT, afp->errmsg, "#=GS line missing <seqname>, <tag>, annotation");
619   if (esl_memtok(&p, &n, " \t", &tag,     &taglen)     != eslOK) ESL_FAIL(eslEFORMAT, afp->errmsg, "#=GS line missing <tag>, annotation");
620   if (! esl_memstrcmp(gs, gslen, "#=GS"))                        ESL_FAIL(eslEFORMAT, afp->errmsg, "faux #=GS line?");
621 
622   seqidx = pd->si;
623   if (seqidx == pd->nseq || ! esl_memstrcmp(seqname, seqnamelen, msa->sqname[seqidx])) {
624     stockholm_get_seqidx(msa, pd, seqname, seqnamelen, &seqidx);
625   }
626 
627   if (esl_memstrcmp(tag, taglen, "WT"))
628     {
629       if (esl_memtok(&p, &n, " \t", &tok, &toklen) != eslOK) ESL_FAIL(eslEFORMAT, afp->errmsg, "no weight value found on #=GS <seqname> WT line");
630       if (msa->wgt[seqidx] != -1.0)                          ESL_FAIL(eslEFORMAT, afp->errmsg, "sequence has more than one #=GS <seqname> WT line");
631       if (n)                                                 ESL_FAIL(eslEFORMAT, afp->errmsg, "#=GS <seqname> WT line should have only one field, the weight");
632       if (! esl_mem_IsReal(tok, toklen))                     ESL_FAIL(eslEFORMAT, afp->errmsg, "value on #=GS <seqname> WT line isn't a real number");
633       if ((status = esl_memtod(tok, toklen, &(msa->wgt[seqidx]))) != eslOK) return status; /* eslEMEM */
634       msa->flags |= eslMSA_HASWGTS;
635     }
636   else if (esl_memstrcmp(tag, taglen, "AC"))
637     {
638       if (esl_memtok(&p, &n, " \t", &tok, &toklen) != eslOK) ESL_FAIL(eslEFORMAT, afp->errmsg, "no accession found on #=GS <seqname> AC line");
639       if (msa->sqacc && msa->sqacc[seqidx])                  ESL_FAIL(eslEFORMAT, afp->errmsg, "sequence has more than one #=GS <seqname> AC accession line");
640       if (n)                                                 ESL_FAIL(eslEFORMAT, afp->errmsg, "#=GS <seqname> AC line should have only one field, the accession");
641       if ((status = esl_msa_SetSeqAccession(msa, seqidx, tok, toklen)) != eslOK) return status; /* eslEMEM */
642     }
643   else if (esl_memstrcmp(tag, taglen, "DE"))
644     {
645       if (msa->sqdesc && msa->sqdesc[seqidx]) ESL_FAIL(eslEFORMAT, afp->errmsg, "sequence has more than one #=GS <seqname> DE accession line");
646       if ((status = esl_msa_SetSeqDescription(msa, seqidx, p, n)) != eslOK) return status; /* eslEMEM */
647     }
648   else
649     {
650       if ((status = esl_msa_AddGS(msa, tag, taglen, seqidx, p, n)) != eslOK) return status;
651     }
652 
653   pd->si = seqidx+1;	/* set guess for next sequence index */
654   return eslOK;
655 }
656 
657 /* stockholm_parse_gc()
658  * Format of line is:
659  *   #=GC <tag> <aligned text>
660  * recognized featurenames: { SS_cons | SA_cons | PP_cons | RF }
661  */
662 static int
stockholm_parse_gc(ESL_MSAFILE * afp,ESL_STOCKHOLM_PARSEDATA * pd,ESL_MSA * msa,char * p,esl_pos_t n)663 stockholm_parse_gc(ESL_MSAFILE *afp, ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa, char *p, esl_pos_t n)
664 {
665   char      *gc,    *tag;
666   esl_pos_t  gclen,  taglen;
667   int        tagidx;
668   int        status;
669 
670   if (esl_memtok(&p, &n, " \t", &gc,   &gclen)    != eslOK) ESL_EXCEPTION(eslEINCONCEIVABLE, "EOL can't happen here.");
671   if (esl_memtok(&p, &n, " \t", &tag,  &taglen)   != eslOK) ESL_FAIL(eslEFORMAT, afp->errmsg, "#=GC line missing <tag>, annotation");
672   while (n && strchr(" \t", p[n-1])) n--; /* skip backwards from eol, to delimit aligned text without going through it */
673 
674   if (! esl_memstrcmp(gc, gclen, "#=GC")) ESL_FAIL(eslEFORMAT, afp->errmsg, "faux #=GC line?");
675   if (! n)                                ESL_FAIL(eslEFORMAT, afp->errmsg, "#=GC line missing annotation?");
676 
677   if (pd->nblock) 		/* Subsequent blocks */
678     {
679       if      (esl_memstrcmp(tag, taglen, "SS_cons")) { if (pd->blinetype[pd->bi] != eslSTOCKHOLM_LINE_GC_SSCONS) ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected #=GC SS_cons; earlier block(s) in different order?"); }
680       else if (esl_memstrcmp(tag, taglen, "SA_cons")) { if (pd->blinetype[pd->bi] != eslSTOCKHOLM_LINE_GC_SACONS) ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected #=GC SA_cons; earlier block(s) in different order?"); }
681       else if (esl_memstrcmp(tag, taglen, "PP_cons")) { if (pd->blinetype[pd->bi] != eslSTOCKHOLM_LINE_GC_PPCONS) ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected #=GC PP_cons; earlier block(s) in different order?"); }
682       else if (esl_memstrcmp(tag, taglen, "RF"))      { if (pd->blinetype[pd->bi] != eslSTOCKHOLM_LINE_GC_RF)     ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected #=GC RF; earlier block(s) in different order?");      }
683       else if (esl_memstrcmp(tag, taglen, "MM"))      { if (pd->blinetype[pd->bi] != eslSTOCKHOLM_LINE_GC_MM)     ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected #=GC MM; earlier block(s) in different order?");      }
684       else if (                                             pd->blinetype[pd->bi] != eslSTOCKHOLM_LINE_GC_OTHER)  ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected #=GC line; earlier block(s) in different order?");
685     }
686   else				/* First block */
687     {
688       if (pd->bi == pd->balloc && (status = stockholm_parsedata_ExpandBlock(pd)) != eslOK) return status;
689 
690       if      (esl_memstrcmp(tag, taglen, "SS_cons"))  pd->blinetype[pd->bi] = eslSTOCKHOLM_LINE_GC_SSCONS;
691       else if (esl_memstrcmp(tag, taglen, "SA_cons"))  pd->blinetype[pd->bi] = eslSTOCKHOLM_LINE_GC_SACONS;
692       else if (esl_memstrcmp(tag, taglen, "PP_cons"))  pd->blinetype[pd->bi] = eslSTOCKHOLM_LINE_GC_PPCONS;
693       else if (esl_memstrcmp(tag, taglen, "RF"))       pd->blinetype[pd->bi] = eslSTOCKHOLM_LINE_GC_RF;
694       else if (esl_memstrcmp(tag, taglen, "MM"))       pd->blinetype[pd->bi] = eslSTOCKHOLM_LINE_GC_MM;
695       else                                             pd->blinetype[pd->bi] = eslSTOCKHOLM_LINE_GC_OTHER;
696       pd->bidx[pd->bi]      = -1;
697     }
698 
699   if (pd->blinetype[pd->bi] == eslSTOCKHOLM_LINE_GC_SSCONS)
700     {
701       if (pd->ssconslen != pd->alen) ESL_FAIL(eslEFORMAT, afp->errmsg, "more than one #=GC SS_cons line in block");
702       if ( (status = esl_strcat(&(msa->ss_cons), pd->ssconslen, p, n)) != eslOK) return status; /* [eslEMEM] */
703       pd->ssconslen += n;
704     }
705   else if (pd->blinetype[pd->bi] == eslSTOCKHOLM_LINE_GC_SACONS)
706     {
707       if (pd->saconslen != pd->alen) ESL_FAIL(eslEFORMAT, afp->errmsg, "more than one #=GC SA_cons line in block");
708       if ((status = esl_strcat(&(msa->sa_cons), pd->saconslen, p, n)) != eslOK) return status; /* [eslEMEM] */
709       pd->saconslen += n;
710     }
711   else if (pd->blinetype[pd->bi] == eslSTOCKHOLM_LINE_GC_PPCONS)
712     {
713       if (pd->ppconslen != pd->alen) ESL_FAIL(eslEFORMAT, afp->errmsg, "more than one #=GC PP_cons line in block");
714       if ((status = esl_strcat(&(msa->pp_cons), pd->ppconslen, p, n)) != eslOK) return status; /* [eslEMEM] */
715       pd->ppconslen += n;
716     }
717   else if (pd->blinetype[pd->bi] == eslSTOCKHOLM_LINE_GC_RF)
718     {
719       if (pd->rflen != pd->alen) ESL_FAIL(eslEFORMAT, afp->errmsg, "more than one #=GC RF line in block");
720       if ((status = esl_strcat(&(msa->rf), pd->rflen, p, n)) != eslOK) return status; /* [eslEMEM] */
721       pd->rflen += n;
722     }
723   else if (pd->blinetype[pd->bi] == eslSTOCKHOLM_LINE_GC_MM)
724     {
725       if (pd->mmasklen != pd->alen) ESL_FAIL(eslEFORMAT, afp->errmsg, "more than one #=GC MM line in block");
726       if ((status = esl_strcat(&(msa->mm), pd->mmasklen, p, n)) != eslOK) return status; /* [eslEMEM] */
727       pd->mmasklen += n;
728     }
729   else
730     {
731       if ((status = stockholm_get_gc_tagidx(msa, pd, tag, taglen, &tagidx)) != eslOK) return status;
732 
733       if (pd->ogc_len[tagidx] != pd->alen) ESL_FAIL(eslEFORMAT, afp->errmsg, "more than one #=GC %.*s line in block", (int) taglen, tag);
734       if ((status = esl_strcat(&(msa->gc[tagidx]), pd->ogc_len[tagidx], p, n)) != eslOK) return status; /* [eslEMEM] */
735       pd->ogc_len[tagidx] += n;
736     }
737 
738   if (pd->bi && n != pd->alen_b) ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected # of aligned annotation in #=GC %.*s line", (int) taglen, tag);
739   pd->alen_b   = n;
740   pd->in_block = TRUE;
741   pd->bi++;
742   return eslOK;
743 }
744 
745 /* A GR line is
746  *   #=GR <seqname> <featurename> <text>
747  * recognized featurenames: { SS | SA | PP }
748  *
749  */
750 static int
stockholm_parse_gr(ESL_MSAFILE * afp,ESL_STOCKHOLM_PARSEDATA * pd,ESL_MSA * msa,char * p,esl_pos_t n)751 stockholm_parse_gr(ESL_MSAFILE *afp, ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa, char *p, esl_pos_t n)
752 {
753   char      *gr,   *name,    *tag;
754   esl_pos_t  grlen, namelen,  taglen;
755   int        seqidx, tagidx;
756   int        z;
757   int        status;
758 
759   if (esl_memtok(&p, &n, " \t", &gr,   &grlen)    != eslOK) ESL_EXCEPTION(eslEINCONCEIVABLE, "EOL can't happen here.");
760   if (esl_memtok(&p, &n, " \t", &name, &namelen)  != eslOK) ESL_FAIL(eslEFORMAT, afp->errmsg, "#=GR line missing <seqname>, <tag>, annotation");
761   if (esl_memtok(&p, &n, " \t", &tag,  &taglen)   != eslOK) ESL_FAIL(eslEFORMAT, afp->errmsg, "#=GR line missing <tag>, annotation");
762   while (n && strchr(" \t", p[n-1])) n--; /* skip backwards from eol, to delimit aligned text without going through it */
763 
764   if (! esl_memstrcmp(gr, grlen, "#=GR")) ESL_FAIL(eslEFORMAT, afp->errmsg, "faux #=GR line?");
765   if (! n)                                ESL_FAIL(eslEFORMAT, afp->errmsg, "#=GR line missing annotation?");
766 
767   /* Which seqidx is this? likely to be either pd->si-1 (#=GR following a seq) or
768    * pd->si (#=GR preceding a seq)
769    */
770   if (! pd->nblock) /* First block: we're setting bidx[], blinetype[] as we see them */
771     {
772       if      (pd->si >= 1       && esl_memstrcmp(name, namelen, msa->sqname[pd->si-1])) seqidx = pd->si-1;
773       else if (pd->si < pd->nseq && esl_memstrcmp(name, namelen, msa->sqname[pd->si]))   seqidx = pd->si;
774       else if ((status = stockholm_get_seqidx(msa, pd, name, namelen, &seqidx)) != eslOK) goto ERROR;
775 
776       if (pd->bi == pd->balloc && (status = stockholm_parsedata_ExpandBlock(pd)) != eslOK) return status;
777 
778       if      (esl_memstrcmp(tag, taglen, "SS")) pd->blinetype[pd->bi] = eslSTOCKHOLM_LINE_GR_SS;
779       else if (esl_memstrcmp(tag, taglen, "SA")) pd->blinetype[pd->bi] = eslSTOCKHOLM_LINE_GR_SA;
780       else if (esl_memstrcmp(tag, taglen, "PP")) pd->blinetype[pd->bi] = eslSTOCKHOLM_LINE_GR_PP;
781       else                                       pd->blinetype[pd->bi] = eslSTOCKHOLM_LINE_GR_OTHER;
782       pd->bidx[pd->bi]      = seqidx;
783     }
784   else
785     {				/* subsequent block(s) */
786       if (pd->bi >= pd->npb) ESL_FAIL(eslEFORMAT, afp->errmsg, "more lines than expected in this alignment block; earlier blocks had fewer");
787 
788       if      (esl_memstrcmp(tag, taglen, "SS")) { if (pd->blinetype[pd->bi] != eslSTOCKHOLM_LINE_GR_SS)    ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected #=GR <seqname> SS; earlier block(s) in different order?"); }
789       else if (esl_memstrcmp(tag, taglen, "SA")) { if (pd->blinetype[pd->bi] != eslSTOCKHOLM_LINE_GR_SA)    ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected #=GR <seqname> SA; earlier block(s) in different order?"); }
790       else if (esl_memstrcmp(tag, taglen, "PP")) { if (pd->blinetype[pd->bi] != eslSTOCKHOLM_LINE_GR_PP)    ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected #=GR <seqname> PP; earlier block(s) in different order?"); }
791       else if (                                        pd->blinetype[pd->bi] != eslSTOCKHOLM_LINE_GR_OTHER) ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected #=GR line; earlier block(s) in different order?");
792 
793       seqidx = pd->bidx[pd->bi];
794       if (! esl_memstrcmp(name, namelen, msa->sqname[seqidx])) ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected seqname %.*s; expected %s from prev blocks", (int) namelen, name, msa->sqname[seqidx]);
795     }
796 
797   /* Append the annotation where it belongs  */
798   if (pd->blinetype[pd->bi] == eslSTOCKHOLM_LINE_GR_SS)
799     {
800       if (! msa->ss) {
801 	ESL_ALLOC(msa->ss,   sizeof(char *)  * msa->sqalloc);
802 	ESL_ALLOC(pd->sslen, sizeof(int64_t) * msa->sqalloc);
803 	for (z = 0; z < msa->sqalloc; z++) { msa->ss[z] = NULL; pd->sslen[z] = 0; }
804       }
805       if (pd->sslen[seqidx] != pd->alen) ESL_FAIL(eslEFORMAT, afp->errmsg, "more than one #=GR %.*s SS line in block", (int) namelen, name);
806       if (( status = esl_strcat(&(msa->ss[seqidx]), pd->sslen[seqidx], p, n)) != eslOK) return status; /* [eslEMEM] */
807       pd->sslen[seqidx] += n;
808     }
809   else if (pd->blinetype[pd->bi] == eslSTOCKHOLM_LINE_GR_PP)
810     {
811       if (! msa->pp) {
812 	ESL_ALLOC(msa->pp,   sizeof(char *)  * msa->sqalloc);
813 	ESL_ALLOC(pd->pplen, sizeof(int64_t) * msa->sqalloc);
814 	for (z = 0; z < msa->sqalloc; z++) { msa->pp[z] = NULL; pd->pplen[z] = 0; }
815       }
816       if (pd->pplen[seqidx] != pd->alen) ESL_FAIL(eslEFORMAT, afp->errmsg, "more than one #=GR %.*s PP line in block", (int) namelen, name);
817       if ((status = esl_strcat(&(msa->pp[seqidx]), pd->pplen[seqidx], p, n)) != eslOK) return status; /* [eslEMEM] */
818       pd->pplen[seqidx] += n;
819     }
820   else if (pd->blinetype[pd->bi] == eslSTOCKHOLM_LINE_GR_SA)
821     {
822       if (! msa->sa) {
823 	ESL_ALLOC(msa->sa,   sizeof(char *)  * msa->sqalloc);
824 	ESL_ALLOC(pd->salen, sizeof(int64_t) * msa->sqalloc);
825 	for (z = 0; z < msa->sqalloc; z++) { msa->sa[z] = NULL; pd->salen[z] = 0; }
826       }
827       if (pd->salen[seqidx] != pd->alen) ESL_FAIL(eslEFORMAT, afp->errmsg, "more than one #=GR %.*s SA line in block", (int) namelen, name);
828       if ((status = esl_strcat(&(msa->sa[seqidx]), pd->salen[seqidx], p, n)) != eslOK) return status;
829       pd->salen[seqidx] += n;
830     }
831   else
832     {
833       if ((status = stockholm_get_gr_tagidx(msa, pd, tag, taglen, &tagidx)) != eslOK) return status; /* [eslEMEM] */
834 
835       if (pd->ogr_len[tagidx][seqidx] != pd->alen) ESL_FAIL(eslEFORMAT, afp->errmsg, "more than one #=GR %.*s %.*s line in block", (int) namelen, name, (int) taglen, tag);
836       if ((status = esl_strcat(&(msa->gr[tagidx][seqidx]), pd->ogr_len[tagidx][seqidx], p, n)) != eslOK) return status;
837       pd->ogr_len[tagidx][seqidx] += n;
838     }
839 
840   if (pd->bi && n != pd->alen_b) ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected # of aligned annotation in #=GR %.*s %.*s line", (int) namelen, name, (int) taglen, tag);
841   pd->alen_b   = n;
842   pd->in_block = TRUE;
843   pd->bi++;
844   return eslOK;
845 
846  ERROR:
847   return status;
848 }
849 
850 
851 /* stockholm_parse_sq():
852  * Format of line is:
853  *   <seqname>  <aligned text>
854  */
855 static int
stockholm_parse_sq(ESL_MSAFILE * afp,ESL_STOCKHOLM_PARSEDATA * pd,ESL_MSA * msa,char * p,esl_pos_t n)856 stockholm_parse_sq(ESL_MSAFILE *afp, ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa, char *p, esl_pos_t n)
857 {
858   char     *seqname;
859   esl_pos_t seqnamelen;
860   int       seqidx = pd->si;
861   int       status;
862 
863   if (esl_memtok(&p, &n, " \t", &seqname, &seqnamelen) != eslOK) ESL_EXCEPTION(eslEINCONCEIVABLE, "EOL can't happen here.");
864   while (n && strchr(" \t", p[n-1])) n--; /* skip backwards from eol, to delimit aligned text without going through it */
865 
866   if (! n) ESL_FAIL(eslEFORMAT, afp->errmsg, "sequence line with no sequence?");
867 
868   /* Which seqidx is this?
869    * In first block:
870    *    1. If #=GS lines set sqname[] completely, then it's pd->si.
871    *    2. If #=GS lines set sqname[] partially or out of order, then name may be in the keyhash.
872    *    3. If we haven't seen name before, then we'll add it: seqidx = pd->nseq, add name to keyhash, possibly reallocate.
873    * In subsequent blocks, use recorded indices and linetypes:
874    *    4. seqidx = saved bidx[]; should be expecting a SQ line; name should match expected name.
875    */
876   if (! pd->nblock) /* First block: we're setting npb, bidx[], and blinetype[] as we see them */
877     {
878       if (pd->si < pd->nseq && esl_memstrcmp(seqname, seqnamelen, msa->sqname[seqidx])) seqidx = pd->si;
879       else if ((status = stockholm_get_seqidx(msa, pd, seqname, seqnamelen, &seqidx)) != eslOK) return status; /* [eslEMEM] */
880 
881       if (pd->bi == pd->balloc && (status = stockholm_parsedata_ExpandBlock(pd)) != eslOK) return status;
882 
883       pd->blinetype[pd->bi] = eslSTOCKHOLM_LINE_SQ;
884       pd->bidx[pd->bi]      = seqidx;
885     }
886   else
887     {				/* subsequent block(s) */
888       if (pd->bi >= pd->npb)                               ESL_FAIL(eslEFORMAT, afp->errmsg, "more lines than expected; earlier blocks had fewer");
889       if (  pd->blinetype[pd->bi] != eslSTOCKHOLM_LINE_SQ) ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected seq line; earlier block(s) in different order?");
890       seqidx = pd->bidx[pd->bi];
891 
892       if (! esl_memstrcmp(seqname, seqnamelen, msa->sqname[seqidx])) ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected seq name %.*s; expected %s from prev block order", (int) seqnamelen, seqname, msa->sqname[seqidx]);
893     }
894 
895   if ( pd->bi > 0 && pd->sqlen[seqidx] == pd->alen + pd->alen_b) ESL_FAIL(eslEFORMAT, afp->errmsg, "duplicate seq name %.*s", (int) seqnamelen, seqname);
896 
897   if (  afp->abc ) {
898     status = esl_abc_dsqcat(afp->inmap, &(msa->ax[seqidx]),   &(pd->sqlen[seqidx]), p, n);
899     if      (status == eslEINVAL) ESL_FAIL(eslEFORMAT, afp->errmsg, "invalid sequence character(s) on line");
900     else if (status != eslOK)     return status;
901   }
902 
903   if (! afp->abc) {
904     status = esl_strmapcat (afp->inmap, &(msa->aseq[seqidx]), &(pd->sqlen[seqidx]), p, n);
905     if      (status == eslEINVAL) ESL_FAIL(eslEFORMAT, afp->errmsg, "invalid sequence character(s) on line");
906     else if (status != eslOK)     return status;
907   }
908 
909   if (pd->bi && n != pd->alen_b)         ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected number of aligned residues parsed on line");
910   if (pd->sqlen[seqidx] - pd->alen != n) ESL_EXCEPTION(eslEINCONCEIVABLE, "implementation assumes that no symbols are ignored in inmap; else GR, GC text annotations are messed up");
911   pd->alen_b   = n;
912   pd->in_block = TRUE;
913   pd->nseq_b++;
914   pd->bi++;
915   pd->si = seqidx+1;
916   return eslOK;
917 }
918 
919 
920 
921 static int
stockholm_parse_comment(ESL_MSA * msa,char * p,esl_pos_t n)922 stockholm_parse_comment(ESL_MSA *msa, char *p, esl_pos_t n)
923 {
924   if (n && *p == '#')      { p++; n--; }
925   while (n && isspace(*p)) { p++; n--; }
926 
927   return esl_msa_AddComment(msa, p, n);
928 }
929 /*------------- end, parsing Stockholm line types ---------------*/
930 
931 
932 
933 /*****************************************************************
934  * 4. Internal: looking up seq, tag indices
935  *****************************************************************/
936 
937 
938 /* stockholm_get_seqidx()
939  *
940  * Find the index of a given sequence <name>,<n> in a growing <msa>
941  * with associated parse data <pdat>. If caller has a good guess (for
942  * instance, the sequences are coming in a previously seen order in a
943  * block of seqs or annotation), the caller can pass this information
944  * in <guess>, or -1 if it has no guess.
945  *
946  * If the name does not already exist in the MSA, then it
947  * is assumed to be a new sequence name that we need to store.
948  * seqidx is set to pdat->nseq, the MSA is Expand()'ed if necessary
949  * to make room, the name is stored in msa->sqname[pdat->nseq],
950  * and pdat->nseq is incremented.
951  *
952  * Returns:  <eslOK> on success, and the seqidx is
953  *           passed back via <ret_idx>. If <name> is new
954  *           in the <msa>, the <name> is stored and the <msa>
955  *           may be internally reallocated if needed.
956  *
957  * Throws: <eslEMEM> on allocation failure
958  *         <eslEINVAL> if we try to add a name to a non-growable MSA.
959  *         <eslEINCONCEIVABLE> on internal coding errors
960  */
961 static int
stockholm_get_seqidx(ESL_MSA * msa,ESL_STOCKHOLM_PARSEDATA * pd,char * name,esl_pos_t n,int * ret_idx)962 stockholm_get_seqidx(ESL_MSA *msa, ESL_STOCKHOLM_PARSEDATA *pd, char *name, esl_pos_t n, int *ret_idx)
963 {
964   int seqidx;
965   int status;
966 
967   status = esl_keyhash_Store(msa->index, name, n, &seqidx);
968   if (status == eslEDUP) { *ret_idx = seqidx; return eslOK; }
969   if (status != eslOK)   goto ERROR;
970 
971   /* if we get here, this is a new name we're adding */
972   if (seqidx >= msa->sqalloc) {
973     if ( (status = esl_msa_Expand(msa))                    != eslOK) goto ERROR;
974     if ( (status = stockholm_parsedata_ExpandSeq(pd, msa)) != eslOK) goto ERROR;
975   }
976 
977   if ( (status = esl_msa_SetSeqName(msa, seqidx, name, n)) != eslOK) goto ERROR;
978   pd->nseq++;
979   msa->nseq = pd->nseq;		/* pd->nseq and msa->nseq must stay in lockstep */
980 
981   *ret_idx = seqidx;
982   return eslOK;
983 
984  ERROR:
985   *ret_idx = -1;
986   return status;
987 }
988 
989 static int
stockholm_get_gr_tagidx(ESL_MSA * msa,ESL_STOCKHOLM_PARSEDATA * pd,char * tag,esl_pos_t taglen,int * ret_tagidx)990 stockholm_get_gr_tagidx(ESL_MSA *msa, ESL_STOCKHOLM_PARSEDATA *pd, char *tag, esl_pos_t taglen, int *ret_tagidx)
991 {
992   int tagidx;
993   int z;
994   int status;
995 
996   /* Find the tag, if we have it; else, add it, at tagidx = msa->ngr */
997   if (!msa->gr_idx && (msa->gr_idx = esl_keyhash_CreateCustom(8,8,128)) == NULL) { status = eslEMEM; goto ERROR; }
998   status = esl_keyhash_Store(msa->gr_idx, tag, taglen, &tagidx);
999   if      (status == eslEDUP) { *ret_tagidx = tagidx; return eslOK; }
1000   else if (status != eslOK)   goto ERROR;
1001 
1002   /* if we get here, this is a new tag we're adding. */
1003   ESL_REALLOC(msa->gr_tag,       sizeof(char *)    * (msa->ngr+1)); /* +1, we're allocated one new tag at a time, as needed */
1004   ESL_REALLOC(msa->gr,           sizeof(char **)   * (msa->ngr+1));
1005   ESL_REALLOC(pd->ogr_len,       sizeof(int64_t *) * (msa->ngr+1));
1006   msa->gr_tag[tagidx] = NULL;
1007   msa->gr[tagidx]     = NULL;
1008   pd->ogr_len[tagidx] = NULL;
1009   ESL_ALLOC(msa->gr[tagidx],     sizeof(char *)    * msa->sqalloc);
1010   ESL_ALLOC(pd->ogr_len[tagidx], sizeof(int64_t)   * msa->sqalloc);
1011   for (z = 0; z < msa->sqalloc; z++) {
1012     msa->gr[tagidx][z] = NULL;
1013     pd->ogr_len[tagidx][z] = 0;
1014   }
1015 
1016   if ( (status = esl_memstrdup(tag, taglen, &(msa->gr_tag[tagidx]))) != eslOK) goto ERROR; /* eslEMEM */
1017   msa->ngr++;
1018   *ret_tagidx = tagidx;
1019   return eslOK;
1020 
1021  ERROR:
1022   *ret_tagidx = -1;
1023   return status;
1024 }
1025 
1026 static int
stockholm_get_gc_tagidx(ESL_MSA * msa,ESL_STOCKHOLM_PARSEDATA * pd,char * tag,esl_pos_t taglen,int * ret_tagidx)1027 stockholm_get_gc_tagidx(ESL_MSA *msa, ESL_STOCKHOLM_PARSEDATA *pd, char *tag, esl_pos_t taglen, int *ret_tagidx)
1028 {
1029   int tagidx;
1030   int status;
1031 
1032   /* Find the tag, if we have it; else, add it, at tagidx = msa->ngc */
1033   if (!msa->gc_idx && (msa->gc_idx = esl_keyhash_CreateCustom(8,8,128)) == NULL) { status = eslEMEM; goto ERROR; }
1034   status = esl_keyhash_Store(msa->gc_idx, tag, taglen, &tagidx);
1035   if      (status == eslEDUP) { *ret_tagidx = tagidx; return eslOK; }
1036   else if (status != eslOK)   goto ERROR; /* eslEMEM */
1037 
1038 
1039   /* if we get here, this is a new tag we're adding. */
1040   ESL_REALLOC(msa->gc_tag, sizeof(char *)  * (msa->ngc+1)); /* +1, we're allocated one new tag at a time, as needed */
1041   ESL_REALLOC(msa->gc,     sizeof(char *)  * (msa->ngc+1));
1042   ESL_REALLOC(pd->ogc_len, sizeof(int64_t) * (msa->ngc+1));
1043   msa->gc_tag[tagidx] = NULL;
1044   msa->gc[tagidx]     = NULL;
1045   pd->ogc_len[tagidx] = 0;
1046 
1047   if ( (status = esl_memstrdup(tag, taglen, &(msa->gc_tag[tagidx]))) != eslOK) return status; /* eslEMEM */
1048   msa->ngc++;
1049   *ret_tagidx = tagidx;
1050   return eslOK;
1051 
1052  ERROR:
1053   *ret_tagidx = -1;
1054   return status;
1055 }
1056 /*------------ end, looking up seq, tag indices -----------------*/
1057 
1058 
1059 /*****************************************************************
1060  * 5. Internal: writing Stockholm/Pfam format
1061  *****************************************************************/
1062 
1063 /* stockholm_write()
1064  * Returns: <eslOK> on success.
1065  * Throws:  <eslEMEM> on allocation error.
1066  *          <eslEWRITE> on any system write error.
1067  */
1068 static int
stockholm_write(FILE * fp,const ESL_MSA * msa,int64_t cpl)1069 stockholm_write(FILE *fp, const ESL_MSA *msa, int64_t cpl)
1070 {
1071   int  i, j;
1072   int  maxname;		          /* maximum name length     */
1073   int  maxgf;		          /* max #=GF tag length     */
1074   int  maxgc;		          /* max #=GC tag length     */
1075   int  maxgr; 		          /* max #=GR tag length     */
1076   int  margin;               	  /* total left margin width */
1077   int  gslen;		          /* length of a #=GS tag    */
1078   char *buf = NULL;
1079   int  currpos;
1080   char *s, *tok;
1081   int  acpl;            /* actual number of character per line */
1082   int  make_uniquenames = FALSE;  /* TRUE if we force names to be unique */
1083   int  uniqwidth = 0;
1084   int  tmpnseq;
1085   int  status;
1086 
1087   /* Stockholm files require unique seqnames. Don't allow the writer
1088    * to create an invalid file.
1089    */
1090   status = esl_msa_CheckUniqueNames(msa);
1091   if      (status == eslFAIL) {
1092     make_uniquenames = TRUE;
1093     for (tmpnseq = msa->nseq; tmpnseq; tmpnseq /= 10) uniqwidth++;  /* how wide the uniqizing numbers need to be */
1094     uniqwidth++; 		/* uniqwidth includes the '|' */
1095   } else if (status != eslOK) goto ERROR;
1096 
1097   /* Figure out how much space we need for name + markup
1098    * to keep the alignment in register.
1099    *
1100    * The left margin of an alignment block can be composed of:
1101    *
1102    * <seqname>                      max length: uniqwidth + maxname + 1
1103    * #=GC <gc_tag>                  max length: 4 + 1 + maxgc + 1
1104    * #=GR <seqname> <gr_tag>        max length: 4 + 1 + uniqwidth + maxname + 1 + maxgr + 1
1105    *
1106    * <margin> is the max of these. It is the total length of the
1107    * left margin that we need to leave, inclusive of the last space.
1108    *
1109    * Then when we output, we do:
1110    * name:  <leftmargin-uniqwidth-1>
1111    * gc:    #=GC <leftmargin-6>
1112    * gr:    #=GR <uniqwidth><maxname> <leftmargin-maxname-uniqwidth-7>
1113    *
1114    * because uniqwidth includes the |, field widths for uniqizing must be uniqwidth-1
1115    *
1116    * xref STL9/p17
1117    */
1118   maxname = esl_str_GetMaxWidth(msa->sqname, msa->nseq);
1119 
1120   maxgf   = esl_str_GetMaxWidth(msa->gf_tag, msa->ngf);
1121   if (maxgf < 2) maxgf = 2;
1122 
1123   maxgc   = esl_str_GetMaxWidth(msa->gc_tag, msa->ngc);
1124   if (msa->rf      && maxgc < 2) maxgc = 2;
1125   if (msa->mm      && maxgc < 2) maxgc = 2;
1126   if (msa->ss_cons && maxgc < 7) maxgc = 7;
1127   if (msa->sa_cons && maxgc < 7) maxgc = 7;
1128   if (msa->pp_cons && maxgc < 7) maxgc = 7;
1129 
1130   maxgr   = esl_str_GetMaxWidth(msa->gr_tag, msa->ngr);
1131   if (msa->ss && maxgr < 2) maxgr = 2;
1132   if (msa->sa && maxgr < 2) maxgr = 2;
1133   if (msa->pp && maxgr < 2) maxgr = 2;
1134 
1135   margin = uniqwidth + maxname + 1;
1136   if (maxgc > 0 && maxgc+6 > margin)                   margin = maxgc+6;
1137   if (maxgr > 0 && uniqwidth+maxname+maxgr+7 > margin) margin = uniqwidth+maxname+maxgr+7;
1138 
1139   /* Allocate a tmp buffer to hold sequence chunks in  */
1140   ESL_ALLOC(buf, sizeof(char) * (cpl+1));
1141 
1142   /* Magic Stockholm header */
1143   if (fprintf(fp, "# STOCKHOLM 1.0\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1144 
1145   /* Warning about uniqization */
1146   if (make_uniquenames && fprintf(fp, "# WARNING: seq names have been made unique by adding a prefix of \"<seq#>|\"\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1147 
1148  /* Free text comment section */
1149   for (i = 0;  i < msa->ncomment; i++)
1150     if (fprintf(fp, "#%s\n", msa->comment[i]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1151   if (msa->ncomment > 0 && fprintf(fp, "\n")  < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1152 
1153    /* GF section: per-file annotation */
1154   if (msa->name && fprintf(fp, "#=GF %-*s %s\n", maxgf, "ID", msa->name) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1155   if (msa->acc  && fprintf(fp, "#=GF %-*s %s\n", maxgf, "AC", msa->acc)  < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1156   if (msa->desc && fprintf(fp, "#=GF %-*s %s\n", maxgf, "DE", msa->desc) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1157   if (msa->au   && fprintf(fp, "#=GF %-*s %s\n", maxgf, "AU", msa->au)   < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1158 
1159   /* Thresholds are hacky. Pfam has two. Rfam has one. */
1160   if      (msa->cutset[eslMSA_GA1] && msa->cutset[eslMSA_GA2]) { if (fprintf(fp, "#=GF %-*s %.1f %.1f\n", maxgf, "GA", msa->cutoff[eslMSA_GA1], msa->cutoff[eslMSA_GA2]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1161   else if (msa->cutset[eslMSA_GA1])                            { if (fprintf(fp, "#=GF %-*s %.1f\n",      maxgf, "GA", msa->cutoff[eslMSA_GA1])                          < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1162 
1163   if      (msa->cutset[eslMSA_NC1] && msa->cutset[eslMSA_NC2]) { if (fprintf(fp, "#=GF %-*s %.1f %.1f\n", maxgf, "NC", msa->cutoff[eslMSA_NC1], msa->cutoff[eslMSA_NC2]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1164   else if (msa->cutset[eslMSA_NC1])                            { if (fprintf(fp, "#=GF %-*s %.1f\n",	    maxgf, "NC", msa->cutoff[eslMSA_NC1])                        < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1165 
1166   if      (msa->cutset[eslMSA_TC1] && msa->cutset[eslMSA_TC2]) { if (fprintf(fp, "#=GF %-*s %.1f %.1f\n", maxgf, "TC", msa->cutoff[eslMSA_TC1], msa->cutoff[eslMSA_TC2]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1167   else if (msa->cutset[eslMSA_TC1])                            { if (fprintf(fp, "#=GF %-*s %.1f\n", 	    maxgf, "TC", msa->cutoff[eslMSA_TC1])                        < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1168 
1169   for (i = 0; i < msa->ngf; i++)
1170     if (fprintf(fp, "#=GF %-*s %s\n", maxgf, msa->gf_tag[i], msa->gf[i]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1171   if (fprintf(fp, "\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1172 
1173 
1174   /* GS section: per-sequence annotation */
1175   if (msa->flags & eslMSA_HASWGTS) {
1176     for (i = 0; i < msa->nseq; i++)
1177       if (make_uniquenames) { if (fprintf(fp, "#=GS %0*d|%-*s WT %.2f\n", uniqwidth-1, i, maxname, msa->sqname[i], msa->wgt[i]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1178       else                  { if (fprintf(fp, "#=GS %-*s WT %.2f\n",                      maxname, msa->sqname[i], msa->wgt[i]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1179     if (fprintf(fp, "\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1180   }
1181 
1182   if (msa->sqacc) {
1183     for (i = 0; i < msa->nseq; i++)
1184       if (msa->sqacc[i]) {
1185 	if (make_uniquenames) { if (fprintf(fp, "#=GS %0*d|%-*s AC %s\n", uniqwidth-1, i, maxname, msa->sqname[i], msa->sqacc[i]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1186 	else                  { if (fprintf(fp, "#=GS %-*s AC %s\n",                      maxname, msa->sqname[i], msa->sqacc[i]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1187       }
1188     if (fprintf(fp, "\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1189   }
1190 
1191   if (msa->sqdesc) {
1192     for (i = 0; i < msa->nseq; i++)
1193       if (msa->sqdesc[i]) {
1194 	if (make_uniquenames) { if (fprintf(fp, "#=GS %0*d|%-*s DE %s\n", uniqwidth-1, i, maxname, msa->sqname[i], msa->sqdesc[i]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1195 	else                  { if (fprintf(fp, "#=GS %-*s DE %s\n",                      maxname, msa->sqname[i], msa->sqdesc[i]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1196       }
1197     if (fprintf(fp, "\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1198   }
1199 
1200   /* Multiannotated GS tags are possible; for example,
1201    *     #=GS foo DR PDB; 1xxx;
1202    *     #=GS foo DR PDB; 2yyy;
1203    * These are stored, for example, as:
1204    *     msa->gs[0][0] = "PDB; 1xxx;\nPDB; 2yyy;"
1205    * and must be decomposed.
1206    */
1207   for (i = 0; i < msa->ngs; i++)
1208     {
1209       gslen = strlen(msa->gs_tag[i]);
1210       for (j = 0; j < msa->nseq; j++)
1211 	if (msa->gs[i][j]) {
1212 	  s = msa->gs[i][j];
1213 	  while (esl_strtok(&s, "\n", &tok) == eslOK)
1214 	    if (make_uniquenames) { if (fprintf(fp, "#=GS %0*d|%-*s %-*s %s\n", uniqwidth-1, i,  maxname, msa->sqname[j], gslen, msa->gs_tag[i], tok) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1215 	    else                  { if (fprintf(fp, "#=GS %-*s %-*s %s\n",                       maxname, msa->sqname[j], gslen, msa->gs_tag[i], tok) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1216 	}
1217       if (fprintf(fp, "\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1218     }
1219 
1220   /* Alignment section:
1221    * contains aligned sequence, #=GR annotation, and #=GC annotation
1222    */
1223   for (currpos = 0; currpos < msa->alen; currpos += cpl)
1224     {
1225       acpl = (msa->alen - currpos > cpl)? cpl : msa->alen - currpos;
1226       buf[acpl] = '\0';  	/* this suffices to terminate for all uses of buf[] in this block */
1227       if (currpos > 0 && fprintf(fp, "\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1228 
1229       for (i = 0; i < msa->nseq; i++)
1230 	{
1231 	  if (msa->abc)   esl_abc_TextizeN(msa->abc, msa->ax[i] + currpos + 1, acpl, buf);
1232 	  if (! msa->abc) strncpy(buf, msa->aseq[i] + currpos, acpl);
1233 	  if (make_uniquenames) { if (fprintf(fp, "%0*d|%-*s %s\n", uniqwidth-1, i, margin-uniqwidth-1, msa->sqname[i], buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1234 	  else                  { if (fprintf(fp, "%-*s %s\n",                      margin-1,           msa->sqname[i], buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1235 
1236 	  if (msa->ss && msa->ss[i]) {
1237 	    strncpy(buf, msa->ss[i] + currpos, acpl);
1238 	    if (make_uniquenames) { if (fprintf(fp, "#=GR %0*d|%-*s %-*s %s\n", uniqwidth-1, i, maxname, msa->sqname[i], margin-maxname-uniqwidth-7, "SS", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1239 	    else                  { if (fprintf(fp, "#=GR %-*s %-*s %s\n",                      maxname, msa->sqname[i], margin-maxname-7,           "SS", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1240 	  }
1241 	  if (msa->sa && msa->sa[i]) {
1242 	    strncpy(buf, msa->sa[i] + currpos, acpl);
1243 	    if (make_uniquenames) { if (fprintf(fp, "#=GR %0*d|%-*s %-*s %s\n", uniqwidth-1, i, maxname, msa->sqname[i], margin-maxname-uniqwidth-7, "SA", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1244 	    else                  { if (fprintf(fp, "#=GR %-*s %-*s %s\n",                      maxname, msa->sqname[i], margin-maxname-7,           "SA", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1245 	  }
1246 	  if (msa->pp && msa->pp[i]) {
1247 	    strncpy(buf, msa->pp[i] + currpos, acpl);
1248 	    if (make_uniquenames) { if (fprintf(fp, "#=GR %0*d|%-*s %-*s %s\n", uniqwidth-1, i, maxname, msa->sqname[i], margin-maxname-uniqwidth-7, "PP", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1249 	    else                  { if (fprintf(fp, "#=GR %-*s %-*s %s\n",                      maxname, msa->sqname[i], margin-maxname-7,           "PP", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1250 	  }
1251 	  for (j = 0; j < msa->ngr; j++)
1252 	    if (msa->gr[j][i]) {
1253 	      strncpy(buf, msa->gr[j][i] + currpos, acpl);
1254 	      if (make_uniquenames) { if (fprintf(fp, "#=GR %0*d|%-*s %-*s %s\n", uniqwidth-1, i, maxname, msa->sqname[i], margin-maxname-uniqwidth-7, msa->gr_tag[j], buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1255 	      else                  { if (fprintf(fp, "#=GR %-*s %-*s %s\n",                      maxname, msa->sqname[i], margin-maxname-7,           msa->gr_tag[j], buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1256 	    }
1257 	}
1258 
1259       if (msa->ss_cons) {
1260 	strncpy(buf, msa->ss_cons + currpos, acpl);
1261 	if (fprintf(fp, "#=GC %-*s %s\n", margin-6, "SS_cons", buf)      < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1262       }
1263       if (msa->sa_cons) {
1264 	strncpy(buf, msa->sa_cons + currpos, acpl);
1265 	if (fprintf(fp, "#=GC %-*s %s\n", margin-6, "SA_cons", buf)      < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1266       }
1267       if (msa->pp_cons) {
1268 	strncpy(buf, msa->pp_cons + currpos, acpl);
1269 	if (fprintf(fp, "#=GC %-*s %s\n", margin-6, "PP_cons", buf)      < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1270       }
1271       if (msa->rf) {
1272   strncpy(buf, msa->rf + currpos, acpl);
1273   if (fprintf(fp, "#=GC %-*s %s\n", margin-6, "RF", buf)           < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1274       }
1275       if (msa->mm) {
1276   strncpy(buf, msa->mm + currpos, acpl);
1277   if (fprintf(fp, "#=GC %-*s %s\n", margin-6, "MM", buf)           < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1278       }
1279       for (j = 0; j < msa->ngc; j++) {
1280 	strncpy(buf, msa->gc[j] + currpos, acpl);
1281 	if (fprintf(fp, "#=GC %-*s %s\n", margin-6, msa->gc_tag[j], buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1282       }
1283     }
1284   if (fprintf(fp, "//\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1285   free(buf);
1286   return eslOK;
1287 
1288  ERROR:
1289   if (buf != NULL) free(buf);
1290   return status;
1291 }
1292 /*----------------- end, writing Stockholm/Pfam -----------------*/
1293 
1294 
1295 
1296 /*****************************************************************
1297  * 6. Unit tests
1298  *****************************************************************/
1299 #ifdef eslMSAFILE_STOCKHOLM_TESTDRIVE
1300 
1301 static void
utest_write_good1(FILE * ofp,int * ret_alphatype,int * ret_nseq,int * ret_alen)1302 utest_write_good1(FILE *ofp, int *ret_alphatype, int *ret_nseq, int *ret_alen)
1303 {
1304   fputs("# STOCKHOLM 1.0\n", ofp);
1305   fputs("#\n", ofp);
1306   fputs("# This is an example of a Stockholm multiple sequence alignment\n", ofp);
1307   fputs("# file. It is deliberately designed to be weird, to exercise many of the\n", ofp);
1308   fputs("# features of Stockholm format, in order to test a parser.\n", ofp);
1309   fputs("#\n", ofp);
1310   fputs("#=GF ID   14-3-3\n", ofp);
1311   fputs("#=GF AC   PF00244\n", ofp);
1312   fputs("#=GF DE   14-3-3 proteins\n", ofp);
1313   fputs("#=GF AU   Finn RD\n", ofp);
1314   fputs("#=GF AL   Clustalw\n", ofp);
1315   fputs("#=GF SE   Prosite\n", ofp);
1316   fputs("#=GF GA   25 25\n", ofp);
1317   fputs("#=GF TC   35.40 35.40\n", ofp);
1318   fputs("#=GF NC   8.80 8.80\n", ofp);
1319   fputs("#=GF BM   hmmbuild -f HMM SEED\n", ofp);
1320   fputs("#=GF BM   hmmcalibrate --seed 0 HMM\n", ofp);
1321   fputs("#=GF RN   [1]\n", ofp);
1322   fputs("#=GF RM   95327195\n", ofp);
1323   fputs("#=GF RT   Structure of a 14-3-3 protein and implications for\n", ofp);
1324   fputs("#=GF RT   coordination of multiple signalling pathways. \n", ofp);
1325   fputs("#=GF RA   Xiao B, Smerdon SJ, Jones DH, Dodson GG, Soneji Y, Aitken\n", ofp);
1326   fputs("#=GF RA   A, Gamblin SJ; \n", ofp);
1327   fputs("#=GF RL   Nature 1995;376:188-191.\n", ofp);
1328   fputs("#=GF RN   [2]\n", ofp);
1329   fputs("#=GF RM   95327196\n", ofp);
1330   fputs("#=GF RT   Crystal structure of the zeta isoform of the 14-3-3\n", ofp);
1331   fputs("#=GF RT   protein. \n", ofp);
1332   fputs("#=GF RA   Liu D, Bienkowska J, Petosa C, Collier RJ, Fu H, Liddington\n", ofp);
1333   fputs("#=GF RA   R; \n", ofp);
1334   fputs("#=GF RL   Nature 1995;376:191-194.\n", ofp);
1335   fputs("#=GF DR   PROSITE; PDOC00633;\n", ofp);
1336   fputs("#=GF DR   SMART; 14_3_3;\n", ofp);
1337   fputs("#=GF DR   PRINTS; PR00305;\n", ofp);
1338   fputs("#=GF SQ   119\n", ofp);
1339   fputs("	\n", ofp);
1340   fputs("#=GS 1431_ENTHI/4-239 WT  0.42\n", ofp);
1341   fputs("#=GS seq1             WT  0.40\n", ofp);
1342   fputs("#=GS seq2             WT  0.41\n", ofp);
1343   fputs("#=GS seq3             WT  0.43\n", ofp);
1344   fputs("#=GS seq4             WT  0.44\n", ofp);
1345   fputs("#=GS seq5             WT  0.45\n", ofp);
1346   fputs("#=GS seq6             WT  0.46\n", ofp);
1347   fputs("\n", ofp);
1348   fputs("#=GS seq4             AC  PF00001\n", ofp);
1349   fputs("#=GS seq4             DE  A description of seq4.\n", ofp);
1350   fputs("\n", ofp);
1351   fputs("#=GS seq1             NEWTAG  foo\n", ofp);
1352   fputs("#=GS seq2             NEWTAG  bar\n", ofp);
1353   fputs("#=GS seq3             NEWTAG  baz  \n", ofp);
1354   fputs("\n", ofp);
1355   fputs("#=GS seq3             TAG2    foo2\n", ofp);
1356   fputs("#=GS seq4             TAG2    foo3\n", ofp);
1357   fputs("#=GS seq5             TAG2    foo4\n", ofp);
1358   fputs("\n", ofp);
1359   fputs("#=GC SS_cons                 xxxxxxxxxxxxxxxxxxx\n", ofp);
1360   fputs("#=GC SA_cons                 xxxxxxxxxxxxxxxxxxx  \n", ofp);
1361   fputs("#=GC New_long_tag_thingie    xxxxxxxxxxxxxxxxxxx\n", ofp);
1362   fputs("1431_ENTHI/4-239             ACDEFGHKLMNPQRSTVWY             \n", ofp);
1363   fputs("#=GR seq1 SS                 ...................  \n", ofp);
1364   fputs("#=GR seq1 SA                 0000000000000000000\n", ofp);
1365   fputs("seq1                         ACDEFGHKLMNPQRSTVWY\n", ofp);
1366   fputs("seq2                         ACDEFGHKLMNPQRSTVWY\n", ofp);
1367   fputs("#=GR seq3 PP                 0000000000000000000  \n", ofp);
1368   fputs("seq3                         ACDEFGHKLMNPQRSTVWY\n", ofp);
1369   fputs("seq4                         ACDEFGHKLMNPQRSTVWY\n", ofp);
1370   fputs("seq5                         ACDEFGHKLMNPQRSTVWY\n", ofp);
1371   fputs("seq6                         ACDEFGHKLMNPQRSTVWY\n", ofp);
1372   fputs("#=GR seq6 SS                 ...................\n", ofp);
1373   fputs("#=GR seq6 SA                 9999999999999999999\n", ofp);
1374   fputs("#=GR seq6 Invented_tag       *******************\n", ofp);
1375   fputs("#=GR seq6 Another_tag        -------------------\n", ofp);
1376   fputs("#=GC PP_cons                 *******************  \n", ofp);
1377   fputs("    \n", ofp);
1378   fputs("  \n", ofp);
1379   fputs("#=GC SS_cons                 xxxxxxxxxxxxxxxxxxx\n", ofp);
1380   fputs("#=GC SA_cons                 xxxxxxxxxxxxxxxxxxx\n", ofp);
1381   fputs("#=GC New_long_tag_thingie    xxxxxxxxxxxxxxxxxxx    \n", ofp);
1382   fputs("1431_ENTHI/4-239             ACDEFGHKLMNPQRSTVWY   \n", ofp);
1383   fputs("#=GR seq1 SS                 ...................\n", ofp);
1384   fputs("#=GR seq1 SA                 0000000000000000000\n", ofp);
1385   fputs("seq1                         ACDEFGHKLMNPQRSTVWY\n", ofp);
1386   fputs("seq2                         ACDEFGHKLMNPQRSTVWY        \n", ofp);
1387   fputs("#=GR seq3 PP                 0000000000000000000  \n", ofp);
1388   fputs("seq3                         ACDEFGHKLMNPQRSTVWY\n", ofp);
1389   fputs("seq4                         ACDEFGHKLMNPQRSTVWY\n", ofp);
1390   fputs("seq5                         ACDEFGHKLMNPQRSTVWY\n", ofp);
1391   fputs("seq6                         ACDEFGHKLMNPQRSTVWY\n", ofp);
1392   fputs("#=GR seq6 SS                 ...................\n", ofp);
1393   fputs("#=GR seq6 SA                 9999999999999999999\n", ofp);
1394   fputs("#=GR seq6 Invented_tag       ******************* \n", ofp);
1395   fputs("#=GR seq6 Another_tag        -------------------\n", ofp);
1396   fputs("#=GC PP_cons                 *******************  \n", ofp);
1397   fputs("\n", ofp);
1398   fputs("#\n", ofp);
1399   fputs("# And here's some trailing comments, just to\n", ofp);
1400   fputs("# try to confuse a parser.\n", ofp);
1401   fputs("#\n", ofp);
1402   fputs("\n", ofp);
1403   fputs("//\n", ofp);
1404 
1405   *ret_alphatype = eslAMINO;
1406   *ret_nseq      = 7;
1407   *ret_alen      = 38;
1408 }
1409 
1410 static void
utest_write_badformat1(FILE * ofp,int * ret_linenumber,char * errmsg)1411 utest_write_badformat1(FILE *ofp, int *ret_linenumber, char *errmsg)
1412 {
1413   fputs("missing Stockholm header\n", ofp);
1414 
1415   *ret_linenumber = 1;
1416   strcpy(errmsg, "missing Stockholm header");
1417 }
1418 
1419 static void
utest_write_badformat2(FILE * ofp,int * ret_linenumber,char * errmsg)1420 utest_write_badformat2(FILE *ofp, int *ret_linenumber, char *errmsg)
1421 {
1422   fputs("# STOCKHOLM 1.0\n", ofp);
1423   fputs("\n", ofp);
1424   fputs("seq1  ACDEFGHIKLMNPQRSTVWY\n", ofp);
1425   fputs("seq2  ACDEFGHIKLMNPQRSTVWY\n", ofp);
1426   fputs("\n", ofp);
1427   fputs("seq1  ACDEFGHIKLMNPQRSTVWY\n", ofp);
1428   fputs("//\n", ofp);
1429 
1430   *ret_linenumber = 7;
1431   strcpy(errmsg, "number of seqs in block did not match number in earlier block(s)");
1432 }
1433 
1434 static void
utest_write_badformat3(FILE * ofp,int * ret_linenumber,char * errmsg)1435 utest_write_badformat3(FILE *ofp, int *ret_linenumber, char *errmsg)
1436 {
1437   fputs("# STOCKHOLM 1.0\n", ofp);
1438   fputs("\n", ofp);
1439   fputs("#=GS seq1 FOO baz\n", ofp);
1440   fputs("#=GS seq2 FOO boz\n", ofp);
1441   fputs("\n", ofp);
1442   fputs("seq1  ACDEFGHIKLMNPQRSTVWY\n", ofp);
1443   fputs("//\n", ofp);
1444 
1445   *ret_linenumber = 7;
1446   strcpy(errmsg, "number of seqs in block did not match number annotated by #=GS lines");
1447 }
1448 
1449 static void
utest_write_badformat4(FILE * ofp,int * ret_linenumber,char * errmsg)1450 utest_write_badformat4(FILE *ofp, int *ret_linenumber, char *errmsg)
1451 {
1452   fputs("# STOCKHOLM 1.0\n", ofp);
1453   fputs("\n", ofp);
1454   fputs("seq1    ACDEFGHIKLMNPQRSTVWY\n", ofp);
1455   fputs("seq2    ACDEFGHIKLMNPQRSTVWY\n", ofp);
1456 
1457   *ret_linenumber = 4;
1458   strcpy(errmsg, "missing // terminator after MSA");
1459 }
1460 
1461 static void
utest_write_badformat5(FILE * ofp,int * ret_linenumber,char * errmsg)1462 utest_write_badformat5(FILE *ofp, int *ret_linenumber, char *errmsg)
1463 {
1464   fputs("# STOCKHOLM 1.0\n", ofp);
1465   fputs("\n", ofp);
1466   fputs("# No data\n", ofp);
1467   fputs("//\n", ofp);
1468 
1469   *ret_linenumber = 4;
1470   strcpy(errmsg, "no alignment data followed Stockholm header");
1471 }
1472 
1473 static void
utest_write_badformat6(FILE * ofp,int * ret_linenumber,char * errmsg)1474 utest_write_badformat6(FILE *ofp, int *ret_linenumber, char *errmsg)
1475 {
1476   fputs("# STOCKHOLM 1.0\n", ofp);
1477   fputs("\n", ofp);
1478   fputs("#=GF \n", ofp);
1479   fputs("\n", ofp);
1480   fputs("seq1  ACDEFGHIKLMNPQRSTVWY\n", ofp);
1481   fputs("seq2  ACDEFGHIKLMNPQRSTVWY\n", ofp);
1482   fputs("//\n", ofp);
1483 
1484   *ret_linenumber = 3;
1485   strcpy(errmsg, "#=GF line is missing <tag>, annotation");
1486 }
1487 
1488 static void
utest_write_badformat7(FILE * ofp,int * ret_linenumber,char * errmsg)1489 utest_write_badformat7(FILE *ofp, int *ret_linenumber, char *errmsg)
1490 {
1491   fputs("# STOCKHOLM 1.0\n", ofp);
1492   fputs("#=GFX tag\n", ofp);
1493   fputs("seq1  ACDEFGHIKLMNPQRSTVWY\n", ofp);
1494   fputs("//\n", ofp);
1495 
1496   *ret_linenumber = 2;
1497   strcpy(errmsg, "faux #=GF line");
1498 }
1499 
1500 static void
utest_write_badformat8(FILE * ofp,int * ret_linenumber,char * errmsg)1501 utest_write_badformat8(FILE *ofp, int *ret_linenumber, char *errmsg)
1502 {
1503   fputs("# STOCKHOLM 1.0\n", ofp);
1504   fputs("#=GF ID\n", ofp);
1505   fputs("seq1  ACDEFGHIKLMNPQRSTVWY\n", ofp);
1506   fputs("//\n", ofp);
1507 
1508   *ret_linenumber = 2;
1509   strcpy(errmsg, "No name found on #=GF ID line");
1510 }
1511 
1512 static void
utest_write_badformat9(FILE * ofp,int * ret_linenumber,char * errmsg)1513 utest_write_badformat9(FILE *ofp, int *ret_linenumber, char *errmsg)
1514 {
1515   fputs("# STOCKHOLM 1.0\n", ofp);
1516   fputs("#=GF ID name with cruft\n", ofp);
1517   fputs("seq1  ACDEFGHIKLMNPQRSTVWY\n", ofp);
1518   fputs("//\n", ofp);
1519 
1520   *ret_linenumber = 2;
1521   strcpy(errmsg, "#=GF ID line should have only one name (no whitespace allowed)");
1522 }
1523 
1524 static void
utest_write_badformat10(FILE * ofp,int * ret_linenumber,char * errmsg)1525 utest_write_badformat10(FILE *ofp, int *ret_linenumber, char *errmsg)
1526 {
1527   fputs("# STOCKHOLM 1.0\n", ofp);
1528   fputs("#=GF AC      \n", ofp);
1529   fputs("seq1  ACDEFGHIKLMNPQRSTVWY\n", ofp);
1530   fputs("//\n", ofp);
1531 
1532   *ret_linenumber = 2;
1533   strcpy(errmsg, "No accession found on #=GF AC line");
1534 }
1535 
1536 static void
utest_write_badformat11(FILE * ofp,int * ret_linenumber,char * errmsg)1537 utest_write_badformat11(FILE *ofp, int *ret_linenumber, char *errmsg)
1538 {
1539   fputs("# STOCKHOLM 1.0\n", ofp);
1540   fputs("#=GF AC accession with cruft     \n", ofp);
1541   fputs("seq1  ACDEFGHIKLMNPQRSTVWY\n", ofp);
1542   fputs("//\n", ofp);
1543 
1544   *ret_linenumber = 2;
1545   strcpy(errmsg, "#=GF AC line should have only one accession (no whitespace allowed)");
1546 }
1547 
1548 static void
utest_write_badformat12(FILE * ofp,int * ret_linenumber,char * errmsg)1549 utest_write_badformat12(FILE *ofp, int *ret_linenumber, char *errmsg)
1550 {
1551   fputs("# STOCKHOLM 1.0\n", ofp);
1552   fputs("#=GF GA not_a_number \n", ofp);
1553   fputs("seq1  ACDEFGHIKLMNPQRSTVWY\n", ofp);
1554   fputs("//\n", ofp);
1555 
1556   *ret_linenumber = 2;
1557   strcpy(errmsg, "Expected a real number for GA1 value on #=GF GA line");
1558 }
1559 
1560 static void
utest_write_badformat13(FILE * ofp,int * ret_linenumber,char * errmsg)1561 utest_write_badformat13(FILE *ofp, int *ret_linenumber, char *errmsg)
1562 {
1563   fputs("# STOCKHOLM 1.0\n", ofp);
1564   fputs("#=GF GA 10.0  not_a_number \n", ofp);
1565   fputs("seq1  ACDEFGHIKLMNPQRSTVWY\n", ofp);
1566   fputs("//\n", ofp);
1567 
1568   *ret_linenumber = 2;
1569   strcpy(errmsg, "Expected a real number for GA2 value on #=GF GA line");
1570 }
1571 
1572 static void
utest_write_badformat14(FILE * ofp,int * ret_linenumber,char * errmsg)1573 utest_write_badformat14(FILE *ofp, int *ret_linenumber, char *errmsg)
1574 {
1575   fputs("# STOCKHOLM 1.0\n", ofp);
1576   fputs("#=GF GA       \n", ofp);
1577   fputs("seq1  ACDEFGHIKLMNPQRSTVWY\n", ofp);
1578   fputs("//\n", ofp);
1579 
1580   *ret_linenumber = 2;
1581   strcpy(errmsg, "No GA threshold value found on #=GF GA line");
1582 }
1583 
1584 static void
utest_write_badformat15(FILE * ofp,int * ret_linenumber,char * errmsg)1585 utest_write_badformat15(FILE *ofp, int *ret_linenumber, char *errmsg)
1586 {
1587   fputs("# STOCKHOLM 1.0\n", ofp);
1588   fputs("#=GF NC not_a_number \n", ofp);
1589   fputs("seq1  ACDEFGHIKLMNPQRSTVWY\n", ofp);
1590   fputs("//\n", ofp);
1591 
1592   *ret_linenumber = 2;
1593   strcpy(errmsg, "Expected a real number for NC1 value on #=GF NC line");
1594 }
1595 
1596 static void
utest_write_badformat16(FILE * ofp,int * ret_linenumber,char * errmsg)1597 utest_write_badformat16(FILE *ofp, int *ret_linenumber, char *errmsg)
1598 {
1599   fputs("# STOCKHOLM 1.0\n", ofp);
1600   fputs("#=GF NC 10.0  not_a_number \n", ofp);
1601   fputs("seq1  ACDEFGHIKLMNPQRSTVWY\n", ofp);
1602   fputs("//\n", ofp);
1603 
1604   *ret_linenumber = 2;
1605   strcpy(errmsg, "Expected a real number for NC2 value on #=GF NC line");
1606 }
1607 
1608 static void
utest_write_badformat17(FILE * ofp,int * ret_linenumber,char * errmsg)1609 utest_write_badformat17(FILE *ofp, int *ret_linenumber, char *errmsg)
1610 {
1611   fputs("# STOCKHOLM 1.0\n", ofp);
1612   fputs("#=GF NC       \n", ofp);
1613   fputs("seq1  ACDEFGHIKLMNPQRSTVWY\n", ofp);
1614   fputs("//\n", ofp);
1615 
1616   *ret_linenumber = 2;
1617   strcpy(errmsg, "No NC threshold value found on #=GF NC line");
1618 }
1619 
1620 static void
utest_write_badformat18(FILE * ofp,int * ret_linenumber,char * errmsg)1621 utest_write_badformat18(FILE *ofp, int *ret_linenumber, char *errmsg)
1622 {
1623   fputs("# STOCKHOLM 1.0\n", ofp);
1624   fputs("#=GF TC not_a_number \n", ofp);
1625   fputs("seq1  ACDEFGHIKLMNPQRSTVWY\n", ofp);
1626   fputs("//\n", ofp);
1627 
1628   *ret_linenumber = 2;
1629   strcpy(errmsg, "Expected a real number for TC1 value on #=GF TC line");
1630 }
1631 
1632 static void
utest_write_badformat19(FILE * ofp,int * ret_linenumber,char * errmsg)1633 utest_write_badformat19(FILE *ofp, int *ret_linenumber, char *errmsg)
1634 {
1635   fputs("# STOCKHOLM 1.0\n", ofp);
1636   fputs("#=GF TC 10.0  not_a_number \n", ofp);
1637   fputs("seq1  ACDEFGHIKLMNPQRSTVWY\n", ofp);
1638   fputs("//\n", ofp);
1639 
1640   *ret_linenumber = 2;
1641   strcpy(errmsg, "Expected a real number for TC2 value on #=GF TC line");
1642 }
1643 
1644 static void
utest_write_badformat20(FILE * ofp,int * ret_linenumber,char * errmsg)1645 utest_write_badformat20(FILE *ofp, int *ret_linenumber, char *errmsg)
1646 {
1647   fputs("# STOCKHOLM 1.0\n", ofp);
1648   fputs("#=GF TC       \n", ofp);
1649   fputs("seq1  ACDEFGHIKLMNPQRSTVWY\n", ofp);
1650   fputs("//\n", ofp);
1651 
1652   *ret_linenumber = 2;
1653   strcpy(errmsg, "No TC threshold value found on #=GF TC line");
1654 }
1655 
1656 static void
utest_write_badformat21(FILE * ofp,int * ret_linenumber,char * errmsg)1657 utest_write_badformat21(FILE *ofp, int *ret_linenumber, char *errmsg)
1658 {
1659   fputs("# STOCKHOLM 1.0\n", ofp);
1660   fputs("#=GS      \n", ofp);
1661   fputs("seq1  ACDEFGHIKLMNPQRSTVWY\n", ofp);
1662   fputs("//\n", ofp);
1663 
1664   *ret_linenumber = 2;
1665   strcpy(errmsg, "#=GS line missing <seqname>, <tag>, annotation");
1666 }
1667 
1668 static void
utest_write_badformat22(FILE * ofp,int * ret_linenumber,char * errmsg)1669 utest_write_badformat22(FILE *ofp, int *ret_linenumber, char *errmsg)
1670 {
1671   fputs("# STOCKHOLM 1.0\n", ofp);
1672   fputs("#=GS seq1    \n", ofp);
1673   fputs("seq1      ACDEFGHIKLMNPQRSTVWY\n", ofp);
1674   fputs("//\n", ofp);
1675 
1676   *ret_linenumber = 2;
1677   strcpy(errmsg, "#=GS line missing <tag>, annotation");
1678 }
1679 
1680 static void
utest_write_badformat23(FILE * ofp,int * ret_linenumber,char * errmsg)1681 utest_write_badformat23(FILE *ofp, int *ret_linenumber, char *errmsg)
1682 {
1683   fputs("# STOCKHOLM 1.0\n", ofp);
1684   fputs("#=GSX seq1 foo   \n", ofp);
1685   fputs("seq1      ACDEFGHIKLMNPQRSTVWY\n", ofp);
1686   fputs("//\n", ofp);
1687 
1688   *ret_linenumber = 2;
1689   strcpy(errmsg, "faux #=GS line");
1690 }
1691 
1692 static void
utest_write_badformat24(FILE * ofp,int * ret_linenumber,char * errmsg)1693 utest_write_badformat24(FILE *ofp, int *ret_linenumber, char *errmsg)
1694 {
1695   fputs("# STOCKHOLM 1.0\n", ofp);
1696   fputs("#=GS seq1 WT   \n", ofp);
1697   fputs("seq1      ACDEFGHIKLMNPQRSTVWY\n", ofp);
1698   fputs("//\n", ofp);
1699 
1700   *ret_linenumber = 2;
1701   strcpy(errmsg, "no weight value found on #=GS <seqname> WT line");
1702 }
1703 
1704 static void
utest_write_badformat25(FILE * ofp,int * ret_linenumber,char * errmsg)1705 utest_write_badformat25(FILE *ofp, int *ret_linenumber, char *errmsg)
1706 {
1707   fputs("# STOCKHOLM 1.0\n", ofp);
1708   fputs("#=GS seq1 WT 2.0  \n", ofp);
1709   fputs("#=GS seq1 WT 2.0  \n", ofp);
1710   fputs("seq1      ACDEFGHIKLMNPQRSTVWY\n", ofp);
1711   fputs("//\n", ofp);
1712 
1713   *ret_linenumber = 3;
1714   strcpy(errmsg, "sequence has more than one #=GS <seqname> WT line");
1715 }
1716 
1717 static void
utest_write_badformat26(FILE * ofp,int * ret_linenumber,char * errmsg)1718 utest_write_badformat26(FILE *ofp, int *ret_linenumber, char *errmsg)
1719 {
1720   fputs("# STOCKHOLM 1.0\n", ofp);
1721   fputs("#=GS seq1 WT 2.0 cruft \n", ofp);
1722   fputs("seq1      ACDEFGHIKLMNPQRSTVWY\n", ofp);
1723   fputs("//\n", ofp);
1724 
1725   *ret_linenumber = 2;
1726   strcpy(errmsg, "#=GS <seqname> WT line should have only one field, the weight");
1727 }
1728 
1729 static void
utest_write_badformat27(FILE * ofp,int * ret_linenumber,char * errmsg)1730 utest_write_badformat27(FILE *ofp, int *ret_linenumber, char *errmsg)
1731 {
1732   fputs("# STOCKHOLM 1.0\n", ofp);
1733   fputs("#=GS seq1 WT cruft \n", ofp);
1734   fputs("seq1      ACDEFGHIKLMNPQRSTVWY\n", ofp);
1735   fputs("//\n", ofp);
1736 
1737   *ret_linenumber = 2;
1738   strcpy(errmsg, "value on #=GS <seqname> WT line isn't a real number");
1739 }
1740 
1741 static void
utest_write_badformat28(FILE * ofp,int * ret_linenumber,char * errmsg)1742 utest_write_badformat28(FILE *ofp, int *ret_linenumber, char *errmsg)
1743 {
1744   fputs("# STOCKHOLM 1.0\n", ofp);
1745   fputs("#=GS seq1 AC   \n", ofp);
1746   fputs("seq1      ACDEFGHIKLMNPQRSTVWY\n", ofp);
1747   fputs("//\n", ofp);
1748 
1749   *ret_linenumber = 2;
1750   strcpy(errmsg, "no accession found on #=GS <seqname> AC line");
1751 }
1752 
1753 static void
utest_write_badformat29(FILE * ofp,int * ret_linenumber,char * errmsg)1754 utest_write_badformat29(FILE *ofp, int *ret_linenumber, char *errmsg)
1755 {
1756   fputs("# STOCKHOLM 1.0\n", ofp);
1757   fputs("#=GS seq1 AC xxxxxx  \n", ofp);
1758   fputs("#=GS seq1 AC yyyyyy  \n", ofp);
1759   fputs("seq1      ACDEFGHIKLMNPQRSTVWY\n", ofp);
1760   fputs("//\n", ofp);
1761 
1762   *ret_linenumber = 3;
1763   strcpy(errmsg, "sequence has more than one #=GS <seqname> AC accession line");
1764 }
1765 
1766 static void
utest_write_badformat30(FILE * ofp,int * ret_linenumber,char * errmsg)1767 utest_write_badformat30(FILE *ofp, int *ret_linenumber, char *errmsg)
1768 {
1769   fputs("# STOCKHOLM 1.0\n", ofp);
1770   fputs("#=GS seq1 AC xxxxxx  cruft \n", ofp);
1771   fputs("seq1      ACDEFGHIKLMNPQRSTVWY\n", ofp);
1772   fputs("//\n", ofp);
1773 
1774   *ret_linenumber = 2;
1775   strcpy(errmsg, "#=GS <seqname> AC line should have only one field, the accession");
1776 }
1777 
1778 static void
utest_write_badformat31(FILE * ofp,int * ret_linenumber,char * errmsg)1779 utest_write_badformat31(FILE *ofp, int *ret_linenumber, char *errmsg)
1780 {
1781   fputs("# STOCKHOLM 1.0\n", ofp);
1782   fputs("#=GS seq1 DE a one line description \n", ofp);
1783   fputs("#=GS seq1 DE oops, a second line \n", ofp);
1784   fputs("seq1      ACDEFGHIKLMNPQRSTVWY\n", ofp);
1785   fputs("//\n", ofp);
1786 
1787   *ret_linenumber = 3;
1788   strcpy(errmsg, "sequence has more than one #=GS <seqname> DE accession line");
1789 }
1790 
1791 static void
utest_write_badformat32(FILE * ofp,int * ret_linenumber,char * errmsg)1792 utest_write_badformat32(FILE *ofp, int *ret_linenumber, char *errmsg)
1793 {
1794   fputs("# STOCKHOLM 1.0\n", ofp);
1795   fputs("#=GC \n", ofp);
1796   fputs("seq1      ACDEFGHIKLMNPQRSTVWY\n", ofp);
1797   fputs("//\n", ofp);
1798 
1799   *ret_linenumber = 2;
1800   strcpy(errmsg, "#=GC line missing <tag>, annotation");
1801 }
1802 
1803 static void
utest_write_badformat33(FILE * ofp,int * ret_linenumber,char * errmsg)1804 utest_write_badformat33(FILE *ofp, int *ret_linenumber, char *errmsg)
1805 {
1806   fputs("# STOCKHOLM 1.0\n", ofp);
1807   fputs("#=GCX FOO xxxxxxxxxxxxxxxxxxxx \n", ofp);
1808   fputs("seq1      ACDEFGHIKLMNPQRSTVWY\n", ofp);
1809   fputs("//\n", ofp);
1810 
1811   *ret_linenumber = 2;
1812   strcpy(errmsg, "faux #=GC line");
1813 }
1814 
1815 static void
utest_write_badformat34(FILE * ofp,int * ret_linenumber,char * errmsg)1816 utest_write_badformat34(FILE *ofp, int *ret_linenumber, char *errmsg)
1817 {
1818   fputs("# STOCKHOLM 1.0\n", ofp);
1819   fputs("#=GC FOO  \n", ofp);
1820   fputs("seq1      ACDEFGHIKLMNPQRSTVWY\n", ofp);
1821   fputs("//\n", ofp);
1822 
1823   *ret_linenumber = 2;
1824   strcpy(errmsg, "#=GC line missing annotation");
1825 }
1826 
1827 static void
utest_write_badformat35(FILE * ofp,int * ret_linenumber,char * errmsg)1828 utest_write_badformat35(FILE *ofp, int *ret_linenumber, char *errmsg)
1829 {
1830   fputs("# STOCKHOLM 1.0\n", ofp);
1831   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1832   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1833   fputs("#=GC SS_cons xxxxxxxxxxxxxxxxxxxx\n", ofp);
1834   fputs("\n", ofp);
1835   fputs("#=GC SS_cons xxxxxxxxxxxxxxxxxxxx\n", ofp);
1836   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1837   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1838   fputs("//\n", ofp);
1839 
1840   *ret_linenumber = 6;
1841   strcpy(errmsg, "unexpected #=GC SS_cons");
1842 }
1843 
1844 static void
utest_write_badformat36(FILE * ofp,int * ret_linenumber,char * errmsg)1845 utest_write_badformat36(FILE *ofp, int *ret_linenumber, char *errmsg)
1846 {
1847   fputs("# STOCKHOLM 1.0\n", ofp);
1848   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1849   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1850   fputs("#=GC SA_cons xxxxxxxxxxxxxxxxxxxx\n", ofp);
1851   fputs("\n", ofp);
1852   fputs("#=GC SA_cons xxxxxxxxxxxxxxxxxxxx\n", ofp);
1853   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1854   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1855   fputs("//\n", ofp);
1856 
1857   *ret_linenumber = 6;
1858   strcpy(errmsg, "unexpected #=GC SA_cons");
1859 }
1860 
1861 static void
utest_write_badformat37(FILE * ofp,int * ret_linenumber,char * errmsg)1862 utest_write_badformat37(FILE *ofp, int *ret_linenumber, char *errmsg)
1863 {
1864   fputs("# STOCKHOLM 1.0\n", ofp);
1865   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1866   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1867   fputs("#=GC PP_cons xxxxxxxxxxxxxxxxxxxx\n", ofp);
1868   fputs("\n", ofp);
1869   fputs("#=GC PP_cons xxxxxxxxxxxxxxxxxxxx\n", ofp);
1870   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1871   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1872   fputs("//\n", ofp);
1873 
1874   *ret_linenumber = 6;
1875   strcpy(errmsg, "unexpected #=GC PP_cons");
1876 }
1877 
1878 static void
utest_write_badformat38(FILE * ofp,int * ret_linenumber,char * errmsg)1879 utest_write_badformat38(FILE *ofp, int *ret_linenumber, char *errmsg)
1880 {
1881   fputs("# STOCKHOLM 1.0\n", ofp);
1882   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1883   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1884   fputs("#=GC RF      xxxxxxxxxxxxxxxxxxxx\n", ofp);
1885   fputs("\n", ofp);
1886   fputs("#=GC RF      xxxxxxxxxxxxxxxxxxxx\n", ofp);
1887   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1888   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1889   fputs("//\n", ofp);
1890 
1891   *ret_linenumber = 6;
1892   strcpy(errmsg, "unexpected #=GC RF");
1893 }
1894 
1895 static void
utest_write_badformat39(FILE * ofp,int * ret_linenumber,char * errmsg)1896 utest_write_badformat39(FILE *ofp, int *ret_linenumber, char *errmsg)
1897 {
1898   fputs("# STOCKHOLM 1.0\n", ofp);
1899   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1900   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1901   fputs("#=GC XX      xxxxxxxxxxxxxxxxxxxx\n", ofp);
1902   fputs("\n", ofp);
1903   fputs("#=GC XX      xxxxxxxxxxxxxxxxxxxx\n", ofp);
1904   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1905   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1906   fputs("//\n", ofp);
1907 
1908   *ret_linenumber = 6;
1909   strcpy(errmsg, "unexpected #=GC line");
1910 }
1911 
1912 static void
utest_write_badformat40(FILE * ofp,int * ret_linenumber,char * errmsg)1913 utest_write_badformat40(FILE *ofp, int *ret_linenumber, char *errmsg)
1914 {
1915   fputs("# STOCKHOLM 1.0\n", ofp);
1916   fputs("#=GC SS_cons xxxxxxxxxxxxxxxxxxxx\n", ofp);
1917   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1918   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1919   fputs("#=GC SS_cons xxxxxxxxxxxxxxxxxxxx\n", ofp);
1920   fputs("//\n", ofp);
1921 
1922   *ret_linenumber = 5;
1923   strcpy(errmsg, "more than one #=GC SS_cons line in block");
1924 }
1925 
1926 static void
utest_write_badformat41(FILE * ofp,int * ret_linenumber,char * errmsg)1927 utest_write_badformat41(FILE *ofp, int *ret_linenumber, char *errmsg)
1928 {
1929   fputs("# STOCKHOLM 1.0\n", ofp);
1930   fputs("#=GC SA_cons xxxxxxxxxxxxxxxxxxxx\n", ofp);
1931   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1932   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1933   fputs("#=GC SA_cons xxxxxxxxxxxxxxxxxxxx\n", ofp);
1934   fputs("//\n", ofp);
1935 
1936   *ret_linenumber = 5;
1937   strcpy(errmsg, "more than one #=GC SA_cons line in block");
1938 }
1939 
1940 static void
utest_write_badformat42(FILE * ofp,int * ret_linenumber,char * errmsg)1941 utest_write_badformat42(FILE *ofp, int *ret_linenumber, char *errmsg)
1942 {
1943   fputs("# STOCKHOLM 1.0\n", ofp);
1944   fputs("#=GC PP_cons xxxxxxxxxxxxxxxxxxxx\n", ofp);
1945   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1946   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1947   fputs("#=GC PP_cons xxxxxxxxxxxxxxxxxxxx\n", ofp);
1948   fputs("//\n", ofp);
1949 
1950   *ret_linenumber = 5;
1951   strcpy(errmsg, "more than one #=GC PP_cons line in block");
1952 }
1953 
1954 static void
utest_write_badformat43(FILE * ofp,int * ret_linenumber,char * errmsg)1955 utest_write_badformat43(FILE *ofp, int *ret_linenumber, char *errmsg)
1956 {
1957   fputs("# STOCKHOLM 1.0\n", ofp);
1958   fputs("#=GC RF      xxxxxxxxxxxxxxxxxxxx\n", ofp);
1959   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1960   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1961   fputs("#=GC RF      xxxxxxxxxxxxxxxxxxxx\n", ofp);
1962   fputs("//\n", ofp);
1963 
1964   *ret_linenumber = 5;
1965   strcpy(errmsg, "more than one #=GC RF line in block");
1966 }
1967 
1968 static void
utest_write_badformat44(FILE * ofp,int * ret_linenumber,char * errmsg)1969 utest_write_badformat44(FILE *ofp, int *ret_linenumber, char *errmsg)
1970 {
1971   fputs("# STOCKHOLM 1.0\n", ofp);
1972   fputs("#=GC XX      xxxxxxxxxxxxxxxxxxxx\n", ofp);
1973   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1974   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1975   fputs("#=GC XX      xxxxxxxxxxxxxxxxxxxx\n", ofp);
1976   fputs("//\n", ofp);
1977 
1978   *ret_linenumber = 5;
1979   strcpy(errmsg, "more than one #=GC XX line in block");
1980 }
1981 
1982 static void
utest_write_badformat45(FILE * ofp,int * ret_linenumber,char * errmsg)1983 utest_write_badformat45(FILE *ofp, int *ret_linenumber, char *errmsg)
1984 {
1985   fputs("# STOCKHOLM 1.0\n", ofp);
1986   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1987   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
1988   fputs("#=GC XX        xxxxxxxxxxxxxxxxxx\n", ofp);
1989   fputs("//\n", ofp);
1990 
1991   *ret_linenumber = 4;
1992   strcpy(errmsg, "unexpected # of aligned annotation in #=GC XX line");
1993 }
1994 
1995 static void
utest_write_badformat46(FILE * ofp,int * ret_linenumber,char * errmsg)1996 utest_write_badformat46(FILE *ofp, int *ret_linenumber, char *errmsg)
1997 {
1998   fputs("# STOCKHOLM 1.0\n", ofp);
1999   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2000   fputs("#=GR \n", ofp);
2001   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2002   fputs("//\n", ofp);
2003 
2004   *ret_linenumber = 3;
2005   strcpy(errmsg, "#=GR line missing <seqname>, <tag>, annotation");
2006 }
2007 
2008 static void
utest_write_badformat47(FILE * ofp,int * ret_linenumber,char * errmsg)2009 utest_write_badformat47(FILE *ofp, int *ret_linenumber, char *errmsg)
2010 {
2011   fputs("# STOCKHOLM 1.0\n", ofp);
2012   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2013   fputs("#=GR seq1  \n", ofp);
2014   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2015   fputs("//\n", ofp);
2016 
2017   *ret_linenumber = 3;
2018   strcpy(errmsg, "#=GR line missing <tag>, annotation");
2019 }
2020 
2021 static void
utest_write_badformat48(FILE * ofp,int * ret_linenumber,char * errmsg)2022 utest_write_badformat48(FILE *ofp, int *ret_linenumber, char *errmsg)
2023 {
2024   fputs("# STOCKHOLM 1.0\n", ofp);
2025   fputs("seq1          ACDEFGHIKLMNPQRSTVWY\n", ofp);
2026   fputs("#=GRX seq1 XX xxxxxxxxxxxxxxxxxxxx\n", ofp);
2027   fputs("seq2          ACDEFGHIKLMNPQRSTVWY\n", ofp);
2028   fputs("//\n", ofp);
2029 
2030   *ret_linenumber = 3;
2031   strcpy(errmsg, "faux #=GR line");
2032 }
2033 
2034 static void
utest_write_badformat49(FILE * ofp,int * ret_linenumber,char * errmsg)2035 utest_write_badformat49(FILE *ofp, int *ret_linenumber, char *errmsg)
2036 {
2037   fputs("# STOCKHOLM 1.0\n", ofp);
2038   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2039   fputs("#=GR seq1 XX  \n", ofp);
2040   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2041   fputs("//\n", ofp);
2042 
2043   *ret_linenumber = 3;
2044   strcpy(errmsg, "#=GR line missing annotation");
2045 }
2046 
2047 static void
utest_write_badformat50(FILE * ofp,int * ret_linenumber,char * errmsg)2048 utest_write_badformat50(FILE *ofp, int *ret_linenumber, char *errmsg)
2049 {
2050   fputs("# STOCKHOLM 1.0\n", ofp);
2051   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2052   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2053   fputs("#=GR seq2 SS xxxxxxxxxxxxxxxxxxxx\n", ofp);
2054   fputs("\n", ofp);
2055   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2056   fputs("#=GR seq2 SS xxxxxxxxxxxxxxxxxxxx\n", ofp);
2057   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2058   fputs("//\n", ofp);
2059 
2060   *ret_linenumber = 7;
2061   strcpy(errmsg, "unexpected #=GR <seqname> SS");
2062 }
2063 
2064 static void
utest_write_badformat51(FILE * ofp,int * ret_linenumber,char * errmsg)2065 utest_write_badformat51(FILE *ofp, int *ret_linenumber, char *errmsg)
2066 {
2067   fputs("# STOCKHOLM 1.0\n", ofp);
2068   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2069   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2070   fputs("#=GR seq2 SA xxxxxxxxxxxxxxxxxxxx\n", ofp);
2071   fputs("\n", ofp);
2072   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2073   fputs("#=GR seq2 SA xxxxxxxxxxxxxxxxxxxx\n", ofp);
2074   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2075   fputs("//\n", ofp);
2076 
2077   *ret_linenumber = 7;
2078   strcpy(errmsg, "unexpected #=GR <seqname> SA");
2079 }
2080 
2081 static void
utest_write_badformat52(FILE * ofp,int * ret_linenumber,char * errmsg)2082 utest_write_badformat52(FILE *ofp, int *ret_linenumber, char *errmsg)
2083 {
2084   fputs("# STOCKHOLM 1.0\n", ofp);
2085   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2086   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2087   fputs("#=GR seq2 PP xxxxxxxxxxxxxxxxxxxx\n", ofp);
2088   fputs("\n", ofp);
2089   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2090   fputs("#=GR seq2 PP xxxxxxxxxxxxxxxxxxxx\n", ofp);
2091   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2092   fputs("//\n", ofp);
2093 
2094   *ret_linenumber = 7;
2095   strcpy(errmsg, "unexpected #=GR <seqname> PP");
2096 }
2097 
2098 static void
utest_write_badformat53(FILE * ofp,int * ret_linenumber,char * errmsg)2099 utest_write_badformat53(FILE *ofp, int *ret_linenumber, char *errmsg)
2100 {
2101   fputs("# STOCKHOLM 1.0\n", ofp);
2102   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2103   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2104   fputs("#=GR seq2 XX xxxxxxxxxxxxxxxxxxxx\n", ofp);
2105   fputs("\n", ofp);
2106   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2107   fputs("#=GR seq2 XX xxxxxxxxxxxxxxxxxxxx\n", ofp);
2108   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2109   fputs("//\n", ofp);
2110 
2111   *ret_linenumber = 7;
2112   strcpy(errmsg, "unexpected #=GR line");
2113 }
2114 
2115 static void
utest_write_badformat54(FILE * ofp,int * ret_linenumber,char * errmsg)2116 utest_write_badformat54(FILE *ofp, int *ret_linenumber, char *errmsg)
2117 {
2118   fputs("# STOCKHOLM 1.0\n", ofp);
2119   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2120   fputs("#=GR seq1 XX xxxxxxxxxxxxxxxxxxxx\n", ofp);
2121   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2122   fputs("\n", ofp);
2123   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2124   fputs("#=GR seq2 XX xxxxxxxxxxxxxxxxxxxx\n", ofp);
2125   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2126   fputs("//\n", ofp);
2127 
2128   *ret_linenumber = 7;
2129   strcpy(errmsg, "unexpected seqname seq2; expected seq1 from prev blocks");
2130 }
2131 
2132 static void
utest_write_badformat55(FILE * ofp,int * ret_linenumber,char * errmsg)2133 utest_write_badformat55(FILE *ofp, int *ret_linenumber, char *errmsg)
2134 {
2135   fputs("# STOCKHOLM 1.0\n", ofp);
2136   fputs("#=GR seq1 SS xxxxxxxxxxxxxxxxxxxx\n", ofp);
2137   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2138   fputs("#=GR seq1 SS xxxxxxxxxxxxxxxxxxxx\n", ofp);
2139   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2140   fputs("//\n", ofp);
2141 
2142   *ret_linenumber = 4;
2143   strcpy(errmsg, "more than one #=GR seq1 SS line in block");
2144 }
2145 
2146 static void
utest_write_badformat56(FILE * ofp,int * ret_linenumber,char * errmsg)2147 utest_write_badformat56(FILE *ofp, int *ret_linenumber, char *errmsg)
2148 {
2149   fputs("# STOCKHOLM 1.0\n", ofp);
2150   fputs("#=GR seq1 PP xxxxxxxxxxxxxxxxxxxx\n", ofp);
2151   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2152   fputs("#=GR seq1 PP xxxxxxxxxxxxxxxxxxxx\n", ofp);
2153   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2154   fputs("//\n", ofp);
2155 
2156   *ret_linenumber = 4;
2157   strcpy(errmsg, "more than one #=GR seq1 PP line in block");
2158 }
2159 
2160 static void
utest_write_badformat57(FILE * ofp,int * ret_linenumber,char * errmsg)2161 utest_write_badformat57(FILE *ofp, int *ret_linenumber, char *errmsg)
2162 {
2163   fputs("# STOCKHOLM 1.0\n", ofp);
2164   fputs("#=GR seq1 SA xxxxxxxxxxxxxxxxxxxx\n", ofp);
2165   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2166   fputs("#=GR seq1 SA xxxxxxxxxxxxxxxxxxxx\n", ofp);
2167   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2168   fputs("//\n", ofp);
2169 
2170   *ret_linenumber = 4;
2171   strcpy(errmsg, "more than one #=GR seq1 SA line in block");
2172 }
2173 
2174 static void
utest_write_badformat58(FILE * ofp,int * ret_linenumber,char * errmsg)2175 utest_write_badformat58(FILE *ofp, int *ret_linenumber, char *errmsg)
2176 {
2177   fputs("# STOCKHOLM 1.0\n", ofp);
2178   fputs("#=GR seq1 XX xxxxxxxxxxxxxxxxxxxx\n", ofp);
2179   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2180   fputs("#=GR seq1 XX xxxxxxxxxxxxxxxxxxxx\n", ofp);
2181   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2182   fputs("//\n", ofp);
2183 
2184   *ret_linenumber = 4;
2185   strcpy(errmsg, "more than one #=GR seq1 XX line in block");
2186 }
2187 
2188 static void
utest_write_badformat59(FILE * ofp,int * ret_linenumber,char * errmsg)2189 utest_write_badformat59(FILE *ofp, int *ret_linenumber, char *errmsg)
2190 {
2191   fputs("# STOCKHOLM 1.0\n", ofp);
2192   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2193   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2194   fputs("#=GR seq2 XX xxxxxxxxxxxxxxxxxx\n", ofp);
2195   fputs("//\n", ofp);
2196 
2197   *ret_linenumber = 4;
2198   strcpy(errmsg, "unexpected # of aligned annotation in #=GR seq2 XX line");
2199 }
2200 
2201 static void
utest_write_badformat60(FILE * ofp,int * ret_linenumber,char * errmsg)2202 utest_write_badformat60(FILE *ofp, int *ret_linenumber, char *errmsg)
2203 {
2204   fputs("# STOCKHOLM 1.0\n", ofp);
2205   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2206   fputs("seq2 \n", ofp);
2207   fputs("//\n", ofp);
2208 
2209   *ret_linenumber = 3;
2210   strcpy(errmsg, "sequence line with no sequence");
2211 }
2212 
2213 static void
utest_write_badformat61(FILE * ofp,int * ret_linenumber,char * errmsg)2214 utest_write_badformat61(FILE *ofp, int *ret_linenumber, char *errmsg)
2215 {
2216   fputs("# STOCKHOLM 1.0\n", ofp);
2217   fputs("seq1  ACDEFGHIKLMNPQRSTVWY\n", ofp);
2218   fputs("seq2  ACDEFGHIKLMNPQRSTVWY\n", ofp);
2219   fputs("\n", ofp);
2220   fputs("seq1  ACDEFGHIKLMNPQRSTVWY\n", ofp);
2221   fputs("seq2  ACDEFGHIKLMNPQRSTVWY\n", ofp);
2222   fputs("seq2  ACDEFGHIKLMNPQRSTVWY\n", ofp);
2223   fputs("//\n", ofp);
2224 
2225   *ret_linenumber = 7;
2226   strcpy(errmsg, "more lines than expected; earlier blocks had fewer");
2227 }
2228 
2229 static void
utest_write_badformat62(FILE * ofp,int * ret_linenumber,char * errmsg)2230 utest_write_badformat62(FILE *ofp, int *ret_linenumber, char *errmsg)
2231 {
2232   fputs("# STOCKHOLM 1.0\n", ofp);
2233   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2234   fputs("#=GR seq2 XX xxxxxxxxxxxxxxxxxxxx\n", ofp);
2235   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2236   fputs("\n", ofp);
2237   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2238   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2239   fputs("#=GR seq2 XX xxxxxxxxxxxxxxxxxxxx\n", ofp);
2240   fputs("//\n", ofp);
2241 
2242   *ret_linenumber = 7;
2243   strcpy(errmsg, "unexpected seq line; earlier block(s) in different order");
2244 }
2245 
2246 static void
utest_write_badformat63(FILE * ofp,int * ret_linenumber,char * errmsg)2247 utest_write_badformat63(FILE *ofp, int *ret_linenumber, char *errmsg)
2248 {
2249   fputs("# STOCKHOLM 1.0\n", ofp);
2250   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2251   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2252   fputs("seq3         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2253   fputs("\n", ofp);
2254   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2255   fputs("seq3         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2256   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2257   fputs("//\n", ofp);
2258 
2259   *ret_linenumber = 7;
2260   strcpy(errmsg, "unexpected seq name seq3; expected seq2 from prev block order");
2261 }
2262 
2263 static void
utest_write_badformat64(FILE * ofp,int * ret_linenumber,char * errmsg)2264 utest_write_badformat64(FILE *ofp, int *ret_linenumber, char *errmsg)
2265 {
2266   fputs("# STOCKHOLM 1.0\n", ofp);
2267   fputs("seq1         ACDEFGHIKL^NPQRSTVWY\n", ofp);
2268   fputs("seq2         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2269   fputs("//\n", ofp);
2270 
2271   *ret_linenumber = 2;
2272   strcpy(errmsg, "invalid sequence character(s) on line");
2273 }
2274 
2275 static void
utest_write_badformat65(FILE * ofp,int * ret_linenumber,char * errmsg)2276 utest_write_badformat65(FILE *ofp, int *ret_linenumber, char *errmsg)
2277 {
2278   fputs("# STOCKHOLM 1.0\n", ofp);
2279   fputs("seq1         ACDEFGHIKLMNPQRSTVWY\n", ofp);
2280   fputs("seq2         ACDEFGHIKLMNPQRSTVWYZ\n", ofp);
2281   fputs("//\n", ofp);
2282 
2283   *ret_linenumber = 3;
2284   strcpy(errmsg, "unexpected number of aligned residues parsed on line");
2285 }
2286 
2287 
2288 static void
utest_goodfile(char * filename,int testnumber,int expected_alphatype,int expected_nseq,int expected_alen)2289 utest_goodfile(char *filename, int testnumber, int expected_alphatype, int expected_nseq, int expected_alen)
2290 {
2291   ESL_ALPHABET        *abc          = NULL;
2292   ESL_MSAFILE        *afp          = NULL;
2293   ESL_MSA             *msa1         = NULL;
2294   ESL_MSA             *msa2         = NULL;
2295   char                 tmpfile1[32] = "esltmpXXXXXX";
2296   char                 tmpfile2[32] = "esltmpXXXXXX";
2297   FILE                *ofp          = NULL;
2298   int                  status;
2299 
2300   /* guessing both the format and the alphabet should work: this is a digital open */
2301   if ( (status = esl_msafile_Open(&abc, filename, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) esl_fatal("stockholm good file test %d failed: digital open", testnumber);
2302   if (abc->type   != expected_alphatype)                                                           esl_fatal("stockholm good file test %d failed: alphabet autodetection", testnumber);
2303   if (afp->format != eslMSAFILE_STOCKHOLM)                                                         esl_fatal("stockholm good file test %d failed: format autodetection",   testnumber); /* eslMSAFILE_PFAM is autodetected as STOCKHOLM, and that's fine */
2304 
2305   /* This is a digital read, using <abc>. */
2306   if ( (status = esl_msafile_stockholm_Read(afp, &msa1))   != eslOK) esl_fatal("stockholm good file test %d failed: msa read, digital", testnumber);
2307   if (msa1->nseq != expected_nseq || msa1->alen != expected_alen)    esl_fatal("stockholm good file test %d failed: nseq/alen",         testnumber);
2308   if (esl_msa_Validate(msa1, NULL) != eslOK)                         esl_fatal("stockholm good file test %d failed: msa1 invalid",      testnumber);
2309   esl_msafile_Close(afp);
2310 
2311   /* write it back out to a new tmpfile (digital write) */
2312   if ( (status = esl_tmpfile_named(tmpfile1, &ofp))                            != eslOK) esl_fatal("stockholm good file test %d failed: tmpfile creation",   testnumber);
2313   if ( (status = esl_msafile_stockholm_Write(ofp, msa1, eslMSAFILE_STOCKHOLM)) != eslOK) esl_fatal("stockholm good file test %d failed: msa write, digital", testnumber);
2314   fclose(ofp);
2315 
2316   /* now open and read it as text mode, in known format. (We have to pass fmtd now, to deal with the possibility of a nonstandard name width) */
2317   if ( (status = esl_msafile_Open(NULL, tmpfile1, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp)) != eslOK) esl_fatal("stockholm good file test %d failed: text mode open", testnumber);
2318   if ( (status = esl_msafile_stockholm_Read(afp, &msa2))                                   != eslOK) esl_fatal("stockholm good file test %d failed: msa read, text", testnumber);
2319   if (msa2->nseq != expected_nseq || msa2->alen != expected_alen)                                    esl_fatal("stockholm good file test %d failed: nseq/alen",      testnumber);
2320   if (esl_msa_Validate(msa2, NULL) != eslOK)                                                         esl_fatal("stockholm good file test %d failed: msa2 invalid",   testnumber);
2321   esl_msafile_Close(afp);
2322 
2323   /* write it back out to a new tmpfile (text write) */
2324   if ( (status = esl_tmpfile_named(tmpfile2, &ofp))                        != eslOK) esl_fatal("stockholm good file test %d failed: tmpfile creation", testnumber);
2325   if ( (status = esl_msafile_stockholm_Write(ofp, msa2, eslMSAFILE_PFAM))  != eslOK) esl_fatal("stockholm good file test %d failed: msa write, text",  testnumber);
2326   fclose(ofp);
2327   esl_msa_Destroy(msa2);
2328 
2329   /* open and read it in digital mode */
2330   if ( (status = esl_msafile_Open(&abc, tmpfile1, NULL, eslMSAFILE_PFAM, NULL, &afp)) != eslOK) esl_fatal("stockholm good file test %d failed: 2nd digital mode open", testnumber);
2331   if ( (status = esl_msafile_stockholm_Read(afp, &msa2))                              != eslOK) esl_fatal("stockholm good file test %d failed: 2nd digital msa read",  testnumber);
2332   if (esl_msa_Validate(msa2, NULL) != eslOK)                                                    esl_fatal("stockholm good file test %d failed: msa2 invalid",          testnumber);
2333   esl_msafile_Close(afp);
2334 
2335   /* this msa <msa2> should be identical to <msa1> */
2336   if (esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal("stockholm good file test %d failed: msa compare", testnumber);
2337 
2338   remove(tmpfile1);
2339   remove(tmpfile2);
2340   esl_msa_Destroy(msa1);
2341   esl_msa_Destroy(msa2);
2342   esl_alphabet_Destroy(abc);
2343 }
2344 
2345 static void
utest_bad_format(char * filename,int testnumber,int expected_linenumber,char * expected_errmsg)2346 utest_bad_format(char *filename, int testnumber, int expected_linenumber, char *expected_errmsg)
2347 {
2348   ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO);
2349   ESL_MSAFILE *afp = NULL;
2350   int           fmt = eslMSAFILE_STOCKHOLM;
2351   ESL_MSA      *msa = NULL;
2352   int           status;
2353 
2354   if ( (status = esl_msafile_Open(&abc, filename, NULL, fmt, NULL, &afp)) != eslOK)  esl_fatal("stockholm bad format test %d failed: unexpected open failure", testnumber);
2355   if ( (status = esl_msafile_stockholm_Read(afp, &msa)) != eslEFORMAT)               esl_fatal("stockholm bad format test %d failed: unexpected error code",   testnumber);
2356   if (strstr(afp->errmsg, expected_errmsg) == NULL)                                  esl_fatal("stockholm bad format test %d failed: unexpected errmsg",       testnumber);
2357   if (afp->linenumber != expected_linenumber)                                        esl_fatal("stockholm bad format test %d failed: unexpected linenumber",   testnumber);
2358   esl_msafile_Close(afp);
2359   esl_alphabet_Destroy(abc);
2360   esl_msa_Destroy(msa);
2361 }
2362 
2363 
2364 static void
utest_good_format(ESL_ALPHABET ** byp_abc,int fmt,int expected_nseq,int64_t expected_alen,char * buf)2365 utest_good_format(ESL_ALPHABET **byp_abc, int fmt, int expected_nseq, int64_t expected_alen, char *buf)
2366 {
2367   char          msg[] = "good format test failed";
2368   ESL_MSAFILE *afp = NULL;
2369   ESL_MSA      *msa = NULL;
2370 
2371   if (esl_msafile_OpenMem(byp_abc, buf, strlen(buf), fmt, NULL, &afp) != eslOK) esl_fatal(msg);
2372   if (esl_msafile_stockholm_Read(afp, &msa)                           != eslOK) esl_fatal(msg);
2373   if (msa->nseq != expected_nseq)                                               esl_fatal(msg);
2374   if (msa->alen != expected_alen)                                               esl_fatal(msg);
2375 
2376   esl_msa_Destroy(msa);
2377   esl_msafile_Close(afp);
2378 }
2379 
2380 static void
utest_identical_io(ESL_ALPHABET ** byp_abc,int fmt,char * buf)2381 utest_identical_io(ESL_ALPHABET **byp_abc, int fmt, char *buf)
2382 {
2383   char   msg[]        = "identical io test failed";
2384   char   tmpfile1[32] = "esltmpXXXXXX";
2385   char   tmpfile2[32] = "esltmpXXXXXX";
2386   FILE  *fp = NULL;
2387   ESL_MSAFILE *afp = NULL;
2388   ESL_MSA *msa1 = NULL;
2389   ESL_MSA *msa2 = NULL;
2390 
2391   if (esl_tmpfile_named(tmpfile1, &fp) != eslOK) esl_fatal(msg);
2392   fputs(buf, fp);
2393   fclose(fp);
2394 
2395   if (esl_msafile_Open(byp_abc, tmpfile1, NULL, fmt, NULL, &afp) != eslOK) esl_fatal(msg);
2396   if (esl_msafile_stockholm_Read(afp, &msa1)                     != eslOK) esl_fatal(msg);
2397   esl_msafile_Close(afp);
2398 
2399   if (esl_tmpfile_named(tmpfile2, &fp) != eslOK) esl_fatal(msg);
2400   if (esl_msafile_stockholm_Write(fp, msa1, eslMSAFILE_STOCKHOLM) != eslOK) esl_fatal(msg);
2401   fclose(fp);
2402 
2403   if (esl_msafile_Open(byp_abc, tmpfile2, NULL, fmt, NULL, &afp) != eslOK) esl_fatal(msg);
2404   if (esl_msafile_stockholm_Read(afp, &msa2)                     != eslOK) esl_fatal(msg);
2405   esl_msafile_Close(afp);
2406 
2407   if (esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal(msg);
2408 
2409   remove(tmpfile1);
2410   remove(tmpfile2);
2411   esl_msa_Destroy(msa1);
2412   esl_msa_Destroy(msa2);
2413 }
2414 
2415 static void
utest_bad_open(ESL_ALPHABET ** byp_abc,int fmt,int expected_status,char * buf)2416 utest_bad_open(ESL_ALPHABET **byp_abc, int fmt, int expected_status, char *buf)
2417 {
2418   char          msg[] = "bad open test failed";
2419   ESL_MSAFILE *afp   = NULL;
2420 
2421   if (esl_msafile_OpenMem(byp_abc, buf, strlen(buf), fmt, NULL, &afp) != expected_status) esl_fatal(msg);
2422   esl_msafile_Close(afp);
2423 }
2424 
2425 static void
utest_bad_read(ESL_ALPHABET ** byp_abc,int fmt,char * expected_errmsg,int expected_line,char * buf)2426 utest_bad_read(ESL_ALPHABET **byp_abc, int fmt, char *expected_errmsg, int expected_line, char *buf)
2427 {
2428   char          msg[] = "bad format test failed";
2429   ESL_MSAFILE *afp   = NULL;
2430   ESL_MSA      *msa   = NULL;
2431 
2432   if (esl_msafile_OpenMem(byp_abc, buf, strlen(buf), fmt, NULL, &afp) != eslOK)      esl_fatal(msg);
2433   if (esl_msafile_stockholm_Read(afp, &msa)                           != eslEFORMAT) esl_fatal(msg);
2434   if (strstr(afp->errmsg, expected_errmsg)                            == NULL)       esl_fatal(msg);
2435   if (afp->linenumber != expected_line)                                              esl_fatal(msg);
2436 
2437   esl_msa_Destroy(msa);
2438   esl_msafile_Close(afp);
2439 }
2440 #endif /*eslMSAFILE_STOCKHOLM_TESTDRIVE*/
2441 /*----------------- end, unit tests -----------------------------*/
2442 
2443 
2444 
2445 /*****************************************************************
2446  * 7. Test driver.
2447  *****************************************************************/
2448 #ifdef eslMSAFILE_STOCKHOLM_TESTDRIVE
2449 /* compile: gcc -g -Wall -I. -L. -o esl_msafile_stockholm_utest -DeslMSAFILE_STOCKHOLM_TESTDRIVE esl_msafile_stockholm.c -leasel -lm
2450  *  (gcov): gcc -g -Wall -fprofile-arcs -ftest-coverage -I. -L. -o esl_msafile_stockholm_utest -DeslMSAFILE_STOCKHOLM_TESTDRIVE esl_msafile_stockholm.c -leasel -lm
2451  * run:     ./esl_msafile_stockholm_utest
2452  */
2453 #include "esl_config.h"
2454 
2455 #include <stdio.h>
2456 
2457 #include "easel.h"
2458 #include "esl_getopts.h"
2459 #include "esl_random.h"
2460 #include "esl_msafile.h"
2461 #include "esl_msafile_stockholm.h"
2462 
2463 static ESL_OPTIONS options[] = {
2464    /* name  type         default  env   range togs  reqs  incomp  help                docgrp */
2465   {"-h",  eslARG_NONE,    FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage",                            0},
2466   { 0,0,0,0,0,0,0,0,0,0},
2467 };
2468 static char usage[]  = "[-options]";
2469 static char banner[] = "test driver for Stockholm/Xfam MSA format module";
2470 
2471 int
main(int argc,char ** argv)2472 main(int argc, char **argv)
2473 {
2474   char            msg[]       = "Stockholm MSA i/o module test driver failed";
2475   ESL_GETOPTS    *go          = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
2476   int             ngoodtests  = 1;
2477   int             nbadtests   = 65;
2478   char            tmpfile[32];
2479   FILE           *ofp;
2480   int             testnumber;
2481   int             expected_alphatype;
2482   int             expected_nseq;
2483   int             expected_alen;
2484   int             expected_linenumber;
2485   char            expected_errmsg[eslERRBUFSIZE];
2486 
2487   utest_bad_open(NULL, eslMSAFILE_UNKNOWN, eslENOFORMAT, "");
2488 
2489   utest_bad_read(NULL, eslMSAFILE_UNKNOWN, "missing // terminator", 1,  "# STOCKHOLM 1.0\n");
2490   utest_bad_read(NULL, eslMSAFILE_UNKNOWN, "no alignment data",     2,  "# STOCKHOLM 1.0\n//\n");
2491 
2492   utest_good_format(NULL, eslMSAFILE_UNKNOWN, 2, 10, "\n# STOCKHOLM 1.0\n\nseq1 ACDEFGHIKL\nseq2 ACDEFGHIKL\n\n//\n\n");
2493 
2494   utest_identical_io(NULL, eslMSAFILE_UNKNOWN, "# STOCKHOLM 1.0\n\nseq1 ACDEFGHIKL\nseq2 ACDEFGHIKL\n//\n");
2495 
2496   /* Various "good" files, that should be parsed correctly. */
2497   for (testnumber = 1; testnumber <= ngoodtests; testnumber++)
2498     {
2499       strcpy(tmpfile, "esltmpXXXXXX");
2500       if (esl_tmpfile_named(tmpfile, &ofp) != eslOK) esl_fatal(msg);
2501       switch (testnumber) {
2502       case  1:  utest_write_good1 (ofp, &expected_alphatype, &expected_nseq, &expected_alen); break;
2503       }
2504       fclose(ofp);
2505       utest_goodfile(tmpfile, testnumber, expected_alphatype, expected_nseq, expected_alen);
2506       remove(tmpfile);
2507     }
2508 
2509   /* Tests for all possible EFORMAT errors */
2510    for (testnumber = 1; testnumber <= nbadtests; testnumber++)
2511     {
2512       strcpy(tmpfile, "esltmpXXXXXX");
2513       if (esl_tmpfile_named(tmpfile, &ofp) != eslOK) esl_fatal(msg);
2514 
2515       switch (testnumber) {
2516       case  1:  utest_write_badformat1 (ofp, &expected_linenumber, expected_errmsg); break;
2517       case  2:  utest_write_badformat2 (ofp, &expected_linenumber, expected_errmsg); break;
2518       case  3:  utest_write_badformat3 (ofp, &expected_linenumber, expected_errmsg); break;
2519       case  4:  utest_write_badformat4 (ofp, &expected_linenumber, expected_errmsg); break;
2520       case  5:  utest_write_badformat5 (ofp, &expected_linenumber, expected_errmsg); break;
2521       case  6:  utest_write_badformat6 (ofp, &expected_linenumber, expected_errmsg); break;
2522       case  7:  utest_write_badformat7 (ofp, &expected_linenumber, expected_errmsg); break;
2523       case  8:  utest_write_badformat8 (ofp, &expected_linenumber, expected_errmsg); break;
2524       case  9:  utest_write_badformat9 (ofp, &expected_linenumber, expected_errmsg); break;
2525       case 10:  utest_write_badformat10(ofp, &expected_linenumber, expected_errmsg); break;
2526       case 11:  utest_write_badformat11(ofp, &expected_linenumber, expected_errmsg); break;
2527       case 12:  utest_write_badformat12(ofp, &expected_linenumber, expected_errmsg); break;
2528       case 13:  utest_write_badformat13(ofp, &expected_linenumber, expected_errmsg); break;
2529       case 14:  utest_write_badformat14(ofp, &expected_linenumber, expected_errmsg); break;
2530       case 15:  utest_write_badformat15(ofp, &expected_linenumber, expected_errmsg); break;
2531       case 16:  utest_write_badformat16(ofp, &expected_linenumber, expected_errmsg); break;
2532       case 17:  utest_write_badformat17(ofp, &expected_linenumber, expected_errmsg); break;
2533       case 18:  utest_write_badformat18(ofp, &expected_linenumber, expected_errmsg); break;
2534       case 19:  utest_write_badformat19(ofp, &expected_linenumber, expected_errmsg); break;
2535       case 20:  utest_write_badformat20(ofp, &expected_linenumber, expected_errmsg); break;
2536       case 21:  utest_write_badformat21(ofp, &expected_linenumber, expected_errmsg); break;
2537       case 22:  utest_write_badformat22(ofp, &expected_linenumber, expected_errmsg); break;
2538       case 23:  utest_write_badformat23(ofp, &expected_linenumber, expected_errmsg); break;
2539       case 24:  utest_write_badformat24(ofp, &expected_linenumber, expected_errmsg); break;
2540       case 25:  utest_write_badformat25(ofp, &expected_linenumber, expected_errmsg); break;
2541       case 26:  utest_write_badformat26(ofp, &expected_linenumber, expected_errmsg); break;
2542       case 27:  utest_write_badformat27(ofp, &expected_linenumber, expected_errmsg); break;
2543       case 28:  utest_write_badformat28(ofp, &expected_linenumber, expected_errmsg); break;
2544       case 29:  utest_write_badformat29(ofp, &expected_linenumber, expected_errmsg); break;
2545       case 30:  utest_write_badformat30(ofp, &expected_linenumber, expected_errmsg); break;
2546       case 31:  utest_write_badformat31(ofp, &expected_linenumber, expected_errmsg); break;
2547       case 32:  utest_write_badformat32(ofp, &expected_linenumber, expected_errmsg); break;
2548       case 33:  utest_write_badformat33(ofp, &expected_linenumber, expected_errmsg); break;
2549       case 34:  utest_write_badformat34(ofp, &expected_linenumber, expected_errmsg); break;
2550       case 35:  utest_write_badformat35(ofp, &expected_linenumber, expected_errmsg); break;
2551       case 36:  utest_write_badformat36(ofp, &expected_linenumber, expected_errmsg); break;
2552       case 37:  utest_write_badformat37(ofp, &expected_linenumber, expected_errmsg); break;
2553       case 38:  utest_write_badformat38(ofp, &expected_linenumber, expected_errmsg); break;
2554       case 39:  utest_write_badformat39(ofp, &expected_linenumber, expected_errmsg); break;
2555       case 40:  utest_write_badformat40(ofp, &expected_linenumber, expected_errmsg); break;
2556       case 41:  utest_write_badformat41(ofp, &expected_linenumber, expected_errmsg); break;
2557       case 42:  utest_write_badformat42(ofp, &expected_linenumber, expected_errmsg); break;
2558       case 43:  utest_write_badformat43(ofp, &expected_linenumber, expected_errmsg); break;
2559       case 44:  utest_write_badformat44(ofp, &expected_linenumber, expected_errmsg); break;
2560       case 45:  utest_write_badformat45(ofp, &expected_linenumber, expected_errmsg); break;
2561       case 46:  utest_write_badformat46(ofp, &expected_linenumber, expected_errmsg); break;
2562       case 47:  utest_write_badformat47(ofp, &expected_linenumber, expected_errmsg); break;
2563       case 48:  utest_write_badformat48(ofp, &expected_linenumber, expected_errmsg); break;
2564       case 49:  utest_write_badformat49(ofp, &expected_linenumber, expected_errmsg); break;
2565       case 50:  utest_write_badformat50(ofp, &expected_linenumber, expected_errmsg); break;
2566       case 51:  utest_write_badformat51(ofp, &expected_linenumber, expected_errmsg); break;
2567       case 52:  utest_write_badformat52(ofp, &expected_linenumber, expected_errmsg); break;
2568       case 53:  utest_write_badformat53(ofp, &expected_linenumber, expected_errmsg); break;
2569       case 54:  utest_write_badformat54(ofp, &expected_linenumber, expected_errmsg); break;
2570       case 55:  utest_write_badformat55(ofp, &expected_linenumber, expected_errmsg); break;
2571       case 56:  utest_write_badformat56(ofp, &expected_linenumber, expected_errmsg); break;
2572       case 57:  utest_write_badformat57(ofp, &expected_linenumber, expected_errmsg); break;
2573       case 58:  utest_write_badformat58(ofp, &expected_linenumber, expected_errmsg); break;
2574       case 59:  utest_write_badformat59(ofp, &expected_linenumber, expected_errmsg); break;
2575       case 60:  utest_write_badformat60(ofp, &expected_linenumber, expected_errmsg); break;
2576       case 61:  utest_write_badformat61(ofp, &expected_linenumber, expected_errmsg); break;
2577       case 62:  utest_write_badformat62(ofp, &expected_linenumber, expected_errmsg); break;
2578       case 63:  utest_write_badformat63(ofp, &expected_linenumber, expected_errmsg); break;
2579       case 64:  utest_write_badformat64(ofp, &expected_linenumber, expected_errmsg); break;
2580       case 65:  utest_write_badformat65(ofp, &expected_linenumber, expected_errmsg); break;
2581       }
2582       fclose(ofp);
2583 
2584       utest_bad_format(tmpfile, testnumber, expected_linenumber, expected_errmsg);
2585       remove(tmpfile);
2586     }
2587 
2588   esl_getopts_Destroy(go);
2589   return 0;
2590 }
2591 #endif /*eslMSAFILE_STOCKHOLM_TESTDRIVE*/
2592 /*---------------- end, test driver -----------------------------*/
2593 
2594 
2595 /*****************************************************************
2596  * 8. Examples.
2597  *****************************************************************/
2598 
2599 #ifdef eslMSAFILE_STOCKHOLM_EXAMPLE
2600 /* A full-featured example of reading/writing MSA(s) in Stockholm format.
2601    gcc -g -Wall -o esl_msafile_stockholm_example -I. -L. -DeslMSAFILE_STOCKHOLM_EXAMPLE esl_msafile_stockholm.c -leasel -lm
2602    ./esl_msafile_stockholm_example <msafile>
2603  */
2604 /*::cexcerpt::msafile_stockholm_example::begin::*/
2605 #include <stdio.h>
2606 
2607 #include "easel.h"
2608 #include "esl_alphabet.h"
2609 #include "esl_getopts.h"
2610 #include "esl_msa.h"
2611 #include "esl_msafile.h"
2612 #include "esl_msafile_stockholm.h"
2613 
2614 static ESL_OPTIONS options[] = {
2615   /* name             type          default  env  range toggles reqs incomp  help                                       docgroup*/
2616   { "-h",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",            0 },
2617   { "-1",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "override autodetection; force Stockholm format",  0 },
2618   { "-q",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "quieter: don't write msa back, just summary",     0 },
2619   { "-t",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use text mode: no digital alphabet",              0 },
2620   { "--dna",       eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, "-t", "specify that alphabet is DNA",                    0 },
2621   { "--rna",       eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, "-t", "specify that alphabet is RNA",                    0 },
2622   { "--amino",     eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, "-t", "specify that alphabet is protein",                0 },
2623   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
2624 };
2625 static char usage[]  = "[-options] <msafile>";
2626 static char banner[] = "example of guessing, reading, writing Stockholm format";
2627 
2628 int
main(int argc,char ** argv)2629 main(int argc, char **argv)
2630 {
2631   ESL_GETOPTS        *go          = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
2632   char               *filename    = esl_opt_GetArg(go, 1);
2633   int                 infmt       = eslMSAFILE_UNKNOWN;
2634   ESL_ALPHABET       *abc         = NULL;
2635   ESL_MSAFILE        *afp         = NULL;
2636   ESL_MSA            *msa         = NULL;
2637   int                 status;
2638 
2639   if      (esl_opt_GetBoolean(go, "-1"))      infmt = eslMSAFILE_STOCKHOLM;  /* override format autodetection */
2640 
2641   if      (esl_opt_GetBoolean(go, "--rna"))   abc = esl_alphabet_Create(eslRNA);
2642   else if (esl_opt_GetBoolean(go, "--dna"))   abc = esl_alphabet_Create(eslDNA);
2643   else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
2644 
2645   /* Text mode: pass NULL for alphabet.
2646    * Digital mode: pass ptr to expected ESL_ALPHABET; and if abc=NULL, alphabet is guessed
2647    */
2648   if   (esl_opt_GetBoolean(go, "-t"))  status = esl_msafile_Open(NULL, filename, NULL, infmt, NULL, &afp);
2649   else                                 status = esl_msafile_Open(&abc, filename, NULL, infmt, NULL, &afp);
2650   if (status != eslOK) esl_msafile_OpenFailure(afp, status);
2651 
2652   while ( (status = esl_msafile_stockholm_Read(afp, &msa)) == eslOK)
2653     {
2654       printf("alphabet:       %s\n", (abc ? esl_abc_DecodeType(abc->type) : "none (text mode)"));
2655       printf("# of seqs:      %d\n", msa->nseq);
2656       printf("# of cols:      %d\n", (int) msa->alen);
2657       printf("\n");
2658 
2659       if (! esl_opt_GetBoolean(go, "-q"))
2660 	esl_msafile_stockholm_Write(stdout, msa, eslMSAFILE_STOCKHOLM);
2661 
2662       esl_msa_Destroy(msa);
2663     }
2664   if (status != eslEOF) esl_msafile_ReadFailure(afp, status);
2665 
2666   esl_msafile_Close(afp);
2667   if (abc) esl_alphabet_Destroy(abc);
2668   esl_getopts_Destroy(go);
2669   exit(0);
2670 }
2671 /*::cexcerpt::msafile_stockholm_example::end::*/
2672 #endif /*eslMSAFILE_STOCKHOLM_EXAMPLE*/
2673 
2674 
2675 
2676 #ifdef eslMSAFILE_STOCKHOLM_EXAMPLE2
2677 /* A minimal example. Read Stockholm MSAs, in text mode.
2678    gcc -g -Wall -o esl_msafile_stockholm_example2 -I. -L. -DeslMSAFILE_STOCKHOLM_EXAMPLE2 esl_msafile_stockholm.c -leasel -lm
2679    ./esl_msafile_stockholm_example2 <msafile>
2680  */
2681 
2682 /*::cexcerpt::msafile_stockholm_example::begin::*/
2683 #include <stdio.h>
2684 
2685 #include "easel.h"
2686 #include "esl_msa.h"
2687 #include "esl_msafile.h"
2688 #include "esl_msafile_stockholm.h"
2689 
2690 int
main(int argc,char ** argv)2691 main(int argc, char **argv)
2692 {
2693   char        *filename = argv[1];
2694   int          infmt    = eslMSAFILE_STOCKHOLM;
2695   ESL_MSAFILE  *afp     = NULL;
2696   ESL_MSA      *msa     = NULL;
2697   int           status;
2698 
2699   if ( (status = esl_msafile_Open(NULL, filename, NULL, infmt, NULL, &afp)) != eslOK)
2700     esl_msafile_OpenFailure(afp, status);
2701 
2702   while  ( (status = esl_msafile_stockholm_Read(afp, &msa)) == eslOK)
2703     {
2704       printf("%15s: %6d seqs, %5d columns\n\n", msa->name, msa->nseq, (int) msa->alen);
2705       esl_msafile_stockholm_Write(stdout, msa, eslMSAFILE_STOCKHOLM);
2706       esl_msa_Destroy(msa);
2707     }
2708   if (status != eslEOF)  esl_msafile_ReadFailure(afp, status);
2709 
2710   esl_msafile_Close(afp);
2711   exit(0);
2712 }
2713 /*::cexcerpt::msafile_stockholm_example2::end::*/
2714 #endif /*eslMSAFILE_STOCKHOLM_EXAMPLE2*/
2715 /*--------------------- end of example --------------------------*/
2716