1 /* I/O of multiple sequence alignment files in Stockholm/Pfam format
2 * (Pfam format = always single block; Stockholm = multiblock allowed)
3 *
4 * Contents:
5 * 1. API for reading/writing Stockholm/Pfam input.
6 * 2. Internal: ESL_STOCKHOLM_PARSEDATA auxiliary structure.
7 * 3. Internal: parsing Stockholm line types.
8 * 4. Internal: looking up seq, tag indices.
9 * 5. Internal: writing Stockholm/Pfam formats
10 * 6. Unit tests.
11 * 7. Test driver.
12 * 8. Example.
13 */
14 #include "esl_config.h"
15
16 #include <string.h>
17 #include <ctype.h>
18
19 #include "easel.h"
20 #include "esl_alphabet.h"
21 #include "esl_mem.h"
22 #include "esl_msa.h"
23 #include "esl_msafile.h"
24
25 #include "esl_msafile_stockholm.h"
26
27 /* Valid line types in an alignment block */
28 #define eslSTOCKHOLM_LINE_SQ 1
29 #define eslSTOCKHOLM_LINE_GC_SSCONS 2
30 #define eslSTOCKHOLM_LINE_GC_SACONS 3
31 #define eslSTOCKHOLM_LINE_GC_PPCONS 4
32 #define eslSTOCKHOLM_LINE_GC_RF 5
33 #define eslSTOCKHOLM_LINE_GC_OTHER 6
34 #define eslSTOCKHOLM_LINE_GR_SS 7
35 #define eslSTOCKHOLM_LINE_GR_SA 8
36 #define eslSTOCKHOLM_LINE_GR_PP 9
37 #define eslSTOCKHOLM_LINE_GR_OTHER 10
38 #define eslSTOCKHOLM_LINE_GC_MM 11
39
40 typedef struct {
41 /* information about the size of the growing alignment parse */
42 int nseq; /* # of sqnames currently stored, sqname[0..nseq-1]. Copy of msa->nseq */
43 int64_t alen; /* alignment length not including current block being parsed. Becomes msa->alen when done */
44
45 /* Having to do with the expected order of lines in each Stockholm block: */
46 int in_block; /* TRUE if we're in a block (GC, GR, or sequence lines) */
47 char *blinetype; /* blinetype[bi=0..npb-1] = code for linetype on parsed block line [bi]: GC, GR, or seq */
48 int *bidx; /* bidx[bi=0.npb-1] = seq index si=0..nseq-1 of seq or GR on parsed block line [bi]; or -1 for GC lines */
49 int npb; /* number of lines per block. Set by bi in 1st block; checked against bi thereafter */
50 int bi; /* index of current line in a block, 0..npb-1 */
51 int si; /* current (next expected) sequence index, 0..nseq */
52 int balloc; /* number of lines per block currently allocated for. */
53
54 /* Other information kept per block */
55 int nblock; /* current block number (starting at 0 while in first block) */
56 int nseq_b; /* number of sequences seen in this block so far */
57 int64_t alen_b; /* residues added by each seq field in curr block */
58
59 /* Having to do with the growing lengths (and numbers) of sequences and annotations in <msa>: */
60 /* yes, needed: used to catch dup lines in a block, such as seq1 xxx, seq1 xxx. */
61 int64_t ssconslen; /* current length of #=GC SS_cons annotation */
62 int64_t saconslen; /* current length of #=GC SA_cons annotation */
63 int64_t ppconslen; /* current length of #=GC PP_cons annotation */
64 int64_t rflen; /* current length of #=GC RF annotation */
65 int64_t mmasklen; /* current length of #=GC MM annotation */
66 int64_t *sqlen; /* current lengths of ax[0..nseq-1] or aseq[0..nseq-1] */
67 int64_t *sslen; /* current lengths of ss[0..nseq-1] */
68 int64_t *salen; /* current lengths of sa[0..nseq-1] */
69 int64_t *pplen; /* current lengths of pp[0..nseq-1] */
70 int64_t *ogc_len; /* current lengths of unparsed gc[0..ngc-1] */
71 int64_t **ogr_len; /* current lengths of unparsed gr[0..ngr-1][0..nseq-1] */
72 int salloc; /* # of sqnames currently allocated for (synced to msa->sqalloc) */
73 } ESL_STOCKHOLM_PARSEDATA;
74
75 static ESL_STOCKHOLM_PARSEDATA *stockholm_parsedata_Create(ESL_MSA *msa);
76 static int stockholm_parsedata_ExpandSeq (ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa);
77 static int stockholm_parsedata_ExpandBlock(ESL_STOCKHOLM_PARSEDATA *pd);
78 static void stockholm_parsedata_Destroy (ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa);
79
80 static int stockholm_parse_gf(ESL_MSAFILE *afp, ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa, char *p, esl_pos_t n);
81 static int stockholm_parse_gs(ESL_MSAFILE *afp, ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa, char *p, esl_pos_t n);
82 static int stockholm_parse_gc(ESL_MSAFILE *afp, ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa, char *p, esl_pos_t n);
83 static int stockholm_parse_gr(ESL_MSAFILE *afp, ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa, char *p, esl_pos_t n);
84 static int stockholm_parse_sq(ESL_MSAFILE *afp, ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa, char *p, esl_pos_t n);
85 static int stockholm_parse_comment(ESL_MSA *msa, char *p, esl_pos_t n);
86
87 static int stockholm_get_seqidx (ESL_MSA *msa, ESL_STOCKHOLM_PARSEDATA *pd, char *name, esl_pos_t n, int *ret_idx);
88 static int stockholm_get_gr_tagidx(ESL_MSA *msa, ESL_STOCKHOLM_PARSEDATA *pd, char *tag, esl_pos_t taglen, int *ret_tagidx);
89 static int stockholm_get_gc_tagidx(ESL_MSA *msa, ESL_STOCKHOLM_PARSEDATA *pd, char *tag, esl_pos_t taglen, int *ret_tagidx);
90
91 static int stockholm_write(FILE *fp, const ESL_MSA *msa, int64_t cpl);
92
93
94 /*****************************************************************
95 *# 1. API for reading/writing Stockholm input.
96 *****************************************************************/
97
98 /* Function: esl_msafile_stockholm_SetInmap()
99 * Synopsis: Configure the input map for Stockholm format.
100 *
101 * Purpose: Configure <afp->inmap> for Stockholm format.
102 *
103 * Text mode accepts any <isgraph()> character.
104 * Digital mode enforces the usual Easel alphabets.
105 *
106 * No characters may be ignored in the input. We cannot
107 * skip whitespace in the inmap, because we'd misalign
108 * relative to the text-mode annotation lines (GR, GC),
109 * where we don't do mapped input.)
110 */
111 int
esl_msafile_stockholm_SetInmap(ESL_MSAFILE * afp)112 esl_msafile_stockholm_SetInmap(ESL_MSAFILE *afp)
113 {
114 int sym;
115
116 if (afp->abc)
117 {
118 for (sym = 0; sym < 128; sym++)
119 afp->inmap[sym] = afp->abc->inmap[sym];
120 afp->inmap[0] = esl_abc_XGetUnknown(afp->abc);
121 }
122
123 if (! afp->abc)
124 {
125 for (sym = 1; sym < 128; sym++)
126 afp->inmap[sym] = (isgraph(sym) ? sym : eslDSQ_ILLEGAL);
127 afp->inmap[0] = '?';
128 }
129 return eslOK;
130 }
131
132
133 /* Function: esl_msafile_stockholm_GuessAlphabet()
134 * Synopsis: Guess the alphabet of an open Stockholm MSA file.
135 *
136 * Purpose: Guess the alphabet of the sequences in open
137 * Stockholm-format MSA file <afp>.
138 *
139 * On a normal return, <*ret_type> is set to <eslDNA>,
140 * <eslRNA>, or <eslAMINO>, and <afp> is reset to its
141 * original position.
142 *
143 * Args: afp - open Stockholm-format MSA file
144 * ret_type - RETURN: <eslDNA>, <eslRNA>, or <eslAMINO>
145 *
146 * Returns: <eslOK> on success.
147 * <eslENOALPHABET> if alphabet type can't be determined.
148 * In either case, <afp> is rewound to the position it
149 * started at.
150 */
151 int
esl_msafile_stockholm_GuessAlphabet(ESL_MSAFILE * afp,int * ret_type)152 esl_msafile_stockholm_GuessAlphabet(ESL_MSAFILE *afp, int *ret_type)
153 {
154 int alphatype = eslUNKNOWN;
155 esl_pos_t anchor = -1;
156 int threshold[3] = { 500, 5000, 50000 }; /* we check after 500, 5000, 50000 residues; else we go to EOF */
157 int nsteps = 3;
158 int step = 0;
159 int nres = 0;
160 int x;
161 int64_t ct[26];
162 char *p, *tok;
163 esl_pos_t n, toklen, pos;
164 int status;
165
166 for (x = 0; x < 26; x++) ct[x] = 0;
167
168 anchor = esl_buffer_GetOffset(afp->bf);
169 if ((status = esl_buffer_SetAnchor(afp->bf, anchor)) != eslOK) { status = eslEINCONCEIVABLE; goto ERROR; } /* [eslINVAL] can't happen here */
170
171 while ( (status = esl_buffer_GetLine(afp->bf, &p, &n)) == eslOK)
172 {
173 if ((status = esl_memtok(&p, &n, " \t", &tok, &toklen)) != eslOK || *tok == '#') continue; /* blank lines, annotation, comments */
174 /* p now points to the rest of the sequence line */
175
176 /* count characters into ct[] array */
177 for (pos = 0; pos < n; pos++)
178 if (isalpha(p[pos])) {
179 x = toupper(p[pos]) - 'A';
180 ct[x]++;
181 nres++;
182 }
183
184 /* try to stop early, checking after 500, 5000, and 50000 residues: */
185 if (step < nsteps && nres > threshold[step]) {
186 if ((status = esl_abc_GuessAlphabet(ct, &alphatype)) == eslOK) goto DONE; /* (eslENOALPHABET) */
187 step++;
188 }
189 }
190 if (status != eslEOF) goto ERROR; /* [eslEMEM,eslESYS,eslEINCONCEIVABLE] */
191 status = esl_abc_GuessAlphabet(ct, &alphatype); /* (eslENOALPHABET) */
192
193 DONE:
194 esl_buffer_SetOffset(afp->bf, anchor); /* Rewind to where we were. */
195 esl_buffer_RaiseAnchor(afp->bf, anchor);
196 *ret_type = alphatype;
197 return status;
198
199 ERROR:
200 if (anchor != -1) {
201 esl_buffer_SetOffset(afp->bf, anchor);
202 esl_buffer_RaiseAnchor(afp->bf, anchor);
203 }
204 *ret_type = eslUNKNOWN;
205 return status;
206 }
207
208
209 /* Function: esl_msafile_stockholm_Read()
210 * Synopsis: Read an alignment in Stockholm format.
211 *
212 * Purpose: Read an MSA from open <ESL_MSAFILE> <afp>,
213 * parsing for Stockholm format. Create a new
214 * MSA, and return it by reference through
215 * <*ret_msa>. Caller is responsible for freeing
216 * this <ESL_MSA>.
217 *
218 * Args: <afp> - open <ESL_MSAFILE> to read from
219 * <ret_msa> - RETURN: newly parsed, created <ESL_MSA>
220 *
221 * Returns: <eslOK> on success. <*ret_msa> contains the newly
222 * allocated MSA. <afp> is poised at start of next
223 * alignment record, or is at EOF.
224 *
225 * <eslEOF> if no (more) alignment data are found in
226 * <afp>, and <afp> is returned at EOF.
227 *
228 * <eslEFORMAT> on a parse error. <*ret_msa> is set to
229 * <NULL>. <afp> contains information sufficient for
230 * constructing useful diagnostic output:
231 * | <afp->errmsg> | user-directed error message |
232 * | <afp->linenumber> | line # where error was detected |
233 * | <afp->line> | offending line (not NUL-term) |
234 * | <afp->n> | length of offending line |
235 * | <afp->bf->filename> | name of the file |
236 * and <afp> is poised at the start of the following line,
237 * so (in principle) the caller could try to resume
238 * parsing.
239 *
240 * Throws: <eslEMEM> on allocation error.
241 * <eslESYS> if a system call fails, such as fread().
242 * <*ret_msa> is returned <NULL>.
243 */
244 int
esl_msafile_stockholm_Read(ESL_MSAFILE * afp,ESL_MSA ** ret_msa)245 esl_msafile_stockholm_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa)
246 {
247 ESL_MSA *msa = NULL;
248 ESL_STOCKHOLM_PARSEDATA *pd = NULL;
249 char *p;
250 esl_pos_t n;
251 int idx;
252 int status;
253
254 ESL_DASSERT1( (afp->format == eslMSAFILE_PFAM || afp->format == eslMSAFILE_STOCKHOLM) );
255
256 afp->errmsg[0] = '\0';
257
258 /* Allocate a growable MSA, and auxiliary parse data coupled to the MSA allocation */
259 if (afp->abc && (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
260 if (! afp->abc && (msa = esl_msa_Create( 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
261 if ( (pd = stockholm_parsedata_Create(msa)) == NULL) { status = eslEMEM; goto ERROR; }
262
263 /* Skip leading blank lines in file. EOF here is a normal EOF return. */
264 do {
265 if ( ( status = esl_msafile_GetLine(afp, &p, &n)) != eslOK) goto ERROR; /* eslEOF is OK here - end of input (eslEOF) [eslEMEM|eslESYS] */
266 } while (esl_memspn(afp->line, afp->n, " \t") == afp->n || /* skip blank lines */
267 (esl_memstrpfx(afp->line, afp->n, "#") /* and skip comment lines */
268 && ! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM"))); /* but stop on Stockholm header */
269
270 /* Check for the magic Stockholm header */
271 if (! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM 1.")) ESL_XFAIL(eslEFORMAT, afp->errmsg, "missing Stockholm header");
272
273 while ( (status = esl_msafile_GetLine(afp, &p, &n)) == eslOK) /* (eslEOF) [eslEMEM|eslESYS] */
274 {
275 while (n && ( *p == ' ' || *p == '\t')) { p++; n--; } /* skip leading whitespace */
276
277 if (!n || esl_memstrpfx(p, n, "//"))
278 { /* blank lines and the Stockholm end-of-record // trigger end-of-block logic */
279 if (pd->in_block) {
280 if (pd->nblock) { if (pd->nseq_b != pd->nseq) ESL_XFAIL(eslEFORMAT, afp->errmsg, "number of seqs in block did not match number in earlier block(s)"); }
281 else { if (pd->nseq_b < pd->nseq) ESL_XFAIL(eslEFORMAT, afp->errmsg, "number of seqs in block did not match number annotated by #=GS lines"); };
282 if (pd->nblock) { if (pd->bi != pd->npb) ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected number of lines in alignment block"); }
283
284 pd->nseq = msa->nseq = pd->nseq_b;
285 pd->alen += pd->alen_b;
286 pd->in_block = FALSE;
287 pd->npb = pd->bi;
288 pd->bi = 0;
289 pd->si = 0;
290 pd->nblock += 1;
291 pd->nseq_b = 0;
292 pd->alen_b = 0;
293 }
294 if (esl_memstrpfx(p, n, "//")) break; /* Stockholm end-of-record marker */
295 else continue; /* else, on to next block */
296 }
297
298 if (*p == '#')
299 {
300 if (esl_memstrpfx(p, n, "#=GF")) { if ((status = stockholm_parse_gf (afp, pd, msa, p, n)) != eslOK) goto ERROR; }
301 else if (esl_memstrpfx(p, n, "#=GS")) { if ((status = stockholm_parse_gs (afp, pd, msa, p, n)) != eslOK) goto ERROR; }
302 else if (esl_memstrpfx(p, n, "#=GC")) { if ((status = stockholm_parse_gc (afp, pd, msa, p, n)) != eslOK) goto ERROR; }
303 else if (esl_memstrpfx(p, n, "#=GR")) { if ((status = stockholm_parse_gr (afp, pd, msa, p, n)) != eslOK) goto ERROR; }
304 else if (esl_memstrcmp(p, n, "# STOCKHOLM 1.0")) ESL_XFAIL(eslEFORMAT, afp->errmsg, "two # STOCKHOLM 1.0 headers in a row?");
305 else { if ((status = stockholm_parse_comment( msa, p, n)) != eslOK) goto ERROR; }
306 }
307 else if ( (status = stockholm_parse_sq (afp, pd, msa, p, n)) != eslOK) goto ERROR;
308 }
309 if (status == eslEOF) ESL_XFAIL(eslEFORMAT, afp->errmsg, "missing // terminator after MSA");
310 else if (status != eslOK) goto ERROR;
311 if (pd->nblock == 0) ESL_XFAIL(eslEFORMAT, afp->errmsg, "no alignment data followed Stockholm header");
312
313 msa->alen = pd->alen;
314
315 /* Stockholm file can set weights. If eslMSA_HASWGTS flag is up, at least one was set: then all must be. */
316 if (msa->flags & eslMSA_HASWGTS)
317 {
318 for (idx = 0; idx < msa->nseq; idx++)
319 if (msa->wgt[idx] == -1.0) ESL_XFAIL(eslEFORMAT, afp->errmsg, "stockholm record ended without a weight for %s", msa->sqname[idx]);
320 }
321 else if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR;
322
323 stockholm_parsedata_Destroy(pd, msa);
324 *ret_msa = msa;
325 return eslOK;
326
327 ERROR:
328 if (pd) stockholm_parsedata_Destroy(pd, msa);
329 if (msa) esl_msa_Destroy(msa);
330 *ret_msa = NULL;
331 return status;
332 }
333
334
335 /* Function: esl_msafile_stockholm_Write()
336 * Synopsis: Write a Stockholm format alignment to a stream.
337 *
338 * Purpose: Write alignment <msa> to output stream <fp>, in Stockholm
339 * format. <fmt> may either be <eslMSAFILE_STOCKHOLM> or
340 * <eslMSAFILE_PFAM>. <eslMSAFILE_PFAM> puts the alignment
341 * into a single block, one alignment line per sequence.
342 * <eslMSAFILE_STOCKHOLM> is a multiple block format, with
343 * a width of 200 aligned residues per line.
344 *
345 * Args: fp - open output stream, writable
346 * msa - alignment to write
347 * fmt - eslMSAFILE_STOCKHOLM | eslMSAFILE_PFAM
348 *
349 * Returns: <eslOK> on success.
350 *
351 * Throws: <eslEMEM> on allocation error.
352 * <eslEWRITE> on any system write error, such as filled disk.
353 */
354 int
esl_msafile_stockholm_Write(FILE * fp,const ESL_MSA * msa,int fmt)355 esl_msafile_stockholm_Write(FILE *fp, const ESL_MSA *msa, int fmt)
356 {
357 switch (fmt) {
358 case eslMSAFILE_PFAM: return stockholm_write(fp, msa, msa->alen);
359 case eslMSAFILE_STOCKHOLM: return stockholm_write(fp, msa, 200);
360 }
361 return eslEINCONCEIVABLE;
362 }
363 /*--------------- end, api for stockholm i/o --------------------*/
364
365
366 /*****************************************************************
367 * 2. Internal: ESL_STOCKHOLM_PARSEDATA auxiliary structure
368 *****************************************************************/
369
370 /* The auxiliary parse data is sufficient to validate each line as we
371 * see it. Our design requires that we immediately report any errors
372 * and the line number they occur on. We do not want to detect errors
373 * in some later validation step, after we've lost track of original
374 * line numbers of the input.
375 */
376
377 static ESL_STOCKHOLM_PARSEDATA *
stockholm_parsedata_Create(ESL_MSA * msa)378 stockholm_parsedata_Create(ESL_MSA *msa)
379 {
380 ESL_STOCKHOLM_PARSEDATA *pd = NULL;
381 int z;
382 int status;
383
384 ESL_ALLOC(pd, sizeof(ESL_STOCKHOLM_PARSEDATA));
385 pd->nseq = 0;
386 pd->alen = 0;
387
388 pd->in_block = FALSE;
389 pd->blinetype = NULL;
390 pd->bidx = NULL;
391 pd->npb = 0;
392 pd->bi = 0;
393 pd->si = 0;
394 pd->balloc = 0;
395
396 pd->nblock = 0;
397 pd->nseq_b = 0;
398 pd->alen_b = 0;
399
400 pd->ssconslen = 0;
401 pd->saconslen = 0;
402 pd->ppconslen = 0;
403 pd->rflen = 0;
404 pd->mmasklen = 0;
405 pd->sqlen = NULL;
406 pd->sslen = NULL;
407 pd->salen = NULL;
408 pd->pplen = NULL;
409 pd->ogc_len = NULL;
410 pd->ogr_len = NULL;
411 pd->salloc = 0;
412
413 ESL_ALLOC(pd->blinetype, sizeof(char) * 16);
414 ESL_ALLOC(pd->bidx, sizeof(int) * 16);
415 pd->balloc = 16;
416
417 ESL_ALLOC(pd->sqlen, sizeof(int64_t) * msa->sqalloc);
418 for (z = 0; z < msa->sqalloc; z++)
419 pd->sqlen[z] = 0;
420 pd->salloc = msa->sqalloc;
421 return pd;
422
423 ERROR:
424 stockholm_parsedata_Destroy(pd, msa);
425 return NULL;
426 }
427
428 static int
stockholm_parsedata_ExpandSeq(ESL_STOCKHOLM_PARSEDATA * pd,ESL_MSA * msa)429 stockholm_parsedata_ExpandSeq(ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa)
430 {
431 int tagidx;
432 int z;
433 int status;
434
435 ESL_REALLOC(pd->sqlen, sizeof(int64_t) * msa->sqalloc);
436 for (z = pd->salloc; z < msa->sqalloc; z++) pd->sqlen[z] = 0;
437
438 if (pd->sslen) {
439 ESL_REALLOC(pd->sslen, sizeof(int64_t) * msa->sqalloc);
440 for (z = pd->salloc; z < msa->sqalloc; z++) pd->sslen[z] = 0;
441 }
442
443 if (pd->salen) {
444 ESL_REALLOC(pd->salen, sizeof(int64_t) * msa->sqalloc);
445 for (z = pd->salloc; z < msa->sqalloc; z++) pd->salen[z] = 0;
446 }
447
448 if (pd->pplen) {
449 ESL_REALLOC(pd->pplen, sizeof(int64_t) * msa->sqalloc);
450 for (z = pd->salloc; z < msa->sqalloc; z++) pd->pplen[z] = 0;
451 }
452
453 /* don't need to reallocate ogc_len here: it's [0..ngc-1], not by seq */
454
455 if (pd->ogr_len) {
456 for (tagidx = 0; tagidx < msa->ngr; tagidx++)
457 if (pd->ogr_len[tagidx]) {
458 ESL_REALLOC(pd->ogr_len[tagidx], sizeof(int64_t) * msa->sqalloc);
459 for (z = pd->salloc; z < msa->sqalloc; z++) pd->ogr_len[tagidx][z] = 0;
460 }
461 }
462
463 pd->salloc = msa->sqalloc;
464 return eslOK;
465
466 ERROR:
467 return status;
468 }
469
470
471 static int
stockholm_parsedata_ExpandBlock(ESL_STOCKHOLM_PARSEDATA * pd)472 stockholm_parsedata_ExpandBlock(ESL_STOCKHOLM_PARSEDATA *pd)
473 {
474 int status;
475
476 ESL_REALLOC(pd->blinetype, sizeof(char) * (pd->balloc * 2));
477 ESL_REALLOC(pd->bidx, sizeof(int) * (pd->balloc * 2));
478 pd->balloc *= 2;
479 return eslOK;
480
481 ERROR:
482 return status;
483 }
484
485
486 static void
stockholm_parsedata_Destroy(ESL_STOCKHOLM_PARSEDATA * pd,ESL_MSA * msa)487 stockholm_parsedata_Destroy(ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa)
488 {
489 int i;
490 if (! pd) return;
491
492 if (pd->blinetype) free(pd->blinetype);
493 if (pd->bidx) free(pd->bidx);
494
495 if (pd->sqlen) free(pd->sqlen);
496 if (pd->sslen) free(pd->sslen);
497 if (pd->salen) free(pd->salen);
498 if (pd->pplen) free(pd->pplen);
499 if (pd->ogc_len) free(pd->ogc_len);
500 if (pd->ogr_len) {
501 for (i = 0; i < msa->ngr; i++)
502 if (pd->ogr_len[i]) free(pd->ogr_len[i]);
503 free(pd->ogr_len);
504 }
505 free(pd);
506 return;
507 }
508 /*------------------ end, ESL_STOCKHOLM_PARSEDATA auxiliary structure -------------*/
509
510
511
512
513 /*****************************************************************
514 * 3. Internal: parsing Stockholm line types
515 *****************************************************************/
516
517 /* stockholm_parse_gf()
518 * Line format is:
519 * #=GF <tag> <text>
520 * recognized featurenames: { ID | AC | DE | AU | GA | NC | TC }
521 */
522 static int
stockholm_parse_gf(ESL_MSAFILE * afp,ESL_STOCKHOLM_PARSEDATA * pd,ESL_MSA * msa,char * p,esl_pos_t n)523 stockholm_parse_gf(ESL_MSAFILE *afp, ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa, char *p, esl_pos_t n)
524 {
525 char *gf, *tag, *tok;
526 esl_pos_t gflen, taglen, toklen;
527 int status;
528
529 if ( (status = esl_memtok(&p, &n, " \t", &gf, &gflen)) != eslOK) ESL_EXCEPTION(eslEINCONCEIVABLE, "EOL can't happen here.");
530 if ( (status = esl_memtok(&p, &n, " \t", &tag, &taglen)) != eslOK) ESL_FAIL(eslEFORMAT, afp->errmsg, "#=GF line is missing <tag>, annotation");
531 if (! esl_memstrcmp(gf, gflen, "#=GF")) ESL_FAIL(eslEFORMAT, afp->errmsg, "faux #=GF line?");
532
533 if (esl_memstrcmp(tag, taglen, "ID"))
534 {
535 if ((status = esl_memtok(&p, &n, " \t", &tok, &toklen)) != eslOK) ESL_FAIL(eslEFORMAT, afp->errmsg, "No name found on #=GF ID line");
536 if (n) ESL_FAIL(eslEFORMAT, afp->errmsg, "#=GF ID line should have only one name (no whitespace allowed)");
537 if ( (status = esl_msa_SetName (msa, tok, toklen)) != eslOK) return status; /* [eslEMEM] */
538 }
539 else if (esl_memstrcmp(tag, taglen, "AC"))
540 {
541 if ((status = esl_memtok(&p, &n, " \t", &tok, &toklen)) != eslOK) ESL_FAIL(eslEFORMAT, afp->errmsg, "No accession found on #=GF AC line");
542 if (n) ESL_FAIL(eslEFORMAT, afp->errmsg, "#=GF AC line should have only one accession (no whitespace allowed)");
543 if ((status = esl_msa_SetAccession(msa, tok, toklen)) != eslOK) return status; /* [eslEMEM] */
544 }
545 else if (esl_memstrcmp(tag, taglen, "DE"))
546 {
547 if ((status = esl_msa_SetDesc (msa, p, n)) != eslOK) return status; /* [eslEMEM] */
548 }
549 else if (esl_memstrcmp(tag, taglen, "AU"))
550 {
551 if ((status = esl_msa_SetAuthor (msa, p, n)) != eslOK) return status; /* [eslEMEM] */
552 }
553 else if (esl_memstrcmp(tag, taglen, "GA"))
554 {
555 if ( (status = esl_memtok(&p, &n, " \t", &tok, &toklen)) == eslOK) {
556 if (! esl_mem_IsReal(tok, toklen)) ESL_FAIL(eslEFORMAT, afp->errmsg, "Expected a real number for GA1 value on #=GF GA line");
557 if ( esl_memtof(tok, toklen, &(msa->cutoff[eslMSA_GA1])) != eslOK) return status; /* [eslEMEM] */
558 msa->cutset[eslMSA_GA1] = TRUE;
559 } else ESL_FAIL(eslEFORMAT, afp->errmsg, "No GA threshold value found on #=GF GA line");
560 if ( (status = esl_memtok(&p, &n, " \t", &tok, &toklen)) == eslOK) {
561 if (! esl_mem_IsReal(tok, toklen)) ESL_FAIL(eslEFORMAT, afp->errmsg, "Expected a real number for GA2 value on #=GF GA line");
562 if ( esl_memtof(tok, toklen, &(msa->cutoff[eslMSA_GA2])) != eslOK) return status; /* [eslEMEM] */
563 msa->cutset[eslMSA_GA2] = TRUE;
564 }
565 }
566 else if (esl_memstrcmp(tag, taglen, "NC"))
567 {
568 if ( (status = esl_memtok(&p, &n, " \t", &tok, &toklen)) == eslOK) {
569 if ( ! esl_memstrcmp(tok, toklen, "undefined")) /* workaround for a problem in Rfam10. ignore NC's that are set to "undefined". */
570 {
571 if (! esl_mem_IsReal(tok, toklen)) ESL_FAIL(eslEFORMAT, afp->errmsg, "Expected a real number for NC1 value on #=GF NC line");
572 if ( esl_memtof(tok, toklen, &(msa->cutoff[eslMSA_NC1])) != eslOK) return status; /* [eslEMEM] */
573 msa->cutset[eslMSA_NC1] = TRUE;
574 }
575 } else ESL_FAIL(eslEFORMAT, afp->errmsg, "No NC threshold value found on #=GF NC line");
576 if ( (status = esl_memtok(&p, &n, " \t", &tok, &toklen)) == eslOK) {
577 if (! esl_mem_IsReal(tok, toklen)) ESL_FAIL(eslEFORMAT, afp->errmsg, "Expected a real number for NC2 value on #=GF NC line");
578 if ( esl_memtof(tok, toklen, &(msa->cutoff[eslMSA_NC2])) != eslOK) return status; /* [eslEMEM] */
579 msa->cutset[eslMSA_NC2] = TRUE;
580 }
581 }
582 else if (esl_memstrcmp(tag, taglen, "TC"))
583 {
584 if ( (status = esl_memtok(&p, &n, " \t", &tok, &toklen)) == eslOK) {
585 if (! esl_mem_IsReal(tok, toklen)) ESL_FAIL(eslEFORMAT, afp->errmsg, "Expected a real number for TC1 value on #=GF TC line");
586 if ( esl_memtof(tok, toklen, &(msa->cutoff[eslMSA_TC1])) != eslOK) return status; /* [eslEMEM] */
587 msa->cutset[eslMSA_TC1] = TRUE;
588 } else ESL_FAIL(eslEFORMAT, afp->errmsg, "No TC threshold value found on #=GF TC line");
589 if ( (status = esl_memtok(&p, &n, " \t", &tok, &toklen)) == eslOK) {
590 if (! esl_mem_IsReal(tok, toklen)) ESL_FAIL(eslEFORMAT, afp->errmsg, "Expected a real number for TC2 value on #=GF TC line");
591 if ( esl_memtof(tok, toklen, &(msa->cutoff[eslMSA_TC2])) != eslOK) return status; /* [eslEMEM] */
592 msa->cutset[eslMSA_TC2] = TRUE;
593 }
594 }
595 else
596 {
597 if ((status = esl_msa_AddGF(msa, tag, taglen, p, n)) != eslOK) return status;
598 }
599
600 return eslOK;
601 }
602
603
604 /* stockholm_parse_gs()
605 * Format:
606 * #=GS <seqname> <tag> <text>
607 * recognized featurenames: { WT | AC | DE }
608 */
609 static int
stockholm_parse_gs(ESL_MSAFILE * afp,ESL_STOCKHOLM_PARSEDATA * pd,ESL_MSA * msa,char * p,esl_pos_t n)610 stockholm_parse_gs(ESL_MSAFILE *afp, ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa, char *p, esl_pos_t n)
611 {
612 char *gs, *seqname, *tag, *tok;
613 esl_pos_t gslen, seqnamelen, taglen, toklen;
614 int seqidx;
615 int status;
616
617 if (esl_memtok(&p, &n, " \t", &gs, &gslen) != eslOK) ESL_EXCEPTION(eslEINCONCEIVABLE, "EOL can't happen here.");
618 if (esl_memtok(&p, &n, " \t", &seqname, &seqnamelen) != eslOK) ESL_FAIL(eslEFORMAT, afp->errmsg, "#=GS line missing <seqname>, <tag>, annotation");
619 if (esl_memtok(&p, &n, " \t", &tag, &taglen) != eslOK) ESL_FAIL(eslEFORMAT, afp->errmsg, "#=GS line missing <tag>, annotation");
620 if (! esl_memstrcmp(gs, gslen, "#=GS")) ESL_FAIL(eslEFORMAT, afp->errmsg, "faux #=GS line?");
621
622 seqidx = pd->si;
623 if (seqidx == pd->nseq || ! esl_memstrcmp(seqname, seqnamelen, msa->sqname[seqidx])) {
624 stockholm_get_seqidx(msa, pd, seqname, seqnamelen, &seqidx);
625 }
626
627 if (esl_memstrcmp(tag, taglen, "WT"))
628 {
629 if (esl_memtok(&p, &n, " \t", &tok, &toklen) != eslOK) ESL_FAIL(eslEFORMAT, afp->errmsg, "no weight value found on #=GS <seqname> WT line");
630 if (msa->wgt[seqidx] != -1.0) ESL_FAIL(eslEFORMAT, afp->errmsg, "sequence has more than one #=GS <seqname> WT line");
631 if (n) ESL_FAIL(eslEFORMAT, afp->errmsg, "#=GS <seqname> WT line should have only one field, the weight");
632 if (! esl_mem_IsReal(tok, toklen)) ESL_FAIL(eslEFORMAT, afp->errmsg, "value on #=GS <seqname> WT line isn't a real number");
633 if ((status = esl_memtod(tok, toklen, &(msa->wgt[seqidx]))) != eslOK) return status; /* eslEMEM */
634 msa->flags |= eslMSA_HASWGTS;
635 }
636 else if (esl_memstrcmp(tag, taglen, "AC"))
637 {
638 if (esl_memtok(&p, &n, " \t", &tok, &toklen) != eslOK) ESL_FAIL(eslEFORMAT, afp->errmsg, "no accession found on #=GS <seqname> AC line");
639 if (msa->sqacc && msa->sqacc[seqidx]) ESL_FAIL(eslEFORMAT, afp->errmsg, "sequence has more than one #=GS <seqname> AC accession line");
640 if (n) ESL_FAIL(eslEFORMAT, afp->errmsg, "#=GS <seqname> AC line should have only one field, the accession");
641 if ((status = esl_msa_SetSeqAccession(msa, seqidx, tok, toklen)) != eslOK) return status; /* eslEMEM */
642 }
643 else if (esl_memstrcmp(tag, taglen, "DE"))
644 {
645 if (msa->sqdesc && msa->sqdesc[seqidx]) ESL_FAIL(eslEFORMAT, afp->errmsg, "sequence has more than one #=GS <seqname> DE accession line");
646 if ((status = esl_msa_SetSeqDescription(msa, seqidx, p, n)) != eslOK) return status; /* eslEMEM */
647 }
648 else
649 {
650 if ((status = esl_msa_AddGS(msa, tag, taglen, seqidx, p, n)) != eslOK) return status;
651 }
652
653 pd->si = seqidx+1; /* set guess for next sequence index */
654 return eslOK;
655 }
656
657 /* stockholm_parse_gc()
658 * Format of line is:
659 * #=GC <tag> <aligned text>
660 * recognized featurenames: { SS_cons | SA_cons | PP_cons | RF }
661 */
662 static int
stockholm_parse_gc(ESL_MSAFILE * afp,ESL_STOCKHOLM_PARSEDATA * pd,ESL_MSA * msa,char * p,esl_pos_t n)663 stockholm_parse_gc(ESL_MSAFILE *afp, ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa, char *p, esl_pos_t n)
664 {
665 char *gc, *tag;
666 esl_pos_t gclen, taglen;
667 int tagidx;
668 int status;
669
670 if (esl_memtok(&p, &n, " \t", &gc, &gclen) != eslOK) ESL_EXCEPTION(eslEINCONCEIVABLE, "EOL can't happen here.");
671 if (esl_memtok(&p, &n, " \t", &tag, &taglen) != eslOK) ESL_FAIL(eslEFORMAT, afp->errmsg, "#=GC line missing <tag>, annotation");
672 while (n && strchr(" \t", p[n-1])) n--; /* skip backwards from eol, to delimit aligned text without going through it */
673
674 if (! esl_memstrcmp(gc, gclen, "#=GC")) ESL_FAIL(eslEFORMAT, afp->errmsg, "faux #=GC line?");
675 if (! n) ESL_FAIL(eslEFORMAT, afp->errmsg, "#=GC line missing annotation?");
676
677 if (pd->nblock) /* Subsequent blocks */
678 {
679 if (esl_memstrcmp(tag, taglen, "SS_cons")) { if (pd->blinetype[pd->bi] != eslSTOCKHOLM_LINE_GC_SSCONS) ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected #=GC SS_cons; earlier block(s) in different order?"); }
680 else if (esl_memstrcmp(tag, taglen, "SA_cons")) { if (pd->blinetype[pd->bi] != eslSTOCKHOLM_LINE_GC_SACONS) ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected #=GC SA_cons; earlier block(s) in different order?"); }
681 else if (esl_memstrcmp(tag, taglen, "PP_cons")) { if (pd->blinetype[pd->bi] != eslSTOCKHOLM_LINE_GC_PPCONS) ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected #=GC PP_cons; earlier block(s) in different order?"); }
682 else if (esl_memstrcmp(tag, taglen, "RF")) { if (pd->blinetype[pd->bi] != eslSTOCKHOLM_LINE_GC_RF) ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected #=GC RF; earlier block(s) in different order?"); }
683 else if (esl_memstrcmp(tag, taglen, "MM")) { if (pd->blinetype[pd->bi] != eslSTOCKHOLM_LINE_GC_MM) ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected #=GC MM; earlier block(s) in different order?"); }
684 else if ( pd->blinetype[pd->bi] != eslSTOCKHOLM_LINE_GC_OTHER) ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected #=GC line; earlier block(s) in different order?");
685 }
686 else /* First block */
687 {
688 if (pd->bi == pd->balloc && (status = stockholm_parsedata_ExpandBlock(pd)) != eslOK) return status;
689
690 if (esl_memstrcmp(tag, taglen, "SS_cons")) pd->blinetype[pd->bi] = eslSTOCKHOLM_LINE_GC_SSCONS;
691 else if (esl_memstrcmp(tag, taglen, "SA_cons")) pd->blinetype[pd->bi] = eslSTOCKHOLM_LINE_GC_SACONS;
692 else if (esl_memstrcmp(tag, taglen, "PP_cons")) pd->blinetype[pd->bi] = eslSTOCKHOLM_LINE_GC_PPCONS;
693 else if (esl_memstrcmp(tag, taglen, "RF")) pd->blinetype[pd->bi] = eslSTOCKHOLM_LINE_GC_RF;
694 else if (esl_memstrcmp(tag, taglen, "MM")) pd->blinetype[pd->bi] = eslSTOCKHOLM_LINE_GC_MM;
695 else pd->blinetype[pd->bi] = eslSTOCKHOLM_LINE_GC_OTHER;
696 pd->bidx[pd->bi] = -1;
697 }
698
699 if (pd->blinetype[pd->bi] == eslSTOCKHOLM_LINE_GC_SSCONS)
700 {
701 if (pd->ssconslen != pd->alen) ESL_FAIL(eslEFORMAT, afp->errmsg, "more than one #=GC SS_cons line in block");
702 if ( (status = esl_strcat(&(msa->ss_cons), pd->ssconslen, p, n)) != eslOK) return status; /* [eslEMEM] */
703 pd->ssconslen += n;
704 }
705 else if (pd->blinetype[pd->bi] == eslSTOCKHOLM_LINE_GC_SACONS)
706 {
707 if (pd->saconslen != pd->alen) ESL_FAIL(eslEFORMAT, afp->errmsg, "more than one #=GC SA_cons line in block");
708 if ((status = esl_strcat(&(msa->sa_cons), pd->saconslen, p, n)) != eslOK) return status; /* [eslEMEM] */
709 pd->saconslen += n;
710 }
711 else if (pd->blinetype[pd->bi] == eslSTOCKHOLM_LINE_GC_PPCONS)
712 {
713 if (pd->ppconslen != pd->alen) ESL_FAIL(eslEFORMAT, afp->errmsg, "more than one #=GC PP_cons line in block");
714 if ((status = esl_strcat(&(msa->pp_cons), pd->ppconslen, p, n)) != eslOK) return status; /* [eslEMEM] */
715 pd->ppconslen += n;
716 }
717 else if (pd->blinetype[pd->bi] == eslSTOCKHOLM_LINE_GC_RF)
718 {
719 if (pd->rflen != pd->alen) ESL_FAIL(eslEFORMAT, afp->errmsg, "more than one #=GC RF line in block");
720 if ((status = esl_strcat(&(msa->rf), pd->rflen, p, n)) != eslOK) return status; /* [eslEMEM] */
721 pd->rflen += n;
722 }
723 else if (pd->blinetype[pd->bi] == eslSTOCKHOLM_LINE_GC_MM)
724 {
725 if (pd->mmasklen != pd->alen) ESL_FAIL(eslEFORMAT, afp->errmsg, "more than one #=GC MM line in block");
726 if ((status = esl_strcat(&(msa->mm), pd->mmasklen, p, n)) != eslOK) return status; /* [eslEMEM] */
727 pd->mmasklen += n;
728 }
729 else
730 {
731 if ((status = stockholm_get_gc_tagidx(msa, pd, tag, taglen, &tagidx)) != eslOK) return status;
732
733 if (pd->ogc_len[tagidx] != pd->alen) ESL_FAIL(eslEFORMAT, afp->errmsg, "more than one #=GC %.*s line in block", (int) taglen, tag);
734 if ((status = esl_strcat(&(msa->gc[tagidx]), pd->ogc_len[tagidx], p, n)) != eslOK) return status; /* [eslEMEM] */
735 pd->ogc_len[tagidx] += n;
736 }
737
738 if (pd->bi && n != pd->alen_b) ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected # of aligned annotation in #=GC %.*s line", (int) taglen, tag);
739 pd->alen_b = n;
740 pd->in_block = TRUE;
741 pd->bi++;
742 return eslOK;
743 }
744
745 /* A GR line is
746 * #=GR <seqname> <featurename> <text>
747 * recognized featurenames: { SS | SA | PP }
748 *
749 */
750 static int
stockholm_parse_gr(ESL_MSAFILE * afp,ESL_STOCKHOLM_PARSEDATA * pd,ESL_MSA * msa,char * p,esl_pos_t n)751 stockholm_parse_gr(ESL_MSAFILE *afp, ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa, char *p, esl_pos_t n)
752 {
753 char *gr, *name, *tag;
754 esl_pos_t grlen, namelen, taglen;
755 int seqidx, tagidx;
756 int z;
757 int status;
758
759 if (esl_memtok(&p, &n, " \t", &gr, &grlen) != eslOK) ESL_EXCEPTION(eslEINCONCEIVABLE, "EOL can't happen here.");
760 if (esl_memtok(&p, &n, " \t", &name, &namelen) != eslOK) ESL_FAIL(eslEFORMAT, afp->errmsg, "#=GR line missing <seqname>, <tag>, annotation");
761 if (esl_memtok(&p, &n, " \t", &tag, &taglen) != eslOK) ESL_FAIL(eslEFORMAT, afp->errmsg, "#=GR line missing <tag>, annotation");
762 while (n && strchr(" \t", p[n-1])) n--; /* skip backwards from eol, to delimit aligned text without going through it */
763
764 if (! esl_memstrcmp(gr, grlen, "#=GR")) ESL_FAIL(eslEFORMAT, afp->errmsg, "faux #=GR line?");
765 if (! n) ESL_FAIL(eslEFORMAT, afp->errmsg, "#=GR line missing annotation?");
766
767 /* Which seqidx is this? likely to be either pd->si-1 (#=GR following a seq) or
768 * pd->si (#=GR preceding a seq)
769 */
770 if (! pd->nblock) /* First block: we're setting bidx[], blinetype[] as we see them */
771 {
772 if (pd->si >= 1 && esl_memstrcmp(name, namelen, msa->sqname[pd->si-1])) seqidx = pd->si-1;
773 else if (pd->si < pd->nseq && esl_memstrcmp(name, namelen, msa->sqname[pd->si])) seqidx = pd->si;
774 else if ((status = stockholm_get_seqidx(msa, pd, name, namelen, &seqidx)) != eslOK) goto ERROR;
775
776 if (pd->bi == pd->balloc && (status = stockholm_parsedata_ExpandBlock(pd)) != eslOK) return status;
777
778 if (esl_memstrcmp(tag, taglen, "SS")) pd->blinetype[pd->bi] = eslSTOCKHOLM_LINE_GR_SS;
779 else if (esl_memstrcmp(tag, taglen, "SA")) pd->blinetype[pd->bi] = eslSTOCKHOLM_LINE_GR_SA;
780 else if (esl_memstrcmp(tag, taglen, "PP")) pd->blinetype[pd->bi] = eslSTOCKHOLM_LINE_GR_PP;
781 else pd->blinetype[pd->bi] = eslSTOCKHOLM_LINE_GR_OTHER;
782 pd->bidx[pd->bi] = seqidx;
783 }
784 else
785 { /* subsequent block(s) */
786 if (pd->bi >= pd->npb) ESL_FAIL(eslEFORMAT, afp->errmsg, "more lines than expected in this alignment block; earlier blocks had fewer");
787
788 if (esl_memstrcmp(tag, taglen, "SS")) { if (pd->blinetype[pd->bi] != eslSTOCKHOLM_LINE_GR_SS) ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected #=GR <seqname> SS; earlier block(s) in different order?"); }
789 else if (esl_memstrcmp(tag, taglen, "SA")) { if (pd->blinetype[pd->bi] != eslSTOCKHOLM_LINE_GR_SA) ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected #=GR <seqname> SA; earlier block(s) in different order?"); }
790 else if (esl_memstrcmp(tag, taglen, "PP")) { if (pd->blinetype[pd->bi] != eslSTOCKHOLM_LINE_GR_PP) ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected #=GR <seqname> PP; earlier block(s) in different order?"); }
791 else if ( pd->blinetype[pd->bi] != eslSTOCKHOLM_LINE_GR_OTHER) ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected #=GR line; earlier block(s) in different order?");
792
793 seqidx = pd->bidx[pd->bi];
794 if (! esl_memstrcmp(name, namelen, msa->sqname[seqidx])) ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected seqname %.*s; expected %s from prev blocks", (int) namelen, name, msa->sqname[seqidx]);
795 }
796
797 /* Append the annotation where it belongs */
798 if (pd->blinetype[pd->bi] == eslSTOCKHOLM_LINE_GR_SS)
799 {
800 if (! msa->ss) {
801 ESL_ALLOC(msa->ss, sizeof(char *) * msa->sqalloc);
802 ESL_ALLOC(pd->sslen, sizeof(int64_t) * msa->sqalloc);
803 for (z = 0; z < msa->sqalloc; z++) { msa->ss[z] = NULL; pd->sslen[z] = 0; }
804 }
805 if (pd->sslen[seqidx] != pd->alen) ESL_FAIL(eslEFORMAT, afp->errmsg, "more than one #=GR %.*s SS line in block", (int) namelen, name);
806 if (( status = esl_strcat(&(msa->ss[seqidx]), pd->sslen[seqidx], p, n)) != eslOK) return status; /* [eslEMEM] */
807 pd->sslen[seqidx] += n;
808 }
809 else if (pd->blinetype[pd->bi] == eslSTOCKHOLM_LINE_GR_PP)
810 {
811 if (! msa->pp) {
812 ESL_ALLOC(msa->pp, sizeof(char *) * msa->sqalloc);
813 ESL_ALLOC(pd->pplen, sizeof(int64_t) * msa->sqalloc);
814 for (z = 0; z < msa->sqalloc; z++) { msa->pp[z] = NULL; pd->pplen[z] = 0; }
815 }
816 if (pd->pplen[seqidx] != pd->alen) ESL_FAIL(eslEFORMAT, afp->errmsg, "more than one #=GR %.*s PP line in block", (int) namelen, name);
817 if ((status = esl_strcat(&(msa->pp[seqidx]), pd->pplen[seqidx], p, n)) != eslOK) return status; /* [eslEMEM] */
818 pd->pplen[seqidx] += n;
819 }
820 else if (pd->blinetype[pd->bi] == eslSTOCKHOLM_LINE_GR_SA)
821 {
822 if (! msa->sa) {
823 ESL_ALLOC(msa->sa, sizeof(char *) * msa->sqalloc);
824 ESL_ALLOC(pd->salen, sizeof(int64_t) * msa->sqalloc);
825 for (z = 0; z < msa->sqalloc; z++) { msa->sa[z] = NULL; pd->salen[z] = 0; }
826 }
827 if (pd->salen[seqidx] != pd->alen) ESL_FAIL(eslEFORMAT, afp->errmsg, "more than one #=GR %.*s SA line in block", (int) namelen, name);
828 if ((status = esl_strcat(&(msa->sa[seqidx]), pd->salen[seqidx], p, n)) != eslOK) return status;
829 pd->salen[seqidx] += n;
830 }
831 else
832 {
833 if ((status = stockholm_get_gr_tagidx(msa, pd, tag, taglen, &tagidx)) != eslOK) return status; /* [eslEMEM] */
834
835 if (pd->ogr_len[tagidx][seqidx] != pd->alen) ESL_FAIL(eslEFORMAT, afp->errmsg, "more than one #=GR %.*s %.*s line in block", (int) namelen, name, (int) taglen, tag);
836 if ((status = esl_strcat(&(msa->gr[tagidx][seqidx]), pd->ogr_len[tagidx][seqidx], p, n)) != eslOK) return status;
837 pd->ogr_len[tagidx][seqidx] += n;
838 }
839
840 if (pd->bi && n != pd->alen_b) ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected # of aligned annotation in #=GR %.*s %.*s line", (int) namelen, name, (int) taglen, tag);
841 pd->alen_b = n;
842 pd->in_block = TRUE;
843 pd->bi++;
844 return eslOK;
845
846 ERROR:
847 return status;
848 }
849
850
851 /* stockholm_parse_sq():
852 * Format of line is:
853 * <seqname> <aligned text>
854 */
855 static int
stockholm_parse_sq(ESL_MSAFILE * afp,ESL_STOCKHOLM_PARSEDATA * pd,ESL_MSA * msa,char * p,esl_pos_t n)856 stockholm_parse_sq(ESL_MSAFILE *afp, ESL_STOCKHOLM_PARSEDATA *pd, ESL_MSA *msa, char *p, esl_pos_t n)
857 {
858 char *seqname;
859 esl_pos_t seqnamelen;
860 int seqidx = pd->si;
861 int status;
862
863 if (esl_memtok(&p, &n, " \t", &seqname, &seqnamelen) != eslOK) ESL_EXCEPTION(eslEINCONCEIVABLE, "EOL can't happen here.");
864 while (n && strchr(" \t", p[n-1])) n--; /* skip backwards from eol, to delimit aligned text without going through it */
865
866 if (! n) ESL_FAIL(eslEFORMAT, afp->errmsg, "sequence line with no sequence?");
867
868 /* Which seqidx is this?
869 * In first block:
870 * 1. If #=GS lines set sqname[] completely, then it's pd->si.
871 * 2. If #=GS lines set sqname[] partially or out of order, then name may be in the keyhash.
872 * 3. If we haven't seen name before, then we'll add it: seqidx = pd->nseq, add name to keyhash, possibly reallocate.
873 * In subsequent blocks, use recorded indices and linetypes:
874 * 4. seqidx = saved bidx[]; should be expecting a SQ line; name should match expected name.
875 */
876 if (! pd->nblock) /* First block: we're setting npb, bidx[], and blinetype[] as we see them */
877 {
878 if (pd->si < pd->nseq && esl_memstrcmp(seqname, seqnamelen, msa->sqname[seqidx])) seqidx = pd->si;
879 else if ((status = stockholm_get_seqidx(msa, pd, seqname, seqnamelen, &seqidx)) != eslOK) return status; /* [eslEMEM] */
880
881 if (pd->bi == pd->balloc && (status = stockholm_parsedata_ExpandBlock(pd)) != eslOK) return status;
882
883 pd->blinetype[pd->bi] = eslSTOCKHOLM_LINE_SQ;
884 pd->bidx[pd->bi] = seqidx;
885 }
886 else
887 { /* subsequent block(s) */
888 if (pd->bi >= pd->npb) ESL_FAIL(eslEFORMAT, afp->errmsg, "more lines than expected; earlier blocks had fewer");
889 if ( pd->blinetype[pd->bi] != eslSTOCKHOLM_LINE_SQ) ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected seq line; earlier block(s) in different order?");
890 seqidx = pd->bidx[pd->bi];
891
892 if (! esl_memstrcmp(seqname, seqnamelen, msa->sqname[seqidx])) ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected seq name %.*s; expected %s from prev block order", (int) seqnamelen, seqname, msa->sqname[seqidx]);
893 }
894
895 if ( pd->bi > 0 && pd->sqlen[seqidx] == pd->alen + pd->alen_b) ESL_FAIL(eslEFORMAT, afp->errmsg, "duplicate seq name %.*s", (int) seqnamelen, seqname);
896
897 if ( afp->abc ) {
898 status = esl_abc_dsqcat(afp->inmap, &(msa->ax[seqidx]), &(pd->sqlen[seqidx]), p, n);
899 if (status == eslEINVAL) ESL_FAIL(eslEFORMAT, afp->errmsg, "invalid sequence character(s) on line");
900 else if (status != eslOK) return status;
901 }
902
903 if (! afp->abc) {
904 status = esl_strmapcat (afp->inmap, &(msa->aseq[seqidx]), &(pd->sqlen[seqidx]), p, n);
905 if (status == eslEINVAL) ESL_FAIL(eslEFORMAT, afp->errmsg, "invalid sequence character(s) on line");
906 else if (status != eslOK) return status;
907 }
908
909 if (pd->bi && n != pd->alen_b) ESL_FAIL(eslEFORMAT, afp->errmsg, "unexpected number of aligned residues parsed on line");
910 if (pd->sqlen[seqidx] - pd->alen != n) ESL_EXCEPTION(eslEINCONCEIVABLE, "implementation assumes that no symbols are ignored in inmap; else GR, GC text annotations are messed up");
911 pd->alen_b = n;
912 pd->in_block = TRUE;
913 pd->nseq_b++;
914 pd->bi++;
915 pd->si = seqidx+1;
916 return eslOK;
917 }
918
919
920
921 static int
stockholm_parse_comment(ESL_MSA * msa,char * p,esl_pos_t n)922 stockholm_parse_comment(ESL_MSA *msa, char *p, esl_pos_t n)
923 {
924 if (n && *p == '#') { p++; n--; }
925 while (n && isspace(*p)) { p++; n--; }
926
927 return esl_msa_AddComment(msa, p, n);
928 }
929 /*------------- end, parsing Stockholm line types ---------------*/
930
931
932
933 /*****************************************************************
934 * 4. Internal: looking up seq, tag indices
935 *****************************************************************/
936
937
938 /* stockholm_get_seqidx()
939 *
940 * Find the index of a given sequence <name>,<n> in a growing <msa>
941 * with associated parse data <pdat>. If caller has a good guess (for
942 * instance, the sequences are coming in a previously seen order in a
943 * block of seqs or annotation), the caller can pass this information
944 * in <guess>, or -1 if it has no guess.
945 *
946 * If the name does not already exist in the MSA, then it
947 * is assumed to be a new sequence name that we need to store.
948 * seqidx is set to pdat->nseq, the MSA is Expand()'ed if necessary
949 * to make room, the name is stored in msa->sqname[pdat->nseq],
950 * and pdat->nseq is incremented.
951 *
952 * Returns: <eslOK> on success, and the seqidx is
953 * passed back via <ret_idx>. If <name> is new
954 * in the <msa>, the <name> is stored and the <msa>
955 * may be internally reallocated if needed.
956 *
957 * Throws: <eslEMEM> on allocation failure
958 * <eslEINVAL> if we try to add a name to a non-growable MSA.
959 * <eslEINCONCEIVABLE> on internal coding errors
960 */
961 static int
stockholm_get_seqidx(ESL_MSA * msa,ESL_STOCKHOLM_PARSEDATA * pd,char * name,esl_pos_t n,int * ret_idx)962 stockholm_get_seqidx(ESL_MSA *msa, ESL_STOCKHOLM_PARSEDATA *pd, char *name, esl_pos_t n, int *ret_idx)
963 {
964 int seqidx;
965 int status;
966
967 status = esl_keyhash_Store(msa->index, name, n, &seqidx);
968 if (status == eslEDUP) { *ret_idx = seqidx; return eslOK; }
969 if (status != eslOK) goto ERROR;
970
971 /* if we get here, this is a new name we're adding */
972 if (seqidx >= msa->sqalloc) {
973 if ( (status = esl_msa_Expand(msa)) != eslOK) goto ERROR;
974 if ( (status = stockholm_parsedata_ExpandSeq(pd, msa)) != eslOK) goto ERROR;
975 }
976
977 if ( (status = esl_msa_SetSeqName(msa, seqidx, name, n)) != eslOK) goto ERROR;
978 pd->nseq++;
979 msa->nseq = pd->nseq; /* pd->nseq and msa->nseq must stay in lockstep */
980
981 *ret_idx = seqidx;
982 return eslOK;
983
984 ERROR:
985 *ret_idx = -1;
986 return status;
987 }
988
989 static int
stockholm_get_gr_tagidx(ESL_MSA * msa,ESL_STOCKHOLM_PARSEDATA * pd,char * tag,esl_pos_t taglen,int * ret_tagidx)990 stockholm_get_gr_tagidx(ESL_MSA *msa, ESL_STOCKHOLM_PARSEDATA *pd, char *tag, esl_pos_t taglen, int *ret_tagidx)
991 {
992 int tagidx;
993 int z;
994 int status;
995
996 /* Find the tag, if we have it; else, add it, at tagidx = msa->ngr */
997 if (!msa->gr_idx && (msa->gr_idx = esl_keyhash_CreateCustom(8,8,128)) == NULL) { status = eslEMEM; goto ERROR; }
998 status = esl_keyhash_Store(msa->gr_idx, tag, taglen, &tagidx);
999 if (status == eslEDUP) { *ret_tagidx = tagidx; return eslOK; }
1000 else if (status != eslOK) goto ERROR;
1001
1002 /* if we get here, this is a new tag we're adding. */
1003 ESL_REALLOC(msa->gr_tag, sizeof(char *) * (msa->ngr+1)); /* +1, we're allocated one new tag at a time, as needed */
1004 ESL_REALLOC(msa->gr, sizeof(char **) * (msa->ngr+1));
1005 ESL_REALLOC(pd->ogr_len, sizeof(int64_t *) * (msa->ngr+1));
1006 msa->gr_tag[tagidx] = NULL;
1007 msa->gr[tagidx] = NULL;
1008 pd->ogr_len[tagidx] = NULL;
1009 ESL_ALLOC(msa->gr[tagidx], sizeof(char *) * msa->sqalloc);
1010 ESL_ALLOC(pd->ogr_len[tagidx], sizeof(int64_t) * msa->sqalloc);
1011 for (z = 0; z < msa->sqalloc; z++) {
1012 msa->gr[tagidx][z] = NULL;
1013 pd->ogr_len[tagidx][z] = 0;
1014 }
1015
1016 if ( (status = esl_memstrdup(tag, taglen, &(msa->gr_tag[tagidx]))) != eslOK) goto ERROR; /* eslEMEM */
1017 msa->ngr++;
1018 *ret_tagidx = tagidx;
1019 return eslOK;
1020
1021 ERROR:
1022 *ret_tagidx = -1;
1023 return status;
1024 }
1025
1026 static int
stockholm_get_gc_tagidx(ESL_MSA * msa,ESL_STOCKHOLM_PARSEDATA * pd,char * tag,esl_pos_t taglen,int * ret_tagidx)1027 stockholm_get_gc_tagidx(ESL_MSA *msa, ESL_STOCKHOLM_PARSEDATA *pd, char *tag, esl_pos_t taglen, int *ret_tagidx)
1028 {
1029 int tagidx;
1030 int status;
1031
1032 /* Find the tag, if we have it; else, add it, at tagidx = msa->ngc */
1033 if (!msa->gc_idx && (msa->gc_idx = esl_keyhash_CreateCustom(8,8,128)) == NULL) { status = eslEMEM; goto ERROR; }
1034 status = esl_keyhash_Store(msa->gc_idx, tag, taglen, &tagidx);
1035 if (status == eslEDUP) { *ret_tagidx = tagidx; return eslOK; }
1036 else if (status != eslOK) goto ERROR; /* eslEMEM */
1037
1038
1039 /* if we get here, this is a new tag we're adding. */
1040 ESL_REALLOC(msa->gc_tag, sizeof(char *) * (msa->ngc+1)); /* +1, we're allocated one new tag at a time, as needed */
1041 ESL_REALLOC(msa->gc, sizeof(char *) * (msa->ngc+1));
1042 ESL_REALLOC(pd->ogc_len, sizeof(int64_t) * (msa->ngc+1));
1043 msa->gc_tag[tagidx] = NULL;
1044 msa->gc[tagidx] = NULL;
1045 pd->ogc_len[tagidx] = 0;
1046
1047 if ( (status = esl_memstrdup(tag, taglen, &(msa->gc_tag[tagidx]))) != eslOK) return status; /* eslEMEM */
1048 msa->ngc++;
1049 *ret_tagidx = tagidx;
1050 return eslOK;
1051
1052 ERROR:
1053 *ret_tagidx = -1;
1054 return status;
1055 }
1056 /*------------ end, looking up seq, tag indices -----------------*/
1057
1058
1059 /*****************************************************************
1060 * 5. Internal: writing Stockholm/Pfam format
1061 *****************************************************************/
1062
1063 /* stockholm_write()
1064 * Returns: <eslOK> on success.
1065 * Throws: <eslEMEM> on allocation error.
1066 * <eslEWRITE> on any system write error.
1067 */
1068 static int
stockholm_write(FILE * fp,const ESL_MSA * msa,int64_t cpl)1069 stockholm_write(FILE *fp, const ESL_MSA *msa, int64_t cpl)
1070 {
1071 int i, j;
1072 int maxname; /* maximum name length */
1073 int maxgf; /* max #=GF tag length */
1074 int maxgc; /* max #=GC tag length */
1075 int maxgr; /* max #=GR tag length */
1076 int margin; /* total left margin width */
1077 int gslen; /* length of a #=GS tag */
1078 char *buf = NULL;
1079 int currpos;
1080 char *s, *tok;
1081 int acpl; /* actual number of character per line */
1082 int make_uniquenames = FALSE; /* TRUE if we force names to be unique */
1083 int uniqwidth = 0;
1084 int tmpnseq;
1085 int status;
1086
1087 /* Stockholm files require unique seqnames. Don't allow the writer
1088 * to create an invalid file.
1089 */
1090 status = esl_msa_CheckUniqueNames(msa);
1091 if (status == eslFAIL) {
1092 make_uniquenames = TRUE;
1093 for (tmpnseq = msa->nseq; tmpnseq; tmpnseq /= 10) uniqwidth++; /* how wide the uniqizing numbers need to be */
1094 uniqwidth++; /* uniqwidth includes the '|' */
1095 } else if (status != eslOK) goto ERROR;
1096
1097 /* Figure out how much space we need for name + markup
1098 * to keep the alignment in register.
1099 *
1100 * The left margin of an alignment block can be composed of:
1101 *
1102 * <seqname> max length: uniqwidth + maxname + 1
1103 * #=GC <gc_tag> max length: 4 + 1 + maxgc + 1
1104 * #=GR <seqname> <gr_tag> max length: 4 + 1 + uniqwidth + maxname + 1 + maxgr + 1
1105 *
1106 * <margin> is the max of these. It is the total length of the
1107 * left margin that we need to leave, inclusive of the last space.
1108 *
1109 * Then when we output, we do:
1110 * name: <leftmargin-uniqwidth-1>
1111 * gc: #=GC <leftmargin-6>
1112 * gr: #=GR <uniqwidth><maxname> <leftmargin-maxname-uniqwidth-7>
1113 *
1114 * because uniqwidth includes the |, field widths for uniqizing must be uniqwidth-1
1115 *
1116 * xref STL9/p17
1117 */
1118 maxname = esl_str_GetMaxWidth(msa->sqname, msa->nseq);
1119
1120 maxgf = esl_str_GetMaxWidth(msa->gf_tag, msa->ngf);
1121 if (maxgf < 2) maxgf = 2;
1122
1123 maxgc = esl_str_GetMaxWidth(msa->gc_tag, msa->ngc);
1124 if (msa->rf && maxgc < 2) maxgc = 2;
1125 if (msa->mm && maxgc < 2) maxgc = 2;
1126 if (msa->ss_cons && maxgc < 7) maxgc = 7;
1127 if (msa->sa_cons && maxgc < 7) maxgc = 7;
1128 if (msa->pp_cons && maxgc < 7) maxgc = 7;
1129
1130 maxgr = esl_str_GetMaxWidth(msa->gr_tag, msa->ngr);
1131 if (msa->ss && maxgr < 2) maxgr = 2;
1132 if (msa->sa && maxgr < 2) maxgr = 2;
1133 if (msa->pp && maxgr < 2) maxgr = 2;
1134
1135 margin = uniqwidth + maxname + 1;
1136 if (maxgc > 0 && maxgc+6 > margin) margin = maxgc+6;
1137 if (maxgr > 0 && uniqwidth+maxname+maxgr+7 > margin) margin = uniqwidth+maxname+maxgr+7;
1138
1139 /* Allocate a tmp buffer to hold sequence chunks in */
1140 ESL_ALLOC(buf, sizeof(char) * (cpl+1));
1141
1142 /* Magic Stockholm header */
1143 if (fprintf(fp, "# STOCKHOLM 1.0\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1144
1145 /* Warning about uniqization */
1146 if (make_uniquenames && fprintf(fp, "# WARNING: seq names have been made unique by adding a prefix of \"<seq#>|\"\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1147
1148 /* Free text comment section */
1149 for (i = 0; i < msa->ncomment; i++)
1150 if (fprintf(fp, "#%s\n", msa->comment[i]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1151 if (msa->ncomment > 0 && fprintf(fp, "\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1152
1153 /* GF section: per-file annotation */
1154 if (msa->name && fprintf(fp, "#=GF %-*s %s\n", maxgf, "ID", msa->name) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1155 if (msa->acc && fprintf(fp, "#=GF %-*s %s\n", maxgf, "AC", msa->acc) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1156 if (msa->desc && fprintf(fp, "#=GF %-*s %s\n", maxgf, "DE", msa->desc) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1157 if (msa->au && fprintf(fp, "#=GF %-*s %s\n", maxgf, "AU", msa->au) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1158
1159 /* Thresholds are hacky. Pfam has two. Rfam has one. */
1160 if (msa->cutset[eslMSA_GA1] && msa->cutset[eslMSA_GA2]) { if (fprintf(fp, "#=GF %-*s %.1f %.1f\n", maxgf, "GA", msa->cutoff[eslMSA_GA1], msa->cutoff[eslMSA_GA2]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1161 else if (msa->cutset[eslMSA_GA1]) { if (fprintf(fp, "#=GF %-*s %.1f\n", maxgf, "GA", msa->cutoff[eslMSA_GA1]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1162
1163 if (msa->cutset[eslMSA_NC1] && msa->cutset[eslMSA_NC2]) { if (fprintf(fp, "#=GF %-*s %.1f %.1f\n", maxgf, "NC", msa->cutoff[eslMSA_NC1], msa->cutoff[eslMSA_NC2]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1164 else if (msa->cutset[eslMSA_NC1]) { if (fprintf(fp, "#=GF %-*s %.1f\n", maxgf, "NC", msa->cutoff[eslMSA_NC1]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1165
1166 if (msa->cutset[eslMSA_TC1] && msa->cutset[eslMSA_TC2]) { if (fprintf(fp, "#=GF %-*s %.1f %.1f\n", maxgf, "TC", msa->cutoff[eslMSA_TC1], msa->cutoff[eslMSA_TC2]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1167 else if (msa->cutset[eslMSA_TC1]) { if (fprintf(fp, "#=GF %-*s %.1f\n", maxgf, "TC", msa->cutoff[eslMSA_TC1]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1168
1169 for (i = 0; i < msa->ngf; i++)
1170 if (fprintf(fp, "#=GF %-*s %s\n", maxgf, msa->gf_tag[i], msa->gf[i]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1171 if (fprintf(fp, "\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1172
1173
1174 /* GS section: per-sequence annotation */
1175 if (msa->flags & eslMSA_HASWGTS) {
1176 for (i = 0; i < msa->nseq; i++)
1177 if (make_uniquenames) { if (fprintf(fp, "#=GS %0*d|%-*s WT %.2f\n", uniqwidth-1, i, maxname, msa->sqname[i], msa->wgt[i]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1178 else { if (fprintf(fp, "#=GS %-*s WT %.2f\n", maxname, msa->sqname[i], msa->wgt[i]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1179 if (fprintf(fp, "\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1180 }
1181
1182 if (msa->sqacc) {
1183 for (i = 0; i < msa->nseq; i++)
1184 if (msa->sqacc[i]) {
1185 if (make_uniquenames) { if (fprintf(fp, "#=GS %0*d|%-*s AC %s\n", uniqwidth-1, i, maxname, msa->sqname[i], msa->sqacc[i]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1186 else { if (fprintf(fp, "#=GS %-*s AC %s\n", maxname, msa->sqname[i], msa->sqacc[i]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1187 }
1188 if (fprintf(fp, "\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1189 }
1190
1191 if (msa->sqdesc) {
1192 for (i = 0; i < msa->nseq; i++)
1193 if (msa->sqdesc[i]) {
1194 if (make_uniquenames) { if (fprintf(fp, "#=GS %0*d|%-*s DE %s\n", uniqwidth-1, i, maxname, msa->sqname[i], msa->sqdesc[i]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1195 else { if (fprintf(fp, "#=GS %-*s DE %s\n", maxname, msa->sqname[i], msa->sqdesc[i]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1196 }
1197 if (fprintf(fp, "\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1198 }
1199
1200 /* Multiannotated GS tags are possible; for example,
1201 * #=GS foo DR PDB; 1xxx;
1202 * #=GS foo DR PDB; 2yyy;
1203 * These are stored, for example, as:
1204 * msa->gs[0][0] = "PDB; 1xxx;\nPDB; 2yyy;"
1205 * and must be decomposed.
1206 */
1207 for (i = 0; i < msa->ngs; i++)
1208 {
1209 gslen = strlen(msa->gs_tag[i]);
1210 for (j = 0; j < msa->nseq; j++)
1211 if (msa->gs[i][j]) {
1212 s = msa->gs[i][j];
1213 while (esl_strtok(&s, "\n", &tok) == eslOK)
1214 if (make_uniquenames) { if (fprintf(fp, "#=GS %0*d|%-*s %-*s %s\n", uniqwidth-1, i, maxname, msa->sqname[j], gslen, msa->gs_tag[i], tok) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1215 else { if (fprintf(fp, "#=GS %-*s %-*s %s\n", maxname, msa->sqname[j], gslen, msa->gs_tag[i], tok) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1216 }
1217 if (fprintf(fp, "\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1218 }
1219
1220 /* Alignment section:
1221 * contains aligned sequence, #=GR annotation, and #=GC annotation
1222 */
1223 for (currpos = 0; currpos < msa->alen; currpos += cpl)
1224 {
1225 acpl = (msa->alen - currpos > cpl)? cpl : msa->alen - currpos;
1226 buf[acpl] = '\0'; /* this suffices to terminate for all uses of buf[] in this block */
1227 if (currpos > 0 && fprintf(fp, "\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1228
1229 for (i = 0; i < msa->nseq; i++)
1230 {
1231 if (msa->abc) esl_abc_TextizeN(msa->abc, msa->ax[i] + currpos + 1, acpl, buf);
1232 if (! msa->abc) strncpy(buf, msa->aseq[i] + currpos, acpl);
1233 if (make_uniquenames) { if (fprintf(fp, "%0*d|%-*s %s\n", uniqwidth-1, i, margin-uniqwidth-1, msa->sqname[i], buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1234 else { if (fprintf(fp, "%-*s %s\n", margin-1, msa->sqname[i], buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1235
1236 if (msa->ss && msa->ss[i]) {
1237 strncpy(buf, msa->ss[i] + currpos, acpl);
1238 if (make_uniquenames) { if (fprintf(fp, "#=GR %0*d|%-*s %-*s %s\n", uniqwidth-1, i, maxname, msa->sqname[i], margin-maxname-uniqwidth-7, "SS", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1239 else { if (fprintf(fp, "#=GR %-*s %-*s %s\n", maxname, msa->sqname[i], margin-maxname-7, "SS", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1240 }
1241 if (msa->sa && msa->sa[i]) {
1242 strncpy(buf, msa->sa[i] + currpos, acpl);
1243 if (make_uniquenames) { if (fprintf(fp, "#=GR %0*d|%-*s %-*s %s\n", uniqwidth-1, i, maxname, msa->sqname[i], margin-maxname-uniqwidth-7, "SA", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1244 else { if (fprintf(fp, "#=GR %-*s %-*s %s\n", maxname, msa->sqname[i], margin-maxname-7, "SA", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1245 }
1246 if (msa->pp && msa->pp[i]) {
1247 strncpy(buf, msa->pp[i] + currpos, acpl);
1248 if (make_uniquenames) { if (fprintf(fp, "#=GR %0*d|%-*s %-*s %s\n", uniqwidth-1, i, maxname, msa->sqname[i], margin-maxname-uniqwidth-7, "PP", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1249 else { if (fprintf(fp, "#=GR %-*s %-*s %s\n", maxname, msa->sqname[i], margin-maxname-7, "PP", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1250 }
1251 for (j = 0; j < msa->ngr; j++)
1252 if (msa->gr[j][i]) {
1253 strncpy(buf, msa->gr[j][i] + currpos, acpl);
1254 if (make_uniquenames) { if (fprintf(fp, "#=GR %0*d|%-*s %-*s %s\n", uniqwidth-1, i, maxname, msa->sqname[i], margin-maxname-uniqwidth-7, msa->gr_tag[j], buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1255 else { if (fprintf(fp, "#=GR %-*s %-*s %s\n", maxname, msa->sqname[i], margin-maxname-7, msa->gr_tag[j], buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed"); }
1256 }
1257 }
1258
1259 if (msa->ss_cons) {
1260 strncpy(buf, msa->ss_cons + currpos, acpl);
1261 if (fprintf(fp, "#=GC %-*s %s\n", margin-6, "SS_cons", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1262 }
1263 if (msa->sa_cons) {
1264 strncpy(buf, msa->sa_cons + currpos, acpl);
1265 if (fprintf(fp, "#=GC %-*s %s\n", margin-6, "SA_cons", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1266 }
1267 if (msa->pp_cons) {
1268 strncpy(buf, msa->pp_cons + currpos, acpl);
1269 if (fprintf(fp, "#=GC %-*s %s\n", margin-6, "PP_cons", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1270 }
1271 if (msa->rf) {
1272 strncpy(buf, msa->rf + currpos, acpl);
1273 if (fprintf(fp, "#=GC %-*s %s\n", margin-6, "RF", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1274 }
1275 if (msa->mm) {
1276 strncpy(buf, msa->mm + currpos, acpl);
1277 if (fprintf(fp, "#=GC %-*s %s\n", margin-6, "MM", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1278 }
1279 for (j = 0; j < msa->ngc; j++) {
1280 strncpy(buf, msa->gc[j] + currpos, acpl);
1281 if (fprintf(fp, "#=GC %-*s %s\n", margin-6, msa->gc_tag[j], buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1282 }
1283 }
1284 if (fprintf(fp, "//\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "stockholm msa write failed");
1285 free(buf);
1286 return eslOK;
1287
1288 ERROR:
1289 if (buf != NULL) free(buf);
1290 return status;
1291 }
1292 /*----------------- end, writing Stockholm/Pfam -----------------*/
1293
1294
1295
1296 /*****************************************************************
1297 * 6. Unit tests
1298 *****************************************************************/
1299 #ifdef eslMSAFILE_STOCKHOLM_TESTDRIVE
1300
1301 static void
utest_write_good1(FILE * ofp,int * ret_alphatype,int * ret_nseq,int * ret_alen)1302 utest_write_good1(FILE *ofp, int *ret_alphatype, int *ret_nseq, int *ret_alen)
1303 {
1304 fputs("# STOCKHOLM 1.0\n", ofp);
1305 fputs("#\n", ofp);
1306 fputs("# This is an example of a Stockholm multiple sequence alignment\n", ofp);
1307 fputs("# file. It is deliberately designed to be weird, to exercise many of the\n", ofp);
1308 fputs("# features of Stockholm format, in order to test a parser.\n", ofp);
1309 fputs("#\n", ofp);
1310 fputs("#=GF ID 14-3-3\n", ofp);
1311 fputs("#=GF AC PF00244\n", ofp);
1312 fputs("#=GF DE 14-3-3 proteins\n", ofp);
1313 fputs("#=GF AU Finn RD\n", ofp);
1314 fputs("#=GF AL Clustalw\n", ofp);
1315 fputs("#=GF SE Prosite\n", ofp);
1316 fputs("#=GF GA 25 25\n", ofp);
1317 fputs("#=GF TC 35.40 35.40\n", ofp);
1318 fputs("#=GF NC 8.80 8.80\n", ofp);
1319 fputs("#=GF BM hmmbuild -f HMM SEED\n", ofp);
1320 fputs("#=GF BM hmmcalibrate --seed 0 HMM\n", ofp);
1321 fputs("#=GF RN [1]\n", ofp);
1322 fputs("#=GF RM 95327195\n", ofp);
1323 fputs("#=GF RT Structure of a 14-3-3 protein and implications for\n", ofp);
1324 fputs("#=GF RT coordination of multiple signalling pathways. \n", ofp);
1325 fputs("#=GF RA Xiao B, Smerdon SJ, Jones DH, Dodson GG, Soneji Y, Aitken\n", ofp);
1326 fputs("#=GF RA A, Gamblin SJ; \n", ofp);
1327 fputs("#=GF RL Nature 1995;376:188-191.\n", ofp);
1328 fputs("#=GF RN [2]\n", ofp);
1329 fputs("#=GF RM 95327196\n", ofp);
1330 fputs("#=GF RT Crystal structure of the zeta isoform of the 14-3-3\n", ofp);
1331 fputs("#=GF RT protein. \n", ofp);
1332 fputs("#=GF RA Liu D, Bienkowska J, Petosa C, Collier RJ, Fu H, Liddington\n", ofp);
1333 fputs("#=GF RA R; \n", ofp);
1334 fputs("#=GF RL Nature 1995;376:191-194.\n", ofp);
1335 fputs("#=GF DR PROSITE; PDOC00633;\n", ofp);
1336 fputs("#=GF DR SMART; 14_3_3;\n", ofp);
1337 fputs("#=GF DR PRINTS; PR00305;\n", ofp);
1338 fputs("#=GF SQ 119\n", ofp);
1339 fputs(" \n", ofp);
1340 fputs("#=GS 1431_ENTHI/4-239 WT 0.42\n", ofp);
1341 fputs("#=GS seq1 WT 0.40\n", ofp);
1342 fputs("#=GS seq2 WT 0.41\n", ofp);
1343 fputs("#=GS seq3 WT 0.43\n", ofp);
1344 fputs("#=GS seq4 WT 0.44\n", ofp);
1345 fputs("#=GS seq5 WT 0.45\n", ofp);
1346 fputs("#=GS seq6 WT 0.46\n", ofp);
1347 fputs("\n", ofp);
1348 fputs("#=GS seq4 AC PF00001\n", ofp);
1349 fputs("#=GS seq4 DE A description of seq4.\n", ofp);
1350 fputs("\n", ofp);
1351 fputs("#=GS seq1 NEWTAG foo\n", ofp);
1352 fputs("#=GS seq2 NEWTAG bar\n", ofp);
1353 fputs("#=GS seq3 NEWTAG baz \n", ofp);
1354 fputs("\n", ofp);
1355 fputs("#=GS seq3 TAG2 foo2\n", ofp);
1356 fputs("#=GS seq4 TAG2 foo3\n", ofp);
1357 fputs("#=GS seq5 TAG2 foo4\n", ofp);
1358 fputs("\n", ofp);
1359 fputs("#=GC SS_cons xxxxxxxxxxxxxxxxxxx\n", ofp);
1360 fputs("#=GC SA_cons xxxxxxxxxxxxxxxxxxx \n", ofp);
1361 fputs("#=GC New_long_tag_thingie xxxxxxxxxxxxxxxxxxx\n", ofp);
1362 fputs("1431_ENTHI/4-239 ACDEFGHKLMNPQRSTVWY \n", ofp);
1363 fputs("#=GR seq1 SS ................... \n", ofp);
1364 fputs("#=GR seq1 SA 0000000000000000000\n", ofp);
1365 fputs("seq1 ACDEFGHKLMNPQRSTVWY\n", ofp);
1366 fputs("seq2 ACDEFGHKLMNPQRSTVWY\n", ofp);
1367 fputs("#=GR seq3 PP 0000000000000000000 \n", ofp);
1368 fputs("seq3 ACDEFGHKLMNPQRSTVWY\n", ofp);
1369 fputs("seq4 ACDEFGHKLMNPQRSTVWY\n", ofp);
1370 fputs("seq5 ACDEFGHKLMNPQRSTVWY\n", ofp);
1371 fputs("seq6 ACDEFGHKLMNPQRSTVWY\n", ofp);
1372 fputs("#=GR seq6 SS ...................\n", ofp);
1373 fputs("#=GR seq6 SA 9999999999999999999\n", ofp);
1374 fputs("#=GR seq6 Invented_tag *******************\n", ofp);
1375 fputs("#=GR seq6 Another_tag -------------------\n", ofp);
1376 fputs("#=GC PP_cons ******************* \n", ofp);
1377 fputs(" \n", ofp);
1378 fputs(" \n", ofp);
1379 fputs("#=GC SS_cons xxxxxxxxxxxxxxxxxxx\n", ofp);
1380 fputs("#=GC SA_cons xxxxxxxxxxxxxxxxxxx\n", ofp);
1381 fputs("#=GC New_long_tag_thingie xxxxxxxxxxxxxxxxxxx \n", ofp);
1382 fputs("1431_ENTHI/4-239 ACDEFGHKLMNPQRSTVWY \n", ofp);
1383 fputs("#=GR seq1 SS ...................\n", ofp);
1384 fputs("#=GR seq1 SA 0000000000000000000\n", ofp);
1385 fputs("seq1 ACDEFGHKLMNPQRSTVWY\n", ofp);
1386 fputs("seq2 ACDEFGHKLMNPQRSTVWY \n", ofp);
1387 fputs("#=GR seq3 PP 0000000000000000000 \n", ofp);
1388 fputs("seq3 ACDEFGHKLMNPQRSTVWY\n", ofp);
1389 fputs("seq4 ACDEFGHKLMNPQRSTVWY\n", ofp);
1390 fputs("seq5 ACDEFGHKLMNPQRSTVWY\n", ofp);
1391 fputs("seq6 ACDEFGHKLMNPQRSTVWY\n", ofp);
1392 fputs("#=GR seq6 SS ...................\n", ofp);
1393 fputs("#=GR seq6 SA 9999999999999999999\n", ofp);
1394 fputs("#=GR seq6 Invented_tag ******************* \n", ofp);
1395 fputs("#=GR seq6 Another_tag -------------------\n", ofp);
1396 fputs("#=GC PP_cons ******************* \n", ofp);
1397 fputs("\n", ofp);
1398 fputs("#\n", ofp);
1399 fputs("# And here's some trailing comments, just to\n", ofp);
1400 fputs("# try to confuse a parser.\n", ofp);
1401 fputs("#\n", ofp);
1402 fputs("\n", ofp);
1403 fputs("//\n", ofp);
1404
1405 *ret_alphatype = eslAMINO;
1406 *ret_nseq = 7;
1407 *ret_alen = 38;
1408 }
1409
1410 static void
utest_write_badformat1(FILE * ofp,int * ret_linenumber,char * errmsg)1411 utest_write_badformat1(FILE *ofp, int *ret_linenumber, char *errmsg)
1412 {
1413 fputs("missing Stockholm header\n", ofp);
1414
1415 *ret_linenumber = 1;
1416 strcpy(errmsg, "missing Stockholm header");
1417 }
1418
1419 static void
utest_write_badformat2(FILE * ofp,int * ret_linenumber,char * errmsg)1420 utest_write_badformat2(FILE *ofp, int *ret_linenumber, char *errmsg)
1421 {
1422 fputs("# STOCKHOLM 1.0\n", ofp);
1423 fputs("\n", ofp);
1424 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1425 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1426 fputs("\n", ofp);
1427 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1428 fputs("//\n", ofp);
1429
1430 *ret_linenumber = 7;
1431 strcpy(errmsg, "number of seqs in block did not match number in earlier block(s)");
1432 }
1433
1434 static void
utest_write_badformat3(FILE * ofp,int * ret_linenumber,char * errmsg)1435 utest_write_badformat3(FILE *ofp, int *ret_linenumber, char *errmsg)
1436 {
1437 fputs("# STOCKHOLM 1.0\n", ofp);
1438 fputs("\n", ofp);
1439 fputs("#=GS seq1 FOO baz\n", ofp);
1440 fputs("#=GS seq2 FOO boz\n", ofp);
1441 fputs("\n", ofp);
1442 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1443 fputs("//\n", ofp);
1444
1445 *ret_linenumber = 7;
1446 strcpy(errmsg, "number of seqs in block did not match number annotated by #=GS lines");
1447 }
1448
1449 static void
utest_write_badformat4(FILE * ofp,int * ret_linenumber,char * errmsg)1450 utest_write_badformat4(FILE *ofp, int *ret_linenumber, char *errmsg)
1451 {
1452 fputs("# STOCKHOLM 1.0\n", ofp);
1453 fputs("\n", ofp);
1454 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1455 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1456
1457 *ret_linenumber = 4;
1458 strcpy(errmsg, "missing // terminator after MSA");
1459 }
1460
1461 static void
utest_write_badformat5(FILE * ofp,int * ret_linenumber,char * errmsg)1462 utest_write_badformat5(FILE *ofp, int *ret_linenumber, char *errmsg)
1463 {
1464 fputs("# STOCKHOLM 1.0\n", ofp);
1465 fputs("\n", ofp);
1466 fputs("# No data\n", ofp);
1467 fputs("//\n", ofp);
1468
1469 *ret_linenumber = 4;
1470 strcpy(errmsg, "no alignment data followed Stockholm header");
1471 }
1472
1473 static void
utest_write_badformat6(FILE * ofp,int * ret_linenumber,char * errmsg)1474 utest_write_badformat6(FILE *ofp, int *ret_linenumber, char *errmsg)
1475 {
1476 fputs("# STOCKHOLM 1.0\n", ofp);
1477 fputs("\n", ofp);
1478 fputs("#=GF \n", ofp);
1479 fputs("\n", ofp);
1480 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1481 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1482 fputs("//\n", ofp);
1483
1484 *ret_linenumber = 3;
1485 strcpy(errmsg, "#=GF line is missing <tag>, annotation");
1486 }
1487
1488 static void
utest_write_badformat7(FILE * ofp,int * ret_linenumber,char * errmsg)1489 utest_write_badformat7(FILE *ofp, int *ret_linenumber, char *errmsg)
1490 {
1491 fputs("# STOCKHOLM 1.0\n", ofp);
1492 fputs("#=GFX tag\n", ofp);
1493 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1494 fputs("//\n", ofp);
1495
1496 *ret_linenumber = 2;
1497 strcpy(errmsg, "faux #=GF line");
1498 }
1499
1500 static void
utest_write_badformat8(FILE * ofp,int * ret_linenumber,char * errmsg)1501 utest_write_badformat8(FILE *ofp, int *ret_linenumber, char *errmsg)
1502 {
1503 fputs("# STOCKHOLM 1.0\n", ofp);
1504 fputs("#=GF ID\n", ofp);
1505 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1506 fputs("//\n", ofp);
1507
1508 *ret_linenumber = 2;
1509 strcpy(errmsg, "No name found on #=GF ID line");
1510 }
1511
1512 static void
utest_write_badformat9(FILE * ofp,int * ret_linenumber,char * errmsg)1513 utest_write_badformat9(FILE *ofp, int *ret_linenumber, char *errmsg)
1514 {
1515 fputs("# STOCKHOLM 1.0\n", ofp);
1516 fputs("#=GF ID name with cruft\n", ofp);
1517 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1518 fputs("//\n", ofp);
1519
1520 *ret_linenumber = 2;
1521 strcpy(errmsg, "#=GF ID line should have only one name (no whitespace allowed)");
1522 }
1523
1524 static void
utest_write_badformat10(FILE * ofp,int * ret_linenumber,char * errmsg)1525 utest_write_badformat10(FILE *ofp, int *ret_linenumber, char *errmsg)
1526 {
1527 fputs("# STOCKHOLM 1.0\n", ofp);
1528 fputs("#=GF AC \n", ofp);
1529 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1530 fputs("//\n", ofp);
1531
1532 *ret_linenumber = 2;
1533 strcpy(errmsg, "No accession found on #=GF AC line");
1534 }
1535
1536 static void
utest_write_badformat11(FILE * ofp,int * ret_linenumber,char * errmsg)1537 utest_write_badformat11(FILE *ofp, int *ret_linenumber, char *errmsg)
1538 {
1539 fputs("# STOCKHOLM 1.0\n", ofp);
1540 fputs("#=GF AC accession with cruft \n", ofp);
1541 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1542 fputs("//\n", ofp);
1543
1544 *ret_linenumber = 2;
1545 strcpy(errmsg, "#=GF AC line should have only one accession (no whitespace allowed)");
1546 }
1547
1548 static void
utest_write_badformat12(FILE * ofp,int * ret_linenumber,char * errmsg)1549 utest_write_badformat12(FILE *ofp, int *ret_linenumber, char *errmsg)
1550 {
1551 fputs("# STOCKHOLM 1.0\n", ofp);
1552 fputs("#=GF GA not_a_number \n", ofp);
1553 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1554 fputs("//\n", ofp);
1555
1556 *ret_linenumber = 2;
1557 strcpy(errmsg, "Expected a real number for GA1 value on #=GF GA line");
1558 }
1559
1560 static void
utest_write_badformat13(FILE * ofp,int * ret_linenumber,char * errmsg)1561 utest_write_badformat13(FILE *ofp, int *ret_linenumber, char *errmsg)
1562 {
1563 fputs("# STOCKHOLM 1.0\n", ofp);
1564 fputs("#=GF GA 10.0 not_a_number \n", ofp);
1565 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1566 fputs("//\n", ofp);
1567
1568 *ret_linenumber = 2;
1569 strcpy(errmsg, "Expected a real number for GA2 value on #=GF GA line");
1570 }
1571
1572 static void
utest_write_badformat14(FILE * ofp,int * ret_linenumber,char * errmsg)1573 utest_write_badformat14(FILE *ofp, int *ret_linenumber, char *errmsg)
1574 {
1575 fputs("# STOCKHOLM 1.0\n", ofp);
1576 fputs("#=GF GA \n", ofp);
1577 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1578 fputs("//\n", ofp);
1579
1580 *ret_linenumber = 2;
1581 strcpy(errmsg, "No GA threshold value found on #=GF GA line");
1582 }
1583
1584 static void
utest_write_badformat15(FILE * ofp,int * ret_linenumber,char * errmsg)1585 utest_write_badformat15(FILE *ofp, int *ret_linenumber, char *errmsg)
1586 {
1587 fputs("# STOCKHOLM 1.0\n", ofp);
1588 fputs("#=GF NC not_a_number \n", ofp);
1589 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1590 fputs("//\n", ofp);
1591
1592 *ret_linenumber = 2;
1593 strcpy(errmsg, "Expected a real number for NC1 value on #=GF NC line");
1594 }
1595
1596 static void
utest_write_badformat16(FILE * ofp,int * ret_linenumber,char * errmsg)1597 utest_write_badformat16(FILE *ofp, int *ret_linenumber, char *errmsg)
1598 {
1599 fputs("# STOCKHOLM 1.0\n", ofp);
1600 fputs("#=GF NC 10.0 not_a_number \n", ofp);
1601 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1602 fputs("//\n", ofp);
1603
1604 *ret_linenumber = 2;
1605 strcpy(errmsg, "Expected a real number for NC2 value on #=GF NC line");
1606 }
1607
1608 static void
utest_write_badformat17(FILE * ofp,int * ret_linenumber,char * errmsg)1609 utest_write_badformat17(FILE *ofp, int *ret_linenumber, char *errmsg)
1610 {
1611 fputs("# STOCKHOLM 1.0\n", ofp);
1612 fputs("#=GF NC \n", ofp);
1613 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1614 fputs("//\n", ofp);
1615
1616 *ret_linenumber = 2;
1617 strcpy(errmsg, "No NC threshold value found on #=GF NC line");
1618 }
1619
1620 static void
utest_write_badformat18(FILE * ofp,int * ret_linenumber,char * errmsg)1621 utest_write_badformat18(FILE *ofp, int *ret_linenumber, char *errmsg)
1622 {
1623 fputs("# STOCKHOLM 1.0\n", ofp);
1624 fputs("#=GF TC not_a_number \n", ofp);
1625 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1626 fputs("//\n", ofp);
1627
1628 *ret_linenumber = 2;
1629 strcpy(errmsg, "Expected a real number for TC1 value on #=GF TC line");
1630 }
1631
1632 static void
utest_write_badformat19(FILE * ofp,int * ret_linenumber,char * errmsg)1633 utest_write_badformat19(FILE *ofp, int *ret_linenumber, char *errmsg)
1634 {
1635 fputs("# STOCKHOLM 1.0\n", ofp);
1636 fputs("#=GF TC 10.0 not_a_number \n", ofp);
1637 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1638 fputs("//\n", ofp);
1639
1640 *ret_linenumber = 2;
1641 strcpy(errmsg, "Expected a real number for TC2 value on #=GF TC line");
1642 }
1643
1644 static void
utest_write_badformat20(FILE * ofp,int * ret_linenumber,char * errmsg)1645 utest_write_badformat20(FILE *ofp, int *ret_linenumber, char *errmsg)
1646 {
1647 fputs("# STOCKHOLM 1.0\n", ofp);
1648 fputs("#=GF TC \n", ofp);
1649 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1650 fputs("//\n", ofp);
1651
1652 *ret_linenumber = 2;
1653 strcpy(errmsg, "No TC threshold value found on #=GF TC line");
1654 }
1655
1656 static void
utest_write_badformat21(FILE * ofp,int * ret_linenumber,char * errmsg)1657 utest_write_badformat21(FILE *ofp, int *ret_linenumber, char *errmsg)
1658 {
1659 fputs("# STOCKHOLM 1.0\n", ofp);
1660 fputs("#=GS \n", ofp);
1661 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1662 fputs("//\n", ofp);
1663
1664 *ret_linenumber = 2;
1665 strcpy(errmsg, "#=GS line missing <seqname>, <tag>, annotation");
1666 }
1667
1668 static void
utest_write_badformat22(FILE * ofp,int * ret_linenumber,char * errmsg)1669 utest_write_badformat22(FILE *ofp, int *ret_linenumber, char *errmsg)
1670 {
1671 fputs("# STOCKHOLM 1.0\n", ofp);
1672 fputs("#=GS seq1 \n", ofp);
1673 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1674 fputs("//\n", ofp);
1675
1676 *ret_linenumber = 2;
1677 strcpy(errmsg, "#=GS line missing <tag>, annotation");
1678 }
1679
1680 static void
utest_write_badformat23(FILE * ofp,int * ret_linenumber,char * errmsg)1681 utest_write_badformat23(FILE *ofp, int *ret_linenumber, char *errmsg)
1682 {
1683 fputs("# STOCKHOLM 1.0\n", ofp);
1684 fputs("#=GSX seq1 foo \n", ofp);
1685 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1686 fputs("//\n", ofp);
1687
1688 *ret_linenumber = 2;
1689 strcpy(errmsg, "faux #=GS line");
1690 }
1691
1692 static void
utest_write_badformat24(FILE * ofp,int * ret_linenumber,char * errmsg)1693 utest_write_badformat24(FILE *ofp, int *ret_linenumber, char *errmsg)
1694 {
1695 fputs("# STOCKHOLM 1.0\n", ofp);
1696 fputs("#=GS seq1 WT \n", ofp);
1697 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1698 fputs("//\n", ofp);
1699
1700 *ret_linenumber = 2;
1701 strcpy(errmsg, "no weight value found on #=GS <seqname> WT line");
1702 }
1703
1704 static void
utest_write_badformat25(FILE * ofp,int * ret_linenumber,char * errmsg)1705 utest_write_badformat25(FILE *ofp, int *ret_linenumber, char *errmsg)
1706 {
1707 fputs("# STOCKHOLM 1.0\n", ofp);
1708 fputs("#=GS seq1 WT 2.0 \n", ofp);
1709 fputs("#=GS seq1 WT 2.0 \n", ofp);
1710 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1711 fputs("//\n", ofp);
1712
1713 *ret_linenumber = 3;
1714 strcpy(errmsg, "sequence has more than one #=GS <seqname> WT line");
1715 }
1716
1717 static void
utest_write_badformat26(FILE * ofp,int * ret_linenumber,char * errmsg)1718 utest_write_badformat26(FILE *ofp, int *ret_linenumber, char *errmsg)
1719 {
1720 fputs("# STOCKHOLM 1.0\n", ofp);
1721 fputs("#=GS seq1 WT 2.0 cruft \n", ofp);
1722 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1723 fputs("//\n", ofp);
1724
1725 *ret_linenumber = 2;
1726 strcpy(errmsg, "#=GS <seqname> WT line should have only one field, the weight");
1727 }
1728
1729 static void
utest_write_badformat27(FILE * ofp,int * ret_linenumber,char * errmsg)1730 utest_write_badformat27(FILE *ofp, int *ret_linenumber, char *errmsg)
1731 {
1732 fputs("# STOCKHOLM 1.0\n", ofp);
1733 fputs("#=GS seq1 WT cruft \n", ofp);
1734 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1735 fputs("//\n", ofp);
1736
1737 *ret_linenumber = 2;
1738 strcpy(errmsg, "value on #=GS <seqname> WT line isn't a real number");
1739 }
1740
1741 static void
utest_write_badformat28(FILE * ofp,int * ret_linenumber,char * errmsg)1742 utest_write_badformat28(FILE *ofp, int *ret_linenumber, char *errmsg)
1743 {
1744 fputs("# STOCKHOLM 1.0\n", ofp);
1745 fputs("#=GS seq1 AC \n", ofp);
1746 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1747 fputs("//\n", ofp);
1748
1749 *ret_linenumber = 2;
1750 strcpy(errmsg, "no accession found on #=GS <seqname> AC line");
1751 }
1752
1753 static void
utest_write_badformat29(FILE * ofp,int * ret_linenumber,char * errmsg)1754 utest_write_badformat29(FILE *ofp, int *ret_linenumber, char *errmsg)
1755 {
1756 fputs("# STOCKHOLM 1.0\n", ofp);
1757 fputs("#=GS seq1 AC xxxxxx \n", ofp);
1758 fputs("#=GS seq1 AC yyyyyy \n", ofp);
1759 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1760 fputs("//\n", ofp);
1761
1762 *ret_linenumber = 3;
1763 strcpy(errmsg, "sequence has more than one #=GS <seqname> AC accession line");
1764 }
1765
1766 static void
utest_write_badformat30(FILE * ofp,int * ret_linenumber,char * errmsg)1767 utest_write_badformat30(FILE *ofp, int *ret_linenumber, char *errmsg)
1768 {
1769 fputs("# STOCKHOLM 1.0\n", ofp);
1770 fputs("#=GS seq1 AC xxxxxx cruft \n", ofp);
1771 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1772 fputs("//\n", ofp);
1773
1774 *ret_linenumber = 2;
1775 strcpy(errmsg, "#=GS <seqname> AC line should have only one field, the accession");
1776 }
1777
1778 static void
utest_write_badformat31(FILE * ofp,int * ret_linenumber,char * errmsg)1779 utest_write_badformat31(FILE *ofp, int *ret_linenumber, char *errmsg)
1780 {
1781 fputs("# STOCKHOLM 1.0\n", ofp);
1782 fputs("#=GS seq1 DE a one line description \n", ofp);
1783 fputs("#=GS seq1 DE oops, a second line \n", ofp);
1784 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1785 fputs("//\n", ofp);
1786
1787 *ret_linenumber = 3;
1788 strcpy(errmsg, "sequence has more than one #=GS <seqname> DE accession line");
1789 }
1790
1791 static void
utest_write_badformat32(FILE * ofp,int * ret_linenumber,char * errmsg)1792 utest_write_badformat32(FILE *ofp, int *ret_linenumber, char *errmsg)
1793 {
1794 fputs("# STOCKHOLM 1.0\n", ofp);
1795 fputs("#=GC \n", ofp);
1796 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1797 fputs("//\n", ofp);
1798
1799 *ret_linenumber = 2;
1800 strcpy(errmsg, "#=GC line missing <tag>, annotation");
1801 }
1802
1803 static void
utest_write_badformat33(FILE * ofp,int * ret_linenumber,char * errmsg)1804 utest_write_badformat33(FILE *ofp, int *ret_linenumber, char *errmsg)
1805 {
1806 fputs("# STOCKHOLM 1.0\n", ofp);
1807 fputs("#=GCX FOO xxxxxxxxxxxxxxxxxxxx \n", ofp);
1808 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1809 fputs("//\n", ofp);
1810
1811 *ret_linenumber = 2;
1812 strcpy(errmsg, "faux #=GC line");
1813 }
1814
1815 static void
utest_write_badformat34(FILE * ofp,int * ret_linenumber,char * errmsg)1816 utest_write_badformat34(FILE *ofp, int *ret_linenumber, char *errmsg)
1817 {
1818 fputs("# STOCKHOLM 1.0\n", ofp);
1819 fputs("#=GC FOO \n", ofp);
1820 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1821 fputs("//\n", ofp);
1822
1823 *ret_linenumber = 2;
1824 strcpy(errmsg, "#=GC line missing annotation");
1825 }
1826
1827 static void
utest_write_badformat35(FILE * ofp,int * ret_linenumber,char * errmsg)1828 utest_write_badformat35(FILE *ofp, int *ret_linenumber, char *errmsg)
1829 {
1830 fputs("# STOCKHOLM 1.0\n", ofp);
1831 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1832 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1833 fputs("#=GC SS_cons xxxxxxxxxxxxxxxxxxxx\n", ofp);
1834 fputs("\n", ofp);
1835 fputs("#=GC SS_cons xxxxxxxxxxxxxxxxxxxx\n", ofp);
1836 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1837 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1838 fputs("//\n", ofp);
1839
1840 *ret_linenumber = 6;
1841 strcpy(errmsg, "unexpected #=GC SS_cons");
1842 }
1843
1844 static void
utest_write_badformat36(FILE * ofp,int * ret_linenumber,char * errmsg)1845 utest_write_badformat36(FILE *ofp, int *ret_linenumber, char *errmsg)
1846 {
1847 fputs("# STOCKHOLM 1.0\n", ofp);
1848 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1849 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1850 fputs("#=GC SA_cons xxxxxxxxxxxxxxxxxxxx\n", ofp);
1851 fputs("\n", ofp);
1852 fputs("#=GC SA_cons xxxxxxxxxxxxxxxxxxxx\n", ofp);
1853 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1854 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1855 fputs("//\n", ofp);
1856
1857 *ret_linenumber = 6;
1858 strcpy(errmsg, "unexpected #=GC SA_cons");
1859 }
1860
1861 static void
utest_write_badformat37(FILE * ofp,int * ret_linenumber,char * errmsg)1862 utest_write_badformat37(FILE *ofp, int *ret_linenumber, char *errmsg)
1863 {
1864 fputs("# STOCKHOLM 1.0\n", ofp);
1865 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1866 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1867 fputs("#=GC PP_cons xxxxxxxxxxxxxxxxxxxx\n", ofp);
1868 fputs("\n", ofp);
1869 fputs("#=GC PP_cons xxxxxxxxxxxxxxxxxxxx\n", ofp);
1870 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1871 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1872 fputs("//\n", ofp);
1873
1874 *ret_linenumber = 6;
1875 strcpy(errmsg, "unexpected #=GC PP_cons");
1876 }
1877
1878 static void
utest_write_badformat38(FILE * ofp,int * ret_linenumber,char * errmsg)1879 utest_write_badformat38(FILE *ofp, int *ret_linenumber, char *errmsg)
1880 {
1881 fputs("# STOCKHOLM 1.0\n", ofp);
1882 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1883 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1884 fputs("#=GC RF xxxxxxxxxxxxxxxxxxxx\n", ofp);
1885 fputs("\n", ofp);
1886 fputs("#=GC RF xxxxxxxxxxxxxxxxxxxx\n", ofp);
1887 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1888 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1889 fputs("//\n", ofp);
1890
1891 *ret_linenumber = 6;
1892 strcpy(errmsg, "unexpected #=GC RF");
1893 }
1894
1895 static void
utest_write_badformat39(FILE * ofp,int * ret_linenumber,char * errmsg)1896 utest_write_badformat39(FILE *ofp, int *ret_linenumber, char *errmsg)
1897 {
1898 fputs("# STOCKHOLM 1.0\n", ofp);
1899 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1900 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1901 fputs("#=GC XX xxxxxxxxxxxxxxxxxxxx\n", ofp);
1902 fputs("\n", ofp);
1903 fputs("#=GC XX xxxxxxxxxxxxxxxxxxxx\n", ofp);
1904 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1905 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1906 fputs("//\n", ofp);
1907
1908 *ret_linenumber = 6;
1909 strcpy(errmsg, "unexpected #=GC line");
1910 }
1911
1912 static void
utest_write_badformat40(FILE * ofp,int * ret_linenumber,char * errmsg)1913 utest_write_badformat40(FILE *ofp, int *ret_linenumber, char *errmsg)
1914 {
1915 fputs("# STOCKHOLM 1.0\n", ofp);
1916 fputs("#=GC SS_cons xxxxxxxxxxxxxxxxxxxx\n", ofp);
1917 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1918 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1919 fputs("#=GC SS_cons xxxxxxxxxxxxxxxxxxxx\n", ofp);
1920 fputs("//\n", ofp);
1921
1922 *ret_linenumber = 5;
1923 strcpy(errmsg, "more than one #=GC SS_cons line in block");
1924 }
1925
1926 static void
utest_write_badformat41(FILE * ofp,int * ret_linenumber,char * errmsg)1927 utest_write_badformat41(FILE *ofp, int *ret_linenumber, char *errmsg)
1928 {
1929 fputs("# STOCKHOLM 1.0\n", ofp);
1930 fputs("#=GC SA_cons xxxxxxxxxxxxxxxxxxxx\n", ofp);
1931 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1932 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1933 fputs("#=GC SA_cons xxxxxxxxxxxxxxxxxxxx\n", ofp);
1934 fputs("//\n", ofp);
1935
1936 *ret_linenumber = 5;
1937 strcpy(errmsg, "more than one #=GC SA_cons line in block");
1938 }
1939
1940 static void
utest_write_badformat42(FILE * ofp,int * ret_linenumber,char * errmsg)1941 utest_write_badformat42(FILE *ofp, int *ret_linenumber, char *errmsg)
1942 {
1943 fputs("# STOCKHOLM 1.0\n", ofp);
1944 fputs("#=GC PP_cons xxxxxxxxxxxxxxxxxxxx\n", ofp);
1945 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1946 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1947 fputs("#=GC PP_cons xxxxxxxxxxxxxxxxxxxx\n", ofp);
1948 fputs("//\n", ofp);
1949
1950 *ret_linenumber = 5;
1951 strcpy(errmsg, "more than one #=GC PP_cons line in block");
1952 }
1953
1954 static void
utest_write_badformat43(FILE * ofp,int * ret_linenumber,char * errmsg)1955 utest_write_badformat43(FILE *ofp, int *ret_linenumber, char *errmsg)
1956 {
1957 fputs("# STOCKHOLM 1.0\n", ofp);
1958 fputs("#=GC RF xxxxxxxxxxxxxxxxxxxx\n", ofp);
1959 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1960 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1961 fputs("#=GC RF xxxxxxxxxxxxxxxxxxxx\n", ofp);
1962 fputs("//\n", ofp);
1963
1964 *ret_linenumber = 5;
1965 strcpy(errmsg, "more than one #=GC RF line in block");
1966 }
1967
1968 static void
utest_write_badformat44(FILE * ofp,int * ret_linenumber,char * errmsg)1969 utest_write_badformat44(FILE *ofp, int *ret_linenumber, char *errmsg)
1970 {
1971 fputs("# STOCKHOLM 1.0\n", ofp);
1972 fputs("#=GC XX xxxxxxxxxxxxxxxxxxxx\n", ofp);
1973 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1974 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1975 fputs("#=GC XX xxxxxxxxxxxxxxxxxxxx\n", ofp);
1976 fputs("//\n", ofp);
1977
1978 *ret_linenumber = 5;
1979 strcpy(errmsg, "more than one #=GC XX line in block");
1980 }
1981
1982 static void
utest_write_badformat45(FILE * ofp,int * ret_linenumber,char * errmsg)1983 utest_write_badformat45(FILE *ofp, int *ret_linenumber, char *errmsg)
1984 {
1985 fputs("# STOCKHOLM 1.0\n", ofp);
1986 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1987 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
1988 fputs("#=GC XX xxxxxxxxxxxxxxxxxx\n", ofp);
1989 fputs("//\n", ofp);
1990
1991 *ret_linenumber = 4;
1992 strcpy(errmsg, "unexpected # of aligned annotation in #=GC XX line");
1993 }
1994
1995 static void
utest_write_badformat46(FILE * ofp,int * ret_linenumber,char * errmsg)1996 utest_write_badformat46(FILE *ofp, int *ret_linenumber, char *errmsg)
1997 {
1998 fputs("# STOCKHOLM 1.0\n", ofp);
1999 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2000 fputs("#=GR \n", ofp);
2001 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2002 fputs("//\n", ofp);
2003
2004 *ret_linenumber = 3;
2005 strcpy(errmsg, "#=GR line missing <seqname>, <tag>, annotation");
2006 }
2007
2008 static void
utest_write_badformat47(FILE * ofp,int * ret_linenumber,char * errmsg)2009 utest_write_badformat47(FILE *ofp, int *ret_linenumber, char *errmsg)
2010 {
2011 fputs("# STOCKHOLM 1.0\n", ofp);
2012 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2013 fputs("#=GR seq1 \n", ofp);
2014 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2015 fputs("//\n", ofp);
2016
2017 *ret_linenumber = 3;
2018 strcpy(errmsg, "#=GR line missing <tag>, annotation");
2019 }
2020
2021 static void
utest_write_badformat48(FILE * ofp,int * ret_linenumber,char * errmsg)2022 utest_write_badformat48(FILE *ofp, int *ret_linenumber, char *errmsg)
2023 {
2024 fputs("# STOCKHOLM 1.0\n", ofp);
2025 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2026 fputs("#=GRX seq1 XX xxxxxxxxxxxxxxxxxxxx\n", ofp);
2027 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2028 fputs("//\n", ofp);
2029
2030 *ret_linenumber = 3;
2031 strcpy(errmsg, "faux #=GR line");
2032 }
2033
2034 static void
utest_write_badformat49(FILE * ofp,int * ret_linenumber,char * errmsg)2035 utest_write_badformat49(FILE *ofp, int *ret_linenumber, char *errmsg)
2036 {
2037 fputs("# STOCKHOLM 1.0\n", ofp);
2038 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2039 fputs("#=GR seq1 XX \n", ofp);
2040 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2041 fputs("//\n", ofp);
2042
2043 *ret_linenumber = 3;
2044 strcpy(errmsg, "#=GR line missing annotation");
2045 }
2046
2047 static void
utest_write_badformat50(FILE * ofp,int * ret_linenumber,char * errmsg)2048 utest_write_badformat50(FILE *ofp, int *ret_linenumber, char *errmsg)
2049 {
2050 fputs("# STOCKHOLM 1.0\n", ofp);
2051 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2052 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2053 fputs("#=GR seq2 SS xxxxxxxxxxxxxxxxxxxx\n", ofp);
2054 fputs("\n", ofp);
2055 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2056 fputs("#=GR seq2 SS xxxxxxxxxxxxxxxxxxxx\n", ofp);
2057 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2058 fputs("//\n", ofp);
2059
2060 *ret_linenumber = 7;
2061 strcpy(errmsg, "unexpected #=GR <seqname> SS");
2062 }
2063
2064 static void
utest_write_badformat51(FILE * ofp,int * ret_linenumber,char * errmsg)2065 utest_write_badformat51(FILE *ofp, int *ret_linenumber, char *errmsg)
2066 {
2067 fputs("# STOCKHOLM 1.0\n", ofp);
2068 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2069 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2070 fputs("#=GR seq2 SA xxxxxxxxxxxxxxxxxxxx\n", ofp);
2071 fputs("\n", ofp);
2072 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2073 fputs("#=GR seq2 SA xxxxxxxxxxxxxxxxxxxx\n", ofp);
2074 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2075 fputs("//\n", ofp);
2076
2077 *ret_linenumber = 7;
2078 strcpy(errmsg, "unexpected #=GR <seqname> SA");
2079 }
2080
2081 static void
utest_write_badformat52(FILE * ofp,int * ret_linenumber,char * errmsg)2082 utest_write_badformat52(FILE *ofp, int *ret_linenumber, char *errmsg)
2083 {
2084 fputs("# STOCKHOLM 1.0\n", ofp);
2085 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2086 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2087 fputs("#=GR seq2 PP xxxxxxxxxxxxxxxxxxxx\n", ofp);
2088 fputs("\n", ofp);
2089 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2090 fputs("#=GR seq2 PP xxxxxxxxxxxxxxxxxxxx\n", ofp);
2091 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2092 fputs("//\n", ofp);
2093
2094 *ret_linenumber = 7;
2095 strcpy(errmsg, "unexpected #=GR <seqname> PP");
2096 }
2097
2098 static void
utest_write_badformat53(FILE * ofp,int * ret_linenumber,char * errmsg)2099 utest_write_badformat53(FILE *ofp, int *ret_linenumber, char *errmsg)
2100 {
2101 fputs("# STOCKHOLM 1.0\n", ofp);
2102 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2103 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2104 fputs("#=GR seq2 XX xxxxxxxxxxxxxxxxxxxx\n", ofp);
2105 fputs("\n", ofp);
2106 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2107 fputs("#=GR seq2 XX xxxxxxxxxxxxxxxxxxxx\n", ofp);
2108 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2109 fputs("//\n", ofp);
2110
2111 *ret_linenumber = 7;
2112 strcpy(errmsg, "unexpected #=GR line");
2113 }
2114
2115 static void
utest_write_badformat54(FILE * ofp,int * ret_linenumber,char * errmsg)2116 utest_write_badformat54(FILE *ofp, int *ret_linenumber, char *errmsg)
2117 {
2118 fputs("# STOCKHOLM 1.0\n", ofp);
2119 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2120 fputs("#=GR seq1 XX xxxxxxxxxxxxxxxxxxxx\n", ofp);
2121 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2122 fputs("\n", ofp);
2123 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2124 fputs("#=GR seq2 XX xxxxxxxxxxxxxxxxxxxx\n", ofp);
2125 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2126 fputs("//\n", ofp);
2127
2128 *ret_linenumber = 7;
2129 strcpy(errmsg, "unexpected seqname seq2; expected seq1 from prev blocks");
2130 }
2131
2132 static void
utest_write_badformat55(FILE * ofp,int * ret_linenumber,char * errmsg)2133 utest_write_badformat55(FILE *ofp, int *ret_linenumber, char *errmsg)
2134 {
2135 fputs("# STOCKHOLM 1.0\n", ofp);
2136 fputs("#=GR seq1 SS xxxxxxxxxxxxxxxxxxxx\n", ofp);
2137 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2138 fputs("#=GR seq1 SS xxxxxxxxxxxxxxxxxxxx\n", ofp);
2139 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2140 fputs("//\n", ofp);
2141
2142 *ret_linenumber = 4;
2143 strcpy(errmsg, "more than one #=GR seq1 SS line in block");
2144 }
2145
2146 static void
utest_write_badformat56(FILE * ofp,int * ret_linenumber,char * errmsg)2147 utest_write_badformat56(FILE *ofp, int *ret_linenumber, char *errmsg)
2148 {
2149 fputs("# STOCKHOLM 1.0\n", ofp);
2150 fputs("#=GR seq1 PP xxxxxxxxxxxxxxxxxxxx\n", ofp);
2151 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2152 fputs("#=GR seq1 PP xxxxxxxxxxxxxxxxxxxx\n", ofp);
2153 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2154 fputs("//\n", ofp);
2155
2156 *ret_linenumber = 4;
2157 strcpy(errmsg, "more than one #=GR seq1 PP line in block");
2158 }
2159
2160 static void
utest_write_badformat57(FILE * ofp,int * ret_linenumber,char * errmsg)2161 utest_write_badformat57(FILE *ofp, int *ret_linenumber, char *errmsg)
2162 {
2163 fputs("# STOCKHOLM 1.0\n", ofp);
2164 fputs("#=GR seq1 SA xxxxxxxxxxxxxxxxxxxx\n", ofp);
2165 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2166 fputs("#=GR seq1 SA xxxxxxxxxxxxxxxxxxxx\n", ofp);
2167 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2168 fputs("//\n", ofp);
2169
2170 *ret_linenumber = 4;
2171 strcpy(errmsg, "more than one #=GR seq1 SA line in block");
2172 }
2173
2174 static void
utest_write_badformat58(FILE * ofp,int * ret_linenumber,char * errmsg)2175 utest_write_badformat58(FILE *ofp, int *ret_linenumber, char *errmsg)
2176 {
2177 fputs("# STOCKHOLM 1.0\n", ofp);
2178 fputs("#=GR seq1 XX xxxxxxxxxxxxxxxxxxxx\n", ofp);
2179 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2180 fputs("#=GR seq1 XX xxxxxxxxxxxxxxxxxxxx\n", ofp);
2181 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2182 fputs("//\n", ofp);
2183
2184 *ret_linenumber = 4;
2185 strcpy(errmsg, "more than one #=GR seq1 XX line in block");
2186 }
2187
2188 static void
utest_write_badformat59(FILE * ofp,int * ret_linenumber,char * errmsg)2189 utest_write_badformat59(FILE *ofp, int *ret_linenumber, char *errmsg)
2190 {
2191 fputs("# STOCKHOLM 1.0\n", ofp);
2192 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2193 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2194 fputs("#=GR seq2 XX xxxxxxxxxxxxxxxxxx\n", ofp);
2195 fputs("//\n", ofp);
2196
2197 *ret_linenumber = 4;
2198 strcpy(errmsg, "unexpected # of aligned annotation in #=GR seq2 XX line");
2199 }
2200
2201 static void
utest_write_badformat60(FILE * ofp,int * ret_linenumber,char * errmsg)2202 utest_write_badformat60(FILE *ofp, int *ret_linenumber, char *errmsg)
2203 {
2204 fputs("# STOCKHOLM 1.0\n", ofp);
2205 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2206 fputs("seq2 \n", ofp);
2207 fputs("//\n", ofp);
2208
2209 *ret_linenumber = 3;
2210 strcpy(errmsg, "sequence line with no sequence");
2211 }
2212
2213 static void
utest_write_badformat61(FILE * ofp,int * ret_linenumber,char * errmsg)2214 utest_write_badformat61(FILE *ofp, int *ret_linenumber, char *errmsg)
2215 {
2216 fputs("# STOCKHOLM 1.0\n", ofp);
2217 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2218 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2219 fputs("\n", ofp);
2220 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2221 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2222 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2223 fputs("//\n", ofp);
2224
2225 *ret_linenumber = 7;
2226 strcpy(errmsg, "more lines than expected; earlier blocks had fewer");
2227 }
2228
2229 static void
utest_write_badformat62(FILE * ofp,int * ret_linenumber,char * errmsg)2230 utest_write_badformat62(FILE *ofp, int *ret_linenumber, char *errmsg)
2231 {
2232 fputs("# STOCKHOLM 1.0\n", ofp);
2233 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2234 fputs("#=GR seq2 XX xxxxxxxxxxxxxxxxxxxx\n", ofp);
2235 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2236 fputs("\n", ofp);
2237 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2238 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2239 fputs("#=GR seq2 XX xxxxxxxxxxxxxxxxxxxx\n", ofp);
2240 fputs("//\n", ofp);
2241
2242 *ret_linenumber = 7;
2243 strcpy(errmsg, "unexpected seq line; earlier block(s) in different order");
2244 }
2245
2246 static void
utest_write_badformat63(FILE * ofp,int * ret_linenumber,char * errmsg)2247 utest_write_badformat63(FILE *ofp, int *ret_linenumber, char *errmsg)
2248 {
2249 fputs("# STOCKHOLM 1.0\n", ofp);
2250 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2251 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2252 fputs("seq3 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2253 fputs("\n", ofp);
2254 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2255 fputs("seq3 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2256 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2257 fputs("//\n", ofp);
2258
2259 *ret_linenumber = 7;
2260 strcpy(errmsg, "unexpected seq name seq3; expected seq2 from prev block order");
2261 }
2262
2263 static void
utest_write_badformat64(FILE * ofp,int * ret_linenumber,char * errmsg)2264 utest_write_badformat64(FILE *ofp, int *ret_linenumber, char *errmsg)
2265 {
2266 fputs("# STOCKHOLM 1.0\n", ofp);
2267 fputs("seq1 ACDEFGHIKL^NPQRSTVWY\n", ofp);
2268 fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2269 fputs("//\n", ofp);
2270
2271 *ret_linenumber = 2;
2272 strcpy(errmsg, "invalid sequence character(s) on line");
2273 }
2274
2275 static void
utest_write_badformat65(FILE * ofp,int * ret_linenumber,char * errmsg)2276 utest_write_badformat65(FILE *ofp, int *ret_linenumber, char *errmsg)
2277 {
2278 fputs("# STOCKHOLM 1.0\n", ofp);
2279 fputs("seq1 ACDEFGHIKLMNPQRSTVWY\n", ofp);
2280 fputs("seq2 ACDEFGHIKLMNPQRSTVWYZ\n", ofp);
2281 fputs("//\n", ofp);
2282
2283 *ret_linenumber = 3;
2284 strcpy(errmsg, "unexpected number of aligned residues parsed on line");
2285 }
2286
2287
2288 static void
utest_goodfile(char * filename,int testnumber,int expected_alphatype,int expected_nseq,int expected_alen)2289 utest_goodfile(char *filename, int testnumber, int expected_alphatype, int expected_nseq, int expected_alen)
2290 {
2291 ESL_ALPHABET *abc = NULL;
2292 ESL_MSAFILE *afp = NULL;
2293 ESL_MSA *msa1 = NULL;
2294 ESL_MSA *msa2 = NULL;
2295 char tmpfile1[32] = "esltmpXXXXXX";
2296 char tmpfile2[32] = "esltmpXXXXXX";
2297 FILE *ofp = NULL;
2298 int status;
2299
2300 /* guessing both the format and the alphabet should work: this is a digital open */
2301 if ( (status = esl_msafile_Open(&abc, filename, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) esl_fatal("stockholm good file test %d failed: digital open", testnumber);
2302 if (abc->type != expected_alphatype) esl_fatal("stockholm good file test %d failed: alphabet autodetection", testnumber);
2303 if (afp->format != eslMSAFILE_STOCKHOLM) esl_fatal("stockholm good file test %d failed: format autodetection", testnumber); /* eslMSAFILE_PFAM is autodetected as STOCKHOLM, and that's fine */
2304
2305 /* This is a digital read, using <abc>. */
2306 if ( (status = esl_msafile_stockholm_Read(afp, &msa1)) != eslOK) esl_fatal("stockholm good file test %d failed: msa read, digital", testnumber);
2307 if (msa1->nseq != expected_nseq || msa1->alen != expected_alen) esl_fatal("stockholm good file test %d failed: nseq/alen", testnumber);
2308 if (esl_msa_Validate(msa1, NULL) != eslOK) esl_fatal("stockholm good file test %d failed: msa1 invalid", testnumber);
2309 esl_msafile_Close(afp);
2310
2311 /* write it back out to a new tmpfile (digital write) */
2312 if ( (status = esl_tmpfile_named(tmpfile1, &ofp)) != eslOK) esl_fatal("stockholm good file test %d failed: tmpfile creation", testnumber);
2313 if ( (status = esl_msafile_stockholm_Write(ofp, msa1, eslMSAFILE_STOCKHOLM)) != eslOK) esl_fatal("stockholm good file test %d failed: msa write, digital", testnumber);
2314 fclose(ofp);
2315
2316 /* now open and read it as text mode, in known format. (We have to pass fmtd now, to deal with the possibility of a nonstandard name width) */
2317 if ( (status = esl_msafile_Open(NULL, tmpfile1, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp)) != eslOK) esl_fatal("stockholm good file test %d failed: text mode open", testnumber);
2318 if ( (status = esl_msafile_stockholm_Read(afp, &msa2)) != eslOK) esl_fatal("stockholm good file test %d failed: msa read, text", testnumber);
2319 if (msa2->nseq != expected_nseq || msa2->alen != expected_alen) esl_fatal("stockholm good file test %d failed: nseq/alen", testnumber);
2320 if (esl_msa_Validate(msa2, NULL) != eslOK) esl_fatal("stockholm good file test %d failed: msa2 invalid", testnumber);
2321 esl_msafile_Close(afp);
2322
2323 /* write it back out to a new tmpfile (text write) */
2324 if ( (status = esl_tmpfile_named(tmpfile2, &ofp)) != eslOK) esl_fatal("stockholm good file test %d failed: tmpfile creation", testnumber);
2325 if ( (status = esl_msafile_stockholm_Write(ofp, msa2, eslMSAFILE_PFAM)) != eslOK) esl_fatal("stockholm good file test %d failed: msa write, text", testnumber);
2326 fclose(ofp);
2327 esl_msa_Destroy(msa2);
2328
2329 /* open and read it in digital mode */
2330 if ( (status = esl_msafile_Open(&abc, tmpfile1, NULL, eslMSAFILE_PFAM, NULL, &afp)) != eslOK) esl_fatal("stockholm good file test %d failed: 2nd digital mode open", testnumber);
2331 if ( (status = esl_msafile_stockholm_Read(afp, &msa2)) != eslOK) esl_fatal("stockholm good file test %d failed: 2nd digital msa read", testnumber);
2332 if (esl_msa_Validate(msa2, NULL) != eslOK) esl_fatal("stockholm good file test %d failed: msa2 invalid", testnumber);
2333 esl_msafile_Close(afp);
2334
2335 /* this msa <msa2> should be identical to <msa1> */
2336 if (esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal("stockholm good file test %d failed: msa compare", testnumber);
2337
2338 remove(tmpfile1);
2339 remove(tmpfile2);
2340 esl_msa_Destroy(msa1);
2341 esl_msa_Destroy(msa2);
2342 esl_alphabet_Destroy(abc);
2343 }
2344
2345 static void
utest_bad_format(char * filename,int testnumber,int expected_linenumber,char * expected_errmsg)2346 utest_bad_format(char *filename, int testnumber, int expected_linenumber, char *expected_errmsg)
2347 {
2348 ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO);
2349 ESL_MSAFILE *afp = NULL;
2350 int fmt = eslMSAFILE_STOCKHOLM;
2351 ESL_MSA *msa = NULL;
2352 int status;
2353
2354 if ( (status = esl_msafile_Open(&abc, filename, NULL, fmt, NULL, &afp)) != eslOK) esl_fatal("stockholm bad format test %d failed: unexpected open failure", testnumber);
2355 if ( (status = esl_msafile_stockholm_Read(afp, &msa)) != eslEFORMAT) esl_fatal("stockholm bad format test %d failed: unexpected error code", testnumber);
2356 if (strstr(afp->errmsg, expected_errmsg) == NULL) esl_fatal("stockholm bad format test %d failed: unexpected errmsg", testnumber);
2357 if (afp->linenumber != expected_linenumber) esl_fatal("stockholm bad format test %d failed: unexpected linenumber", testnumber);
2358 esl_msafile_Close(afp);
2359 esl_alphabet_Destroy(abc);
2360 esl_msa_Destroy(msa);
2361 }
2362
2363
2364 static void
utest_good_format(ESL_ALPHABET ** byp_abc,int fmt,int expected_nseq,int64_t expected_alen,char * buf)2365 utest_good_format(ESL_ALPHABET **byp_abc, int fmt, int expected_nseq, int64_t expected_alen, char *buf)
2366 {
2367 char msg[] = "good format test failed";
2368 ESL_MSAFILE *afp = NULL;
2369 ESL_MSA *msa = NULL;
2370
2371 if (esl_msafile_OpenMem(byp_abc, buf, strlen(buf), fmt, NULL, &afp) != eslOK) esl_fatal(msg);
2372 if (esl_msafile_stockholm_Read(afp, &msa) != eslOK) esl_fatal(msg);
2373 if (msa->nseq != expected_nseq) esl_fatal(msg);
2374 if (msa->alen != expected_alen) esl_fatal(msg);
2375
2376 esl_msa_Destroy(msa);
2377 esl_msafile_Close(afp);
2378 }
2379
2380 static void
utest_identical_io(ESL_ALPHABET ** byp_abc,int fmt,char * buf)2381 utest_identical_io(ESL_ALPHABET **byp_abc, int fmt, char *buf)
2382 {
2383 char msg[] = "identical io test failed";
2384 char tmpfile1[32] = "esltmpXXXXXX";
2385 char tmpfile2[32] = "esltmpXXXXXX";
2386 FILE *fp = NULL;
2387 ESL_MSAFILE *afp = NULL;
2388 ESL_MSA *msa1 = NULL;
2389 ESL_MSA *msa2 = NULL;
2390
2391 if (esl_tmpfile_named(tmpfile1, &fp) != eslOK) esl_fatal(msg);
2392 fputs(buf, fp);
2393 fclose(fp);
2394
2395 if (esl_msafile_Open(byp_abc, tmpfile1, NULL, fmt, NULL, &afp) != eslOK) esl_fatal(msg);
2396 if (esl_msafile_stockholm_Read(afp, &msa1) != eslOK) esl_fatal(msg);
2397 esl_msafile_Close(afp);
2398
2399 if (esl_tmpfile_named(tmpfile2, &fp) != eslOK) esl_fatal(msg);
2400 if (esl_msafile_stockholm_Write(fp, msa1, eslMSAFILE_STOCKHOLM) != eslOK) esl_fatal(msg);
2401 fclose(fp);
2402
2403 if (esl_msafile_Open(byp_abc, tmpfile2, NULL, fmt, NULL, &afp) != eslOK) esl_fatal(msg);
2404 if (esl_msafile_stockholm_Read(afp, &msa2) != eslOK) esl_fatal(msg);
2405 esl_msafile_Close(afp);
2406
2407 if (esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal(msg);
2408
2409 remove(tmpfile1);
2410 remove(tmpfile2);
2411 esl_msa_Destroy(msa1);
2412 esl_msa_Destroy(msa2);
2413 }
2414
2415 static void
utest_bad_open(ESL_ALPHABET ** byp_abc,int fmt,int expected_status,char * buf)2416 utest_bad_open(ESL_ALPHABET **byp_abc, int fmt, int expected_status, char *buf)
2417 {
2418 char msg[] = "bad open test failed";
2419 ESL_MSAFILE *afp = NULL;
2420
2421 if (esl_msafile_OpenMem(byp_abc, buf, strlen(buf), fmt, NULL, &afp) != expected_status) esl_fatal(msg);
2422 esl_msafile_Close(afp);
2423 }
2424
2425 static void
utest_bad_read(ESL_ALPHABET ** byp_abc,int fmt,char * expected_errmsg,int expected_line,char * buf)2426 utest_bad_read(ESL_ALPHABET **byp_abc, int fmt, char *expected_errmsg, int expected_line, char *buf)
2427 {
2428 char msg[] = "bad format test failed";
2429 ESL_MSAFILE *afp = NULL;
2430 ESL_MSA *msa = NULL;
2431
2432 if (esl_msafile_OpenMem(byp_abc, buf, strlen(buf), fmt, NULL, &afp) != eslOK) esl_fatal(msg);
2433 if (esl_msafile_stockholm_Read(afp, &msa) != eslEFORMAT) esl_fatal(msg);
2434 if (strstr(afp->errmsg, expected_errmsg) == NULL) esl_fatal(msg);
2435 if (afp->linenumber != expected_line) esl_fatal(msg);
2436
2437 esl_msa_Destroy(msa);
2438 esl_msafile_Close(afp);
2439 }
2440 #endif /*eslMSAFILE_STOCKHOLM_TESTDRIVE*/
2441 /*----------------- end, unit tests -----------------------------*/
2442
2443
2444
2445 /*****************************************************************
2446 * 7. Test driver.
2447 *****************************************************************/
2448 #ifdef eslMSAFILE_STOCKHOLM_TESTDRIVE
2449 /* compile: gcc -g -Wall -I. -L. -o esl_msafile_stockholm_utest -DeslMSAFILE_STOCKHOLM_TESTDRIVE esl_msafile_stockholm.c -leasel -lm
2450 * (gcov): gcc -g -Wall -fprofile-arcs -ftest-coverage -I. -L. -o esl_msafile_stockholm_utest -DeslMSAFILE_STOCKHOLM_TESTDRIVE esl_msafile_stockholm.c -leasel -lm
2451 * run: ./esl_msafile_stockholm_utest
2452 */
2453 #include "esl_config.h"
2454
2455 #include <stdio.h>
2456
2457 #include "easel.h"
2458 #include "esl_getopts.h"
2459 #include "esl_random.h"
2460 #include "esl_msafile.h"
2461 #include "esl_msafile_stockholm.h"
2462
2463 static ESL_OPTIONS options[] = {
2464 /* name type default env range togs reqs incomp help docgrp */
2465 {"-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0},
2466 { 0,0,0,0,0,0,0,0,0,0},
2467 };
2468 static char usage[] = "[-options]";
2469 static char banner[] = "test driver for Stockholm/Xfam MSA format module";
2470
2471 int
main(int argc,char ** argv)2472 main(int argc, char **argv)
2473 {
2474 char msg[] = "Stockholm MSA i/o module test driver failed";
2475 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
2476 int ngoodtests = 1;
2477 int nbadtests = 65;
2478 char tmpfile[32];
2479 FILE *ofp;
2480 int testnumber;
2481 int expected_alphatype;
2482 int expected_nseq;
2483 int expected_alen;
2484 int expected_linenumber;
2485 char expected_errmsg[eslERRBUFSIZE];
2486
2487 utest_bad_open(NULL, eslMSAFILE_UNKNOWN, eslENOFORMAT, "");
2488
2489 utest_bad_read(NULL, eslMSAFILE_UNKNOWN, "missing // terminator", 1, "# STOCKHOLM 1.0\n");
2490 utest_bad_read(NULL, eslMSAFILE_UNKNOWN, "no alignment data", 2, "# STOCKHOLM 1.0\n//\n");
2491
2492 utest_good_format(NULL, eslMSAFILE_UNKNOWN, 2, 10, "\n# STOCKHOLM 1.0\n\nseq1 ACDEFGHIKL\nseq2 ACDEFGHIKL\n\n//\n\n");
2493
2494 utest_identical_io(NULL, eslMSAFILE_UNKNOWN, "# STOCKHOLM 1.0\n\nseq1 ACDEFGHIKL\nseq2 ACDEFGHIKL\n//\n");
2495
2496 /* Various "good" files, that should be parsed correctly. */
2497 for (testnumber = 1; testnumber <= ngoodtests; testnumber++)
2498 {
2499 strcpy(tmpfile, "esltmpXXXXXX");
2500 if (esl_tmpfile_named(tmpfile, &ofp) != eslOK) esl_fatal(msg);
2501 switch (testnumber) {
2502 case 1: utest_write_good1 (ofp, &expected_alphatype, &expected_nseq, &expected_alen); break;
2503 }
2504 fclose(ofp);
2505 utest_goodfile(tmpfile, testnumber, expected_alphatype, expected_nseq, expected_alen);
2506 remove(tmpfile);
2507 }
2508
2509 /* Tests for all possible EFORMAT errors */
2510 for (testnumber = 1; testnumber <= nbadtests; testnumber++)
2511 {
2512 strcpy(tmpfile, "esltmpXXXXXX");
2513 if (esl_tmpfile_named(tmpfile, &ofp) != eslOK) esl_fatal(msg);
2514
2515 switch (testnumber) {
2516 case 1: utest_write_badformat1 (ofp, &expected_linenumber, expected_errmsg); break;
2517 case 2: utest_write_badformat2 (ofp, &expected_linenumber, expected_errmsg); break;
2518 case 3: utest_write_badformat3 (ofp, &expected_linenumber, expected_errmsg); break;
2519 case 4: utest_write_badformat4 (ofp, &expected_linenumber, expected_errmsg); break;
2520 case 5: utest_write_badformat5 (ofp, &expected_linenumber, expected_errmsg); break;
2521 case 6: utest_write_badformat6 (ofp, &expected_linenumber, expected_errmsg); break;
2522 case 7: utest_write_badformat7 (ofp, &expected_linenumber, expected_errmsg); break;
2523 case 8: utest_write_badformat8 (ofp, &expected_linenumber, expected_errmsg); break;
2524 case 9: utest_write_badformat9 (ofp, &expected_linenumber, expected_errmsg); break;
2525 case 10: utest_write_badformat10(ofp, &expected_linenumber, expected_errmsg); break;
2526 case 11: utest_write_badformat11(ofp, &expected_linenumber, expected_errmsg); break;
2527 case 12: utest_write_badformat12(ofp, &expected_linenumber, expected_errmsg); break;
2528 case 13: utest_write_badformat13(ofp, &expected_linenumber, expected_errmsg); break;
2529 case 14: utest_write_badformat14(ofp, &expected_linenumber, expected_errmsg); break;
2530 case 15: utest_write_badformat15(ofp, &expected_linenumber, expected_errmsg); break;
2531 case 16: utest_write_badformat16(ofp, &expected_linenumber, expected_errmsg); break;
2532 case 17: utest_write_badformat17(ofp, &expected_linenumber, expected_errmsg); break;
2533 case 18: utest_write_badformat18(ofp, &expected_linenumber, expected_errmsg); break;
2534 case 19: utest_write_badformat19(ofp, &expected_linenumber, expected_errmsg); break;
2535 case 20: utest_write_badformat20(ofp, &expected_linenumber, expected_errmsg); break;
2536 case 21: utest_write_badformat21(ofp, &expected_linenumber, expected_errmsg); break;
2537 case 22: utest_write_badformat22(ofp, &expected_linenumber, expected_errmsg); break;
2538 case 23: utest_write_badformat23(ofp, &expected_linenumber, expected_errmsg); break;
2539 case 24: utest_write_badformat24(ofp, &expected_linenumber, expected_errmsg); break;
2540 case 25: utest_write_badformat25(ofp, &expected_linenumber, expected_errmsg); break;
2541 case 26: utest_write_badformat26(ofp, &expected_linenumber, expected_errmsg); break;
2542 case 27: utest_write_badformat27(ofp, &expected_linenumber, expected_errmsg); break;
2543 case 28: utest_write_badformat28(ofp, &expected_linenumber, expected_errmsg); break;
2544 case 29: utest_write_badformat29(ofp, &expected_linenumber, expected_errmsg); break;
2545 case 30: utest_write_badformat30(ofp, &expected_linenumber, expected_errmsg); break;
2546 case 31: utest_write_badformat31(ofp, &expected_linenumber, expected_errmsg); break;
2547 case 32: utest_write_badformat32(ofp, &expected_linenumber, expected_errmsg); break;
2548 case 33: utest_write_badformat33(ofp, &expected_linenumber, expected_errmsg); break;
2549 case 34: utest_write_badformat34(ofp, &expected_linenumber, expected_errmsg); break;
2550 case 35: utest_write_badformat35(ofp, &expected_linenumber, expected_errmsg); break;
2551 case 36: utest_write_badformat36(ofp, &expected_linenumber, expected_errmsg); break;
2552 case 37: utest_write_badformat37(ofp, &expected_linenumber, expected_errmsg); break;
2553 case 38: utest_write_badformat38(ofp, &expected_linenumber, expected_errmsg); break;
2554 case 39: utest_write_badformat39(ofp, &expected_linenumber, expected_errmsg); break;
2555 case 40: utest_write_badformat40(ofp, &expected_linenumber, expected_errmsg); break;
2556 case 41: utest_write_badformat41(ofp, &expected_linenumber, expected_errmsg); break;
2557 case 42: utest_write_badformat42(ofp, &expected_linenumber, expected_errmsg); break;
2558 case 43: utest_write_badformat43(ofp, &expected_linenumber, expected_errmsg); break;
2559 case 44: utest_write_badformat44(ofp, &expected_linenumber, expected_errmsg); break;
2560 case 45: utest_write_badformat45(ofp, &expected_linenumber, expected_errmsg); break;
2561 case 46: utest_write_badformat46(ofp, &expected_linenumber, expected_errmsg); break;
2562 case 47: utest_write_badformat47(ofp, &expected_linenumber, expected_errmsg); break;
2563 case 48: utest_write_badformat48(ofp, &expected_linenumber, expected_errmsg); break;
2564 case 49: utest_write_badformat49(ofp, &expected_linenumber, expected_errmsg); break;
2565 case 50: utest_write_badformat50(ofp, &expected_linenumber, expected_errmsg); break;
2566 case 51: utest_write_badformat51(ofp, &expected_linenumber, expected_errmsg); break;
2567 case 52: utest_write_badformat52(ofp, &expected_linenumber, expected_errmsg); break;
2568 case 53: utest_write_badformat53(ofp, &expected_linenumber, expected_errmsg); break;
2569 case 54: utest_write_badformat54(ofp, &expected_linenumber, expected_errmsg); break;
2570 case 55: utest_write_badformat55(ofp, &expected_linenumber, expected_errmsg); break;
2571 case 56: utest_write_badformat56(ofp, &expected_linenumber, expected_errmsg); break;
2572 case 57: utest_write_badformat57(ofp, &expected_linenumber, expected_errmsg); break;
2573 case 58: utest_write_badformat58(ofp, &expected_linenumber, expected_errmsg); break;
2574 case 59: utest_write_badformat59(ofp, &expected_linenumber, expected_errmsg); break;
2575 case 60: utest_write_badformat60(ofp, &expected_linenumber, expected_errmsg); break;
2576 case 61: utest_write_badformat61(ofp, &expected_linenumber, expected_errmsg); break;
2577 case 62: utest_write_badformat62(ofp, &expected_linenumber, expected_errmsg); break;
2578 case 63: utest_write_badformat63(ofp, &expected_linenumber, expected_errmsg); break;
2579 case 64: utest_write_badformat64(ofp, &expected_linenumber, expected_errmsg); break;
2580 case 65: utest_write_badformat65(ofp, &expected_linenumber, expected_errmsg); break;
2581 }
2582 fclose(ofp);
2583
2584 utest_bad_format(tmpfile, testnumber, expected_linenumber, expected_errmsg);
2585 remove(tmpfile);
2586 }
2587
2588 esl_getopts_Destroy(go);
2589 return 0;
2590 }
2591 #endif /*eslMSAFILE_STOCKHOLM_TESTDRIVE*/
2592 /*---------------- end, test driver -----------------------------*/
2593
2594
2595 /*****************************************************************
2596 * 8. Examples.
2597 *****************************************************************/
2598
2599 #ifdef eslMSAFILE_STOCKHOLM_EXAMPLE
2600 /* A full-featured example of reading/writing MSA(s) in Stockholm format.
2601 gcc -g -Wall -o esl_msafile_stockholm_example -I. -L. -DeslMSAFILE_STOCKHOLM_EXAMPLE esl_msafile_stockholm.c -leasel -lm
2602 ./esl_msafile_stockholm_example <msafile>
2603 */
2604 /*::cexcerpt::msafile_stockholm_example::begin::*/
2605 #include <stdio.h>
2606
2607 #include "easel.h"
2608 #include "esl_alphabet.h"
2609 #include "esl_getopts.h"
2610 #include "esl_msa.h"
2611 #include "esl_msafile.h"
2612 #include "esl_msafile_stockholm.h"
2613
2614 static ESL_OPTIONS options[] = {
2615 /* name type default env range toggles reqs incomp help docgroup*/
2616 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
2617 { "-1", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "override autodetection; force Stockholm format", 0 },
2618 { "-q", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "quieter: don't write msa back, just summary", 0 },
2619 { "-t", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use text mode: no digital alphabet", 0 },
2620 { "--dna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is DNA", 0 },
2621 { "--rna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is RNA", 0 },
2622 { "--amino", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is protein", 0 },
2623 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
2624 };
2625 static char usage[] = "[-options] <msafile>";
2626 static char banner[] = "example of guessing, reading, writing Stockholm format";
2627
2628 int
main(int argc,char ** argv)2629 main(int argc, char **argv)
2630 {
2631 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
2632 char *filename = esl_opt_GetArg(go, 1);
2633 int infmt = eslMSAFILE_UNKNOWN;
2634 ESL_ALPHABET *abc = NULL;
2635 ESL_MSAFILE *afp = NULL;
2636 ESL_MSA *msa = NULL;
2637 int status;
2638
2639 if (esl_opt_GetBoolean(go, "-1")) infmt = eslMSAFILE_STOCKHOLM; /* override format autodetection */
2640
2641 if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA);
2642 else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA);
2643 else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
2644
2645 /* Text mode: pass NULL for alphabet.
2646 * Digital mode: pass ptr to expected ESL_ALPHABET; and if abc=NULL, alphabet is guessed
2647 */
2648 if (esl_opt_GetBoolean(go, "-t")) status = esl_msafile_Open(NULL, filename, NULL, infmt, NULL, &afp);
2649 else status = esl_msafile_Open(&abc, filename, NULL, infmt, NULL, &afp);
2650 if (status != eslOK) esl_msafile_OpenFailure(afp, status);
2651
2652 while ( (status = esl_msafile_stockholm_Read(afp, &msa)) == eslOK)
2653 {
2654 printf("alphabet: %s\n", (abc ? esl_abc_DecodeType(abc->type) : "none (text mode)"));
2655 printf("# of seqs: %d\n", msa->nseq);
2656 printf("# of cols: %d\n", (int) msa->alen);
2657 printf("\n");
2658
2659 if (! esl_opt_GetBoolean(go, "-q"))
2660 esl_msafile_stockholm_Write(stdout, msa, eslMSAFILE_STOCKHOLM);
2661
2662 esl_msa_Destroy(msa);
2663 }
2664 if (status != eslEOF) esl_msafile_ReadFailure(afp, status);
2665
2666 esl_msafile_Close(afp);
2667 if (abc) esl_alphabet_Destroy(abc);
2668 esl_getopts_Destroy(go);
2669 exit(0);
2670 }
2671 /*::cexcerpt::msafile_stockholm_example::end::*/
2672 #endif /*eslMSAFILE_STOCKHOLM_EXAMPLE*/
2673
2674
2675
2676 #ifdef eslMSAFILE_STOCKHOLM_EXAMPLE2
2677 /* A minimal example. Read Stockholm MSAs, in text mode.
2678 gcc -g -Wall -o esl_msafile_stockholm_example2 -I. -L. -DeslMSAFILE_STOCKHOLM_EXAMPLE2 esl_msafile_stockholm.c -leasel -lm
2679 ./esl_msafile_stockholm_example2 <msafile>
2680 */
2681
2682 /*::cexcerpt::msafile_stockholm_example::begin::*/
2683 #include <stdio.h>
2684
2685 #include "easel.h"
2686 #include "esl_msa.h"
2687 #include "esl_msafile.h"
2688 #include "esl_msafile_stockholm.h"
2689
2690 int
main(int argc,char ** argv)2691 main(int argc, char **argv)
2692 {
2693 char *filename = argv[1];
2694 int infmt = eslMSAFILE_STOCKHOLM;
2695 ESL_MSAFILE *afp = NULL;
2696 ESL_MSA *msa = NULL;
2697 int status;
2698
2699 if ( (status = esl_msafile_Open(NULL, filename, NULL, infmt, NULL, &afp)) != eslOK)
2700 esl_msafile_OpenFailure(afp, status);
2701
2702 while ( (status = esl_msafile_stockholm_Read(afp, &msa)) == eslOK)
2703 {
2704 printf("%15s: %6d seqs, %5d columns\n\n", msa->name, msa->nseq, (int) msa->alen);
2705 esl_msafile_stockholm_Write(stdout, msa, eslMSAFILE_STOCKHOLM);
2706 esl_msa_Destroy(msa);
2707 }
2708 if (status != eslEOF) esl_msafile_ReadFailure(afp, status);
2709
2710 esl_msafile_Close(afp);
2711 exit(0);
2712 }
2713 /*::cexcerpt::msafile_stockholm_example2::end::*/
2714 #endif /*eslMSAFILE_STOCKHOLM_EXAMPLE2*/
2715 /*--------------------- end of example --------------------------*/
2716