1 /* I/O of multiple sequence alignments in A2M format (UCSC SAM)
2  *
3  * Contents:
4  *   1. API for reading/writing A2M format
5  *   2. Internal functions used by the A2M parser
6  *   3. Unit tests.
7  *   4. Test driver.
8  *   5. Examples.
9  *
10  * Reference:
11  *   http://compbio.soe.ucsc.edu/a2m-desc.html
12  */
13 #include "esl_config.h"
14 
15 #include <stdio.h>
16 #include <string.h>
17 
18 #include "easel.h"
19 #include "esl_alphabet.h"
20 #include "esl_mem.h"
21 #include "esl_msa.h"
22 #include "esl_msafile.h"
23 
24 #include "esl_msafile_a2m.h"
25 
26 static int a2m_padding_digital(ESL_MSA *msa, char **csflag, int *nins, int ncons);
27 static int a2m_padding_text   (ESL_MSA *msa, char **csflag, int *nins, int ncons);
28 
29 /*****************************************************************
30  *# 1. API for reading/writing A2M format
31  *****************************************************************/
32 
33 
34 /* Function:  esl_msafile_a2m_SetInmap()
35  * Synopsis:  Set input map specific for A2M format
36  *
37  * Purpose:   Set the <afp->inmap> for A2M format.
38  *
39  *            A2M ignores whitespace and periods (and ignoring
40  *            periods makes us agnostic whether the input is
41  *            "dotless" format or not). Make ' ', '\t', and
42  *            '.' ignored.
43  *
44  *            A2M format only allows - for a gap, so make
45  *            all other Easel gap characters illegal on input.
46  *
47  *            A2M format handles an 'O' specially: this indicates
48  *            a FIM (free insertion module) to the SAM software.
49  *            We ignore it.
50 
51  *            A2M allows ACDEFGHIKLMNPQRSTVWY for aa, plus XBZ.
52  *            Unknown letters (including other ambig codes) are mapped
53  *            to X.  A2M allows ACGTU for nucleic, plus YRN.  Unknown
54  *            letters (including other ambig codes) are mapped to N.
55  *            However, Easel enforces its normal input restrictions on
56  *            residues: digital bioalphabets allow only valid residue
57  *            symbols, and text mode allows any isalpha() character
58  *            verbatim.
59  */
60 int
esl_msafile_a2m_SetInmap(ESL_MSAFILE * afp)61 esl_msafile_a2m_SetInmap(ESL_MSAFILE *afp)
62 {
63   int sym;
64 
65   if (afp->abc)
66     {
67       for (sym = 0; sym < 128; sym++)
68 	afp->inmap[sym] = afp->abc->inmap[sym];
69       afp->inmap[0] = esl_abc_XGetUnknown(afp->abc);
70       afp->inmap['_']  = eslDSQ_ILLEGAL;
71       afp->inmap['*']  = eslDSQ_ILLEGAL;
72       afp->inmap['~']  = eslDSQ_ILLEGAL;
73     }
74 
75   if (! afp->abc)
76     {
77       for (sym = 1; sym < 128; sym++)
78 	afp->inmap[sym] = (isalpha(sym) ? sym : eslDSQ_ILLEGAL);
79       afp->inmap[0]   = '?';
80       afp->inmap['-'] = '-';
81     }
82 
83   afp->inmap[' ']  = eslDSQ_IGNORED;
84   afp->inmap['\t'] = eslDSQ_IGNORED;
85   afp->inmap['.']  = eslDSQ_IGNORED;
86   afp->inmap['O']  = eslDSQ_IGNORED;
87   afp->inmap['o']  = eslDSQ_IGNORED;
88   return eslOK;
89 }
90 
91 /* Function:  esl_msafile_a2m_GuessAlphabet()
92  * Synopsis:  Guess the alphabet of an open A2M MSA file.
93  *
94  * Purpose:   Guess the alpbabet of the sequences in open
95  *            A2M format MSA file <afp>.
96  *
97  *            On a normal return, <*ret_type> is set to <eslDNA>,
98  *            <eslRNA>, or <eslAMINO>, and <afp> is reset to its
99  *            original position.
100  *
101  * Args:      afp      - open A2M format MSA file
102  *            ret_type - RETURN: <eslDNA>, <eslRNA>, or <eslAMINO>
103  *
104  * Returns:   <eslOK> on success.
105  *            <eslENOALPHABET> if alphabet type can't be determined.
106  *            In either case, <afp> is rewound to the position it
107  *            started at.
108  *
109  * Throws:    <eslEMEM> on allocation error.
110  *            <eslESYS> on failures of fread() or other system calls
111  */
112 int
esl_msafile_a2m_GuessAlphabet(ESL_MSAFILE * afp,int * ret_type)113 esl_msafile_a2m_GuessAlphabet(ESL_MSAFILE *afp, int *ret_type)
114 {
115   int       alphatype     = eslUNKNOWN;
116   esl_pos_t anchor        = -1;
117   int       threshold[3]  = { 500, 5000, 50000 }; /* we check after 500, 5000, 50000 residues; else we go to EOF */
118   int       nsteps        = 3;
119   int       step          = 0;
120   int       nres          = 0;
121   int       x;
122   int64_t   ct[26];
123   char     *p;
124   esl_pos_t n, pos;
125   int       status;
126 
127   for (x = 0; x < 26; x++) ct[x] = 0;
128 
129   anchor = esl_buffer_GetOffset(afp->bf);
130   if ((status = esl_buffer_SetAnchor(afp->bf, anchor)) != eslOK) { status = eslEINCONCEIVABLE; goto ERROR; } /* [eslINVAL] can't happen here */
131 
132   while ( (status = esl_buffer_GetLine(afp->bf, &p, &n)) == eslOK)
133     {
134       while (n && isspace(*p)) { p++; n--; }
135       if    (!n || *p == '>') continue;
136 
137       for (pos = 0; pos < n; pos++)
138 	if (isalpha(p[pos])) {
139 	  x = toupper(p[pos]) - 'A';
140 	  ct[x]++;
141 	  nres++;
142 	}
143 
144       /* try to stop early, checking after 500, 5000, and 50000 residues: */
145       if (step < nsteps && nres > threshold[step]) {
146 	if ((status = esl_abc_GuessAlphabet(ct, &alphatype)) == eslOK) goto DONE; /* (eslENOALPHABET) */
147 	step++;
148       }
149     }
150   if (status != eslEOF) goto ERROR; /* [eslEMEM,eslESYS,eslEINCONCEIVABLE] */
151   status = esl_abc_GuessAlphabet(ct, &alphatype); /* (eslENOALPHABET) */
152 
153   /* deliberate flowthrough...*/
154  DONE:
155   esl_buffer_SetOffset(afp->bf, anchor);   /* Rewind to where we were. */
156   esl_buffer_RaiseAnchor(afp->bf, anchor);
157   *ret_type = alphatype;
158   return status;
159 
160  ERROR:
161   if (anchor != -1) {
162     esl_buffer_SetOffset(afp->bf, anchor);
163     esl_buffer_RaiseAnchor(afp->bf, anchor);
164   }
165   *ret_type = eslUNKNOWN;
166   return status;
167 }
168 
169 
170 /* Function:  esl_msafile_a2m_Read()
171  * Synopsis:  Read a UCSC A2M format alignment.
172  *
173  * Purpose:   Read an MSA from an open <ESL_MSAFILE> <afp>, parsing
174  *            for UCSC A2M (SAM) format. Create a new MSA,
175  *            and return a ptr to it in <*ret_msa>. Caller is responsible
176  *            for freeing this <ESL_MSA>.
177  *
178  *            The <msa> has a reference line (<msa->rf[]>) that
179  *            corresponds to the uppercase/lowercase columns in the
180  *            alignment: consensus (uppercase) columns are marked 'X',
181  *            and insert (lowercase) columns are marked '.' in the RF
182  *            annotation line.
183  *
184  *            This input parser can deal both with "dotless" A2M, and
185  *            full A2M format with dots.
186  *
187  * Args:      afp     - open <ESL_MSAFILE>
188  *            ret_msa - RETURN: newly parsed <ESL_MSA>
189  *
190  * Returns:   <eslOK> on success. <*ret_msa> is set to the newly
191  *            allocated MSA, and <afp> is at EOF.
192  *
193  *            <eslEOF> if no (more) alignment data are found in
194  *            <afp>, and <afp> is returned at EOF.
195  *
196  *            <eslEFORMAT> on a parse error. <*ret_msa> is set to
197  *            <NULL>. <afp> contains information sufficient for
198  *            constructing useful diagnostic output:
199  *            | <afp->errmsg>       | user-directed error message     |
200  *            | <afp->linenumber>   | line # where error was detected |
201  *            | <afp->line>         | offending line (not NUL-term)   |
202  *            | <afp->n>            | length of offending line        |
203  *            | <afp->bf->filename> | name of the file                |
204  *            and <afp> is poised at the start of the following line,
205  *            so (in principle) the caller could try to resume
206  *            parsing.
207  *
208  * Throws:    <eslEMEM> - an allocation failed.
209  *            <eslESYS> - a system call such as fread() failed
210  *            <eslEINCONCEIVABLE> - "impossible" corruption
211  *            On these, <*ret_msa> is returned <NULL>, and the state of
212  *            <afp> is undefined.
213  */
214 int
esl_msafile_a2m_Read(ESL_MSAFILE * afp,ESL_MSA ** ret_msa)215 esl_msafile_a2m_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa)
216 {
217   ESL_MSA  *msa        = NULL;
218   char    **csflag     = NULL;	/* csflag[i][pos] is TRUE if aseq[i][pos] was uppercase consensus   */
219   int      *nins       = NULL;	/* # of inserted residues before each consensus col [0..ncons-1]    */
220   int      *this_nins  = NULL;	/* # of inserted residues before each consensus residue in this seq */
221   int       nseq       = 0;
222   int       ncons      = 0;
223   int       idx;
224   int64_t   thislen;
225   int64_t   spos;
226   int       this_ncons;
227   int       cpos, bpos;
228   char     *p, *tok;
229   esl_pos_t n,  toklen;
230   int       status;
231 
232   ESL_DASSERT1( (afp->format == eslMSAFILE_A2M) );
233 
234   afp->errmsg[0] = '\0';
235 
236   if (afp->abc   &&  (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
237   if (! afp->abc &&  (msa = esl_msa_Create(                 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
238   ESL_ALLOC(csflag, sizeof(char *) * msa->sqalloc);
239   for (idx = 0; idx < msa->sqalloc; idx++) csflag[idx] = NULL;
240 
241   /* skip leading blank lines in file */
242   while ( (status = esl_msafile_GetLine(afp, &p, &n)) == eslOK  && esl_memspn(afp->line, afp->n, " \t") == afp->n) ;
243   if      (status != eslOK) goto ERROR; /* includes normal EOF */
244 
245   /* tolerate sloppy space at start of name/desc line */
246   while (n && isspace(*p)) { p++; n--; }
247   if (*p != '>') ESL_XFAIL(eslEFORMAT, afp->errmsg, "expected A2M name/desc line starting with >");
248 
249   do {	/* for each record starting in '>': */
250     p++; n--; 			/* advance past > */
251 
252     if ( (status = esl_memtok(&p, &n, " \t", &tok, &toklen))   != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "no name found for A2M record");
253     if (nseq >= msa->sqalloc) {
254       int old_sqalloc = msa->sqalloc;
255       if ( (status = esl_msa_Expand(msa)) != eslOK) goto ERROR;
256       ESL_REALLOC(csflag, sizeof(char *) * msa->sqalloc);
257       for (idx = old_sqalloc; idx < msa->sqalloc; idx++) csflag[idx] = NULL;
258     }
259 
260     if (     (status = esl_msa_SetSeqName       (msa, nseq, tok, toklen)) != eslOK) goto ERROR;
261     if (n && (status = esl_msa_SetSeqDescription(msa, nseq, p,   n))      != eslOK) goto ERROR;
262 
263     /* now for each sequence line... */
264     thislen = 0;		/* count of lowercase, uppercase, and '-': w/o dots, on first pass */
265     this_ncons = 0;		/* count of uppercase + '-': number of consensus columns in alignment: must match for all seqs */
266     if (! this_nins)                      // Starting w/ first sequence, we need a wee initial alloc. Also nseq == 0, ncons == 0 ...
267       ESL_ALLOC(this_nins, sizeof(int) * 1);
268     for (cpos = 0; cpos <= ncons; cpos++) // A little tricksy. On the first sequence, ncons == 0, so this initializes just [0].
269       this_nins[cpos] = 0;
270 
271     while ( (status = esl_msafile_GetLine(afp, &p, &n)) == eslOK)
272       {
273 	while (n && isspace(*p)) { p++; n--; } /* tolerate and skip leading whitespace on line */
274 	if (n  == 0)   continue;	       /* tolerate and skip blank lines */
275 	if (*p == '>') break;
276 
277 	ESL_REALLOC(csflag[nseq], sizeof(char) * (thislen + n + 1)); /* might be an overalloc by a bit, depending on whitespace on line */
278 	if (nseq == 0) {
279 	  ESL_REALLOC(this_nins, sizeof(int) * (this_ncons + n + 1));
280 	  for (cpos = this_ncons+1; cpos <= this_ncons+n; cpos++)    // DO NOT zero [this_ncons]; we may still be in an insert from prev line.
281 	    this_nins[cpos] = 0;
282 	}
283 
284 	for (spos = thislen, bpos = 0; bpos < n; bpos++)
285 	  {
286 	    if      (p[bpos] == 'O')   continue;
287 	    else if (isupper(p[bpos])) { csflag[nseq][spos++] = TRUE;  this_ncons++;            }
288 	    else if (islower(p[bpos])) { csflag[nseq][spos++] = FALSE; this_nins[this_ncons]++; }
289 	    else if (p[bpos] == '-')   { csflag[nseq][spos++] = TRUE;  this_ncons++;            }
290 	    if (ncons && this_ncons > ncons) ESL_XFAIL(eslEFORMAT, afp->errmsg,  "unexpected # of consensus residues, didn't match previous seq(s)");
291 	  }
292 	csflag[nseq][spos] = TRUE; /* need a sentinel, because of the way the padding functions work */
293 
294 	if (msa->abc)   { status = esl_abc_dsqcat(afp->inmap, &(msa->ax[nseq]),   &thislen, p, n); }
295 	if (! msa->abc) { status = esl_strmapcat (afp->inmap, &(msa->aseq[nseq]), &thislen, p, n); }
296 	if (status == eslEINVAL)   ESL_XFAIL(eslEFORMAT, afp->errmsg, "one or more invalid sequence characters");
297 	else if (status != eslOK)  goto ERROR;
298 	ESL_DASSERT1( (spos == thislen) );
299       }
300     if (status != eslOK && status != eslEOF) goto ERROR; /* exception thrown by esl_msafile_GetLine() */
301     /* status == OK: then *p == '>'. status == eslEOF: we're eof.  status == anything else: error */
302     /* Finished reading a sequence record. */
303 
304     /* Edge case: it's possible to have a sequence of length 0 residues in the alignment,
305      * and that causes the loop structure above to exit before allocating csflag[] or ax[]
306      */
307     if (thislen == 0)
308       {
309         ESL_ALLOC(csflag[nseq], sizeof(char) * 1);
310         csflag[nseq][0] = TRUE;                       // csflag[] needs a sentinel even if there's no seq; make one.
311         ESL_ALLOC(msa->ax[nseq], sizeof(ESL_DSQ) * 2);
312         msa->ax[nseq][0] = msa->ax[nseq][1] = eslDSQ_SENTINEL;
313       }
314 
315     if (nseq == 0)
316       {
317 	ncons = this_ncons;
318 	ESL_ALLOC(nins, sizeof(int) * (ncons+1));
319 	for (cpos = 0; cpos <= ncons; cpos++)
320 	  nins[cpos] = this_nins[cpos];
321       }
322     else
323       {
324 	if (this_ncons != ncons) ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected # of consensus residues, didn't match previous seq(s). (Do you have an O residue?)");
325 	for (cpos = 0; cpos <= ncons; cpos++)
326 	  nins[cpos]      = ESL_MAX(nins[cpos], this_nins[cpos]);
327       }
328     nseq++;
329   } while (status == eslOK);
330 
331   /* Now we have nseq *unaligned* sequences in ax/aseq[0..nseq-1]; call the length slen, though we don't explicitly store it
332    * csflag[idx][spos] tells us whether each unaligned residue is an insertion or consensus, for spos==0..slen-1.
333    * nins[0..ncons] tells us the max number of inserted residues before each consensus column
334    * This is sufficient information to reconstruct each aligned sequence.
335    */
336   msa->nseq = nseq;
337   if (msa->abc)  { if ((status = a2m_padding_digital(msa, csflag, nins, ncons)) != eslOK) goto ERROR; }
338   if (!msa->abc) { if ((status = a2m_padding_text   (msa, csflag, nins, ncons)) != eslOK) goto ERROR; }
339 
340   if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR;
341 
342   *ret_msa  = msa;
343   free(nins);
344   free(this_nins);
345   for (idx = 0; idx < msa->nseq; idx++) free(csflag[idx]);
346   free(csflag);
347   return eslOK;
348 
349  ERROR:
350   if (nins)      free(nins);
351   if (this_nins) free(this_nins);
352   if (csflag) {
353     for (idx = 0; idx < msa->nseq; idx++)
354       if (csflag[idx]) free(csflag[idx]);
355     free(csflag);
356   }
357   if (msa) esl_msa_Destroy(msa);
358   return status;
359 }
360 
361 
362 /* Function:  esl_msafile_a2m_Write()
363  * Synopsis:  Write an A2M (UCSC SAM) dotless format alignment to a stream.
364  *
365  * Purpose:   Write alignment <msa> in dotless UCSC A2M format to a
366  *            stream <fp>.
367  *
368  *            The <msa> should have a valid reference line <msa->rf>,
369  *            with alphanumeric characters marking consensus (match)
370  *            columns, and non-alphanumeric characters marking
371  *            nonconsensus (insert) columns. If it does not,
372  *            then as a fallback, the first sequence in the alignment is
373  *            considered to be the consensus.
374  *
375  *            In "dotless" A2M format, gap characters (.) in insert
376  *            columns are omitted; therefore sequences can be of
377  *            different lengths, but each sequence has the same number
378  *            of consensus columns (residue or -).
379  *
380  *            A2M format cannot represent missing data symbols
381  *            (Easel's ~). Any missing data symbols are converted to
382  *            gaps.
383  *
384  *            A2M format cannot represent pyrrolysine residues in
385  *            amino acid sequences, because it treats 'O' symbols
386  *            specially, as indicating a position at which a
387  *            free-insertion module (FIM) should be created. Any 'O'
388  *            in the <msa> is written instead as an unknown
389  *            residue ('X', in protein sequences).
390  *
391  * Args:      fp  - open output stream
392  *            msa - MSA to write
393  *
394  * Returns:   <eslOK> on success.
395  *
396  * Throws:    <eslEMEM> on allocation failure.
397  *            <eslEWRITE> on any system write error, such as filled disk.
398  */
399 int
esl_msafile_a2m_Write(FILE * fp,const ESL_MSA * msa)400 esl_msafile_a2m_Write(FILE *fp, const ESL_MSA *msa)
401 {
402   char   *buf = NULL;
403   int     cpl = 60;
404   int     bpos;
405   int     pos;
406   int     is_consensus;
407   int     is_residue;
408   int     do_dotless = TRUE;	/* just changing this to FALSE makes it write dots too */
409   int     i;
410   int     sym;
411   int     status;
412 
413   ESL_ALLOC(buf, sizeof(char) * (cpl+1));
414 
415   for (i = 0; i < msa->nseq; i++)
416     {
417       /* Construct the name/description line */
418       if (fprintf(fp, ">%s", msa->sqname[i])                                                      < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "a2m msa file write failed");
419       if (msa->sqacc  != NULL && msa->sqacc[i]  != NULL) { if (fprintf(fp, " %s", msa->sqacc[i])  < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "a2m msa file write failed"); }
420       if (msa->sqdesc != NULL && msa->sqdesc[i] != NULL) { if (fprintf(fp, " %s", msa->sqdesc[i]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "a2m msa file write failed"); }
421       if (fputc('\n', fp)                                                                         < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "a2m msa file write failed");
422 
423       if (msa->abc)
424 	{
425 	  pos = 0;
426 	  while (pos < msa->alen)
427 	    {
428 	      for (bpos = 0; pos < msa->alen && bpos < cpl; pos++)
429 		{
430 		  sym          = msa->abc->sym[msa->ax[i][pos+1]]; /* note off-by-one in digitized aseq: 1..alen */
431 		  is_residue   = esl_abc_XIsResidue(msa->abc, msa->ax[i][pos+1]);
432 		  if (msa->rf) is_consensus = (isalnum(msa->rf[pos]) ? TRUE : FALSE);
433 		  else         is_consensus = (esl_abc_XIsResidue(msa->abc, msa->ax[0][pos+1]) ? TRUE : FALSE);
434 
435 		  if (sym == 'O') sym = esl_abc_CGetUnknown(msa->abc); /* watch out: O means "insert a FIM" in a2m format, not pyrrolysine */
436 
437 		  if      (is_consensus) { buf[bpos++] = (is_residue ? toupper(sym) : '-'); }
438 		  else if (is_residue)   { buf[bpos++] = tolower(sym); }
439 		  else if (! do_dotless) { buf[bpos++] = '.'; }
440 		}
441 	      buf[bpos] = '\0';
442 	      if (bpos) { if (fprintf(fp, "%s\n", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "a2m msa file write failed");}
443 	    }
444 	}
445 
446       if (! msa->abc)
447 	{
448 	  pos = 0;
449 	  while (pos < msa->alen)
450 	    {
451 	      for (bpos = 0; pos < msa->alen && bpos < cpl; pos++)
452 		{
453 		  sym          = msa->aseq[i][pos];
454 		  is_residue   = isalpha(msa->aseq[i][pos]);
455 		  if (msa->rf) is_consensus = (isalnum(msa->rf[pos]) ? TRUE : FALSE);
456 		  else         is_consensus = (isalnum(msa->aseq[0][pos]) ? TRUE : FALSE);
457 
458 		  if (sym == 'O') sym = 'X';
459 
460 		  if      (is_consensus) { buf[bpos++] = ( is_residue ? toupper(sym) : '-'); }
461 		  else if (is_residue)   { buf[bpos++] = tolower(sym); }
462 		  else if (! do_dotless) { buf[bpos++] = '.'; }
463 
464 		}
465 	      buf[bpos] = '\0';
466 	      if (bpos) { if (fprintf(fp, "%s\n", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "a2m msa file write failed"); }
467 	    }
468 	}
469     } /* end, loop over sequences in the MSA */
470 
471   free(buf);
472   return eslOK;
473 
474  ERROR:
475   if (buf) free(buf);
476   return status;
477 }
478 /*------------- end, API for i/o of a2m format ------------------*/
479 
480 
481 /*****************************************************************
482  * 2. Internal functions used by the A2M parser
483  *****************************************************************/
484 
485 /* A2M parser has an input phase, followed by an alignment padding phase.
486  * The a2m_padding_{digital,text} functions do the padding phase.
487  *
488  * Upon call:
489  *   msa->nseq is set;
490  *   msa->ax[0..nseq-1][1..slen] are unaligned seqs (consensus cols +
491  *   inserted residues); or msa->aseq[0..nseq-1][0..slen-1], for text mode
492  *   csflag[0..nseq-1][0..slen-1] is TRUE/FALSE for whether each pos
493  *   in msa->ax[][1..slen]/msa->aseq[][0..slen-1] is consensus or insert
494  *   nins[0..ncons] is the number of insert columns preceding each consensus column
495  *
496  * watch out, ax[] is a digital sequence, 1..alen not 0..alen-1: hence
497  * the [apos+1], [spos+1] indexing
498  *
499  * these functions may not fail with any normal (eslEFORMAT) error, because
500  * we wouldn't be able to tell the line/linenumber of the error
501  *
502  * Upon successful return:
503  *  msa->alen is set
504  *  all msa->ax[]/msa->aseq are now aligned digital sequences
505  *  msa->rf is set
506  */
507 static int
a2m_padding_digital(ESL_MSA * msa,char ** csflag,int * nins,int ncons)508 a2m_padding_digital(ESL_MSA *msa, char **csflag, int *nins, int ncons)
509 {
510   ESL_DSQ *ax     = NULL;		/* new aligned sequence - will be swapped into msa->ax[] */
511   ESL_DSQ  gapsym = esl_abc_XGetGap(msa->abc);
512   int      apos, cpos, spos;	/* position counters for alignment 0..alen, consensus cols 0..cpos-1, sequence position 0..slen-1 */
513   int      alen;
514   int      icount;
515   int      idx;
516   int      status;
517 
518   alen = ncons;
519   for (cpos = 0; cpos <= ncons; cpos++)
520     alen += nins[cpos];
521 
522   ESL_ALLOC(msa->rf, sizeof(char) * (alen+1));
523   for (apos = 0, cpos = 0; cpos <= ncons; cpos++)
524     {
525       for (icount = 0; icount < nins[cpos]; icount++) msa->rf[apos++] = '.';
526       if  (cpos < ncons) msa->rf[apos++] = 'x';
527     }
528   msa->rf[apos] = '\0';
529 
530   for (idx = 0; idx < msa->nseq; idx++)
531     {
532       ESL_ALLOC(ax, sizeof(ESL_DSQ) * (alen + 2));
533       ax[0] = eslDSQ_SENTINEL;
534       apos = spos  = 0;
535       for (cpos = 0; cpos <= ncons; cpos++)
536 	{
537 	  icount = 0;
538 	  while (csflag[idx][spos] == FALSE)  { ax[apos+1] = msa->ax[idx][spos+1]; apos++; spos++; icount++; }
539 	  while (icount < nins[cpos]) 	      { ax[apos+1] = gapsym;               apos++;         icount++; }
540 	  if (cpos < ncons)                   { ax[apos+1] = msa->ax[idx][spos+1]; apos++; spos++;           }
541 	}
542       ESL_DASSERT1( (msa->ax[idx][spos+1] == eslDSQ_SENTINEL) );
543       ESL_DASSERT1( (apos == alen) );
544       ax[alen+1] = eslDSQ_SENTINEL;
545       free(msa->ax[idx]);
546       msa->ax[idx] = ax;
547       ax = NULL;
548     }
549   msa->alen = alen;
550 
551 
552 
553   return eslOK;
554 
555  ERROR:
556   if (ax) free(ax);
557   return status;
558 }
559 
560 
561 static int
a2m_padding_text(ESL_MSA * msa,char ** csflag,int * nins,int ncons)562 a2m_padding_text(ESL_MSA *msa, char **csflag, int *nins, int ncons)
563 {
564   char   *aseq = NULL;		/* new aligned sequence - will be swapped into msa->aseq[] */
565   int     apos, cpos, spos;	/* position counters for alignment 0..alen, consensus cols 0..cpos-1, sequence position 0..slen-1 */
566   int     alen;
567   int     icount;
568   int     idx;
569   int     status;
570 
571   alen = ncons;
572   for (cpos = 0; cpos <= ncons; cpos++)
573     alen += nins[cpos];
574 
575   ESL_ALLOC(msa->rf, sizeof(char) * (alen+1));
576   for (apos = 0, cpos = 0; cpos <= ncons; cpos++)
577     {
578       for (icount = 0; icount < nins[cpos]; icount++) msa->rf[apos++] = '.';
579       if  (cpos < ncons) msa->rf[apos++] = 'x';
580     }
581   msa->rf[apos] = '\0';
582 
583   for (idx = 0; idx < msa->nseq; idx++)
584     {
585       ESL_ALLOC(aseq, sizeof(char) * (alen + 1));
586       apos = spos  = 0;
587       for (cpos = 0; cpos <= ncons; cpos++)
588 	{
589 	  icount = 0;
590 	  while (csflag[idx][spos] == FALSE)  { aseq[apos] = msa->aseq[idx][spos]; apos++; spos++; icount++; }
591 	  while (icount < nins[cpos]) 	      { aseq[apos] = '.';                  apos++;         icount++; }
592 	  if (cpos < ncons) 	              { aseq[apos] = msa->aseq[idx][spos]; apos++; spos++;           }
593 	}
594       ESL_DASSERT1( (msa->aseq[idx][spos] == '\0') );
595       ESL_DASSERT1( (apos == alen) );
596       aseq[alen] = '\0';
597       free(msa->aseq[idx]);
598       msa->aseq[idx] = aseq;
599       aseq = NULL;
600     }
601   msa->alen = alen;
602   return eslOK;
603 
604  ERROR:
605   if (aseq) free(aseq);
606   return status;
607 }
608 /*---------- end, internal functions for the parser -------------*/
609 
610 
611 /*****************************************************************
612  * 3. Unit tests.
613  *****************************************************************/
614 #ifdef eslMSAFILE_A2M_TESTDRIVE
615 
616 #include "esl_random.h"
617 
618 /* A2M is unable to write O (pyrrolysine) residues, because it uses O
619  * to mean a free insertion module. For unit tests that generate
620  * dirty/sampled MSAs, we have to avoid O's in those alignments.
621  */
622 static void
a2m_no_O(ESL_MSA * msa)623 a2m_no_O(ESL_MSA *msa)
624 {
625   int idx;
626   int apos;
627   int symO;
628 
629   ESL_DASSERT1(( msa->flags & eslMSA_DIGITAL ));
630   ESL_DASSERT1(( msa->abc != NULL ));
631 
632   symO = esl_abc_DigitizeSymbol(msa->abc, 'O');
633 
634   for (idx = 0; idx < msa->nseq; idx++)
635     for (apos = 1; apos <= msa->alen; apos++)
636       if (msa->ax[idx][apos] == symO) msa->ax[idx][apos] = esl_abc_XGetUnknown(msa->abc);
637 }
638 
639 static void
utest_write_good1(FILE * ofp,int * ret_alphatype,int * ret_nseq,int * ret_alen)640 utest_write_good1(FILE *ofp, int *ret_alphatype, int *ret_nseq, int *ret_alen)
641 {
642   fputs(">MYG_PHYCA\n", ofp);
643   fputs("VLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRFKHLKTEAEMKASED\n", ofp);
644   fputs("LKKHGVTVLTALGAILKKKGHHEAELKPLAQSHATKHKIPIKYLEFISEAIIHVLHSRHP\n", ofp);
645   fputs("GDFGADAQGAMNKALELFRKDIAAKYKELGYQG\n", ofp);
646   fputs(">GLB5_PETMA\n", ofp);
647   fputs("pivdtgsvApLSAAEKTKIRSAWAPVYSTYETSGVDILVKFFTSTPAAQEFFPKFKGLTT\n", ofp);
648   fputs("ADQLKKSADVRWHAERIINAVNDAVASMDDtekMSMKLRDLSGKHAKSFQVDPQYFKVLA\n", ofp);
649   fputs("AVI---------ADTVAAGDAGFEKLMSMICILLRSAY-------\n", ofp);
650   fputs(">HBB_HUMAN\n", ofp);
651   fputs("VhLTPEEKSAVTALWGKV--NVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNP\n", ofp);
652   fputs("KVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHF\n", ofp);
653   fputs("GKEFTPPVQAAYQKVVAGVANALAHKYH------\n", ofp);
654   fputs(">HBA_HUMAN\n", ofp);
655   fputs("VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHF------DLSHGSAQ\n", ofp);
656   fputs("VKGHGKKVADALTNAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLP\n", ofp);
657   fputs("AEFTPAVHASLDKFLASVSTVLTSKYR------\n", ofp);
658 
659   *ret_alphatype = eslAMINO;
660   *ret_nseq      = 4;
661   *ret_alen      = 165;
662 }
663 
664 static void
utest_write_good2(FILE * ofp,int * ret_alphatype,int * ret_nseq,int * ret_alen)665 utest_write_good2(FILE *ofp, int *ret_alphatype, int *ret_nseq, int *ret_alen)
666 {
667   fputs(">tRNA2\n", ofp);
668   fputs("UCCGAUAUAGUGUAACGGCUAUCACAUCACGCUUUCACCGUGGAGACCGGGGUUCGACUC\n", ofp);
669   fputs("CCCGUAUCGGAG\n", ofp);
670   fputs(">tRNA3\n", ofp);
671   fputs("UCCGUGAUAGUUUAAUGGUCAGAAUGG-GCGCUUGUCGCGUGCcAGAUCGGGGUUCAAUU\n", ofp);
672   fputs("CCCCGUCGCGGAG\n", ofp);
673   fputs(">tRNA5\n", ofp);
674   fputs("GGGCACAUGGCGCAGUUGGUAGCGCGCUUCCCUUGCAAGGAAGaGGUCAUCGGUUCGAUU\n", ofp);
675   fputs("CCGGUUGCGUCCA\n", ofp);
676   fputs(">tRNA1\n", ofp);
677   fputs("GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGaGGUCCUGUGUUCGAUC\n", ofp);
678   fputs("CACAGAAUUCGCA\n", ofp);
679   fputs(">tRNA4\n", ofp);
680   fputs("GCUCGUAUGGCGCAGUGG-UAGCGCAGCAGAUUGCAAAUCUGUuGGUCCUUAGUUCGAUC\n", ofp);
681   fputs("CUGAGUGCGAGCU\n", ofp);
682 
683   *ret_alphatype = eslRNA;
684   *ret_nseq      = 5;
685   *ret_alen      = 73;
686 }
687 
688 static void
utest_goodfile(char * filename,int testnumber,int expected_alphatype,int expected_nseq,int expected_alen)689 utest_goodfile(char *filename, int testnumber, int expected_alphatype, int expected_nseq, int expected_alen)
690 {
691   ESL_ALPHABET        *abc          = NULL;
692   ESL_MSAFILE         *afp          = NULL;
693   ESL_MSA             *msa1         = NULL;
694   ESL_MSA             *msa2         = NULL;
695   char                 tmpfile1[32] = "esltmpXXXXXX";
696   char                 tmpfile2[32] = "esltmpXXXXXX";
697   FILE                *ofp          = NULL;
698   int                  status;
699 
700   /* A2M must be specified (no format guessing, unless we use .a2m suffix) but guessing the alphabet should work: this is a digital open */
701   if ( (status = esl_msafile_Open(&abc, filename, NULL, eslMSAFILE_A2M, NULL, &afp)) != eslOK) esl_fatal("a2m good file test %d failed: digital open",           testnumber);
702   if (afp->format != eslMSAFILE_A2M)                                                           esl_fatal("a2m good file test %d failed: bad format",             testnumber);
703   if (abc->type   != expected_alphatype)                                                       esl_fatal("a2m good file test %d failed: alphabet autodetection", testnumber);
704 
705   /* This is a digital read, using <abc>. */
706   if ( (status = esl_msafile_a2m_Read(afp, &msa1))   != eslOK)     esl_fatal("a2m good file test %d failed: msa read, digital", testnumber);
707   if (msa1->nseq != expected_nseq || msa1->alen != expected_alen)  esl_fatal("a2m good file test %d failed: nseq/alen",         testnumber);
708   if (esl_msa_Validate(msa1, NULL) != eslOK)                       esl_fatal("a2m good file test %d failed: msa invalid",       testnumber);
709   esl_msafile_Close(afp);
710 
711   /* write it back out to a new tmpfile (digital write) */
712   if ( (status = esl_tmpfile_named(tmpfile1, &ofp)) != eslOK) esl_fatal("a2m good file test %d failed: tmpfile creation",   testnumber);
713   if ( (status = esl_msafile_a2m_Write(ofp, msa1))  != eslOK) esl_fatal("a2m good file test %d failed: msa write, digital", testnumber);
714   fclose(ofp);
715 
716   /* now open and read it as text mode, in known format. */
717   if ( (status = esl_msafile_Open(NULL, tmpfile1, NULL, eslMSAFILE_A2M, NULL, &afp)) != eslOK) esl_fatal("a2m good file test %d failed: text mode open", testnumber);
718   if ( (status = esl_msafile_a2m_Read(afp, &msa2))                                   != eslOK) esl_fatal("a2m good file test %d failed: msa read, text", testnumber);
719   if (msa2->nseq != expected_nseq || msa2->alen != expected_alen)                              esl_fatal("a2m good file test %d failed: nseq/alen",      testnumber);
720   if (esl_msa_Validate(msa2, NULL) != eslOK)                                                   esl_fatal("a2m good file test %d failed: msa invalid",       testnumber);
721   esl_msafile_Close(afp);
722 
723   /* write it back out to a new tmpfile (text write) */
724   if ( (status = esl_tmpfile_named(tmpfile2, &ofp)) != eslOK) esl_fatal("a2m good file test %d failed: tmpfile creation", testnumber);
725   if ( (status = esl_msafile_a2m_Write(ofp, msa2))  != eslOK) esl_fatal("a2m good file test %d failed: msa write, text",  testnumber);
726   fclose(ofp);
727   esl_msa_Destroy(msa2);
728 
729   /* open and read it in digital mode */
730   if ( (status = esl_msafile_Open(&abc, tmpfile1, NULL, eslMSAFILE_A2M, NULL, &afp)) != eslOK) esl_fatal("a2m good file test %d failed: 2nd digital mode open", testnumber);
731   if ( (status = esl_msafile_a2m_Read(afp, &msa2))                                   != eslOK) esl_fatal("a2m good file test %d failed: 2nd digital msa read",  testnumber);
732   if (esl_msa_Validate(msa2, NULL) != eslOK)                                                   esl_fatal("a2m good file test %d failed: msa invalid",           testnumber);
733   esl_msafile_Close(afp);
734 
735   /* this msa <msa2> should be identical to <msa1> */
736   if (esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal("a2m good file test %d failed: msa compare", testnumber);
737 
738   remove(tmpfile1);
739   remove(tmpfile2);
740   esl_msa_Destroy(msa1);
741   esl_msa_Destroy(msa2);
742   esl_alphabet_Destroy(abc);
743 }
744 
745 
746 static void
write_test_msas(FILE * ofp1,FILE * ofp2)747 write_test_msas(FILE *ofp1, FILE *ofp2)
748 {
749   fprintf(ofp1, ">seq1 description line for seq1\n");
750   fprintf(ofp1, "ACDEFGHIKLMNPQRSTVWY\n");
751   fprintf(ofp1, "ACDEFGHIKLMNPQRSTVWY\n");
752   fprintf(ofp1, ">seq2 description line for seq2\n");
753   fprintf(ofp1, "ACDEFGHIKLMNPQRSTV--\n");
754   fprintf(ofp1, "ACDEFGHIKLMNPQRSTVWY\n");
755   fprintf(ofp1, "yy\n");
756   fprintf(ofp1, ">seq3\n");
757   fprintf(ofp1, "aaACDEFGHIKLMNPQRSTV\n");
758   fprintf(ofp1, "--ACDEFGHIKLMNPQRSTVWY\n");
759   fprintf(ofp1, ">seq4  \n");
760   fprintf(ofp1, "ACDEFGHIKLMNPQR\n");
761   fprintf(ofp1, "STVWYACDEFGHIKL\n");
762   fprintf(ofp1, "MNPQRSTVWY\n");
763 
764   fprintf(ofp2, "# STOCKHOLM 1.0\n");
765   fprintf(ofp2, "\n");
766   fprintf(ofp2, "#=GS seq1 DE description line for seq1\n");
767   fprintf(ofp2, "#=GS seq2 DE description line for seq2\n");
768   fprintf(ofp2, "\n");
769   fprintf(ofp2, "#=GC RF ..xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx..\n");
770   fprintf(ofp2, "seq1    ..ACDEFGHIKLMNPQRSTVWYACDEFGHIKLMNPQRSTVWY..\n");
771   fprintf(ofp2, "seq2    ..ACDEFGHIKLMNPQRSTV--ACDEFGHIKLMNPQRSTVWYyy\n");
772   fprintf(ofp2, "seq3    aaACDEFGHIKLMNPQRSTV--ACDEFGHIKLMNPQRSTVWY..\n");
773   fprintf(ofp2, "seq4    ..ACDEFGHIKLMNPQRSTVWYACDEFGHIKLMNPQRSTVWY..\n");
774   fprintf(ofp2, "//\n");
775 }
776 
777 static void
read_test_msas_digital(char * a2mfile,char * stkfile)778 read_test_msas_digital(char *a2mfile, char *stkfile)
779 {
780   char msg[]         = "A2M msa digital read unit test failed";
781   ESL_ALPHABET *abc  = NULL;
782   ESL_MSAFILE  *afp1 = NULL;
783   ESL_MSAFILE  *afp2 = NULL;
784   ESL_MSA      *msa1, *msa2, *msa3, *msa4;
785   FILE         *a2mfp, *stkfp;
786   char          a2mfile2[32] = "esltmpa2m2XXXXXX";
787   char          stkfile2[32] = "esltmpstk2XXXXXX";
788 
789   if ( esl_msafile_Open(&abc, a2mfile, NULL, eslMSAFILE_A2M,       NULL, &afp1) != eslOK)  esl_fatal(msg);
790   if ( !abc || abc->type != eslAMINO)                                                      esl_fatal(msg);
791   if ( esl_msafile_Open(&abc, stkfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK)  esl_fatal(msg);
792   if ( esl_msafile_a2m_Read      (afp1, &msa1)                                  != eslOK)  esl_fatal(msg);
793   if ( esl_msafile_stockholm_Read(afp2, &msa2)                                  != eslOK)  esl_fatal(msg);
794   if ( esl_msa_Compare(msa1, msa2)                                              != eslOK)  esl_fatal(msg);
795 
796   if ( esl_msafile_a2m_Read      (afp1, &msa3) != eslEOF) esl_fatal(msg);
797   if ( esl_msafile_stockholm_Read(afp2, &msa3) != eslEOF) esl_fatal(msg);
798 
799   esl_msafile_Close(afp2);
800   esl_msafile_Close(afp1);
801 
802   /* Now write stk to a2m file, and vice versa; then retest */
803   if ( esl_tmpfile_named(a2mfile2, &a2mfp)                       != eslOK) esl_fatal(msg);
804   if ( esl_tmpfile_named(stkfile2, &stkfp)                       != eslOK) esl_fatal(msg);
805   if ( esl_msafile_a2m_Write      (a2mfp, msa2)                  != eslOK) esl_fatal(msg);
806   if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_PFAM) != eslOK) esl_fatal(msg);
807   fclose(a2mfp);
808   fclose(stkfp);
809   if ( esl_msafile_Open(&abc, a2mfile2, NULL, eslMSAFILE_A2M,       NULL, &afp1) != eslOK) esl_fatal(msg);
810   if ( esl_msafile_Open(&abc, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
811   if ( esl_msafile_a2m_Read      (afp1, &msa3)                                   != eslOK) esl_fatal(msg);
812   if ( esl_msafile_stockholm_Read(afp2, &msa4)                                   != eslOK) esl_fatal(msg);
813   if ( esl_msa_Compare(msa3, msa4)                                               != eslOK) esl_fatal(msg);
814 
815   remove(a2mfile2);
816   remove(stkfile2);
817   esl_msafile_Close(afp2);
818   esl_msafile_Close(afp1);
819 
820   esl_msa_Destroy(msa1);
821   esl_msa_Destroy(msa2);
822   esl_msa_Destroy(msa3);
823   esl_msa_Destroy(msa4);
824   esl_alphabet_Destroy(abc);
825 }
826 
827 static void
read_test_msas_text(char * a2mfile,char * stkfile)828 read_test_msas_text(char *a2mfile, char *stkfile)
829 {
830   char msg[]         = "A2M msa text-mode read unit test failed";
831   ESL_MSAFILE  *afp1 = NULL;
832   ESL_MSAFILE  *afp2 = NULL;
833   ESL_MSA      *msa1, *msa2, *msa3, *msa4;
834   FILE         *a2mfp, *stkfp;
835   char          a2mfile2[32] = "esltmpa2m2XXXXXX";
836   char          stkfile2[32] = "esltmpstk2XXXXXX";
837 
838   /*                     vvvv-- everything's the same as the digital utest except these NULLs  */
839   if ( esl_msafile_Open(NULL, a2mfile, NULL, eslMSAFILE_A2M,       NULL, &afp1) != eslOK)  esl_fatal(msg);
840   if ( esl_msafile_Open(NULL, stkfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK)  esl_fatal(msg);
841   if ( esl_msafile_a2m_Read      (afp1, &msa1)                                  != eslOK)  esl_fatal(msg);
842   if ( esl_msafile_stockholm_Read(afp2, &msa2)                                  != eslOK)  esl_fatal(msg);
843   if ( esl_msa_Compare(msa1, msa2)                                              != eslOK)  esl_fatal(msg);
844   if ( esl_msafile_a2m_Read      (afp1, &msa3)                                  != eslEOF) esl_fatal(msg);
845   if ( esl_msafile_stockholm_Read(afp2, &msa3)                                  != eslEOF) esl_fatal(msg);
846   esl_msafile_Close(afp2);
847   esl_msafile_Close(afp1);
848 
849   if ( esl_tmpfile_named(a2mfile2, &a2mfp)                        != eslOK) esl_fatal(msg);
850   if ( esl_tmpfile_named(stkfile2, &stkfp)                        != eslOK) esl_fatal(msg);
851   if ( esl_msafile_a2m_Write      (a2mfp, msa2)                   != eslOK) esl_fatal(msg);
852   if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_PFAM)  != eslOK) esl_fatal(msg);
853   fclose(a2mfp);
854   fclose(stkfp);
855   if ( esl_msafile_Open(NULL, a2mfile2, NULL, eslMSAFILE_A2M,       NULL, &afp1) != eslOK) esl_fatal(msg);
856   if ( esl_msafile_Open(NULL, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
857   if ( esl_msafile_a2m_Read      (afp1, &msa3)                                   != eslOK) esl_fatal(msg);
858   if ( esl_msafile_stockholm_Read(afp2, &msa4)                                   != eslOK) esl_fatal(msg);
859   if ( esl_msa_Compare(msa3, msa4)                                               != eslOK) esl_fatal(msg);
860 
861   remove(a2mfile2);
862   remove(stkfile2);
863   esl_msafile_Close(afp2);
864   esl_msafile_Close(afp1);
865 
866   esl_msa_Destroy(msa1);
867   esl_msa_Destroy(msa2);
868   esl_msa_Destroy(msa3);
869   esl_msa_Destroy(msa4);
870 }
871 
872 static void
utest_gibberish(ESL_RANDOMNESS * rng)873 utest_gibberish(ESL_RANDOMNESS *rng)
874 {
875   char          msg[]       = "esl_msafile_a2m gibberish test failed";
876   char          a2mfile[32] = "esltmpXXXXXX";
877   FILE         *fp          = NULL;
878   ESL_ALPHABET *abc         = esl_alphabet_Create(eslAMINO);
879   ESL_MSA      *msa         = NULL;
880   ESL_MSAFILE  *msafp       = NULL;
881   ESL_MSA      *msa2        = NULL;
882 
883   if ( esl_msa_Sample(rng, abc, 100, 100, &msa) != eslOK) esl_fatal(msg);
884   if ( esl_msa_Validate(msa, NULL)              != eslOK) esl_fatal(msg);
885   if ( esl_msa_FlushLeftInserts(msa)            != eslOK) esl_fatal(msg);  // Reading A2M back in will flush inserts left.
886   if ( esl_msa_MinimGaps(msa, NULL, NULL, TRUE) != eslOK) esl_fatal(msg);  // Reading A2M back in will minimize gaps.
887   a2m_no_O(msa);                                                           // A2M doesn't allow O residues.
888   ESL_DASSERT1(( !(msa->flags & eslMSA_HASWGTS) ));                        // A2M can't store weights.
889   ESL_DASSERT1(( msa->rf != NULL ));                                       // A2M always implies consensus annotation.
890 
891   if ( esl_tmpfile_named(a2mfile, &fp)          != eslOK) esl_fatal(msg);
892   if ( esl_msafile_a2m_Write(fp, msa)           != eslOK) esl_fatal(msg);
893   fclose(fp);
894 
895   if ( esl_msafile_Open(&abc, a2mfile, NULL, eslMSAFILE_A2M, NULL, &msafp) != eslOK) esl_fatal(msg);
896   if ( esl_msafile_a2m_Read(msafp, &msa2)                                  != eslOK) esl_fatal(msg);
897   if ( esl_msa_Validate(msa2, NULL)                                        != eslOK) esl_fatal(msg);
898   if ( esl_msa_Compare(msa, msa2)                                          != eslOK) esl_fatal(msg);
899   esl_msafile_Close(msafp);
900 
901   remove(a2mfile);
902   esl_msa_Destroy(msa);
903   esl_msa_Destroy(msa2);
904   esl_alphabet_Destroy(abc);
905 }
906 #endif /*eslMSAFILE_A2M_TESTDRIVE*/
907 /*---------------------- end, unit tests ------------------------*/
908 
909 
910 
911 /*****************************************************************
912  * 4. Test driver.
913  *****************************************************************/
914 #ifdef eslMSAFILE_A2M_TESTDRIVE
915 /* compile: gcc -g -Wall -I. -L. -o esl_msafile_a2m_utest -DeslMSAFILE_A2M_TESTDRIVE esl_msafile_a2m.c -leasel -lm
916  *  (gcov): gcc -g -Wall -fprofile-arcs -ftest-coverage -I. -L. -o esl_msafile_a2m_utest -DeslMSAFILE_A2M_TESTDRIVE esl_msafile_a2m.c -leasel -lm
917  * run:     ./esl_msafile_a2m_utest
918  */
919 #include "esl_config.h"
920 
921 #include <stdio.h>
922 
923 #include "easel.h"
924 #include "esl_getopts.h"
925 #include "esl_random.h"
926 #include "esl_msafile.h"
927 #include "esl_msafile_a2m.h"
928 
929 static ESL_OPTIONS options[] = {
930    /* name  type         default  env  range togs  reqs  incomp  help                        docgrp */
931   {"-h",  eslARG_NONE,    FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage",                            0},
932   {"-s",  eslARG_INT,       "0", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>",   0},
933   { 0,0,0,0,0,0,0,0,0,0},
934 };
935 static char usage[]  = "[-options]";
936 static char banner[] = "test driver for A2M MSA format module";
937 
938 int
main(int argc,char ** argv)939 main(int argc, char **argv)
940 {
941   char            msg[]        = "a2m MSA i/o module test driver failed";
942   ESL_GETOPTS    *go           = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
943   ESL_RANDOMNESS *rng          = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));
944   char            a2mfile[32]  = "esltmpa2mXXXXXX";
945   char            stkfile[32]  = "esltmpstkXXXXXX";
946   FILE           *a2mfp, *stkfp;
947   int             testnumber;
948   int             ngoodtests = 2;
949   char            tmpfile[32];
950   FILE           *ofp;
951   int             expected_alphatype;
952   int             expected_nseq;
953   int             expected_alen;
954 
955   fprintf(stderr, "## %s\n", argv[0]);
956   fprintf(stderr, "#  rng seed = %" PRIu32 "\n", esl_randomness_GetSeed(rng));
957 
958   if ( esl_tmpfile_named(a2mfile, &a2mfp) != eslOK) esl_fatal(msg);
959   if ( esl_tmpfile_named(stkfile, &stkfp) != eslOK) esl_fatal(msg);
960   write_test_msas(a2mfp, stkfp);
961   fclose(a2mfp);
962   fclose(stkfp);
963 
964   read_test_msas_digital(a2mfile, stkfile);
965   read_test_msas_text   (a2mfile, stkfile);
966 
967   /* Various "good" files that should be parsed correctly */
968   for (testnumber = 1; testnumber <= ngoodtests; testnumber++)
969     {
970       strcpy(tmpfile, "esltmpXXXXXX");
971       if (esl_tmpfile_named(tmpfile, &ofp) != eslOK) esl_fatal(msg);
972       switch (testnumber) {
973       case  1:  utest_write_good1 (ofp, &expected_alphatype, &expected_nseq, &expected_alen); break;
974       case  2:  utest_write_good2 (ofp, &expected_alphatype, &expected_nseq, &expected_alen); break;
975       }
976       fclose(ofp);
977       utest_goodfile(tmpfile, testnumber, expected_alphatype, expected_nseq, expected_alen);
978       remove(tmpfile);
979     }
980 
981   utest_gibberish(rng);
982 
983   fprintf(stderr, "#  status = ok\n");
984 
985   remove(a2mfile);
986   remove(stkfile);
987   esl_randomness_Destroy(rng);
988   esl_getopts_Destroy(go);
989   return 0;
990 }
991 #endif /*eslMSAFILE_A2M_TESTDRIVE*/
992 /*--------------------- end, test driver ------------------------*/
993 
994 
995 
996 /*****************************************************************
997  * 5. Examples.
998  *****************************************************************/
999 
1000 #ifdef eslMSAFILE_A2M_EXAMPLE
1001 /* A full-featured example of reading/writing an MSA in A2M format.
1002    gcc -g -Wall -o esl_msafile_a2m_example -I. -L. -DeslMSAFILE_A2M_EXAMPLE esl_msafile_a2m.c -leasel -lm
1003    ./esl_msafile_a2m_example <msafile>
1004  */
1005 /*::cexcerpt::msafile_a2m_example::begin::*/
1006 #include <stdio.h>
1007 
1008 #include "easel.h"
1009 #include "esl_alphabet.h"
1010 #include "esl_getopts.h"
1011 #include "esl_msa.h"
1012 #include "esl_msafile.h"
1013 #include "esl_msafile_a2m.h"
1014 
1015 static ESL_OPTIONS options[] = {
1016   /* name             type          default  env  range toggles reqs incomp  help                                       docgroup*/
1017   { "-h",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",            0 },
1018   { "-1",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "override autodetection; force A2M format",        0 },
1019   { "-q",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "quieter: don't write msa back, just summary",     0 },
1020   { "-t",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use text mode: no digital alphabet",              0 },
1021   { "--dna",       eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, "-t", "specify that alphabet is DNA",                    0 },
1022   { "--rna",       eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, "-t", "specify that alphabet is RNA",                    0 },
1023   { "--amino",     eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, "-t", "specify that alphabet is protein",                0 },
1024   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1025 };
1026 static char usage[]  = "[-options] <msafile>";
1027 static char banner[] = "example of guessing, reading, writing A2M format";
1028 
1029 int
main(int argc,char ** argv)1030 main(int argc, char **argv)
1031 {
1032   ESL_GETOPTS        *go          = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
1033   char               *filename    = esl_opt_GetArg(go, 1);
1034   int                 infmt       = eslMSAFILE_UNKNOWN;
1035   ESL_ALPHABET       *abc         = NULL;
1036   ESL_MSAFILE        *afp         = NULL;
1037   ESL_MSA            *msa         = NULL;
1038   int                 status;
1039 
1040   if      (esl_opt_GetBoolean(go, "-1"))      infmt = eslMSAFILE_A2M;  /* override format autodetection */
1041 
1042   if      (esl_opt_GetBoolean(go, "--rna"))   abc = esl_alphabet_Create(eslRNA);
1043   else if (esl_opt_GetBoolean(go, "--dna"))   abc = esl_alphabet_Create(eslDNA);
1044   else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
1045 
1046   /* Text mode: pass NULL for alphabet.
1047    * Digital mode: pass ptr to expected ESL_ALPHABET; and if abc=NULL, alphabet is guessed
1048    */
1049   if   (esl_opt_GetBoolean(go, "-t"))  status = esl_msafile_Open(NULL, filename, NULL, infmt, NULL, &afp);
1050   else                                 status = esl_msafile_Open(&abc, filename, NULL, infmt, NULL, &afp);
1051   if (status != eslOK) esl_msafile_OpenFailure(afp, status);
1052 
1053   if ((status = esl_msafile_a2m_Read(afp, &msa)) != eslOK)
1054     esl_msafile_ReadFailure(afp, status);
1055 
1056   printf("alphabet:       %s\n", (abc ? esl_abc_DecodeType(abc->type) : "none (text mode)"));
1057   printf("# of seqs:      %d\n", msa->nseq);
1058   printf("# of cols:      %d\n", (int) msa->alen);
1059   printf("\n");
1060 
1061   if (! esl_opt_GetBoolean(go, "-q"))
1062     esl_msafile_a2m_Write(stdout, msa);
1063 
1064   esl_msa_Destroy(msa);
1065   esl_msafile_Close(afp);
1066   if (abc) esl_alphabet_Destroy(abc);
1067   esl_getopts_Destroy(go);
1068   exit(0);
1069 }
1070 /*::cexcerpt::msafile_a2m_example::end::*/
1071 #endif /*eslMSAFILE_A2M_EXAMPLE*/
1072 
1073 
1074 #ifdef eslMSAFILE_A2M_EXAMPLE2
1075 /* A minimal example. Read A2M MSA, in text mode
1076    gcc -g -Wall -o esl_msafile_a2m_example2 -I. -L. -DeslMSAFILE_A2M_EXAMPLE2 esl_msafile_a2m.c -leasel -lm
1077    ./esl_msafile_a2m_example <msafile>
1078  */
1079 /*::cexcerpt::msafile_a2m_example2::begin::*/
1080 #include <stdio.h>
1081 
1082 #include "easel.h"
1083 #include "esl_msa.h"
1084 #include "esl_msafile.h"
1085 #include "esl_msafile_a2m.h"
1086 
1087 int
main(int argc,char ** argv)1088 main(int argc, char **argv)
1089 {
1090   char         *filename = argv[1];
1091   int           fmt      = eslMSAFILE_A2M;
1092   ESL_MSAFILE  *afp      = NULL;
1093   ESL_MSA      *msa      = NULL;
1094   int           status;
1095 
1096   if ( (status = esl_msafile_Open(NULL, filename, NULL, fmt, NULL, &afp)) != eslOK) esl_msafile_OpenFailure(afp, status);
1097   if ( (status = esl_msafile_a2m_Read(afp, &msa))                         != eslOK) esl_msafile_ReadFailure(afp, status);
1098 
1099   printf("%6d seqs, %5d columns\n", msa->nseq, (int) msa->alen);
1100 
1101   esl_msafile_a2m_Write(stdout, msa);
1102   esl_msa_Destroy(msa);
1103   esl_msafile_Close(afp);
1104   exit(0);
1105 }
1106 /*::cexcerpt::msafile_a2m_example2::end::*/
1107 #endif /*eslMSAFILE_A2M_EXAMPLE2*/
1108 /*--------------------- end of examples -------------------------*/
1109 
1110