1 /* I/O of multiple sequence alignments in PSI-BLAST format
2  *
3  * Contents:
4  *   1. API for reading/writing PSI-BLAST format
5  *   2. Unit tests.
6  *   3. Test driver.
7  *   4. Examples.
8  */
9 #include "esl_config.h"
10 
11 #include <stdio.h>
12 #include <string.h>
13 #include <ctype.h>
14 
15 #include "easel.h"
16 #include "esl_alphabet.h"
17 #include "esl_mem.h"
18 #include "esl_msa.h"
19 #include "esl_msafile.h"
20 
21 #include "esl_msafile_psiblast.h"
22 
23 /*****************************************************************
24  *# 1. API for reading/writing PSI-BLAST format
25  *****************************************************************/
26 
27 /* Function:  esl_msafile_psiblast_SetInmap()
28  * Synopsis:  Set input map specific for PSI-BLAST input.
29  *
30  * Purpose:   Set the <afp->inmap> for PSI-BLAST format.
31  *
32  *            PSI-BLAST only allows - for a gap. It also disallows O residues.
33  *
34  *            Text mode accepts any <isalpha()> character plus '-' but not 'O' or 'o'.
35  *            Digital mode enforces the usual Easel alphabets, but disallows "._*~".
36  */
37 int
esl_msafile_psiblast_SetInmap(ESL_MSAFILE * afp)38 esl_msafile_psiblast_SetInmap(ESL_MSAFILE *afp)
39 {
40    int sym;
41 
42   if (afp->abc)
43     {
44       for (sym = 0; sym < 128; sym++)
45 	afp->inmap[sym] = afp->abc->inmap[sym];
46       afp->inmap[0]   = esl_abc_XGetUnknown(afp->abc);
47       afp->inmap['.'] = eslDSQ_ILLEGAL;
48       afp->inmap['_'] = eslDSQ_ILLEGAL;
49       afp->inmap['*'] = eslDSQ_ILLEGAL;
50       afp->inmap['~'] = eslDSQ_ILLEGAL;
51     }
52 
53   if (! afp->abc)
54     {
55       for (sym = 1; sym < 128; sym++)
56 	afp->inmap[sym] = (isalpha(sym) ? sym : eslDSQ_ILLEGAL);
57       afp->inmap[0]   = '?';
58       afp->inmap['-'] = '-';
59     }
60 
61   afp->inmap['O'] = eslDSQ_ILLEGAL;
62   afp->inmap['o'] = eslDSQ_ILLEGAL;
63   return eslOK;
64 }
65 
66 /* Function:  esl_msafile_psiblast_GuessAlphabet()
67  * Synopsis:  Guess the alphabet of an open PSI-BLAST MSA file.
68  *
69  * Purpose:   Guess the alpbabet of the sequences in open
70  *            PSI-BLAST format MSA file <afp>.
71  *
72  *            On a normal return, <*ret_type> is set to <eslDNA>,
73  *            <eslRNA>, or <eslAMINO>, and <afp> is reset to its
74  *            original position.
75  *
76  * Args:      afp      - open PSI-BLAST format MSA file
77  *            ret_type - RETURN: <eslDNA>, <eslRNA>, or <eslAMINO>
78  *
79  * Returns:   <eslOK> on success.
80  *            <eslENOALPHABET> if alphabet type can't be determined.
81  *            In either case, <afp> is rewound to the position it
82  *            started at.
83  */
84 int
esl_msafile_psiblast_GuessAlphabet(ESL_MSAFILE * afp,int * ret_type)85 esl_msafile_psiblast_GuessAlphabet(ESL_MSAFILE *afp, int *ret_type)
86 {
87   int       alphatype     = eslUNKNOWN;
88   esl_pos_t anchor        = -1;
89   int       threshold[3]  = { 500, 5000, 50000 }; /* we check after 500, 5000, 50000 residues; else we go to EOF */
90   int       nsteps        = 3;
91   int       step          = 0;
92   int       nres          = 0;
93   int       x;
94   int64_t   ct[26];
95   char     *p, *tok;
96   esl_pos_t n,  toklen, pos;
97   int       status;
98 
99   for (x = 0; x < 26; x++) ct[x] = 0;
100 
101   anchor = esl_buffer_GetOffset(afp->bf);
102   if ((status = esl_buffer_SetAnchor(afp->bf, anchor)) != eslOK) { status = eslEINCONCEIVABLE; goto ERROR; } /* [eslINVAL] can't happen here */
103 
104   while ( (status = esl_buffer_GetLine(afp->bf, &p, &n)) == eslOK)
105     {
106       if ((status = esl_memtok(&p, &n, " \t", &tok, &toklen)) != eslOK) continue; /* blank lines */
107       /* p now points to the rest of the sequence line, after a name */
108 
109       /* count characters into ct[] array */
110       for (pos = 0; pos < n; pos++)
111 	if (isalpha(p[pos])) {
112 	  x = toupper(p[pos]) - 'A';
113 	  ct[x]++;
114 	  nres++;
115 	}
116 
117       /* try to stop early, checking after 500, 5000, and 50000 residues: */
118       if (step < nsteps && nres > threshold[step]) {
119 	if ((status = esl_abc_GuessAlphabet(ct, &alphatype)) == eslOK) goto DONE; /* (eslENOALPHABET) */
120 	step++;
121       }
122     }
123   if (status != eslEOF) goto ERROR; /* [eslEMEM,eslESYS,eslEINCONCEIVABLE] */
124   status = esl_abc_GuessAlphabet(ct, &alphatype); /* (eslENOALPHABET) */
125 
126  DONE:
127   esl_buffer_SetOffset(afp->bf, anchor);   /* Rewind to where we were. */
128   esl_buffer_RaiseAnchor(afp->bf, anchor);
129   *ret_type = alphatype;
130   return status;
131 
132  ERROR:
133   if (anchor != -1) {
134     esl_buffer_SetOffset(afp->bf, anchor);
135     esl_buffer_RaiseAnchor(afp->bf, anchor);
136   }
137   *ret_type = eslUNKNOWN;
138   return status;
139 }
140 
141 /* Function:  esl_msafile_psiblast_Read()
142  * Synopsis:  Read an alignment in PSI-BLAST's input format.
143  *
144  * Purpose:   Read an MSA from an open <ESL_MSAFILE> <afp>, parsing for
145  *            PSI-BLAST input format, starting from the current point.
146  *            Create a new multiple alignment, and return a ptr to
147  *            that alignment via <*ret_msa>. Caller is responsible for
148  *            free'ing this <ESL_MSA>.
149  *
150  *            The <msa> has a reference line (<msa->rf[]>) that
151  *            corresponds to the uppercase/lowercase columns in the
152  *            alignment: consensus (uppercase) columns are marked 'x',
153  *            and insert (lowercase) columns are marked '.' in this RF
154  *            line.
155  *
156  * Args:      afp     - open <ESL_MSAFILE>
157  *            ret_msa - RETURN: newly parsed <ESL_MSA>
158  *
159  * Returns:   <eslOK> on success. <*ret_msa> contains the newly
160  *            allocated MSA. <afp> is at EOF.
161  *
162  *            <eslEOF> if no (more) alignment data are found in
163  *            <afp>, and <afp> is returned at EOF.
164  *
165  *            <eslEFORMAT> on a parse error. <*ret_msa> is set to
166  *            <NULL>. <afp> contains information sufficient for
167  *            constructing useful diagnostic output:
168  *            | <afp->errmsg>       | user-directed error message     |
169  *            | <afp->linenumber>   | line # where error was detected |
170  *            | <afp->line>         | offending line (not NUL-term)   |
171  *            | <afp->n>            | length of offending line        |
172  *            | <afp->bf->filename> | name of the file                |
173  *            and <afp> is poised at the start of the following line,
174  *            so (in principle) the caller could try to resume
175  *            parsing.
176  *
177  * Throws:    <eslEMEM> on allocation error.
178  *            <eslESYS> if a system call fails, such as fread().
179  *            <eslEINCONCEIVABLE> - "impossible" corruption
180  *            On these, <*ret_msa> is returned <NULL>, and the state of
181  *            <afp> is undefined.
182  */
183 int
esl_msafile_psiblast_Read(ESL_MSAFILE * afp,ESL_MSA ** ret_msa)184 esl_msafile_psiblast_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa)
185 {
186   ESL_MSA  *msa      = NULL;
187   int       idx      = 0;	/* counter over sequences in a block */
188   int       nblocks  = 0;	/* counter over blocks */
189   int64_t   alen     = 0;
190   int       nseq     = 0;
191   int64_t   cur_alen;
192   esl_pos_t pos;		/* position on a line */
193   esl_pos_t name_start,      name_len;
194   esl_pos_t seq_start,       seq_len;
195   esl_pos_t block_seq_start, block_seq_len;
196   int       status;
197 
198   ESL_DASSERT1( (afp->format == eslMSAFILE_PSIBLAST) );
199 
200   afp->errmsg[0] = '\0';
201 
202   /* allocate a growable MSA. We set msa->{nseq,alen} only when we're done. */
203   if (afp->abc   &&  (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
204   if (! afp->abc &&  (msa = esl_msa_Create(                 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
205 
206   /* skip leading blank lines in file */
207   while ( (status = esl_msafile_GetLine(afp, NULL, NULL)) == eslOK && esl_memspn(afp->line, afp->n, " \t") == afp->n) ;
208   if (status != eslOK)  goto ERROR; /* includes normal EOF */
209 
210   /* Read the file a line at a time; if a parsing error occurs, detect immediately, with afp->linenumber set correctly */
211    do { /* while in the file... */
212     idx = 0;
213     do { /* while in a block... */
214       for (pos = 0;     pos < afp->n; pos++) if (! isspace(afp->line[pos])) break;
215       name_start = pos;
216       for (pos = pos+1; pos < afp->n; pos++) if (  isspace(afp->line[pos])) break;
217       name_len   = pos - name_start;
218       for (pos = pos+1; pos < afp->n; pos++) if (! isspace(afp->line[pos])) break;
219       seq_start  = pos;
220       if (pos >= afp->n) ESL_XFAIL(eslEFORMAT, afp->errmsg, "invalid alignment line");
221       for (pos = afp->n-1; pos > 0; pos--)   if (! isspace(afp->line[pos])) break;
222       seq_len    = pos - seq_start + 1;
223 
224       if (idx == 0) {
225 	block_seq_start = seq_start;
226 	block_seq_len   = seq_len;
227       } else {
228 	if (seq_start != block_seq_start) ESL_XFAIL(eslEFORMAT, afp->errmsg, "sequence start is misaligned");
229 	if (seq_len   != block_seq_len)   ESL_XFAIL(eslEFORMAT, afp->errmsg, "sequence end is misaligned");
230       }
231 
232       /* Process the consensus #=RF line. */
233       if (idx == 0) {
234 	ESL_REALLOC(msa->rf, sizeof(char) * (alen + seq_len + 1));
235 	for (pos = 0; pos < seq_len; pos++) msa->rf[alen+pos] = '-'; /* anything neutral other than . or x will do. */
236 	msa->rf[alen+pos] = '\0';
237       }
238       for (pos = 0; pos < seq_len; pos++)
239 	{
240 	  if (afp->line[seq_start+pos] == '-') continue;
241 	  if (isupper(afp->line[seq_start+pos])) {
242 	    if (msa->rf[alen+pos] == '.') ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected upper case residue (#%d on line)", (int) pos+1);
243 	    msa->rf[alen+pos] = 'x';
244 	  }
245 	  if (islower(afp->line[seq_start+pos])) {
246 	    if (msa->rf[alen+pos] == 'x') ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected lower case residue (#%d on line)", (int) pos+1);
247 	    msa->rf[alen+pos] = '.';
248 	  }
249 	}
250 
251       /* Store the sequence name. */
252       if (nblocks == 0)	{
253 	/* make sure we have room for another sequence */
254 	if (idx >= msa->sqalloc &&  (status = esl_msa_Expand(msa))                   != eslOK) goto ERROR;
255 	if ( (status = esl_msa_SetSeqName(msa, idx, afp->line+name_start, name_len)) != eslOK) goto ERROR;
256       } else {
257 	if (! esl_memstrcmp(afp->line+name_start, name_len, msa->sqname[idx]))
258 	  ESL_XFAIL(eslEFORMAT, afp->errmsg, "expected sequence %s on this line, but saw %.*s", msa->sqname[idx], (int) name_len, afp->line+name_start);
259       }
260 
261       /* Append the sequence. */
262       cur_alen = alen;
263       if (msa->abc)    { status = esl_abc_dsqcat(afp->inmap, &(msa->ax[idx]),   &(cur_alen), afp->line+seq_start, seq_len); }
264       if (! msa->abc)  { status = esl_strmapcat (afp->inmap, &(msa->aseq[idx]), &(cur_alen), afp->line+seq_start, seq_len); }
265       if      (status == eslEINVAL)    ESL_XFAIL(eslEFORMAT, afp->errmsg, "one or more invalid sequence characters");
266       else if (status != eslOK)        goto ERROR;
267       if (cur_alen - alen != seq_len)  ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected number of seq characters");
268 
269       /* get next line. if it's blank, or if we're EOF, we're done with the block */
270       idx++;
271       status = esl_msafile_GetLine(afp, NULL, NULL);
272     } while (status == eslOK && esl_memspn(afp->line, afp->n, " \t") < afp->n); /* blank line ends a block. */
273     if (status != eslOK && status != eslEOF) goto ERROR;
274     /* End of one block */
275 
276     if     (nblocks == 0) nseq = idx;
277     else if (idx != nseq) ESL_XFAIL(eslEFORMAT, afp->errmsg, "last block didn't contain same # of seqs as earlier blocks");
278     alen += block_seq_len;
279     nblocks++;
280 
281     /* skip blank lines to start of next block, if any */
282     while ( (status = esl_msafile_GetLine(afp, NULL, NULL)) == eslOK  && esl_memspn(afp->line, afp->n, " \t") == afp->n) ;
283    } while (status == eslOK);
284    if (status != eslEOF) goto ERROR;
285 
286    msa->nseq = nseq;
287    msa->alen = alen;
288    if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR;
289    *ret_msa  = msa;
290    return eslOK;
291 
292  ERROR:
293   if (msa) esl_msa_Destroy(msa);
294   *ret_msa = NULL;
295   return status;
296 }
297 
298 
299 /* Function:  esl_msafile_psiblast_Write()
300  * Synopsis:  Write an MSA to a stream in PSI-BLAST format
301  *
302  * Purpose:   Write alignment <msa> in NCBI PSI-BLAST format to
303  *            stream <fp>.
304  *
305  *            The <msa> should have a valid reference line <msa->rf>,
306  *            with alphanumeric characters marking consensus (match)
307  *            columns, and non-alphanumeric characters marking
308  *            nonconsensus (insert) columns. If it does not have RF
309  *            annotation, then the first sequence in the <msa>
310  *            defines the "consensus".
311  *
312  *            PSI-BLAST format allows only one symbol ('-') for gaps,
313  *            and cannot represent missing data symbols (Easel's
314  *            '~'). Any missing data symbols are converted to gaps.
315  *
316  * Args:      fp  - open output stream
317  *            msa - MSA to write
318  *
319  * Returns:   <eslOK> on success.
320  *
321  * Throws:    <eslEMEM> on allocation failure.
322  *            <eslEWRITE> on any system write failure, such as filled disk.
323  */
324 int
esl_msafile_psiblast_Write(FILE * fp,const ESL_MSA * msa)325 esl_msafile_psiblast_Write(FILE *fp, const ESL_MSA *msa)
326 {
327   char    *buf = NULL;
328   int      cpl = 60;
329   int      acpl;
330   int      i;
331   int      sym;
332   int64_t  pos, bpos;
333   int      maxnamewidth = esl_str_GetMaxWidth(msa->sqname, msa->nseq);
334   int      is_consensus;
335   int      is_residue;
336   int      status;
337 
338   ESL_ALLOC(buf, sizeof(char) * (cpl+1));
339 
340   for (pos = 0; pos < msa->alen; pos += cpl)
341     {
342       for (i = 0; i < msa->nseq; i++)
343 	{
344 	  acpl =  (msa->alen - pos > cpl)? cpl : msa->alen - pos;
345 
346 	  if (msa->abc)
347 	    {
348 	      for (bpos = 0; bpos < acpl; bpos++)
349 		{
350 		  sym          = msa->abc->sym[msa->ax[i][pos + bpos + 1]];
351 		  is_residue   = esl_abc_XIsResidue(msa->abc, msa->ax[i][pos+bpos+1]);
352 		  if (msa->rf) is_consensus = (isalnum(msa->rf[pos + bpos]) ? TRUE : FALSE);
353 		  else         is_consensus = (esl_abc_XIsResidue(msa->abc, msa->ax[0][pos+bpos+1]) ? TRUE : FALSE);
354 
355 		  if (is_consensus) { buf[bpos] = (is_residue ? toupper(sym) : '-'); }
356 		  else              { buf[bpos] = (is_residue ? tolower(sym) : '-'); }
357 		}
358 	    }
359 
360 	  if (! msa->abc)
361 	    {
362 	      for (bpos = 0; bpos < acpl; bpos++)
363 		{
364 		  sym          = msa->aseq[i][pos + bpos];
365 		  is_residue   = isalnum(sym);
366 		  if (msa->rf) is_consensus = (isalnum(msa->rf[pos + bpos]) ? TRUE : FALSE);
367 		  else         is_consensus = (isalnum(msa->aseq[0][pos+bpos]) ? TRUE : FALSE);
368 
369 		  if (is_consensus) { buf[bpos] = (is_residue ? toupper(sym) : '-'); }
370 		  else              { buf[bpos] = (is_residue ? tolower(sym) : '-'); }
371 		}
372 	    }
373 	  buf[acpl] = '\0';
374 	  if (fprintf(fp, "%-*s  %s\n", maxnamewidth, msa->sqname[i], buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "psiblast msa write failed");
375 	}  /* end loop over sequences */
376 
377       if (pos + cpl < msa->alen)
378 	{ if (fputc('\n', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "psiblast msa write failed"); }
379     }
380   free(buf);
381   return eslOK;
382 
383  ERROR:
384   if (buf) free(buf);
385   return status;
386 }
387 /*----------- end, API for i/o of psi-blast format --------------*/
388 
389 
390 
391 /*****************************************************************
392  * 2. Unit tests.
393  *****************************************************************/
394 #ifdef eslMSAFILE_PSIBLAST_TESTDRIVE
395 static void
utest_write_good1(FILE * ofp,int * ret_alphatype,int * ret_nseq,int * ret_alen)396 utest_write_good1(FILE *ofp, int *ret_alphatype, int *ret_nseq, int *ret_alen)
397 {
398   fputs("MYG_PHYCA   --------V-LSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRFKHLKT\n", ofp);
399   fputs("GLB5_PETMA  pivdtgsvApLSAAEKTKIRSAWAPVYSTYETSGVDILVKFFTSTPAAQEFFPKFKGLTT\n", ofp);
400   fputs("HBB_HUMAN   --------VhLTPEEKSAVTALWGKV--NVDEVGGEALGRLLVVYPWTQRFFESFGDLST\n", ofp);
401   fputs("HBA_HUMAN   --------V-LSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHF-----\n", ofp);
402   fputs("\n", ofp);
403   fputs("MYG_PHYCA   EAEMKASEDLKKHGVTVLTALGAILKKKGH---HEAELKPLAQSHATKHKIPIKYLEFIS\n", ofp);
404   fputs("GLB5_PETMA  ADQLKKSADVRWHAERIINAVNDAVASMDDtekMSMKLRDLSGKHAKSFQVDPQYFKVLA\n", ofp);
405   fputs("HBB_HUMAN   PDAVMGNPKVKAHGKKVLGAFSDGLAHLDN---LKGTFATLSELHCDKLHVDPENFRLLG\n", ofp);
406   fputs("HBA_HUMAN   -DLSHGSAQVKGHGKKVADALTNAVAHVDD---MPNALSALSDLHAHKLRVDPVNFKLLS\n", ofp);
407   fputs("\n", ofp);
408   fputs("MYG_PHYCA   EAIIHVLHSRHPGDFGADAQGAMNKALELFRKDIAAKYKELGYQG\n", ofp);
409   fputs("GLB5_PETMA  AVI---------ADTVAAGDAGFEKLMSMICILLRSAY-------\n", ofp);
410   fputs("HBB_HUMAN   NVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH------\n", ofp);
411   fputs("HBA_HUMAN   HCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKYR------\n", ofp);
412 
413   *ret_alphatype = eslAMINO;
414   *ret_nseq      = 4;
415   *ret_alen      = 165;
416 }
417 
418 static void
utest_write_good2(FILE * ofp,int * ret_alphatype,int * ret_nseq,int * ret_alen)419 utest_write_good2(FILE *ofp, int *ret_alphatype, int *ret_nseq, int *ret_alen)
420 {
421   fputs("tRNA2  UCCGAUAUAGUGUAACGGCUAUCACAUCACGCUUUCACCGUGG-AGACCGGGGUUCGACU\n", ofp);
422   fputs("tRNA3  UCCGUGAUAGUUUAAUGGUCAGAAUGG-GCGCUUGUCGCGUGCcAGAUCGGGGUUCAAUU\n", ofp);
423   fputs("tRNA5  GGGCACAUGGCGCAGUUGGUAGCGCGCUUCCCUUGCAAGGAAGaGGUCAUCGGUUCGAUU\n", ofp);
424   fputs("tRNA1  GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGaGGUCCUGUGUUCGAUC\n", ofp);
425   fputs("tRNA4  GCUCGUAUGGCGCAGUGG-UAGCGCAGCAGAUUGCAAAUCUGUuGGUCCUUAGUUCGAUC\n", ofp);
426   fputs("\n", ofp);
427   fputs("tRNA2  CCCCGUAUCGGAG\n", ofp);
428   fputs("tRNA3  CCCCGUCGCGGAG\n", ofp);
429   fputs("tRNA5  CCGGUUGCGUCCA\n", ofp);
430   fputs("tRNA1  CACAGAAUUCGCA\n", ofp);
431   fputs("tRNA4  CUGAGUGCGAGCU\n", ofp);
432   *ret_alphatype = eslRNA;
433   *ret_nseq      = 5;
434   *ret_alen      = 73;
435 }
436 
437 static void
utest_goodfile(char * filename,int testnumber,int expected_alphatype,int expected_nseq,int expected_alen)438 utest_goodfile(char *filename, int testnumber, int expected_alphatype, int expected_nseq, int expected_alen)
439 {
440   ESL_ALPHABET        *abc          = NULL;
441   ESL_MSAFILE         *afp          = NULL;
442   ESL_MSA             *msa1         = NULL;
443   ESL_MSA             *msa2         = NULL;
444   char                 tmpfile1[32] = "esltmpXXXXXX";
445   char                 tmpfile2[32] = "esltmpXXXXXX";
446   FILE                *ofp          = NULL;
447   int                  status;
448 
449   /* guessing both the format and the alphabet should work: this is a digital open */
450   /* PSIBLAST format is autodetected as SELEX, which is fine - selex parser is more general */
451   if ( (status = esl_msafile_Open(&abc, filename, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) esl_fatal("psiblast good file test %d failed: digital open",           testnumber);
452   if (afp->format != eslMSAFILE_SELEX)                                                             esl_fatal("psiblast good file test %d failed: format autodetection",   testnumber);
453   if (abc->type   != expected_alphatype)                                                           esl_fatal("psiblast good file test %d failed: alphabet autodetection", testnumber);
454   afp->format = eslMSAFILE_PSIBLAST;
455 
456   /* This is a digital read, using <abc>. */
457   if ( (status = esl_msafile_psiblast_Read(afp, &msa1))   != eslOK) esl_fatal("psiblast good file test %d failed: msa read, digital", testnumber);
458   if (msa1->nseq != expected_nseq || msa1->alen != expected_alen)   esl_fatal("psiblast good file test %d failed: nseq/alen",         testnumber);
459   if (esl_msa_Validate(msa1, NULL) != eslOK)                        esl_fatal("psiblast good file test %d failed: msa1 invalid",      testnumber);
460   esl_msafile_Close(afp);
461 
462   /* write it back out to a new tmpfile (digital write) */
463   if ( (status = esl_tmpfile_named(tmpfile1, &ofp))      != eslOK) esl_fatal("psiblast good file test %d failed: tmpfile creation",   testnumber);
464   if ( (status = esl_msafile_psiblast_Write(ofp, msa1))  != eslOK) esl_fatal("psiblast good file test %d failed: msa write, digital", testnumber);
465   fclose(ofp);
466 
467   /* now open and read it as text mode, in known format. (We have to pass fmtd now, to deal with the possibility of a nonstandard name width) */
468   if ( (status = esl_msafile_Open(NULL, tmpfile1, NULL, eslMSAFILE_PSIBLAST, NULL, &afp)) != eslOK) esl_fatal("psiblast good file test %d failed: text mode open", testnumber);
469   if ( (status = esl_msafile_psiblast_Read(afp, &msa2))                                   != eslOK) esl_fatal("psiblast good file test %d failed: msa read, text", testnumber);
470   if (msa2->nseq != expected_nseq || msa2->alen != expected_alen)                                   esl_fatal("psiblast good file test %d failed: nseq/alen",      testnumber);
471   if (esl_msa_Validate(msa2, NULL) != eslOK)                                                        esl_fatal("psiblast good file test %d failed: msa2 invalid",   testnumber);
472   esl_msafile_Close(afp);
473 
474   /* write it back out to a new tmpfile (text write) */
475   if ( (status = esl_tmpfile_named(tmpfile2, &ofp))      != eslOK) esl_fatal("psiblast good file test %d failed: tmpfile creation", testnumber);
476   if ( (status = esl_msafile_psiblast_Write(ofp, msa2))  != eslOK) esl_fatal("psiblast good file test %d failed: msa write, text",  testnumber);
477   fclose(ofp);
478   esl_msa_Destroy(msa2);
479 
480   /* open and read it in digital mode */
481   if ( (status = esl_msafile_Open(&abc, tmpfile1, NULL, eslMSAFILE_PSIBLAST, NULL, &afp)) != eslOK) esl_fatal("psiblast good file test %d failed: 2nd digital mode open", testnumber);
482   if ( (status = esl_msafile_psiblast_Read(afp, &msa2))                                   != eslOK) esl_fatal("psiblast good file test %d failed: 2nd digital msa read",  testnumber);
483   if (esl_msa_Validate(msa2, NULL) != eslOK)                                                        esl_fatal("psiblast good file test %d failed: msa2 invalid",          testnumber);
484   esl_msafile_Close(afp);
485 
486   /* this msa <msa2> should be identical to <msa1> */
487   if (esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal("psiblast good file test %d failed: msa compare", testnumber);
488 
489   remove(tmpfile1);
490   remove(tmpfile2);
491   esl_msa_Destroy(msa1);
492   esl_msa_Destroy(msa2);
493   esl_alphabet_Destroy(abc);
494 }
495 
496 static void
write_test_msas(FILE * ofp1,FILE * ofp2)497 write_test_msas(FILE *ofp1, FILE *ofp2)
498 {
499   fprintf(ofp1, "\n");
500   fprintf(ofp1, "seq1  --ACDEFGHIKLMNPQRSTVWY\n");
501   fprintf(ofp1, "seq2  --ACDEFGHIKLMNPQRSTV-- \n");
502   fprintf(ofp1, "seq3  aaACDEFGHIKLMNPQRSTV--  \n");
503   fprintf(ofp1, "seq4  --ACDEFGHIKLMNPQRSTVWY  \n");
504   fprintf(ofp1, "\n");
505   fprintf(ofp1, "seq1  ACDEFGHIKLMNPQRSTVWY--\n");
506   fprintf(ofp1, "seq2  ACDEFGHIKLMNPQRSTVWYyy\n");
507   fprintf(ofp1, "seq3  ACDEFGHIKLMNPQRSTVWY--\n");
508   fprintf(ofp1, "seq4  ACDEFGHIKLMNPQRSTVWY--\n");
509   fprintf(ofp1, "\n");
510 
511   fprintf(ofp2, "# STOCKHOLM 1.0\n");
512   fprintf(ofp2, "\n");
513   fprintf(ofp2, "#=GC RF ..xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx..\n");
514   fprintf(ofp2, "seq1    --ACDEFGHIKLMNPQRSTVWYACDEFGHIKLMNPQRSTVWY--\n");
515   fprintf(ofp2, "seq2    --ACDEFGHIKLMNPQRSTV--ACDEFGHIKLMNPQRSTVWYyy\n");
516   fprintf(ofp2, "seq3    aaACDEFGHIKLMNPQRSTV--ACDEFGHIKLMNPQRSTVWY--\n");
517   fprintf(ofp2, "seq4    --ACDEFGHIKLMNPQRSTVWYACDEFGHIKLMNPQRSTVWY--\n");
518   fprintf(ofp2, "//\n");
519 }
520 
521 static void
read_test_msas_digital(char * pbfile,char * stkfile)522 read_test_msas_digital(char *pbfile, char *stkfile)
523 {
524   char msg[]         = "PSIBLAST msa digital read unit test failed";
525   ESL_ALPHABET *abc  = NULL;
526   ESL_MSAFILE  *afp1 = NULL;
527   ESL_MSAFILE  *afp2 = NULL;
528   ESL_MSA      *msa1, *msa2, *msa3, *msa4;
529   FILE         *pbfp, *stkfp;
530   char          pbfile2[32]  = "esltmppb2XXXXXX";
531   char          stkfile2[32] = "esltmpstk2XXXXXX";
532 
533   if ( esl_msafile_Open(&abc, pbfile,  NULL, eslMSAFILE_PSIBLAST,  NULL, &afp1) != eslOK)  esl_fatal(msg);
534   if ( !abc || abc->type != eslAMINO)                                                       esl_fatal(msg);
535   if ( esl_msafile_Open(&abc, stkfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK)  esl_fatal(msg);
536   if ( esl_msafile_psiblast_Read (afp1, &msa1)                                  != eslOK)  esl_fatal(msg);
537   if ( esl_msafile_stockholm_Read(afp2, &msa2)                                  != eslOK)  esl_fatal(msg);
538   if ( esl_msa_Compare(msa1, msa2)                                              != eslOK)  esl_fatal(msg);
539 
540   if ( esl_msafile_psiblast_Read (afp1, &msa3) != eslEOF) esl_fatal(msg);
541   if ( esl_msafile_stockholm_Read(afp2, &msa3) != eslEOF) esl_fatal(msg);
542 
543   esl_msafile_Close(afp2);
544   esl_msafile_Close(afp1);
545 
546   /* Now write stk to psiblast file, and vice versa; then retest */
547   if ( esl_tmpfile_named(pbfile2,  &pbfp)                                   != eslOK) esl_fatal(msg);
548   if ( esl_tmpfile_named(stkfile2, &stkfp)                                  != eslOK) esl_fatal(msg);
549   if ( esl_msafile_psiblast_Write  (pbfp, msa2)                             != eslOK) esl_fatal(msg);
550   if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_STOCKHOLM)       != eslOK) esl_fatal(msg);
551   fclose(pbfp);
552   fclose(stkfp);
553   if ( esl_msafile_Open(&abc, pbfile2,  NULL, eslMSAFILE_PSIBLAST,  NULL, &afp1) != eslOK) esl_fatal(msg);
554   if ( esl_msafile_Open(&abc, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
555   if ( esl_msafile_psiblast_Read (afp1, &msa3)                                   != eslOK) esl_fatal(msg);
556   if ( esl_msafile_stockholm_Read(afp2, &msa4)                                   != eslOK) esl_fatal(msg);
557   if ( esl_msa_Compare(msa3, msa4)                                               != eslOK) esl_fatal(msg);
558 
559   remove(pbfile2);
560   remove(stkfile2);
561   esl_msafile_Close(afp2);
562   esl_msafile_Close(afp1);
563 
564   esl_msa_Destroy(msa1);
565   esl_msa_Destroy(msa2);
566   esl_msa_Destroy(msa3);
567   esl_msa_Destroy(msa4);
568   esl_alphabet_Destroy(abc);
569 }
570 
571 static void
read_test_msas_text(char * pbfile,char * stkfile)572 read_test_msas_text(char *pbfile, char *stkfile)
573 {
574   char msg[]         = "PSIBLAST msa text-mode read unit test failed";
575   ESL_MSAFILE  *afp1 = NULL;
576   ESL_MSAFILE  *afp2 = NULL;
577   ESL_MSA      *msa1, *msa2, *msa3, *msa4;
578   FILE         *pbfp, *stkfp;
579   char          pbfile2[32]  = "esltmppb2XXXXXX";
580   char          stkfile2[32] = "esltmpstk2XXXXXX";
581 
582   /*                    vvvv-- everything's the same as the digital utest except these NULLs  */
583   if ( esl_msafile_Open(NULL, pbfile,  NULL, eslMSAFILE_PSIBLAST,  NULL, &afp1)   != eslOK)  esl_fatal(msg);
584   if ( esl_msafile_Open(NULL, stkfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2)   != eslOK)  esl_fatal(msg);
585   if ( esl_msafile_psiblast_Read (afp1, &msa1)                                    != eslOK)  esl_fatal(msg);
586   if ( esl_msafile_stockholm_Read(afp2, &msa2)                                    != eslOK)  esl_fatal(msg);
587   if ( esl_msa_Compare(msa1, msa2)                                                != eslOK)  esl_fatal(msg);
588   if ( esl_msafile_psiblast_Read (afp1, &msa3)                                    != eslEOF) esl_fatal(msg);
589   if ( esl_msafile_stockholm_Read(afp2, &msa3)                                    != eslEOF) esl_fatal(msg);
590   esl_msafile_Close(afp2);
591   esl_msafile_Close(afp1);
592 
593   if ( esl_tmpfile_named(pbfile2, &pbfp)                                     != eslOK) esl_fatal(msg);
594   if ( esl_tmpfile_named(stkfile2, &stkfp)                                   != eslOK) esl_fatal(msg);
595   if ( esl_msafile_psiblast_Write (pbfp,  msa2)                              != eslOK) esl_fatal(msg);
596   if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_STOCKHOLM)        != eslOK) esl_fatal(msg);
597   fclose(pbfp);
598   fclose(stkfp);
599   if ( esl_msafile_Open(NULL, pbfile2,  NULL, eslMSAFILE_PSIBLAST,  NULL, &afp1)  != eslOK) esl_fatal(msg);
600   if ( esl_msafile_Open(NULL, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2)  != eslOK) esl_fatal(msg);
601   if ( esl_msafile_psiblast_Read (afp1, &msa3)                                    != eslOK) esl_fatal(msg);
602   if ( esl_msafile_stockholm_Read(afp2, &msa4)                                    != eslOK) esl_fatal(msg);
603   if ( esl_msa_Compare(msa3, msa4)                                                != eslOK) esl_fatal(msg);
604 
605   remove(pbfile2);
606   remove(stkfile2);
607   esl_msafile_Close(afp2);
608   esl_msafile_Close(afp1);
609 
610   esl_msa_Destroy(msa1);
611   esl_msa_Destroy(msa2);
612   esl_msa_Destroy(msa3);
613   esl_msa_Destroy(msa4);
614 }
615 #endif /*eslMSAFILE_PSIBLAST_TESTDRIVE*/
616 /*---------------------- end, unit tests ------------------------*/
617 
618 
619 /*****************************************************************
620  * 3. Test driver.
621  *****************************************************************/
622 #ifdef eslMSAFILE_PSIBLAST_TESTDRIVE
623 /* compile: gcc -g -Wall -I. -L. -o esl_msafile_psiblast_utest -DeslMSAFILE_PSIBLAST_TESTDRIVE esl_msafile_psiblast.c -leasel -lm
624  *  (gcov): gcc -g -Wall -fprofile-arcs -ftest-coverage -I. -L. -o esl_msafile_psiblast_utest -DeslMSAFILE_PSIBLAST_TESTDRIVE esl_msafile_psiblast.c -leasel -lm
625  * run:     ./esl_msafile_psiblast_utest
626  */
627 #include "esl_config.h"
628 
629 #include <stdio.h>
630 
631 #include "easel.h"
632 #include "esl_getopts.h"
633 #include "esl_random.h"
634 #include "esl_msafile.h"
635 #include "esl_msafile_psiblast.h"
636 
637 static ESL_OPTIONS options[] = {
638    /* name  type         default  env   range togs  reqs  incomp  help                docgrp */
639   {"-h",  eslARG_NONE,    FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage",                            0},
640   { 0,0,0,0,0,0,0,0,0,0},
641 };
642 static char usage[]  = "[-options]";
643 static char banner[] = "test driver for PSIBLAST MSA format module";
644 
645 int
main(int argc,char ** argv)646 main(int argc, char **argv)
647 {
648   char            msg[]        = "PSI-BLAST MSA i/o module test driver failed";
649   ESL_GETOPTS    *go           = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
650   char            pbfile[32]   = "esltmppbXXXXXX";
651   char            stkfile[32]  = "esltmpstkXXXXXX";
652   FILE           *pbfp, *stkfp;
653   int             testnumber;
654   int             ngoodtests = 2;
655   char            tmpfile[32];
656   FILE           *ofp;
657   int             expected_alphatype;
658   int             expected_nseq;
659   int             expected_alen;
660 
661   if ( esl_tmpfile_named(pbfile,  &pbfp)  != eslOK) esl_fatal(msg);
662   if ( esl_tmpfile_named(stkfile, &stkfp) != eslOK) esl_fatal(msg);
663   write_test_msas(pbfp, stkfp);
664   fclose(pbfp);
665   fclose(stkfp);
666 
667   read_test_msas_digital(pbfile, stkfile);
668   read_test_msas_text   (pbfile, stkfile);
669 
670   /* Various "good" files that should be parsed correctly */
671   for (testnumber = 1; testnumber <= ngoodtests; testnumber++)
672     {
673       strcpy(tmpfile, "esltmpXXXXXX");
674       if (esl_tmpfile_named(tmpfile, &ofp) != eslOK) esl_fatal(msg);
675       switch (testnumber) {
676       case  1:  utest_write_good1 (ofp, &expected_alphatype, &expected_nseq, &expected_alen); break;
677       case  2:  utest_write_good2 (ofp, &expected_alphatype, &expected_nseq, &expected_alen); break;
678       }
679       fclose(ofp);
680       utest_goodfile(tmpfile, testnumber, expected_alphatype, expected_nseq, expected_alen);
681       remove(tmpfile);
682     }
683 
684   remove(pbfile);
685   remove(stkfile);
686   esl_getopts_Destroy(go);
687   return 0;
688 }
689 #endif /*eslMSAFILE_PSIBLAST_TESTDRIVE*/
690 /*--------------------- end, test driver ------------------------*/
691 
692 
693 
694 
695 /*****************************************************************
696  * 4. Examples.
697  *****************************************************************/
698 
699 #ifdef eslMSAFILE_PSIBLAST_EXAMPLE
700 /* A full-featured example of reading/writing an MSA in PSIBLAST format.
701    gcc -g -Wall -o esl_msafile_psiblast_example -I. -L. -DeslMSAFILE_PSIBLAST_EXAMPLE esl_msafile_psiblast.c -leasel -lm
702    ./esl_msafile_psiblast_example <msafile>
703  */
704 /*::cexcerpt::msafile_psiblast_example::begin::*/
705 #include <stdio.h>
706 
707 #include "easel.h"
708 #include "esl_alphabet.h"
709 #include "esl_getopts.h"
710 #include "esl_msa.h"
711 #include "esl_msafile.h"
712 #include "esl_msafile_psiblast.h"
713 
714 static ESL_OPTIONS options[] = {
715   /* name             type          default  env  range toggles reqs incomp  help                                       docgroup*/
716   { "-h",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",            0 },
717   { "-1",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "override autodetection; force PSIBLAST format",   0 },
718   { "-q",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "quieter: don't write msa back, just summary",     0 },
719   { "-t",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use text mode: no digital alphabet",              0 },
720   { "--dna",       eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, "-t", "specify that alphabet is DNA",                    0 },
721   { "--rna",       eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, "-t", "specify that alphabet is RNA",                    0 },
722   { "--amino",     eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, "-t", "specify that alphabet is protein",                0 },
723   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
724 };
725 static char usage[]  = "[-options] <msafile>";
726 static char banner[] = "example of guessing, reading, writing PSIBLAST format";
727 
728 int
main(int argc,char ** argv)729 main(int argc, char **argv)
730 {
731   ESL_GETOPTS        *go          = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
732   char               *filename    = esl_opt_GetArg(go, 1);
733   int                 infmt       = eslMSAFILE_UNKNOWN;
734   ESL_ALPHABET       *abc         = NULL;
735   ESL_MSAFILE        *afp         = NULL;
736   ESL_MSA            *msa         = NULL;
737   int                 status;
738 
739   if      (esl_opt_GetBoolean(go, "-1"))      infmt = eslMSAFILE_PSIBLAST;  /* override format autodetection */
740 
741   if      (esl_opt_GetBoolean(go, "--rna"))   abc = esl_alphabet_Create(eslRNA);
742   else if (esl_opt_GetBoolean(go, "--dna"))   abc = esl_alphabet_Create(eslDNA);
743   else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
744 
745   /* Text mode: pass NULL for alphabet.
746    * Digital mode: pass ptr to expected ESL_ALPHABET; and if abc=NULL, alphabet is guessed
747    */
748   if   (esl_opt_GetBoolean(go, "-t"))  status = esl_msafile_Open(NULL, filename, NULL, infmt, NULL, &afp);
749   else                                 status = esl_msafile_Open(&abc, filename, NULL, infmt, NULL, &afp);
750   if (status != eslOK) esl_msafile_OpenFailure(afp, status);
751 
752   if ((status = esl_msafile_psiblast_Read(afp, &msa)) != eslOK)
753     esl_msafile_ReadFailure(afp, status);
754 
755   printf("alphabet:       %s\n", (abc ? esl_abc_DecodeType(abc->type) : "none (text mode)"));
756   printf("# of seqs:      %d\n", msa->nseq);
757   printf("# of cols:      %d\n", (int) msa->alen);
758   printf("\n");
759 
760   if (! esl_opt_GetBoolean(go, "-q"))
761     esl_msafile_psiblast_Write(stdout, msa);
762 
763   esl_msa_Destroy(msa);
764   esl_msafile_Close(afp);
765   if (abc) esl_alphabet_Destroy(abc);
766   esl_getopts_Destroy(go);
767   exit(0);
768 }
769 /*::cexcerpt::msafile_psiblast_example::end::*/
770 #endif /*eslMSAFILE_PSIBLAST_EXAMPLE*/
771 
772 
773 #ifdef eslMSAFILE_PSIBLAST_EXAMPLE2
774 /* A minimal example. Read PSIBLAST format MSA, in text mode.
775    gcc -g -Wall -o esl_msafile_psiblast_example2 -I. -L. -DeslMSAFILE_PSIBLAST_EXAMPLE2 esl_msafile_psiblast.c -leasel -lm
776    ./esl_msafile_psiblast_example2 <msafile>
777  */
778 
779 /*::cexcerpt::msafile_psiblast_example2::begin::*/
780 #include <stdio.h>
781 
782 #include "easel.h"
783 #include "esl_msa.h"
784 #include "esl_msafile.h"
785 #include "esl_msafile_psiblast.h"
786 
787 int
main(int argc,char ** argv)788 main(int argc, char **argv)
789 {
790   char         *filename = argv[1];
791   int           fmt      = eslMSAFILE_PSIBLAST;
792   ESL_MSAFILE  *afp      = NULL;
793   ESL_MSA      *msa      = NULL;
794   int           status;
795 
796   if ( (status = esl_msafile_Open(NULL, filename, NULL, fmt, NULL, &afp)) != eslOK) esl_msafile_OpenFailure(afp, status);
797   if ( (status = esl_msafile_psiblast_Read(afp, &msa))                    != eslOK) esl_msafile_ReadFailure(afp, status);
798 
799   printf("%6d seqs, %5d columns\n", msa->nseq, (int) msa->alen);
800 
801   esl_msafile_psiblast_Write(stdout, msa);
802 
803   esl_msa_Destroy(msa);
804   esl_msafile_Close(afp);
805   exit(0);
806 }
807 /*::cexcerpt::msafile_psiblast_example2::end::*/
808 #endif /*eslMSAFILE_PSIBLAST_EXAMPLE2*/
809 /*--------------------- end of examples -------------------------*/
810