1 /* I/O of multiple sequence alignments in PSI-BLAST format
2 *
3 * Contents:
4 * 1. API for reading/writing PSI-BLAST format
5 * 2. Unit tests.
6 * 3. Test driver.
7 * 4. Examples.
8 */
9 #include "esl_config.h"
10
11 #include <stdio.h>
12 #include <string.h>
13 #include <ctype.h>
14
15 #include "easel.h"
16 #include "esl_alphabet.h"
17 #include "esl_mem.h"
18 #include "esl_msa.h"
19 #include "esl_msafile.h"
20
21 #include "esl_msafile_psiblast.h"
22
23 /*****************************************************************
24 *# 1. API for reading/writing PSI-BLAST format
25 *****************************************************************/
26
27 /* Function: esl_msafile_psiblast_SetInmap()
28 * Synopsis: Set input map specific for PSI-BLAST input.
29 *
30 * Purpose: Set the <afp->inmap> for PSI-BLAST format.
31 *
32 * PSI-BLAST only allows - for a gap. It also disallows O residues.
33 *
34 * Text mode accepts any <isalpha()> character plus '-' but not 'O' or 'o'.
35 * Digital mode enforces the usual Easel alphabets, but disallows "._*~".
36 */
37 int
esl_msafile_psiblast_SetInmap(ESL_MSAFILE * afp)38 esl_msafile_psiblast_SetInmap(ESL_MSAFILE *afp)
39 {
40 int sym;
41
42 if (afp->abc)
43 {
44 for (sym = 0; sym < 128; sym++)
45 afp->inmap[sym] = afp->abc->inmap[sym];
46 afp->inmap[0] = esl_abc_XGetUnknown(afp->abc);
47 afp->inmap['.'] = eslDSQ_ILLEGAL;
48 afp->inmap['_'] = eslDSQ_ILLEGAL;
49 afp->inmap['*'] = eslDSQ_ILLEGAL;
50 afp->inmap['~'] = eslDSQ_ILLEGAL;
51 }
52
53 if (! afp->abc)
54 {
55 for (sym = 1; sym < 128; sym++)
56 afp->inmap[sym] = (isalpha(sym) ? sym : eslDSQ_ILLEGAL);
57 afp->inmap[0] = '?';
58 afp->inmap['-'] = '-';
59 }
60
61 afp->inmap['O'] = eslDSQ_ILLEGAL;
62 afp->inmap['o'] = eslDSQ_ILLEGAL;
63 return eslOK;
64 }
65
66 /* Function: esl_msafile_psiblast_GuessAlphabet()
67 * Synopsis: Guess the alphabet of an open PSI-BLAST MSA file.
68 *
69 * Purpose: Guess the alpbabet of the sequences in open
70 * PSI-BLAST format MSA file <afp>.
71 *
72 * On a normal return, <*ret_type> is set to <eslDNA>,
73 * <eslRNA>, or <eslAMINO>, and <afp> is reset to its
74 * original position.
75 *
76 * Args: afp - open PSI-BLAST format MSA file
77 * ret_type - RETURN: <eslDNA>, <eslRNA>, or <eslAMINO>
78 *
79 * Returns: <eslOK> on success.
80 * <eslENOALPHABET> if alphabet type can't be determined.
81 * In either case, <afp> is rewound to the position it
82 * started at.
83 */
84 int
esl_msafile_psiblast_GuessAlphabet(ESL_MSAFILE * afp,int * ret_type)85 esl_msafile_psiblast_GuessAlphabet(ESL_MSAFILE *afp, int *ret_type)
86 {
87 int alphatype = eslUNKNOWN;
88 esl_pos_t anchor = -1;
89 int threshold[3] = { 500, 5000, 50000 }; /* we check after 500, 5000, 50000 residues; else we go to EOF */
90 int nsteps = 3;
91 int step = 0;
92 int nres = 0;
93 int x;
94 int64_t ct[26];
95 char *p, *tok;
96 esl_pos_t n, toklen, pos;
97 int status;
98
99 for (x = 0; x < 26; x++) ct[x] = 0;
100
101 anchor = esl_buffer_GetOffset(afp->bf);
102 if ((status = esl_buffer_SetAnchor(afp->bf, anchor)) != eslOK) { status = eslEINCONCEIVABLE; goto ERROR; } /* [eslINVAL] can't happen here */
103
104 while ( (status = esl_buffer_GetLine(afp->bf, &p, &n)) == eslOK)
105 {
106 if ((status = esl_memtok(&p, &n, " \t", &tok, &toklen)) != eslOK) continue; /* blank lines */
107 /* p now points to the rest of the sequence line, after a name */
108
109 /* count characters into ct[] array */
110 for (pos = 0; pos < n; pos++)
111 if (isalpha(p[pos])) {
112 x = toupper(p[pos]) - 'A';
113 ct[x]++;
114 nres++;
115 }
116
117 /* try to stop early, checking after 500, 5000, and 50000 residues: */
118 if (step < nsteps && nres > threshold[step]) {
119 if ((status = esl_abc_GuessAlphabet(ct, &alphatype)) == eslOK) goto DONE; /* (eslENOALPHABET) */
120 step++;
121 }
122 }
123 if (status != eslEOF) goto ERROR; /* [eslEMEM,eslESYS,eslEINCONCEIVABLE] */
124 status = esl_abc_GuessAlphabet(ct, &alphatype); /* (eslENOALPHABET) */
125
126 DONE:
127 esl_buffer_SetOffset(afp->bf, anchor); /* Rewind to where we were. */
128 esl_buffer_RaiseAnchor(afp->bf, anchor);
129 *ret_type = alphatype;
130 return status;
131
132 ERROR:
133 if (anchor != -1) {
134 esl_buffer_SetOffset(afp->bf, anchor);
135 esl_buffer_RaiseAnchor(afp->bf, anchor);
136 }
137 *ret_type = eslUNKNOWN;
138 return status;
139 }
140
141 /* Function: esl_msafile_psiblast_Read()
142 * Synopsis: Read an alignment in PSI-BLAST's input format.
143 *
144 * Purpose: Read an MSA from an open <ESL_MSAFILE> <afp>, parsing for
145 * PSI-BLAST input format, starting from the current point.
146 * Create a new multiple alignment, and return a ptr to
147 * that alignment via <*ret_msa>. Caller is responsible for
148 * free'ing this <ESL_MSA>.
149 *
150 * The <msa> has a reference line (<msa->rf[]>) that
151 * corresponds to the uppercase/lowercase columns in the
152 * alignment: consensus (uppercase) columns are marked 'x',
153 * and insert (lowercase) columns are marked '.' in this RF
154 * line.
155 *
156 * Args: afp - open <ESL_MSAFILE>
157 * ret_msa - RETURN: newly parsed <ESL_MSA>
158 *
159 * Returns: <eslOK> on success. <*ret_msa> contains the newly
160 * allocated MSA. <afp> is at EOF.
161 *
162 * <eslEOF> if no (more) alignment data are found in
163 * <afp>, and <afp> is returned at EOF.
164 *
165 * <eslEFORMAT> on a parse error. <*ret_msa> is set to
166 * <NULL>. <afp> contains information sufficient for
167 * constructing useful diagnostic output:
168 * | <afp->errmsg> | user-directed error message |
169 * | <afp->linenumber> | line # where error was detected |
170 * | <afp->line> | offending line (not NUL-term) |
171 * | <afp->n> | length of offending line |
172 * | <afp->bf->filename> | name of the file |
173 * and <afp> is poised at the start of the following line,
174 * so (in principle) the caller could try to resume
175 * parsing.
176 *
177 * Throws: <eslEMEM> on allocation error.
178 * <eslESYS> if a system call fails, such as fread().
179 * <eslEINCONCEIVABLE> - "impossible" corruption
180 * On these, <*ret_msa> is returned <NULL>, and the state of
181 * <afp> is undefined.
182 */
183 int
esl_msafile_psiblast_Read(ESL_MSAFILE * afp,ESL_MSA ** ret_msa)184 esl_msafile_psiblast_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa)
185 {
186 ESL_MSA *msa = NULL;
187 int idx = 0; /* counter over sequences in a block */
188 int nblocks = 0; /* counter over blocks */
189 int64_t alen = 0;
190 int nseq = 0;
191 int64_t cur_alen;
192 esl_pos_t pos; /* position on a line */
193 esl_pos_t name_start, name_len;
194 esl_pos_t seq_start, seq_len;
195 esl_pos_t block_seq_start, block_seq_len;
196 int status;
197
198 ESL_DASSERT1( (afp->format == eslMSAFILE_PSIBLAST) );
199
200 afp->errmsg[0] = '\0';
201
202 /* allocate a growable MSA. We set msa->{nseq,alen} only when we're done. */
203 if (afp->abc && (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
204 if (! afp->abc && (msa = esl_msa_Create( 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
205
206 /* skip leading blank lines in file */
207 while ( (status = esl_msafile_GetLine(afp, NULL, NULL)) == eslOK && esl_memspn(afp->line, afp->n, " \t") == afp->n) ;
208 if (status != eslOK) goto ERROR; /* includes normal EOF */
209
210 /* Read the file a line at a time; if a parsing error occurs, detect immediately, with afp->linenumber set correctly */
211 do { /* while in the file... */
212 idx = 0;
213 do { /* while in a block... */
214 for (pos = 0; pos < afp->n; pos++) if (! isspace(afp->line[pos])) break;
215 name_start = pos;
216 for (pos = pos+1; pos < afp->n; pos++) if ( isspace(afp->line[pos])) break;
217 name_len = pos - name_start;
218 for (pos = pos+1; pos < afp->n; pos++) if (! isspace(afp->line[pos])) break;
219 seq_start = pos;
220 if (pos >= afp->n) ESL_XFAIL(eslEFORMAT, afp->errmsg, "invalid alignment line");
221 for (pos = afp->n-1; pos > 0; pos--) if (! isspace(afp->line[pos])) break;
222 seq_len = pos - seq_start + 1;
223
224 if (idx == 0) {
225 block_seq_start = seq_start;
226 block_seq_len = seq_len;
227 } else {
228 if (seq_start != block_seq_start) ESL_XFAIL(eslEFORMAT, afp->errmsg, "sequence start is misaligned");
229 if (seq_len != block_seq_len) ESL_XFAIL(eslEFORMAT, afp->errmsg, "sequence end is misaligned");
230 }
231
232 /* Process the consensus #=RF line. */
233 if (idx == 0) {
234 ESL_REALLOC(msa->rf, sizeof(char) * (alen + seq_len + 1));
235 for (pos = 0; pos < seq_len; pos++) msa->rf[alen+pos] = '-'; /* anything neutral other than . or x will do. */
236 msa->rf[alen+pos] = '\0';
237 }
238 for (pos = 0; pos < seq_len; pos++)
239 {
240 if (afp->line[seq_start+pos] == '-') continue;
241 if (isupper(afp->line[seq_start+pos])) {
242 if (msa->rf[alen+pos] == '.') ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected upper case residue (#%d on line)", (int) pos+1);
243 msa->rf[alen+pos] = 'x';
244 }
245 if (islower(afp->line[seq_start+pos])) {
246 if (msa->rf[alen+pos] == 'x') ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected lower case residue (#%d on line)", (int) pos+1);
247 msa->rf[alen+pos] = '.';
248 }
249 }
250
251 /* Store the sequence name. */
252 if (nblocks == 0) {
253 /* make sure we have room for another sequence */
254 if (idx >= msa->sqalloc && (status = esl_msa_Expand(msa)) != eslOK) goto ERROR;
255 if ( (status = esl_msa_SetSeqName(msa, idx, afp->line+name_start, name_len)) != eslOK) goto ERROR;
256 } else {
257 if (! esl_memstrcmp(afp->line+name_start, name_len, msa->sqname[idx]))
258 ESL_XFAIL(eslEFORMAT, afp->errmsg, "expected sequence %s on this line, but saw %.*s", msa->sqname[idx], (int) name_len, afp->line+name_start);
259 }
260
261 /* Append the sequence. */
262 cur_alen = alen;
263 if (msa->abc) { status = esl_abc_dsqcat(afp->inmap, &(msa->ax[idx]), &(cur_alen), afp->line+seq_start, seq_len); }
264 if (! msa->abc) { status = esl_strmapcat (afp->inmap, &(msa->aseq[idx]), &(cur_alen), afp->line+seq_start, seq_len); }
265 if (status == eslEINVAL) ESL_XFAIL(eslEFORMAT, afp->errmsg, "one or more invalid sequence characters");
266 else if (status != eslOK) goto ERROR;
267 if (cur_alen - alen != seq_len) ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected number of seq characters");
268
269 /* get next line. if it's blank, or if we're EOF, we're done with the block */
270 idx++;
271 status = esl_msafile_GetLine(afp, NULL, NULL);
272 } while (status == eslOK && esl_memspn(afp->line, afp->n, " \t") < afp->n); /* blank line ends a block. */
273 if (status != eslOK && status != eslEOF) goto ERROR;
274 /* End of one block */
275
276 if (nblocks == 0) nseq = idx;
277 else if (idx != nseq) ESL_XFAIL(eslEFORMAT, afp->errmsg, "last block didn't contain same # of seqs as earlier blocks");
278 alen += block_seq_len;
279 nblocks++;
280
281 /* skip blank lines to start of next block, if any */
282 while ( (status = esl_msafile_GetLine(afp, NULL, NULL)) == eslOK && esl_memspn(afp->line, afp->n, " \t") == afp->n) ;
283 } while (status == eslOK);
284 if (status != eslEOF) goto ERROR;
285
286 msa->nseq = nseq;
287 msa->alen = alen;
288 if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR;
289 *ret_msa = msa;
290 return eslOK;
291
292 ERROR:
293 if (msa) esl_msa_Destroy(msa);
294 *ret_msa = NULL;
295 return status;
296 }
297
298
299 /* Function: esl_msafile_psiblast_Write()
300 * Synopsis: Write an MSA to a stream in PSI-BLAST format
301 *
302 * Purpose: Write alignment <msa> in NCBI PSI-BLAST format to
303 * stream <fp>.
304 *
305 * The <msa> should have a valid reference line <msa->rf>,
306 * with alphanumeric characters marking consensus (match)
307 * columns, and non-alphanumeric characters marking
308 * nonconsensus (insert) columns. If it does not have RF
309 * annotation, then the first sequence in the <msa>
310 * defines the "consensus".
311 *
312 * PSI-BLAST format allows only one symbol ('-') for gaps,
313 * and cannot represent missing data symbols (Easel's
314 * '~'). Any missing data symbols are converted to gaps.
315 *
316 * Args: fp - open output stream
317 * msa - MSA to write
318 *
319 * Returns: <eslOK> on success.
320 *
321 * Throws: <eslEMEM> on allocation failure.
322 * <eslEWRITE> on any system write failure, such as filled disk.
323 */
324 int
esl_msafile_psiblast_Write(FILE * fp,const ESL_MSA * msa)325 esl_msafile_psiblast_Write(FILE *fp, const ESL_MSA *msa)
326 {
327 char *buf = NULL;
328 int cpl = 60;
329 int acpl;
330 int i;
331 int sym;
332 int64_t pos, bpos;
333 int maxnamewidth = esl_str_GetMaxWidth(msa->sqname, msa->nseq);
334 int is_consensus;
335 int is_residue;
336 int status;
337
338 ESL_ALLOC(buf, sizeof(char) * (cpl+1));
339
340 for (pos = 0; pos < msa->alen; pos += cpl)
341 {
342 for (i = 0; i < msa->nseq; i++)
343 {
344 acpl = (msa->alen - pos > cpl)? cpl : msa->alen - pos;
345
346 if (msa->abc)
347 {
348 for (bpos = 0; bpos < acpl; bpos++)
349 {
350 sym = msa->abc->sym[msa->ax[i][pos + bpos + 1]];
351 is_residue = esl_abc_XIsResidue(msa->abc, msa->ax[i][pos+bpos+1]);
352 if (msa->rf) is_consensus = (isalnum(msa->rf[pos + bpos]) ? TRUE : FALSE);
353 else is_consensus = (esl_abc_XIsResidue(msa->abc, msa->ax[0][pos+bpos+1]) ? TRUE : FALSE);
354
355 if (is_consensus) { buf[bpos] = (is_residue ? toupper(sym) : '-'); }
356 else { buf[bpos] = (is_residue ? tolower(sym) : '-'); }
357 }
358 }
359
360 if (! msa->abc)
361 {
362 for (bpos = 0; bpos < acpl; bpos++)
363 {
364 sym = msa->aseq[i][pos + bpos];
365 is_residue = isalnum(sym);
366 if (msa->rf) is_consensus = (isalnum(msa->rf[pos + bpos]) ? TRUE : FALSE);
367 else is_consensus = (isalnum(msa->aseq[0][pos+bpos]) ? TRUE : FALSE);
368
369 if (is_consensus) { buf[bpos] = (is_residue ? toupper(sym) : '-'); }
370 else { buf[bpos] = (is_residue ? tolower(sym) : '-'); }
371 }
372 }
373 buf[acpl] = '\0';
374 if (fprintf(fp, "%-*s %s\n", maxnamewidth, msa->sqname[i], buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "psiblast msa write failed");
375 } /* end loop over sequences */
376
377 if (pos + cpl < msa->alen)
378 { if (fputc('\n', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "psiblast msa write failed"); }
379 }
380 free(buf);
381 return eslOK;
382
383 ERROR:
384 if (buf) free(buf);
385 return status;
386 }
387 /*----------- end, API for i/o of psi-blast format --------------*/
388
389
390
391 /*****************************************************************
392 * 2. Unit tests.
393 *****************************************************************/
394 #ifdef eslMSAFILE_PSIBLAST_TESTDRIVE
395 static void
utest_write_good1(FILE * ofp,int * ret_alphatype,int * ret_nseq,int * ret_alen)396 utest_write_good1(FILE *ofp, int *ret_alphatype, int *ret_nseq, int *ret_alen)
397 {
398 fputs("MYG_PHYCA --------V-LSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRFKHLKT\n", ofp);
399 fputs("GLB5_PETMA pivdtgsvApLSAAEKTKIRSAWAPVYSTYETSGVDILVKFFTSTPAAQEFFPKFKGLTT\n", ofp);
400 fputs("HBB_HUMAN --------VhLTPEEKSAVTALWGKV--NVDEVGGEALGRLLVVYPWTQRFFESFGDLST\n", ofp);
401 fputs("HBA_HUMAN --------V-LSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHF-----\n", ofp);
402 fputs("\n", ofp);
403 fputs("MYG_PHYCA EAEMKASEDLKKHGVTVLTALGAILKKKGH---HEAELKPLAQSHATKHKIPIKYLEFIS\n", ofp);
404 fputs("GLB5_PETMA ADQLKKSADVRWHAERIINAVNDAVASMDDtekMSMKLRDLSGKHAKSFQVDPQYFKVLA\n", ofp);
405 fputs("HBB_HUMAN PDAVMGNPKVKAHGKKVLGAFSDGLAHLDN---LKGTFATLSELHCDKLHVDPENFRLLG\n", ofp);
406 fputs("HBA_HUMAN -DLSHGSAQVKGHGKKVADALTNAVAHVDD---MPNALSALSDLHAHKLRVDPVNFKLLS\n", ofp);
407 fputs("\n", ofp);
408 fputs("MYG_PHYCA EAIIHVLHSRHPGDFGADAQGAMNKALELFRKDIAAKYKELGYQG\n", ofp);
409 fputs("GLB5_PETMA AVI---------ADTVAAGDAGFEKLMSMICILLRSAY-------\n", ofp);
410 fputs("HBB_HUMAN NVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH------\n", ofp);
411 fputs("HBA_HUMAN HCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKYR------\n", ofp);
412
413 *ret_alphatype = eslAMINO;
414 *ret_nseq = 4;
415 *ret_alen = 165;
416 }
417
418 static void
utest_write_good2(FILE * ofp,int * ret_alphatype,int * ret_nseq,int * ret_alen)419 utest_write_good2(FILE *ofp, int *ret_alphatype, int *ret_nseq, int *ret_alen)
420 {
421 fputs("tRNA2 UCCGAUAUAGUGUAACGGCUAUCACAUCACGCUUUCACCGUGG-AGACCGGGGUUCGACU\n", ofp);
422 fputs("tRNA3 UCCGUGAUAGUUUAAUGGUCAGAAUGG-GCGCUUGUCGCGUGCcAGAUCGGGGUUCAAUU\n", ofp);
423 fputs("tRNA5 GGGCACAUGGCGCAGUUGGUAGCGCGCUUCCCUUGCAAGGAAGaGGUCAUCGGUUCGAUU\n", ofp);
424 fputs("tRNA1 GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGaGGUCCUGUGUUCGAUC\n", ofp);
425 fputs("tRNA4 GCUCGUAUGGCGCAGUGG-UAGCGCAGCAGAUUGCAAAUCUGUuGGUCCUUAGUUCGAUC\n", ofp);
426 fputs("\n", ofp);
427 fputs("tRNA2 CCCCGUAUCGGAG\n", ofp);
428 fputs("tRNA3 CCCCGUCGCGGAG\n", ofp);
429 fputs("tRNA5 CCGGUUGCGUCCA\n", ofp);
430 fputs("tRNA1 CACAGAAUUCGCA\n", ofp);
431 fputs("tRNA4 CUGAGUGCGAGCU\n", ofp);
432 *ret_alphatype = eslRNA;
433 *ret_nseq = 5;
434 *ret_alen = 73;
435 }
436
437 static void
utest_goodfile(char * filename,int testnumber,int expected_alphatype,int expected_nseq,int expected_alen)438 utest_goodfile(char *filename, int testnumber, int expected_alphatype, int expected_nseq, int expected_alen)
439 {
440 ESL_ALPHABET *abc = NULL;
441 ESL_MSAFILE *afp = NULL;
442 ESL_MSA *msa1 = NULL;
443 ESL_MSA *msa2 = NULL;
444 char tmpfile1[32] = "esltmpXXXXXX";
445 char tmpfile2[32] = "esltmpXXXXXX";
446 FILE *ofp = NULL;
447 int status;
448
449 /* guessing both the format and the alphabet should work: this is a digital open */
450 /* PSIBLAST format is autodetected as SELEX, which is fine - selex parser is more general */
451 if ( (status = esl_msafile_Open(&abc, filename, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) esl_fatal("psiblast good file test %d failed: digital open", testnumber);
452 if (afp->format != eslMSAFILE_SELEX) esl_fatal("psiblast good file test %d failed: format autodetection", testnumber);
453 if (abc->type != expected_alphatype) esl_fatal("psiblast good file test %d failed: alphabet autodetection", testnumber);
454 afp->format = eslMSAFILE_PSIBLAST;
455
456 /* This is a digital read, using <abc>. */
457 if ( (status = esl_msafile_psiblast_Read(afp, &msa1)) != eslOK) esl_fatal("psiblast good file test %d failed: msa read, digital", testnumber);
458 if (msa1->nseq != expected_nseq || msa1->alen != expected_alen) esl_fatal("psiblast good file test %d failed: nseq/alen", testnumber);
459 if (esl_msa_Validate(msa1, NULL) != eslOK) esl_fatal("psiblast good file test %d failed: msa1 invalid", testnumber);
460 esl_msafile_Close(afp);
461
462 /* write it back out to a new tmpfile (digital write) */
463 if ( (status = esl_tmpfile_named(tmpfile1, &ofp)) != eslOK) esl_fatal("psiblast good file test %d failed: tmpfile creation", testnumber);
464 if ( (status = esl_msafile_psiblast_Write(ofp, msa1)) != eslOK) esl_fatal("psiblast good file test %d failed: msa write, digital", testnumber);
465 fclose(ofp);
466
467 /* now open and read it as text mode, in known format. (We have to pass fmtd now, to deal with the possibility of a nonstandard name width) */
468 if ( (status = esl_msafile_Open(NULL, tmpfile1, NULL, eslMSAFILE_PSIBLAST, NULL, &afp)) != eslOK) esl_fatal("psiblast good file test %d failed: text mode open", testnumber);
469 if ( (status = esl_msafile_psiblast_Read(afp, &msa2)) != eslOK) esl_fatal("psiblast good file test %d failed: msa read, text", testnumber);
470 if (msa2->nseq != expected_nseq || msa2->alen != expected_alen) esl_fatal("psiblast good file test %d failed: nseq/alen", testnumber);
471 if (esl_msa_Validate(msa2, NULL) != eslOK) esl_fatal("psiblast good file test %d failed: msa2 invalid", testnumber);
472 esl_msafile_Close(afp);
473
474 /* write it back out to a new tmpfile (text write) */
475 if ( (status = esl_tmpfile_named(tmpfile2, &ofp)) != eslOK) esl_fatal("psiblast good file test %d failed: tmpfile creation", testnumber);
476 if ( (status = esl_msafile_psiblast_Write(ofp, msa2)) != eslOK) esl_fatal("psiblast good file test %d failed: msa write, text", testnumber);
477 fclose(ofp);
478 esl_msa_Destroy(msa2);
479
480 /* open and read it in digital mode */
481 if ( (status = esl_msafile_Open(&abc, tmpfile1, NULL, eslMSAFILE_PSIBLAST, NULL, &afp)) != eslOK) esl_fatal("psiblast good file test %d failed: 2nd digital mode open", testnumber);
482 if ( (status = esl_msafile_psiblast_Read(afp, &msa2)) != eslOK) esl_fatal("psiblast good file test %d failed: 2nd digital msa read", testnumber);
483 if (esl_msa_Validate(msa2, NULL) != eslOK) esl_fatal("psiblast good file test %d failed: msa2 invalid", testnumber);
484 esl_msafile_Close(afp);
485
486 /* this msa <msa2> should be identical to <msa1> */
487 if (esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal("psiblast good file test %d failed: msa compare", testnumber);
488
489 remove(tmpfile1);
490 remove(tmpfile2);
491 esl_msa_Destroy(msa1);
492 esl_msa_Destroy(msa2);
493 esl_alphabet_Destroy(abc);
494 }
495
496 static void
write_test_msas(FILE * ofp1,FILE * ofp2)497 write_test_msas(FILE *ofp1, FILE *ofp2)
498 {
499 fprintf(ofp1, "\n");
500 fprintf(ofp1, "seq1 --ACDEFGHIKLMNPQRSTVWY\n");
501 fprintf(ofp1, "seq2 --ACDEFGHIKLMNPQRSTV-- \n");
502 fprintf(ofp1, "seq3 aaACDEFGHIKLMNPQRSTV-- \n");
503 fprintf(ofp1, "seq4 --ACDEFGHIKLMNPQRSTVWY \n");
504 fprintf(ofp1, "\n");
505 fprintf(ofp1, "seq1 ACDEFGHIKLMNPQRSTVWY--\n");
506 fprintf(ofp1, "seq2 ACDEFGHIKLMNPQRSTVWYyy\n");
507 fprintf(ofp1, "seq3 ACDEFGHIKLMNPQRSTVWY--\n");
508 fprintf(ofp1, "seq4 ACDEFGHIKLMNPQRSTVWY--\n");
509 fprintf(ofp1, "\n");
510
511 fprintf(ofp2, "# STOCKHOLM 1.0\n");
512 fprintf(ofp2, "\n");
513 fprintf(ofp2, "#=GC RF ..xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx..\n");
514 fprintf(ofp2, "seq1 --ACDEFGHIKLMNPQRSTVWYACDEFGHIKLMNPQRSTVWY--\n");
515 fprintf(ofp2, "seq2 --ACDEFGHIKLMNPQRSTV--ACDEFGHIKLMNPQRSTVWYyy\n");
516 fprintf(ofp2, "seq3 aaACDEFGHIKLMNPQRSTV--ACDEFGHIKLMNPQRSTVWY--\n");
517 fprintf(ofp2, "seq4 --ACDEFGHIKLMNPQRSTVWYACDEFGHIKLMNPQRSTVWY--\n");
518 fprintf(ofp2, "//\n");
519 }
520
521 static void
read_test_msas_digital(char * pbfile,char * stkfile)522 read_test_msas_digital(char *pbfile, char *stkfile)
523 {
524 char msg[] = "PSIBLAST msa digital read unit test failed";
525 ESL_ALPHABET *abc = NULL;
526 ESL_MSAFILE *afp1 = NULL;
527 ESL_MSAFILE *afp2 = NULL;
528 ESL_MSA *msa1, *msa2, *msa3, *msa4;
529 FILE *pbfp, *stkfp;
530 char pbfile2[32] = "esltmppb2XXXXXX";
531 char stkfile2[32] = "esltmpstk2XXXXXX";
532
533 if ( esl_msafile_Open(&abc, pbfile, NULL, eslMSAFILE_PSIBLAST, NULL, &afp1) != eslOK) esl_fatal(msg);
534 if ( !abc || abc->type != eslAMINO) esl_fatal(msg);
535 if ( esl_msafile_Open(&abc, stkfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
536 if ( esl_msafile_psiblast_Read (afp1, &msa1) != eslOK) esl_fatal(msg);
537 if ( esl_msafile_stockholm_Read(afp2, &msa2) != eslOK) esl_fatal(msg);
538 if ( esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal(msg);
539
540 if ( esl_msafile_psiblast_Read (afp1, &msa3) != eslEOF) esl_fatal(msg);
541 if ( esl_msafile_stockholm_Read(afp2, &msa3) != eslEOF) esl_fatal(msg);
542
543 esl_msafile_Close(afp2);
544 esl_msafile_Close(afp1);
545
546 /* Now write stk to psiblast file, and vice versa; then retest */
547 if ( esl_tmpfile_named(pbfile2, &pbfp) != eslOK) esl_fatal(msg);
548 if ( esl_tmpfile_named(stkfile2, &stkfp) != eslOK) esl_fatal(msg);
549 if ( esl_msafile_psiblast_Write (pbfp, msa2) != eslOK) esl_fatal(msg);
550 if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_STOCKHOLM) != eslOK) esl_fatal(msg);
551 fclose(pbfp);
552 fclose(stkfp);
553 if ( esl_msafile_Open(&abc, pbfile2, NULL, eslMSAFILE_PSIBLAST, NULL, &afp1) != eslOK) esl_fatal(msg);
554 if ( esl_msafile_Open(&abc, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
555 if ( esl_msafile_psiblast_Read (afp1, &msa3) != eslOK) esl_fatal(msg);
556 if ( esl_msafile_stockholm_Read(afp2, &msa4) != eslOK) esl_fatal(msg);
557 if ( esl_msa_Compare(msa3, msa4) != eslOK) esl_fatal(msg);
558
559 remove(pbfile2);
560 remove(stkfile2);
561 esl_msafile_Close(afp2);
562 esl_msafile_Close(afp1);
563
564 esl_msa_Destroy(msa1);
565 esl_msa_Destroy(msa2);
566 esl_msa_Destroy(msa3);
567 esl_msa_Destroy(msa4);
568 esl_alphabet_Destroy(abc);
569 }
570
571 static void
read_test_msas_text(char * pbfile,char * stkfile)572 read_test_msas_text(char *pbfile, char *stkfile)
573 {
574 char msg[] = "PSIBLAST msa text-mode read unit test failed";
575 ESL_MSAFILE *afp1 = NULL;
576 ESL_MSAFILE *afp2 = NULL;
577 ESL_MSA *msa1, *msa2, *msa3, *msa4;
578 FILE *pbfp, *stkfp;
579 char pbfile2[32] = "esltmppb2XXXXXX";
580 char stkfile2[32] = "esltmpstk2XXXXXX";
581
582 /* vvvv-- everything's the same as the digital utest except these NULLs */
583 if ( esl_msafile_Open(NULL, pbfile, NULL, eslMSAFILE_PSIBLAST, NULL, &afp1) != eslOK) esl_fatal(msg);
584 if ( esl_msafile_Open(NULL, stkfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
585 if ( esl_msafile_psiblast_Read (afp1, &msa1) != eslOK) esl_fatal(msg);
586 if ( esl_msafile_stockholm_Read(afp2, &msa2) != eslOK) esl_fatal(msg);
587 if ( esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal(msg);
588 if ( esl_msafile_psiblast_Read (afp1, &msa3) != eslEOF) esl_fatal(msg);
589 if ( esl_msafile_stockholm_Read(afp2, &msa3) != eslEOF) esl_fatal(msg);
590 esl_msafile_Close(afp2);
591 esl_msafile_Close(afp1);
592
593 if ( esl_tmpfile_named(pbfile2, &pbfp) != eslOK) esl_fatal(msg);
594 if ( esl_tmpfile_named(stkfile2, &stkfp) != eslOK) esl_fatal(msg);
595 if ( esl_msafile_psiblast_Write (pbfp, msa2) != eslOK) esl_fatal(msg);
596 if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_STOCKHOLM) != eslOK) esl_fatal(msg);
597 fclose(pbfp);
598 fclose(stkfp);
599 if ( esl_msafile_Open(NULL, pbfile2, NULL, eslMSAFILE_PSIBLAST, NULL, &afp1) != eslOK) esl_fatal(msg);
600 if ( esl_msafile_Open(NULL, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
601 if ( esl_msafile_psiblast_Read (afp1, &msa3) != eslOK) esl_fatal(msg);
602 if ( esl_msafile_stockholm_Read(afp2, &msa4) != eslOK) esl_fatal(msg);
603 if ( esl_msa_Compare(msa3, msa4) != eslOK) esl_fatal(msg);
604
605 remove(pbfile2);
606 remove(stkfile2);
607 esl_msafile_Close(afp2);
608 esl_msafile_Close(afp1);
609
610 esl_msa_Destroy(msa1);
611 esl_msa_Destroy(msa2);
612 esl_msa_Destroy(msa3);
613 esl_msa_Destroy(msa4);
614 }
615 #endif /*eslMSAFILE_PSIBLAST_TESTDRIVE*/
616 /*---------------------- end, unit tests ------------------------*/
617
618
619 /*****************************************************************
620 * 3. Test driver.
621 *****************************************************************/
622 #ifdef eslMSAFILE_PSIBLAST_TESTDRIVE
623 /* compile: gcc -g -Wall -I. -L. -o esl_msafile_psiblast_utest -DeslMSAFILE_PSIBLAST_TESTDRIVE esl_msafile_psiblast.c -leasel -lm
624 * (gcov): gcc -g -Wall -fprofile-arcs -ftest-coverage -I. -L. -o esl_msafile_psiblast_utest -DeslMSAFILE_PSIBLAST_TESTDRIVE esl_msafile_psiblast.c -leasel -lm
625 * run: ./esl_msafile_psiblast_utest
626 */
627 #include "esl_config.h"
628
629 #include <stdio.h>
630
631 #include "easel.h"
632 #include "esl_getopts.h"
633 #include "esl_random.h"
634 #include "esl_msafile.h"
635 #include "esl_msafile_psiblast.h"
636
637 static ESL_OPTIONS options[] = {
638 /* name type default env range togs reqs incomp help docgrp */
639 {"-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0},
640 { 0,0,0,0,0,0,0,0,0,0},
641 };
642 static char usage[] = "[-options]";
643 static char banner[] = "test driver for PSIBLAST MSA format module";
644
645 int
main(int argc,char ** argv)646 main(int argc, char **argv)
647 {
648 char msg[] = "PSI-BLAST MSA i/o module test driver failed";
649 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
650 char pbfile[32] = "esltmppbXXXXXX";
651 char stkfile[32] = "esltmpstkXXXXXX";
652 FILE *pbfp, *stkfp;
653 int testnumber;
654 int ngoodtests = 2;
655 char tmpfile[32];
656 FILE *ofp;
657 int expected_alphatype;
658 int expected_nseq;
659 int expected_alen;
660
661 if ( esl_tmpfile_named(pbfile, &pbfp) != eslOK) esl_fatal(msg);
662 if ( esl_tmpfile_named(stkfile, &stkfp) != eslOK) esl_fatal(msg);
663 write_test_msas(pbfp, stkfp);
664 fclose(pbfp);
665 fclose(stkfp);
666
667 read_test_msas_digital(pbfile, stkfile);
668 read_test_msas_text (pbfile, stkfile);
669
670 /* Various "good" files that should be parsed correctly */
671 for (testnumber = 1; testnumber <= ngoodtests; testnumber++)
672 {
673 strcpy(tmpfile, "esltmpXXXXXX");
674 if (esl_tmpfile_named(tmpfile, &ofp) != eslOK) esl_fatal(msg);
675 switch (testnumber) {
676 case 1: utest_write_good1 (ofp, &expected_alphatype, &expected_nseq, &expected_alen); break;
677 case 2: utest_write_good2 (ofp, &expected_alphatype, &expected_nseq, &expected_alen); break;
678 }
679 fclose(ofp);
680 utest_goodfile(tmpfile, testnumber, expected_alphatype, expected_nseq, expected_alen);
681 remove(tmpfile);
682 }
683
684 remove(pbfile);
685 remove(stkfile);
686 esl_getopts_Destroy(go);
687 return 0;
688 }
689 #endif /*eslMSAFILE_PSIBLAST_TESTDRIVE*/
690 /*--------------------- end, test driver ------------------------*/
691
692
693
694
695 /*****************************************************************
696 * 4. Examples.
697 *****************************************************************/
698
699 #ifdef eslMSAFILE_PSIBLAST_EXAMPLE
700 /* A full-featured example of reading/writing an MSA in PSIBLAST format.
701 gcc -g -Wall -o esl_msafile_psiblast_example -I. -L. -DeslMSAFILE_PSIBLAST_EXAMPLE esl_msafile_psiblast.c -leasel -lm
702 ./esl_msafile_psiblast_example <msafile>
703 */
704 /*::cexcerpt::msafile_psiblast_example::begin::*/
705 #include <stdio.h>
706
707 #include "easel.h"
708 #include "esl_alphabet.h"
709 #include "esl_getopts.h"
710 #include "esl_msa.h"
711 #include "esl_msafile.h"
712 #include "esl_msafile_psiblast.h"
713
714 static ESL_OPTIONS options[] = {
715 /* name type default env range toggles reqs incomp help docgroup*/
716 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
717 { "-1", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "override autodetection; force PSIBLAST format", 0 },
718 { "-q", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "quieter: don't write msa back, just summary", 0 },
719 { "-t", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use text mode: no digital alphabet", 0 },
720 { "--dna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is DNA", 0 },
721 { "--rna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is RNA", 0 },
722 { "--amino", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is protein", 0 },
723 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
724 };
725 static char usage[] = "[-options] <msafile>";
726 static char banner[] = "example of guessing, reading, writing PSIBLAST format";
727
728 int
main(int argc,char ** argv)729 main(int argc, char **argv)
730 {
731 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
732 char *filename = esl_opt_GetArg(go, 1);
733 int infmt = eslMSAFILE_UNKNOWN;
734 ESL_ALPHABET *abc = NULL;
735 ESL_MSAFILE *afp = NULL;
736 ESL_MSA *msa = NULL;
737 int status;
738
739 if (esl_opt_GetBoolean(go, "-1")) infmt = eslMSAFILE_PSIBLAST; /* override format autodetection */
740
741 if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA);
742 else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA);
743 else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
744
745 /* Text mode: pass NULL for alphabet.
746 * Digital mode: pass ptr to expected ESL_ALPHABET; and if abc=NULL, alphabet is guessed
747 */
748 if (esl_opt_GetBoolean(go, "-t")) status = esl_msafile_Open(NULL, filename, NULL, infmt, NULL, &afp);
749 else status = esl_msafile_Open(&abc, filename, NULL, infmt, NULL, &afp);
750 if (status != eslOK) esl_msafile_OpenFailure(afp, status);
751
752 if ((status = esl_msafile_psiblast_Read(afp, &msa)) != eslOK)
753 esl_msafile_ReadFailure(afp, status);
754
755 printf("alphabet: %s\n", (abc ? esl_abc_DecodeType(abc->type) : "none (text mode)"));
756 printf("# of seqs: %d\n", msa->nseq);
757 printf("# of cols: %d\n", (int) msa->alen);
758 printf("\n");
759
760 if (! esl_opt_GetBoolean(go, "-q"))
761 esl_msafile_psiblast_Write(stdout, msa);
762
763 esl_msa_Destroy(msa);
764 esl_msafile_Close(afp);
765 if (abc) esl_alphabet_Destroy(abc);
766 esl_getopts_Destroy(go);
767 exit(0);
768 }
769 /*::cexcerpt::msafile_psiblast_example::end::*/
770 #endif /*eslMSAFILE_PSIBLAST_EXAMPLE*/
771
772
773 #ifdef eslMSAFILE_PSIBLAST_EXAMPLE2
774 /* A minimal example. Read PSIBLAST format MSA, in text mode.
775 gcc -g -Wall -o esl_msafile_psiblast_example2 -I. -L. -DeslMSAFILE_PSIBLAST_EXAMPLE2 esl_msafile_psiblast.c -leasel -lm
776 ./esl_msafile_psiblast_example2 <msafile>
777 */
778
779 /*::cexcerpt::msafile_psiblast_example2::begin::*/
780 #include <stdio.h>
781
782 #include "easel.h"
783 #include "esl_msa.h"
784 #include "esl_msafile.h"
785 #include "esl_msafile_psiblast.h"
786
787 int
main(int argc,char ** argv)788 main(int argc, char **argv)
789 {
790 char *filename = argv[1];
791 int fmt = eslMSAFILE_PSIBLAST;
792 ESL_MSAFILE *afp = NULL;
793 ESL_MSA *msa = NULL;
794 int status;
795
796 if ( (status = esl_msafile_Open(NULL, filename, NULL, fmt, NULL, &afp)) != eslOK) esl_msafile_OpenFailure(afp, status);
797 if ( (status = esl_msafile_psiblast_Read(afp, &msa)) != eslOK) esl_msafile_ReadFailure(afp, status);
798
799 printf("%6d seqs, %5d columns\n", msa->nseq, (int) msa->alen);
800
801 esl_msafile_psiblast_Write(stdout, msa);
802
803 esl_msa_Destroy(msa);
804 esl_msafile_Close(afp);
805 exit(0);
806 }
807 /*::cexcerpt::msafile_psiblast_example2::end::*/
808 #endif /*eslMSAFILE_PSIBLAST_EXAMPLE2*/
809 /*--------------------- end of examples -------------------------*/
810