1 /* Unaligned ascii sequence file i/o.
2 *
3 * Contents:
4 * 1. An <ESL_SQFILE> object, in text mode.
5 * 2. An <ESL_SQFILE> object, in digital mode.
6 * 3. Miscellaneous routines.
7 * 4. Sequence reading (sequential).
8 * 5. Sequence/subsequence fetching, random access
9 * 6. Internal routines shared by parsers.
10 * 7. Internal routines for EMBL format (including UniProt, TrEMBL)
11 * 8. Internal routines for GenBank format
12 * 9. Internal routines for FASTA format
13 * 10. Internal routines for daemon format
14 * 11. Internal routines for HMMPGMD format
15 *
16 * This module shares remote evolutionary homology with Don Gilbert's
17 * seminal, public domain ReadSeq package, though the last common
18 * ancestor was circa 1991 and no recognizable vestiges are likely to
19 * remain. Thanks Don!
20 *
21 */
22 #include "esl_config.h"
23
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <string.h>
27 #include <ctype.h>
28 #include <unistd.h>
29
30 #include "easel.h"
31 #include "esl_alphabet.h"
32 #include "esl_msa.h"
33 #include "esl_msafile.h"
34 #include "esl_sqio.h"
35 #include "esl_sq.h"
36 #include "esl_ssi.h"
37
38 /* format specific routines */
39 static int sqascii_GuessFileFormat(ESL_SQFILE *sqfp, int *ret_fmt);
40 static int sqascii_Position (ESL_SQFILE *sqfp, off_t offset);
41 static void sqascii_Close (ESL_SQFILE *sqfp);
42 static int sqascii_SetDigital (ESL_SQFILE *sqfp, const ESL_ALPHABET *abc);
43 static int sqascii_GuessAlphabet (ESL_SQFILE *sqfp, int *ret_type);
44 static int sqascii_Read (ESL_SQFILE *sqfp, ESL_SQ *sq);
45 static int sqascii_ReadInfo (ESL_SQFILE *sqfp, ESL_SQ *sq);
46 static int sqascii_ReadSequence (ESL_SQFILE *sqfp, ESL_SQ *sq);
47 static int sqascii_ReadWindow (ESL_SQFILE *sqfp, int C, int W, ESL_SQ *sq);
48 static int sqascii_ReadBlock (ESL_SQFILE *sqfp, ESL_SQ_BLOCK *sqBlock, int max_residues, int max_sequences, int long_target);
49 static int sqascii_Echo (ESL_SQFILE *sqfp, const ESL_SQ *sq, FILE *ofp);
50
51 static int sqascii_IsRewindable (const ESL_SQFILE *sqfp);
52 static const char *sqascii_GetError (const ESL_SQFILE *sqfp);
53
54 static int sqascii_OpenSSI (ESL_SQFILE *sqfp, const char *ssifile_hint);
55 static int sqascii_PositionByKey (ESL_SQFILE *sqfp, const char *key);
56 static int sqascii_PositionByNumber(ESL_SQFILE *sqfp, int which);
57 static int sqascii_Fetch (ESL_SQFILE *sqfp, const char *key, ESL_SQ *sq);
58 static int sqascii_FetchInfo (ESL_SQFILE *sqfp, const char *key, ESL_SQ *sq);
59 static int sqascii_FetchSubseq (ESL_SQFILE *sqfp, const char *source, int64_t start, int64_t end, ESL_SQ *sq);
60
61 /* Internal routines shared by parsers. */
62 static int loadmem (ESL_SQFILE *sqfp);
63 static int loadbuf (ESL_SQFILE *sqfp);
64 static int nextchar (ESL_SQFILE *sqfp, char *ret_c);
65 static int seebuf (ESL_SQFILE *sqfp, int64_t maxn, int64_t *opt_nres, int64_t *opt_endpos);
66 static void addbuf (ESL_SQFILE *sqfp, ESL_SQ *sq, int64_t nres);
67 static void skipbuf (ESL_SQFILE *sqfp, int64_t nskip);
68 static int read_nres(ESL_SQFILE *sqfp, ESL_SQ *sq, int64_t nskip, int64_t nres, int64_t *opt_actual_nres);
69 static int skip_whitespace(ESL_SQFILE *sqfp);
70
71 /* EMBL format; also UniProt, TrEMBL */
72 static void config_embl(ESL_SQFILE *sqfp);
73 static void inmap_embl (ESL_SQFILE *sqfp, const ESL_DSQ *abc_inmap);
74 static int header_embl(ESL_SQFILE *sqfp, ESL_SQ *sq);
75 static int skip_embl (ESL_SQFILE *sqfp, ESL_SQ *sq);
76 static int end_embl (ESL_SQFILE *sqfp, ESL_SQ *sq);
77
78 /* GenBank format; also DDBJ */
79 static void config_genbank(ESL_SQFILE *sqfp);
80 static void inmap_genbank (ESL_SQFILE *sqfp, const ESL_DSQ *abc_inmap);
81 static int header_genbank(ESL_SQFILE *sqfp, ESL_SQ *sq);
82 static int skip_genbank (ESL_SQFILE *sqfp, ESL_SQ *sq);
83 static int end_genbank (ESL_SQFILE *sqfp, ESL_SQ *sq);
84
85 /* FASTA format */
86 static void config_fasta(ESL_SQFILE *sqfp);
87 static void inmap_fasta (ESL_SQFILE *sqfp, const ESL_DSQ *abc_inmap);
88 static int header_fasta(ESL_SQFILE *sqfp, ESL_SQ *sq);
89 static int skip_fasta (ESL_SQFILE *sqfp, ESL_SQ *sq);
90 static int end_fasta (ESL_SQFILE *sqfp, ESL_SQ *sq);
91
92 /* daemon format */
93 static void config_daemon(ESL_SQFILE *sqfp);
94 static void inmap_daemon (ESL_SQFILE *sqfp, const ESL_DSQ *abc_inmap);
95 static int end_daemon (ESL_SQFILE *sqfp, ESL_SQ *sq);
96
97 /* HMMPGMD format */
98 static int fileheader_hmmpgmd(ESL_SQFILE *sqfp);
99
100
101 /*****************************************************************
102 *# 1. An <ESL_SQFILE> object, in text mode.
103 *****************************************************************/
104
105 /* Function: esl_sqascii_Open()
106 * Synopsis: Open a sequence file for reading.
107 *
108 * Purpose: Open a sequence file <filename> for reading.
109 * The opened <ESL_SQFILE> is returned through <ret_sqfp>.
110 *
111 * The format of the file is asserted to be <format> (for
112 * example, <eslSQFILE_FASTA>). If <format> is
113 * <eslSQFILE_UNKNOWN> then the routine attempts to
114 * autodetect the file format.
115 *
116 * There are two special cases for <filename>. If
117 * <filename> is "-", the sequence data are read from a
118 * <STDIN> pipe. If <filename> ends in ".gz", the file is
119 * assumed to be compressed with <gzip>, and it is opened
120 * by a pipe from <gzip -dc>. Reading gzip files only works
121 * on POSIX-compliant systems that have pipes
122 * (specifically, the POSIX.2 popen() call).
123 *
124 * Returns: <eslOK> on success, and <*ret_sqfp> points to a new
125 * open <ESL_SQFILE>. Caller deallocates this object with
126 * <esl_sqfile_Close()>.
127 *
128 * Returns <eslENOTFOUND> if <filename> can't be found or
129 * opened. Returns <eslEFORMAT> if the file is empty, or
130 * if autodetection is attempted and the format can't be
131 * determined. On any error condition, <*ret_sqfp> is
132 * returned NULL.
133 *
134 * Throws: <eslEMEM> on allocation failure.
135 */
136 int
esl_sqascii_Open(char * filename,int format,ESL_SQFILE * sqfp)137 esl_sqascii_Open(char *filename, int format, ESL_SQFILE *sqfp)
138 {
139 int status;/* return status from an ESL call */
140 int n;
141
142 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
143
144 /* before we go any further, make sure we can handle the format */
145 if (format == eslSQFILE_NCBI) return eslENOTFOUND;
146
147 /* Default initializations */
148 ascii->fp = NULL;
149 ascii->do_gzip = FALSE;
150 ascii->do_stdin = FALSE;
151 ascii->do_buffer = FALSE;
152
153 ascii->mem = NULL;
154 ascii->allocm = 0;
155 ascii->mn = 0;
156 ascii->mpos = 0;
157 ascii->moff = -1;
158 ascii->is_recording = FALSE;
159
160 ascii->buf = NULL;
161 ascii->boff = 0;
162 ascii->balloc = 0;
163 ascii->nc = 0;
164 ascii->bpos = 0;
165 ascii->L = 0;
166 ascii->linenumber = 1;
167
168 ascii->bookmark_offset = 0;
169 ascii->bookmark_linenum = 0;
170
171 ascii->is_linebased = FALSE;
172 ascii->eof_is_ok = FALSE;
173 ascii->parse_header = NULL;
174 ascii->skip_header = NULL;
175 ascii->parse_end = NULL;
176
177 ascii->afp = NULL;
178 ascii->msa = NULL;
179 ascii->idx = -1;
180
181 ascii->ssifile = NULL;
182 ascii->rpl = -1; /* -1 = not set yet */
183 ascii->bpl = -1; /* (ditto) */
184 ascii->prvrpl = -1; /* (ditto) */
185 ascii->prvbpl = -1; /* (ditto) */
186 ascii->currpl = -1;
187 ascii->curbpl = -1;
188 ascii->ssi = NULL;
189
190 /* MSA formats are handled entirely by msafile module -
191 * let it handle stdin, .gz, etc
192 */
193 if (! esl_sqio_IsAlignment(format))
194 {
195 if (strcmp(filename, "-") == 0) /* stdin special case */
196 {
197 ascii->fp = stdin;
198 ascii->do_stdin = TRUE;
199 }
200 else
201 { /* Check the current working directory first. */
202 if ((ascii->fp = fopen(filename, "r")) == NULL) {
203 status = eslENOTFOUND;
204 goto ERROR;
205 }
206 }
207
208 /* Deal with the .gz special case: to popen(), "success" only means
209 * it found and executed gzip -dc. If gzip -dc doesn't find our
210 * file, popen() still blithely returns success, so we have to be
211 * sure the file exists. That's why we fopen()'ed it above, only to
212 * close it and popen() it here.
213 */
214 #ifdef HAVE_POPEN
215 n = strlen(filename);
216 if (n > 3 && strcmp(filename+n-3, ".gz") == 0)
217 {
218 char *cmd;
219 fclose(ascii->fp);
220 ESL_ALLOC(cmd, sizeof(char) * (n+1+strlen("gzip -dc ")));
221 sprintf(cmd, "gzip -dc %s", filename);
222 ascii->fp = popen(cmd, "r");
223 if (ascii->fp == NULL) { status = eslENOTFOUND; goto ERROR; }
224 ascii->do_gzip = TRUE;
225 free(cmd);
226 }
227 #endif /*HAVE_POPEN*/
228
229 /* If we don't know the format yet, try to autodetect it now. */
230 if (format == eslSQFILE_UNKNOWN)
231 {
232 status = sqascii_GuessFileFormat(sqfp, &format);
233 if (status == eslOK) sqfp->format = format;
234 else if (status != eslEFORMAT) goto ERROR; /* <format> might still be eslSQFILE_UNKNOWN, for MSA files */
235 }
236
237 /* If the format is still unknown, it may be an MSA file. The
238 * msafile module is capable of autodetecting format even in a .gz
239 * or stdin pipe, but the stuff above has already read from these
240 * nonrewindable sources, trying to guess an unaligned format. We
241 * could open a second .gz pipe, but that's ugly; and in any case,
242 * we can't rewind stdin. Eventually, this will get resolved, by
243 * having sqio open an ESL_BUFFER, then doing an
244 * esl_msafile_OpenBuffer() if we need to hand control to the
245 * msafile module. For now, sqio is already documented to be
246 * unable to autodetect MSA file formats in stdin or .gz pipes,
247 * so leave it that way.
248 */
249 if (format == eslSQFILE_UNKNOWN && (ascii->do_gzip || ascii->do_stdin))
250 { status = eslEFORMAT; goto ERROR; }
251 }
252
253 /* If format is definitely an MSA, open it through the msafile interface.
254 * Or, if format is still unknown, try to open the file as an MSA file,
255 * using msafile autodetection.
256 */
257 if (format == eslSQFILE_UNKNOWN || esl_sqio_IsAlignment(format))
258 {
259 status = esl_msafile_Open(NULL, filename, NULL, format, NULL, &(ascii->afp));
260 if (status != eslOK) { status = eslEFORMAT; goto ERROR; } /* This was our last attempt. Failure to open == failure to detect format */
261 sqfp->format = format = ascii->afp->format;
262 }
263 if (format == eslSQFILE_UNKNOWN) { status = eslEFORMAT; goto ERROR; }
264
265
266 /* Configure the <sqfp>'s parser and inmaps for this format. */
267 if (!esl_sqio_IsAlignment(format))
268 {
269 switch (format) {
270 case eslSQFILE_EMBL: config_embl(sqfp); inmap_embl(sqfp, NULL); break;
271 case eslSQFILE_UNIPROT: config_embl(sqfp); inmap_embl(sqfp, NULL); break;
272 case eslSQFILE_GENBANK: config_genbank(sqfp); inmap_genbank(sqfp, NULL); break;
273 case eslSQFILE_DDBJ: config_genbank(sqfp); inmap_genbank(sqfp, NULL); break;
274 case eslSQFILE_FASTA: config_fasta(sqfp); inmap_fasta(sqfp, NULL); break;
275 case eslSQFILE_DAEMON: config_daemon(sqfp); inmap_daemon(sqfp, NULL); break;
276 case eslSQFILE_HMMPGMD: config_fasta(sqfp); inmap_fasta(sqfp, NULL); break;
277 default:status = eslEFORMAT; goto ERROR;
278 }
279
280 /* Preload the first line or chunk of file. */
281 status = loadbuf(sqfp);
282 if (status == eslEOF) { status = eslEFORMAT; goto ERROR; }
283 else if (status != eslOK) { goto ERROR; }
284
285 /* hmmpgmd is a special case: we need to skip first line before parsing it.
286 * generalize that a little: this could be a section for parsing a file header,
287 * and leaving the buf positioned at the first char of the first record
288 * (just as expected if there's no file header)
289 */
290 switch (format) {
291 case eslSQFILE_HMMPGMD: status = fileheader_hmmpgmd(sqfp); break;
292 default: status = eslOK; break;
293 }
294 if (status != eslOK) goto ERROR;
295 }
296 else
297 {
298 ascii->is_linebased = TRUE;
299 ascii->eof_is_ok = FALSE; /* no-op for msa's */
300 ascii->parse_header = NULL; /* no-op for msa's */
301 ascii->skip_header = NULL; /* no-op for msa's */
302 ascii->parse_end = NULL; /* no-op for msa's */
303 }
304
305 /* initialize the function pointers for the ascii routines */
306 sqfp->position = &sqascii_Position;
307 sqfp->close = &sqascii_Close;
308
309 sqfp->set_digital = &sqascii_SetDigital;
310 sqfp->guess_alphabet = &sqascii_GuessAlphabet;
311
312 sqfp->is_rewindable = &sqascii_IsRewindable;
313
314 sqfp->read = &sqascii_Read;
315 sqfp->read_info = &sqascii_ReadInfo;
316 sqfp->read_seq = &sqascii_ReadSequence;
317 sqfp->read_window = &sqascii_ReadWindow;
318 sqfp->echo = &sqascii_Echo;
319
320 sqfp->read_block = &sqascii_ReadBlock;
321
322 sqfp->open_ssi = &sqascii_OpenSSI;
323 sqfp->pos_by_key = &sqascii_PositionByKey;
324 sqfp->pos_by_number = &sqascii_PositionByNumber;
325
326 sqfp->fetch = &sqascii_Fetch;
327 sqfp->fetch_info = &sqascii_FetchInfo;
328 sqfp->fetch_subseq = &sqascii_FetchSubseq;
329 sqfp->get_error = &sqascii_GetError;
330
331 return eslOK;
332
333 ERROR:
334 sqascii_Close(sqfp);
335 return status;
336 }
337
338
339 /* Function: sqascii_GuessFileFormat()
340 * Synopsis: Guess the format of an open <ESL_SQFILE>.
341 *
342 * Purpose: Try to guess the sequence file format of <sqfp>, and
343 * return the format code in <*ret_fmt>.
344 *
345 * First we attempt to guess based on the <filename>'s
346 * suffix. <*.fa> is assumed to be in FASTA format; <*.gb>
347 * is assumed to be in GenBank format.
348 *
349 * If that fails, we attempt to guess based on peeking at
350 * the first nonblank line of <filename>. If the line
351 * starts with $>$, we assume FASTA format; if the line
352 * starts with <ID>, we assume EMBL format; if the line
353 * starts with <LOCUS> or it contains the string <Genetic
354 * Sequence Data Bank> we assume GenBank format.
355 *
356 * If that fails too, return an <eslEFORMAT> error, and
357 * <*ret_fmt> is set to <eslSQFILE_UNKNOWN>.
358 *
359 * Returns: <eslOK> on success, and <*ret_fmt> contains
360 * a valid sequence file format code, such as
361 * <eslSQFILE_FASTA>.
362 *
363 * Returns <eslEFORMAT> if we opened <filename> but it
364 * contains no nonblank lines, or if we peeked at the first
365 * nonblank line and still couldn't guess the format;
366 * <*ret_fmt> is then <eslSQFILE_UNKNOWN>.
367 *
368 * Throws: <eslEMEM> on allocation failure.
369 */
370 static int
sqascii_GuessFileFormat(ESL_SQFILE * sqfp,int * ret_fmt)371 sqascii_GuessFileFormat(ESL_SQFILE *sqfp, int *ret_fmt)
372 {
373 int n = strlen(sqfp->filename);
374 const char *sfx = NULL;
375 int is_gzip = FALSE;
376 int nsfx;
377 int status;
378
379 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
380
381 /* On any premature exit, *ret_fmt is eslSQFILE_UNKNOWN */
382 *ret_fmt = eslSQFILE_UNKNOWN;
383
384 /* Is <filename> gzip'ed? Look at suffix. */
385 if (n > 3 && strcmp(sqfp->filename+n-3, ".gz") == 0) is_gzip = TRUE;
386
387 /* Locate the suffix that might indicate format (ignore .gz) */
388 for (nsfx = 1, sfx = sqfp->filename + n - 1 - (is_gzip ? 3 : 0);
389 sfx != sqfp->filename && *sfx != '.';
390 sfx--)
391 nsfx++;
392
393 /* now sfx points either to filename (we didn't find a suffix) or to the . of the suffix,
394 * and nsfx is the suffix length inclusive of the .
395 */
396
397 /* Attempt to guess file format based on file name suffix. */
398 if (nsfx && strcmp(sfx, ".fa") == 0) { *ret_fmt = eslSQFILE_FASTA; return eslOK; }
399 else if (nsfx && strcmp(sfx, ".gb") == 0) { *ret_fmt = eslSQFILE_GENBANK; return eslOK; }
400
401 /* If that didn't work, we'll have a peek at the stream;
402 * turn recording on, and set for line based input.
403 */
404 if (ascii->is_recording == -1) ESL_EXCEPTION(eslEINVAL, "sq file already too advanced");
405 ascii->is_recording = TRUE;
406 ascii->is_linebased = TRUE;
407 loadbuf(sqfp);/* now ascii->buf is a line of the file */
408
409 /* get first nonblank line */
410 while (esl_str_IsBlank(ascii->buf)) {
411 status = loadbuf(sqfp);
412 if (status == eslEOF) ESL_XFAIL(eslEFORMAT, ascii->errbuf, "No data found in file");
413 else if (status != eslOK) goto ERROR;
414 }
415
416 /* formats that can be determined from the first line: */
417 if (*(ascii->buf) == '>') *ret_fmt = eslSQFILE_FASTA;
418 else if (strncmp(ascii->buf, "ID ", 5) == 0) *ret_fmt = eslSQFILE_EMBL;
419 else if (strncmp(ascii->buf, "LOCUS ", 8) == 0) *ret_fmt = eslSQFILE_GENBANK;
420 else if (strstr(ascii->buf, "Genetic Sequence Data Bank") != NULL) *ret_fmt = eslSQFILE_GENBANK;
421
422 /* reset the sqfp */
423 ascii->mpos = 0;
424 ascii->is_recording = FALSE;
425 ascii->is_linebased = FALSE;
426 free(ascii->buf);
427 ascii->buf = NULL;
428 ascii->balloc = 0;
429 return (*ret_fmt == eslSQFILE_UNKNOWN) ? eslEFORMAT : eslOK;
430
431 ERROR:
432 ascii->mpos = 0;
433 ascii->is_recording = FALSE;
434 ascii->is_linebased = FALSE;
435 if (ascii->buf != NULL) { free(ascii->buf); ascii->balloc = 0; }
436 return status;
437 }
438
439 /* Function: sqascii_Position()
440 * Synopsis: Reposition an open sequence file to an offset.
441 *
442 * Purpose: Reposition an open <sqfp> to offset <offset>.
443 * <offset> would usually be the first byte of a
444 * desired sequence record.
445 *
446 * Only normal sequence files can be positioned to a
447 * nonzero offset. If <sqfp> corresponds to a standard
448 * input stream or gzip -dc stream, it may not be
449 * repositioned. If <sqfp> corresponds to a multiple
450 * sequence alignment file, the only legal <offset>
451 * is 0, to rewind the file to the beginning and
452 * be able to read the entire thing again.
453 *
454 * After <esl_sqfile_Position()> is called on a nonzero
455 * <offset>, <sqfp->linenumber> and other bookkeeping
456 * information is unknown. If caller knows it, it should
457 * set it explicitly.
458 *
459 * See the SSI module for manipulating offsets and indices.
460 *
461 * Returns: <eslOK> on success;
462 * <eslEOF> if no data can be read from this position.
463 *
464 * Throws: <eslESYS> if the fseeko() or fread() call fails.
465 * <eslEMEM> on (re-)allocation failure.
466 * <eslEINVAL> if the <sqfp> is not positionable.
467 * <eslENOTFOUND> if in trying to rewind an alignment file
468 * by closing and reopening it, the open fails.
469 * On errors, the state of <sqfp> is indeterminate, and
470 * it should not be used again.
471 */
472 static int
sqascii_Position(ESL_SQFILE * sqfp,off_t offset)473 sqascii_Position(ESL_SQFILE *sqfp, off_t offset)
474 {
475 int status;
476
477 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
478
479 if (ascii->do_stdin) ESL_EXCEPTION(eslEINVAL, "can't Position() in standard input");
480 if (ascii->do_gzip) ESL_EXCEPTION(eslEINVAL, "can't Position() in a gzipped file");
481 if (offset < 0) ESL_EXCEPTION(eslEINVAL, "bad offset");
482 if (offset > 0 && ascii->afp != NULL) ESL_EXCEPTION(eslEINVAL, "can't use esl_sqfile_Position() w/ nonzero offset on MSA file");
483
484 if (esl_sqio_IsAlignment(sqfp->format))
485 {/* msa file: close and reopen. maybe sometime we'll have esl_msafile_Rewind() */
486 /* we have already verified that offset==0 for MSA file */
487 esl_msafile_Close(ascii->afp);
488 if (ascii->msa != NULL) esl_msa_Destroy(ascii->msa);
489 ascii->afp = NULL;
490 ascii->msa = NULL;
491 ascii->idx = 0;
492
493 /* we know we successfully opened it the first time, so a
494 failure to reopen is an exception, not a user-reportable
495 normal error. ENOTFOUND is the only normal error;
496 EFORMAT error can't occur because we know the format and
497 don't use autodetection.
498 */
499 status = esl_msafile_Open(NULL, sqfp->filename, NULL, sqfp->format, NULL, &(ascii->afp));
500 if (status == eslENOTFOUND) ESL_EXCEPTION(eslENOTFOUND, "failed to reopen alignment file");
501 else if (status != eslOK) return status;
502 }
503 else/* normal case: unaligned sequence file */
504 {
505 if (fseeko(ascii->fp, offset, SEEK_SET) != 0) ESL_EXCEPTION(eslESYS, "fseeko() failed");
506
507 ascii->currpl = -1;
508 ascii->curbpl = -1;
509 ascii->prvrpl = -1;
510 ascii->prvbpl = -1;
511 ascii->linenumber = (offset == 0) ? 1 : -1; /* -1 is "unknown" */
512 ascii->L = -1;
513 ascii->mpos = ascii->mn;/* this forces loadbuf to load new data */
514 if ((status = loadbuf(sqfp)) != eslOK) return status;
515 }
516 return eslOK;
517 }
518
519
520
521 /* Function: sqascii_Close()
522 * Synopsis: Close a sequence file.
523 *
524 * Purpose: Closes an open <sqfp>.
525 *
526 * Returns: (void).
527 */
528 static void
sqascii_Close(ESL_SQFILE * sqfp)529 sqascii_Close(ESL_SQFILE *sqfp)
530 {
531 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
532
533 #ifdef HAVE_POPEN
534 if (ascii->do_gzip) pclose(ascii->fp);
535 else
536 #endif
537 if (! ascii->do_stdin && ascii->fp != NULL) fclose(ascii->fp);
538
539 if (ascii->ssifile != NULL) free(ascii->ssifile);
540 if (ascii->mem != NULL) free(ascii->mem);
541 if (ascii->balloc > 0) free(ascii->buf);
542 if (ascii->ssi != NULL) esl_ssi_Close(ascii->ssi);
543
544 if (ascii->afp != NULL) esl_msafile_Close(ascii->afp);
545 if (ascii->msa != NULL) esl_msa_Destroy(ascii->msa);
546
547 ascii->do_gzip = FALSE;
548 ascii->do_stdin = FALSE;
549
550 ascii->fp = NULL;
551
552 ascii->ssifile = NULL;
553 ascii->mem = NULL;
554
555 ascii->balloc = 0;
556 ascii->buf = NULL;
557
558 ascii->ssi = NULL;
559
560 ascii->afp = NULL;
561 ascii->msa = NULL;
562
563 return;
564 }
565 /*------------------- ESL_SQFILE open/close -----------------------*/
566
567
568 /*****************************************************************
569 *# 2. An <ESL_SQFILE> object, in digital mode [with <alphabet>]
570 *****************************************************************/
571
572 /* Function: sqascii_SetDigital()
573 * Synopsis: Set an open <ESL_SQFILE> to read in digital mode.
574 *
575 * Purpose: Given an <ESL_SQFILE> that's already been opened,
576 * configure it to expect subsequent input to conform
577 * to the digital alphabet <abc>.
578 *
579 * Calling <esl_sqfile_Open(); esl_sqfile_SetDigital()> is
580 * equivalent to <esl_sqfile_OpenDigital()>. The two-step
581 * version is useful when you need a
582 * <esl_sqfile_GuessAlphabet()> call in between, guessing
583 * the file's alphabet in text mode before you set it to
584 * digital mode.
585 *
586 * Returns: <eslOK> on success.
587 */
588 static int
sqascii_SetDigital(ESL_SQFILE * sqfp,const ESL_ALPHABET * abc)589 sqascii_SetDigital(ESL_SQFILE *sqfp, const ESL_ALPHABET *abc)
590 {
591 int status = eslOK;
592
593 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
594
595 if (!esl_sqio_IsAlignment(sqfp->format))
596 {
597 switch (sqfp->format) {
598 case eslSQFILE_EMBL: inmap_embl(sqfp, abc->inmap); break;
599 case eslSQFILE_UNIPROT: inmap_embl(sqfp, abc->inmap); break;
600 case eslSQFILE_GENBANK: inmap_genbank(sqfp, abc->inmap); break;
601 case eslSQFILE_DDBJ: inmap_genbank(sqfp, abc->inmap); break;
602 case eslSQFILE_FASTA: inmap_fasta(sqfp, abc->inmap); break;
603 case eslSQFILE_DAEMON: inmap_daemon(sqfp, abc->inmap); break;
604
605 default: status = eslEFORMAT; break;
606 }
607 }
608 else
609 esl_msafile_SetDigital(ascii->afp, abc);
610
611 return status;
612 }
613
614 /* Function: sqascii_GuessAlphabet()
615 * Synopsis: Guess the alphabet of an open <ESL_SQFILE>.
616 *
617 * Purpose: After opening <sqfp>, attempt to guess what alphabet
618 * its sequences are in, by inspecting the first sequence
619 * in the file, and return this alphabet type in <*ret_type>.
620 *
621 * Returns: <eslOK> on success, and <*ret_type> is set to <eslDNA>,
622 * <eslRNA>, or <eslAMINO>.
623 *
624 * Returns <eslENOALPHABET> and sets <*ret_type> to
625 * <eslUNKNOWN> if the first sequence (or alignment)
626 * in the file contains no more than ten residues total,
627 * or if its alphabet cannot be guessed (i.e. it contains
628 * IUPAC degeneracy codes, but no amino acid specific
629 * residues).
630 *
631 * Returns <eslEFORMAT> if a parse error is encountered in
632 * trying to read the sequence file. <ascii->errbuf> is set
633 * to a useful error message if this occurs,
634 * <sqfp->linenumber> is the line on which the error
635 * occurred, and <*ret_type> is set to <eslUNKNOWN>.
636 *
637 * Returns <eslENODATA> and sets <*ret_type> to <eslUNKNOWN>
638 * if the file appears to be empty.
639 *
640 * Throws: <eslEMEM> on allocation error;
641 * <eslEINCONCEIVABLE> on unimaginable internal errors.
642 */
643 static int
sqascii_GuessAlphabet(ESL_SQFILE * sqfp,int * ret_type)644 sqascii_GuessAlphabet(ESL_SQFILE *sqfp, int *ret_type)
645 {
646 ESL_SQ *sq = NULL;
647 int status;
648
649 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
650
651 /* Special case: for MSA files, hand this off to msafile_GuessAlphabet. */
652 if (esl_sqio_IsAlignment(sqfp->format)) return esl_msafile_GuessAlphabet(ascii->afp, ret_type);
653
654 /* set the sqfp to record; we'll rewind afterwards and use the recording */
655 ascii->is_recording = TRUE;
656
657 if ((sq = esl_sq_Create()) == NULL) { status = eslEMEM; goto ERROR; }
658
659 status = sqascii_ReadWindow(sqfp, 0, 4000, sq);
660 if (status == eslEOF) { status = eslENODATA; goto ERROR; }
661 else if (status != eslOK) goto ERROR;
662
663 if ((status = esl_sq_GuessAlphabet(sq, ret_type)) != eslOK) goto ERROR;
664
665 /* reset the sqfp, so it uses the recording next */
666 ascii->mpos = 0;
667 ascii->linenumber = 1;
668 ascii->is_recording = FALSE;
669 if ((status = loadbuf(sqfp)) != eslOK) ESL_EXCEPTION(status, "buffer load failed, but shouldn't have");
670 esl_sq_Destroy(sq);
671 return eslOK;
672
673 ERROR:
674 esl_sq_Destroy(sq);
675 *ret_type = eslUNKNOWN;
676 return status;
677 }
678 /*-------------- end, digital mode ESL_SQFILE -------------------*/
679
680
681
682
683 /*****************************************************************
684 *# 3. Miscellaneous routines
685 *****************************************************************/
686
687 /* Function: sqascii_IsRewindable()
688 * Synopsis: Return <TRUE> if <sqfp> can be rewound.
689 *
690 * Purpose: Returns <TRUE> if <sqfp> can be rewound (positioned
691 * to an offset of zero), in order to read it a second
692 * time.
693 */
694 static int
sqascii_IsRewindable(const ESL_SQFILE * sqfp)695 sqascii_IsRewindable(const ESL_SQFILE *sqfp)
696 {
697 if (sqfp->data.ascii.do_gzip == TRUE) return FALSE;
698 if (sqfp->data.ascii.do_stdin == TRUE) return FALSE;
699 return TRUE;
700 }
701
702 /* Function: sqascii_GetError()
703 * Synopsis: Return <TRUE> if <sqfp> can be rewound.
704 *
705 * Purpose: Returns <TRUE> if <sqfp> can be rewound (positioned
706 * to an offset of zero), in order to read it a second
707 * time.
708 */
709 static const char *
sqascii_GetError(const ESL_SQFILE * sqfp)710 sqascii_GetError(const ESL_SQFILE *sqfp)
711 {
712 return sqfp->data.ascii.errbuf;
713 }
714
715
716 /*****************************************************************
717 *# 4. Sequence reading (sequential)
718 *****************************************************************/
719
720 /* Function: sqascii_Read()
721 * Synopsis: Read the next sequence from a file.
722 *
723 * Purpose: Reads the next sequence from open sequence file <sqfp> into
724 * <sq>. Caller provides an allocated and initialized <s>, which
725 * will be internally reallocated if its space is insufficient.
726 *
727 * Returns: <eslOK> on success; the new sequence is stored in <s>.
728 *
729 * Returns <eslEOF> when there is no sequence left in the
730 * file (including first attempt to read an empty file).
731 *
732 * Returns <eslEFORMAT> if there's a problem with the format,
733 * such as an illegal character; the line number that the parse
734 * error occurs on is in <sqfp->linenumber>, and an informative
735 * error message is placed in <ascii->errbuf>.
736 *
737 * Throws: <eslEMEM> on allocation failure;
738 * <eslEINCONCEIVABLE> on internal error.
739 */
740 static int
sqascii_Read(ESL_SQFILE * sqfp,ESL_SQ * sq)741 sqascii_Read(ESL_SQFILE *sqfp, ESL_SQ *sq)
742 {
743 int status;
744 int64_t epos;
745 int64_t n;
746
747 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
748
749 if (esl_sqio_IsAlignment(sqfp->format))
750 {
751 ESL_SQ *tmpsq = NULL;
752 if (ascii->msa == NULL || ascii->idx >= ascii->msa->nseq)
753 { /* we need to load a new alignment? */
754 esl_msa_Destroy(ascii->msa);
755 status = esl_msafile_Read(ascii->afp, &(ascii->msa));
756 if (status == eslEFORMAT)
757 { /* oops, a parse error; upload the error info from afp to sqfp */
758 ascii->linenumber = ascii->afp->linenumber;
759 strcpy(ascii->errbuf, ascii->afp->errmsg); /* errbufs same size! */
760 return eslEFORMAT;
761 }
762 if (status != eslOK) return status;
763 ascii->idx = 0;
764 }
765
766 /* grab next seq from alignment */
767 /* this is inefficient; it goes via a temporarily allocated copy of the sequence */
768 if ((status = esl_sq_FetchFromMSA(ascii->msa, ascii->idx, &tmpsq)) != eslOK) return status;
769 esl_sq_GrowTo(sq, tmpsq->n);
770 esl_sq_Copy(tmpsq, sq);
771 esl_sq_Destroy(tmpsq);
772 ascii->idx++;
773
774 sq->start = 1;
775 sq->end = sq->n;
776 sq->C = 0;
777 sq->W = sq->n;
778 sq->L = sq->n;
779 return eslOK;
780 }
781
782 /* Main case: read next seq from sqfp's stream */
783 if (ascii->nc == 0) return eslEOF;
784 if ((status = ascii->parse_header(sqfp, sq)) != eslOK) return status; /* EMEM, EOF, EFORMAT */
785
786 do {
787 if ((status = seebuf(sqfp, -1, &n, &epos)) == eslEFORMAT) return status;
788 if (esl_sq_GrowTo(sq, sq->n + n) != eslOK) return eslEMEM;
789 addbuf(sqfp, sq, n);
790 ascii->L += n;
791 sq->eoff = ascii->boff + epos - 1;
792 if (status == eslEOD) break;
793 } while ((status = loadbuf(sqfp)) == eslOK);
794
795 if (status == eslEOF)
796 {
797 if (! ascii->eof_is_ok) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Unexpected EOF; file truncated?");
798 if ((status = ascii->parse_end(sqfp, sq)) != eslOK) return status;
799 }
800 else if (status == eslEOD)
801 {
802 ascii->bpos = epos;
803 if ((status = ascii->parse_end(sqfp, sq)) != eslOK) return status;
804 }
805 else if (status != eslOK) return status;
806
807 if (sq->dsq != NULL) sq->dsq[sq->n+1] = eslDSQ_SENTINEL;
808 else sq->seq[sq->n] = '\0';
809 sq->start = 1;
810 sq->end = sq->n;
811 sq->C = 0;
812 sq->W = sq->n;
813 sq->L = sq->n;
814 return eslOK;
815 }
816
817
818 /* Function: sqascii_ReadInfo()
819 * Synopsis: Read sequence info, but not the sequence itself.
820 *
821 * Purpose: Read the next sequence from open sequence file <sqfp>,
822 * but don't store the sequence (or secondary structure).
823 * Upon successful return, <s> holds all the available
824 * information about the sequence -- its name, accession,
825 * description, and overall length <sq->L>.
826 *
827 * This is useful for indexing sequence files, where
828 * individual sequences might be ginormous, and we'd rather
829 * avoid reading complete seqs into memory.
830 *
831 * Returns: <eslOK> on success.
832 *
833 * Throws: <eslEMEM> on allocation error.
834 */
835 static int
sqascii_ReadInfo(ESL_SQFILE * sqfp,ESL_SQ * sq)836 sqascii_ReadInfo(ESL_SQFILE *sqfp, ESL_SQ *sq)
837 {
838 int status;
839 int64_t epos;
840 int64_t n;
841
842 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
843
844 if (esl_sqio_IsAlignment(sqfp->format))
845 {
846 ESL_SQ *tmpsq = NULL;
847 if (ascii->msa == NULL || ascii->idx >= ascii->msa->nseq)
848 { /* we need to load a new alignment? */
849 esl_msa_Destroy(ascii->msa);
850 status = esl_msafile_Read(ascii->afp, &(ascii->msa));
851 if (status == eslEFORMAT)
852 { /* oops, a parse error; upload the error info from afp to sqfp */
853 ascii->linenumber = ascii->afp->linenumber;
854 strcpy(ascii->errbuf, ascii->afp->errmsg); /* errbufs same size! */
855 return eslEFORMAT;
856 }
857 if (status != eslOK) return status;
858 ascii->idx = 0;
859 }
860
861 /* grab next seq from alignment */
862 /* this is inefficient; it goes via a temporarily allocated copy of the sequence */
863 if ((status = esl_sq_FetchFromMSA(ascii->msa, ascii->idx, &tmpsq)) != eslOK) return status;
864 if (tmpsq->dsq != NULL) tmpsq->dsq[1] = eslDSQ_SENTINEL;
865 else tmpsq->seq[0] = '\0';
866 esl_sq_Copy(tmpsq, sq);
867 esl_sq_Destroy(tmpsq);
868 ascii->idx++;
869
870 if (sq->dsq != NULL) sq->dsq[1] = eslDSQ_SENTINEL;
871 else sq->seq[0] = '\0';
872 if (sq->ss != NULL) { free(sq->ss); sq->ss = NULL; }
873
874 sq->n = 0;
875 sq->start = 0;
876 sq->end = 0;
877 sq->C = 0;
878 sq->W = 0;
879 return eslOK;
880 }
881
882 if (ascii->nc == 0) return eslEOF;
883 if ((status = ascii->parse_header(sqfp, sq)) != eslOK) return status; /* EOF, EFORMAT */
884
885 ascii->L = 0;
886 do {
887 status = seebuf(sqfp, -1, &n, &epos);
888 ascii->L += n;
889 sq->eoff = ascii->boff + epos - 1;
890 if (status == eslEFORMAT) return status;
891 if (status == eslEOD) break;
892 } while ((status = loadbuf(sqfp)) == eslOK);
893
894 if (status == eslEOF)
895 {
896 if (! ascii->eof_is_ok) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Unexpected EOF; file truncated?");
897 }
898 else if (status == eslEOD)
899 {
900 ascii->bpos = epos;
901 if ((status = ascii->parse_end(sqfp, sq)) != eslOK) return status;
902 }
903 else if (status != eslOK) return status;
904 sq->L = ascii->L;
905
906 /* Set coord system for an info-only ESL_SQ */
907 if (sq->dsq != NULL) sq->dsq[1] = eslDSQ_SENTINEL;
908 else sq->seq[0] = '\0';
909 if (sq->ss != NULL) { free(sq->ss); sq->ss = NULL; }
910 sq->n = 0;
911 sq->start = 0;
912 sq->end = 0;
913 sq->C = 0;
914 sq->W = 0;
915 return eslOK;
916 }
917
918
919 /* Function: sqascii_ReadSequence()
920 * Synopsis: Read the next sequence from a file.
921 *
922 * Purpose: Reads the next sequence from open sequence file <sqfp> into
923 * <sq>. Caller provides an allocated and initialized <s>, which
924 * will be internally reallocated if its space is insufficient.
925 *
926 * Returns: <eslOK> on success; the new sequence is stored in <s>.
927 *
928 * Returns <eslEOF> when there is no sequence left in the
929 * file (including first attempt to read an empty file).
930 *
931 * Returns <eslEFORMAT> if there's a problem with the format,
932 * such as an illegal character; the line number that the parse
933 * error occurs on is in <sqfp->linenumber>, and an informative
934 * error message is placed in <ascii->errbuf>.
935 *
936 * Throws: <eslEMEM> on allocation failure;
937 * <eslEINCONCEIVABLE> on internal error.
938 */
939 static int
sqascii_ReadSequence(ESL_SQFILE * sqfp,ESL_SQ * sq)940 sqascii_ReadSequence(ESL_SQFILE *sqfp, ESL_SQ *sq)
941 {
942 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
943 int64_t epos;
944 int64_t n;
945 int status;
946
947 if (esl_sqio_IsAlignment(sqfp->format))
948 {
949 ESL_SQ *tmpsq = NULL;
950 if (ascii->msa == NULL || ascii->idx >= ascii->msa->nseq)
951 { /* we need to load a new alignment? */
952 esl_msa_Destroy(ascii->msa);
953 status = esl_msafile_Read(ascii->afp, &(ascii->msa));
954 if (status == eslEFORMAT)
955 { /* oops, a parse error; upload the error info from afp to sqfp */
956 ascii->linenumber = ascii->afp->linenumber;
957 strcpy(ascii->errbuf, ascii->afp->errmsg); /* errbufs same size! */
958 return eslEFORMAT;
959 }
960 if (status != eslOK) return status;
961 ascii->idx = 0;
962 }
963
964 /* grab next seq from alignment */
965 /* this is inefficient; it goes via a temporarily allocated copy of the sequence */
966 status = esl_sq_FetchFromMSA(ascii->msa, ascii->idx, &tmpsq); // eslEMEM | eslEOD
967 if (status != eslOK) return status;
968
969 esl_sq_GrowTo(sq, tmpsq->n);
970 esl_sq_Copy(tmpsq, sq);
971 esl_sq_Destroy(tmpsq);
972 ascii->idx++;
973
974 sq->start = 1;
975 sq->end = sq->n;
976 sq->C = 0;
977 sq->W = sq->n;
978 sq->L = sq->n;
979 return eslOK;
980 }
981
982 /* Main case: read next seq from sqfp's stream */
983 if (ascii->nc == 0) return eslEOF;
984 if ((status = ascii->skip_header(sqfp, sq)) != eslOK) return status; /* EOF, EFORMAT */
985
986 do {
987 if ((status = seebuf(sqfp, -1, &n, &epos)) == eslEFORMAT) return status;
988 if (esl_sq_GrowTo(sq, sq->n + n) != eslOK) return eslEMEM;
989 addbuf(sqfp, sq, n);
990 ascii->L += n;
991 sq->eoff = ascii->boff + epos - 1;
992 if (status == eslEOD) break;
993 } while ((status = loadbuf(sqfp)) == eslOK);
994
995 if (status == eslEOF)
996 {
997 if (! ascii->eof_is_ok) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Unexpected EOF; file truncated?");
998 if ((status = ascii->parse_end(sqfp, sq)) != eslOK) return status;
999 }
1000 else if (status == eslEOD)
1001 {
1002 ascii->bpos = epos;
1003 if ((status = ascii->parse_end(sqfp, sq)) != eslOK) return status;
1004 }
1005 else if (status != eslOK) return status;
1006
1007 if (sq->dsq != NULL) sq->dsq[sq->n+1] = eslDSQ_SENTINEL;
1008 else sq->seq[sq->n] = '\0';
1009 sq->start = 1;
1010 sq->end = sq->n;
1011 sq->C = 0;
1012 sq->W = sq->n;
1013 sq->L = sq->n;
1014 return eslOK;
1015 }
1016
1017
1018 /* Function: sqascii_ReadWindow()
1019 * Synopsis: Read next window of sequence.
1020 *
1021 * Purpose: Read a next window of <W> residues from open file <sqfp>,
1022 * keeping <C> residues from the previous window as
1023 * context, and keeping previous annotation in the <sq>
1024 * as before.
1025 *
1026 * If this is the first window of a new sequence record,
1027 * <C> is ignored (there's no previous context yet), and
1028 * the annotation fields of the <sq> (name, accession, and
1029 * description) are initialized by reading the sequence
1030 * record's header. This is the only time the annotation
1031 * fields are initialized.
1032 *
1033 * On return, <sq->dsq[]> contains the window and its
1034 * context; residues <1..sq->C> are the previous context,
1035 * and residues <sq->C+1..sq->n> are the new window. The
1036 * start and end coordinates of the whole <dsq[1..n]>
1037 * (including context) in the original source sequence are
1038 * <sq->start..sq->end>. (Or, for text mode sequences,
1039 * <sq->seq[0..sq->C-1,sq->C..sq->n-1]>, while <start> and
1040 * <end> coords are still <1..L>.)
1041 *
1042 * When a sequence record is completed and no more data
1043 * remain, <eslEOD> is returned, with an ``info'' <sq>
1044 * structure (containing the annotation and the total
1045 * sequence length <L>, but no sequence). (The total
1046 * sequence length <L> is unknown in <sq> until this
1047 * <eslEOD> return.)
1048 *
1049 * The caller may then do one of two things before calling
1050 * <esl_sq_ReadWindow()> again; it can reset the sequence
1051 * with <esl_sq_Reuse()> to continue reading the next
1052 * sequence in the file, or it can set a negative <W> as a
1053 * signal to read windows from the reverse complement
1054 * (Crick) strand. Reverse complement reading only works
1055 * for nucleic acid sequence.
1056 *
1057 * If you read the reverse complement strand, you must read
1058 * the whole thing, calling <esl_sqio_ReadWindow()> with
1059 * negative <W> windows until <eslEOD> is returned again
1060 * with an empty (info-only) <sq> structure. When that
1061 * <EOD> is reached, the <sqfp> is repositioned at the
1062 * start of the next sequence record; the caller should now
1063 * <Reuse()> the <sq>, and the next <esl_sqio_ReadWindow()>
1064 * call must have a positive <W>, corresponding to starting
1065 * to read the Watson strand of the next sequence.
1066 *
1067 * Note that the <ReadWindow()> interface is designed for
1068 * an idiom of sequential reading of complete sequences in
1069 * overlapping windows, possibly on both strands; if you
1070 * want more freedom to move around in the sequence
1071 * grabbing windows in another order, you can use the
1072 * <FetchSubseq()> interface.
1073 *
1074 * Reading the reverse complement strand requires file
1075 * repositioning, so it will not work on non-repositionable
1076 * streams like gzipped files or a stdin pipe. Moreover,
1077 * for reverse complement input to be efficient, the
1078 * sequence file should have consistent line lengths,
1079 * suitable for SSI's fast subsequence indexing.
1080 *
1081 * Returns: <eslOK> on success; <sq> now contains next window of
1082 * sequence, with at least 1 new residue. The number
1083 * of new residues is <sq->W>; <sq->C> residues are
1084 * saved from the previous window. Caller may now
1085 * process residues <sq->dsq[sq->C+1]..sq->dsq[sq->n]>.
1086 *
1087 * <eslEOD> if no new residues were read for this sequence
1088 * and strand, and <sq> now contains an empty info-only
1089 * structure (annotation and <L> are valid). Before calling
1090 * <esl_sqio_ReadWindow()> again, caller will either want
1091 * to make <W> negative (to start reading the Crick strand
1092 * of the current sequence), or it will want to reset the
1093 * <sq> (with <esl_sq_Reuse()>) to go on the next sequence.
1094 *
1095 * <eslEOF> if we've already returned <eslEOD> before to
1096 * signal the end of the previous seq record, and moreover,
1097 * there's no more sequence records in the file.
1098 *
1099 * <eslEINVAL> if an invalid residue is found in the
1100 * sequence, or if you attempt to take the reverse
1101 * complement of a sequence that can't be reverse
1102 * complemented.
1103 *
1104 * Throws: <eslESYNTAX> if you try to read a reverse window before
1105 * you've read forward strand.
1106 *
1107 * <eslECORRUPT> if something goes awry internally in the
1108 * coordinate system.
1109 *
1110 * <eslEMEM> on allocation error.
1111 */
1112 static int
sqascii_ReadWindow(ESL_SQFILE * sqfp,int C,int W,ESL_SQ * sq)1113 sqascii_ReadWindow(ESL_SQFILE *sqfp, int C, int W, ESL_SQ *sq)
1114 {
1115 int actual_start;
1116 int64_t nres;
1117 int64_t line;
1118 off_t offset;
1119 int status;
1120 ESL_SQ *tmpsq = NULL;
1121
1122 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
1123
1124 if (esl_sqio_IsAlignment(sqfp->format))
1125 {
1126 /* special: if we're initializing a revcomp window read, back ascii->idx up one */
1127 if (W < 0 && sq->start == 0) ascii->idx--;
1128
1129 if (ascii->msa == NULL || ascii->idx >= ascii->msa->nseq)
1130 { /* need new alignment? */
1131 esl_msa_Destroy(ascii->msa);
1132 status = esl_msafile_Read(ascii->afp, &(ascii->msa));
1133 if (status == eslEFORMAT)
1134 { /* oops, a parse error; upload the error info from afp to sqfp */
1135 ascii->linenumber = ascii->afp->linenumber;
1136 strcpy(ascii->errbuf, ascii->afp->errmsg); /* errbufs same size! */
1137 return eslEFORMAT;
1138 }
1139 else if (status != eslOK) goto ERROR;
1140 ascii->idx = 0;
1141 }
1142
1143 /* grab appropriate seq from alignment into tmpsq */
1144 if ((status = esl_sq_FetchFromMSA(ascii->msa, ascii->idx, &tmpsq)) != eslOK) goto ERROR;
1145
1146 /*by default, tmpsq is an ascii sequence, convert it to digital if that's what sq is*/
1147 if (sq->seq == NULL &&
1148 (( status = esl_sq_Digitize(sq->abc, tmpsq)) != eslOK))
1149 goto ERROR;
1150
1151 /* Figure out tmpsq coords we'll put in sq */
1152 if (W > 0)
1153 {/* forward strand */
1154 sq->C = ESL_MIN(sq->n, C);
1155 sq->start = sq->end - sq->C + 1;
1156 sq->end = ESL_MIN(tmpsq->L, sq->end + W);
1157 sq->n = sq->end - sq->start + 1;
1158 sq->W = sq->n - sq->C;
1159 }
1160 else
1161 {/* reverse strand */
1162 if (sq->L == -1) ESL_XEXCEPTION(eslESYNTAX, "Can't read reverse complement until you've read forward strand");
1163
1164 sq->C = ESL_MIN(sq->n, sq->end + C - 1);
1165 sq->end = (sq->start == 0 ? sq->L : sq->end + sq->C - 1);
1166 sq->start = ESL_MAX(1, sq->end + W - sq->C - 1);
1167 sq->n = sq->end - sq->start + 1;
1168 sq->W = sq->n - sq->C;
1169 }
1170
1171 if (sq->W == 0)/* no new sequence? that's the EOD case */
1172 {
1173 sq->start = 0;
1174 sq->end = 0;
1175 sq->C = 0;
1176 sq->W = 0;
1177 sq->n = 0;
1178 sq->L = tmpsq->L;
1179 if (sq->dsq) sq->dsq[1] = eslDSQ_SENTINEL;
1180 else if (sq->seq) sq->seq[0] = '\0';
1181
1182 ascii->idx++;
1183 esl_sq_Destroy(tmpsq);
1184 return eslEOD;
1185 }
1186
1187 /* Copy the sequence frag. */
1188 if (tmpsq->ss != NULL && sq->ss == NULL) ESL_ALLOC(sq->ss, sizeof(char) * (sq->salloc)); /* this *must* be for salloc */
1189 esl_sq_GrowTo(sq, sq->n);
1190 if (tmpsq->seq != NULL)
1191 {/* text mode */
1192 memcpy(sq->seq, tmpsq->seq + sq->start - 1, sizeof(char) * sq->n);
1193 sq->seq[sq->n] = '\0';
1194 if (tmpsq->ss != NULL) {
1195 memcpy(sq->ss, tmpsq->ss + sq->start - 1, sizeof(char) * sq->n);
1196 sq->ss[sq->n] = '\0';
1197 }
1198 }
1199 else
1200 {
1201 memcpy(sq->dsq + 1, tmpsq->dsq + sq->start, sizeof(ESL_DSQ) * sq->n);
1202 sq->dsq[sq->n+1] = eslDSQ_SENTINEL;
1203 if (tmpsq->ss != NULL) {
1204 memcpy(sq->ss + 1, tmpsq->ss + sq->start, sizeof(char) * sq->n);
1205 sq->ss[sq->n+1] = '\0';
1206 }
1207 }
1208 if (W < 0 && (status = esl_sq_ReverseComplement(sq)) != eslOK)
1209 ESL_XFAIL(eslEINVAL, ascii->errbuf, "Can't reverse complement that sequence window");
1210
1211 /* Copy annotation */
1212 if ((status = esl_sq_SetName (sq, tmpsq->name)) != eslOK) goto ERROR;
1213 if ((status = esl_sq_SetSource (sq, tmpsq->name)) != eslOK) goto ERROR;
1214 if ((status = esl_sq_SetAccession(sq, tmpsq->acc)) != eslOK) goto ERROR;
1215 if ((status = esl_sq_SetDesc (sq, tmpsq->desc)) != eslOK) goto ERROR;
1216 sq->roff = -1;
1217 sq->doff = -1;
1218 sq->eoff = -1;
1219 sq->hoff = -1;
1220
1221 esl_sq_Destroy(tmpsq);
1222 return eslOK;
1223 }
1224
1225 /* Now for the normal case: we're reading a normal unaligned seq file, not an alignment. */
1226 /* Negative W indicates reverse complement direction */
1227 if (W < 0)
1228 {
1229 if (sq->L == -1) ESL_EXCEPTION(eslESYNTAX, "Can't read reverse complement until you've read forward strand");
1230
1231 if (sq->end == 1 || sq->L == 0)
1232 { /* last end == 1 means last window was the final one on reverse strand,
1233 * so we're EOD; jump back to last forward position.
1234 *
1235 * Also check for the unusual case of sq->L == 0, a completely empty sequence:
1236 * in that case, immediately return eslEOD.
1237 */
1238 if (ascii->bookmark_offset > 0)
1239 {
1240 if (esl_sqfile_Position(sqfp, ascii->bookmark_offset) != eslOK)
1241 ESL_EXCEPTION(eslECORRUPT, "Failed to reposition seq file at last forward bookmark");
1242 ascii->linenumber = ascii->bookmark_linenum;
1243 }
1244 else
1245 ascii->nc = 0; /* signals EOF */
1246
1247 ascii->bookmark_offset = 0;
1248 ascii->bookmark_linenum = 0;
1249
1250 sq->start = 0;
1251 sq->end = 0;
1252 sq->C = 0;
1253 sq->W = 0;
1254 sq->n = 0;
1255 /* sq->L stays as it is */
1256 if (sq->dsq != NULL) sq->dsq[1] = eslDSQ_SENTINEL;
1257 else sq->seq[0] = '\0';
1258 return eslEOD;
1259 }
1260
1261 /* If sq->start == 0, we haven't read any reverse windows yet;
1262 * init reading from sq->L
1263 */
1264 W = -W;
1265 if (sq->start == 0)
1266 {
1267 sq->start = ESL_MAX(1, (sq->L - W + 1));
1268 sq->end = sq->L;
1269 sq->C = 0;
1270 sq->W = sq->end - sq->start + 1;
1271 ascii->curbpl = -1;
1272 ascii->currpl = -1;
1273 ascii->prvbpl = -1;
1274 ascii->prvrpl = -1;
1275 ascii->linenumber = -1;
1276 ascii->L = -1;
1277 }
1278 else
1279 { /* Else, we're continuing to next window; prv was <end>..<start> */
1280 sq->C = ESL_MIN(C, sq->L - sq->end + 1); /* based on prev window's end */
1281 sq->end = sq->end + sq->C - 1; /* also based on prev end */
1282 sq->start = ESL_MAX(1, (sq->end - W - sq->C + 1));
1283 sq->W = sq->end - sq->start + 1 - sq->C;
1284 }
1285
1286 /* Now position for a subseq fetch of <start..end> on fwd strand, using SSI offset calc */
1287 ESL_DASSERT1(( sq->doff != 0 ));
1288 if (ascii->bpl == 0 || ascii->rpl == 0) /* no help; brute force resolution. */
1289 {
1290 offset = sq->doff;
1291 actual_start = 1;
1292 }
1293 else if (ascii->bpl == ascii->rpl+1) /* residue resolution */
1294 {
1295 line = (sq->start-1) / ascii->rpl; /* data line #0.. that <end> is on */
1296 offset = sq->doff + line * ascii->bpl + (sq->start-1)%ascii->rpl;
1297 actual_start = sq->start;
1298 }
1299 else /* line resolution */
1300 {
1301 line = (sq->start-1) / ascii->rpl; /* data line #0.. that <end> is on */
1302 offset = sq->doff + line * ascii->bpl;
1303 actual_start = 1 + line * ascii->rpl;
1304 }
1305
1306 if (esl_sqfile_Position(sqfp, offset) != eslOK)
1307 ESL_EXCEPTION(eslECORRUPT, "Failed to reposition seq file for reverse window read");
1308
1309 /* grab the subseq and rev comp it */
1310 if ((status = esl_sq_GrowTo(sq, sq->C+sq->W)) != eslOK) return status;
1311 sq->n = 0;
1312 status = read_nres(sqfp, sq, (sq->start - actual_start), (sq->end - sq->start + 1), &nres);
1313
1314 if (status != eslOK || nres < (sq->end - sq->start + 1))
1315 ESL_EXCEPTION(eslECORRUPT, "Failed to extract %d..%d", sq->start, sq->end);
1316
1317 status = esl_sq_ReverseComplement(sq);
1318 if (status == eslEINVAL) ESL_FAIL(eslEINVAL, ascii->errbuf, "can't reverse complement that seq - it's not DNA/RNA");
1319 else if (status != eslOK) return status;
1320
1321 return eslOK;
1322 }
1323
1324 /* Else, we're reading the forward strand */
1325 else
1326 { /* sq->start == 0 means we haven't read any windows on this sequence yet...
1327 * it's a new record, and we need to initialize with the header and
1328 * the first window. This is the only case that we're allowed to return
1329 * EOF from.
1330 */
1331 if (sq->start == 0)
1332 {
1333 if (ascii->nc == 0) return eslEOF;
1334 if ((status = ascii->parse_header(sqfp, sq)) != eslOK) return status; /* EOF, EFORMAT */
1335 sq->start = 1;
1336 sq->C = 0;/* no context in first window */
1337 sq->L = -1;/* won't be known 'til EOD. */
1338 ascii->L = 0;/* init to 0, so we can count residues as we go */
1339 esl_sq_SetSource(sq, sq->name);
1340 /* the <ascii->buf> is now positioned at the start of seq data */
1341 /* ascii->linenumber is ok where it is */
1342 /* the header_*() routines initialized rpl,bpl bookkeeping at start of seq line,
1343 * and also sq->doff,roff.
1344 */
1345 }
1346 else
1347 { /* else we're reading a window other than first; slide context over. */
1348 sq->C = ESL_MIN(C, sq->n);
1349
1350 /* if the case where the window is smaller than the context and the
1351 * context is not full, it is not necessary to move the context part
1352 * of the sequence that has been read in.
1353 */
1354 if (sq->C >= C) {
1355 /* now handle the case where the context is full */
1356 if (sq->seq != NULL) memmove(sq->seq, sq->seq + sq->n - sq->C, sq->C);
1357 else memmove(sq->dsq+1, sq->dsq + sq->n - sq->C + 1, sq->C);
1358 sq->start = ascii->L - sq->C + 1;
1359 sq->n = C;
1360 }
1361 }
1362
1363 if ((status = esl_sq_GrowTo(sq, C+W)) != eslOK) return status; /* EMEM */
1364 status = read_nres(sqfp, sq, 0, W, &nres);
1365 ascii->L += nres;
1366
1367 if (status == eslEOD)
1368 { /* Forward strand is done. 0 residues were read. Return eslEOD and an empty (info) <sq>. */
1369 if ((status = ascii->parse_end(sqfp, sq)) != eslOK) return status;
1370
1371 sq->start = 0;
1372 sq->end = 0;
1373 sq->C = 0;
1374 sq->W = 0;
1375 sq->L = ascii->L;
1376 sq->n = 0;
1377
1378 if (ascii->nc > 0) {
1379 ascii->bookmark_offset = ascii->boff+ascii->bpos; /* remember where the next seq starts. */
1380 //ascii->bookmark_linenum = ascii->bookmark_linenum;
1381 } else {
1382 ascii->bookmark_offset = 0; /* signals for EOF, no more seqs */
1383 ascii->bookmark_linenum = 0;
1384 }
1385
1386 if (sq->dsq != NULL) sq->dsq[1] = eslDSQ_SENTINEL; /* erase the saved context */
1387 else sq->seq[0] = '\0';
1388 return eslEOD;
1389 }
1390 else if (status == eslOK)
1391 { /* Forward strand is still in progress. <= W residues were read. Return eslOK. */
1392 sq->end = sq->start + sq->C + nres - 1;
1393 sq->W = nres;
1394 return eslOK;
1395 }
1396 else return status;/* EFORMAT,EMEM */
1397 }
1398 /*NOTREACHED*/
1399 return eslOK;
1400
1401 ERROR:
1402 if (tmpsq != NULL) esl_sq_Destroy(tmpsq);
1403 return status;
1404 }
1405
1406 /* Function: sqascii_ReadBlock()
1407 * Synopsis: Read the next block of sequences from a file.
1408 *
1409 * Purpose: Reads a block of sequences from open sequence file <sqfp> into
1410 * <sqBlock>.
1411 *
1412 * In the case that <long_target> is false, the sequences are
1413 * expected to be protein - individual sequences won't be long
1414 * so read them in one-whole-sequence at a time. If <max_sequences> is set
1415 * to a number > 0 read <max_sequences> sequences, up to at most
1416 * MAX_RESIDUE_COUNT residues.
1417 *
1418 * If <long_target> is true, the sequences are expected to be DNA.
1419 * Because sequences in a DNA database can exceed MAX_RESIDUE_COUNT,
1420 * this function uses ReadWindow to read chunks of sequence no
1421 * larger than <max_residues>, and must allow for the possibility that a
1422 * request will be made to continue reading a partly-read
1423 * sequence. This case also respects the <max_sequences> limit.
1424 *
1425 * Returns: <eslOK> on success; the new sequence is stored in <sqBlock>.
1426 *
1427 * Returns <eslEOF> when there is no sequence left in the
1428 * file (including first attempt to read an empty file).
1429 *
1430 * Returns <eslEFORMAT> if there's a problem with the format,
1431 * such as an illegal character; the line number that the parse
1432 * error occurs on is in <sqfp->linenumber>, and an informative
1433 * error message is placed in <ascii->errbuf>.
1434 *
1435 * Throws: <eslEMEM> on allocation failure;
1436 * <eslEINCONCEIVABLE> on internal error.
1437 */
1438 static int
sqascii_ReadBlock(ESL_SQFILE * sqfp,ESL_SQ_BLOCK * sqBlock,int max_residues,int max_sequences,int long_target)1439 sqascii_ReadBlock(ESL_SQFILE *sqfp, ESL_SQ_BLOCK *sqBlock, int max_residues, int max_sequences, int long_target)
1440 {
1441 int i = 0;
1442 int size = 0;
1443 int status = eslOK;
1444 ESL_SQ *tmpsq = NULL;
1445
1446 sqBlock->count = 0;
1447 if (max_sequences < 1 || max_sequences > sqBlock->listSize)
1448 max_sequences = sqBlock->listSize;
1449
1450
1451 if ( !long_target )
1452 { /* in these cases, an individual sequence won't ever be really long,
1453 so just read in a sequence at a time */
1454
1455 for (i = 0; i < max_sequences && size < MAX_RESIDUE_COUNT; ++i)
1456 {
1457 status = sqascii_Read(sqfp, sqBlock->list + i);
1458
1459 if (status != eslOK) break;
1460 size += sqBlock->list[i].n;
1461 ++sqBlock->count;
1462 }
1463 }
1464 else
1465 { /* DNA, not an alignment. Might be really long sequences */
1466
1467 if (max_residues < 1)
1468 max_residues = MAX_RESIDUE_COUNT;
1469
1470 tmpsq = esl_sq_CreateDigital(sqBlock->list->abc);
1471 //if complete flag is set to FALSE, then the prior block must have ended with a window that was a possibly
1472 //incomplete part of it's full sequence. Read another overlapping window.
1473 if (! sqBlock->complete )
1474 {
1475 //overloading C as indicator of how big C should be for this window reading action
1476 status = sqascii_ReadWindow(sqfp, sqBlock->list->C, max_residues, sqBlock->list);
1477 if (status == eslOK)
1478 {
1479 sqBlock->count = i = 1;
1480 size = sqBlock->list->n - sqBlock->list->C;
1481 sqBlock->list->L = sqfp->data.ascii.L;
1482 if (size == max_residues)
1483 { // Filled the block with a single very long window.
1484
1485 sqBlock->complete = FALSE; // default value, unless overridden below
1486 status = skip_whitespace(sqfp);
1487 if ( status != eslOK ) { // either EOD or end of buffer (EOF) was reached before the next character was seen
1488 sqBlock->complete = TRUE;
1489 status = eslOK;
1490 }
1491
1492 if(tmpsq != NULL) esl_sq_Destroy(tmpsq);
1493 return status;
1494 }
1495 else
1496 {
1497 // Burn off EOD (see notes for similar entry ~25 lines below), then go fetch the next sequence
1498 esl_sq_Reuse(tmpsq);
1499 tmpsq->start = sqBlock->list->start ;
1500 tmpsq->C = 0;
1501 status = sqascii_ReadWindow(sqfp, 0, max_residues, tmpsq);
1502 if (status != eslEOD) {
1503 if(tmpsq != NULL) esl_sq_Destroy(tmpsq);
1504 return status; //surprising
1505 }
1506 //sqBlock->list->L = tmpsq->L;
1507 }
1508 }
1509 else if (status == eslEOD)
1510 { // turns out there isn't any more of the sequence to read, after all
1511 }
1512 else
1513 {
1514 if(tmpsq != NULL) esl_sq_Destroy(tmpsq);
1515 return status;
1516 }
1517 } // otherwise, just start at the beginning
1518
1519
1520 for ( ; i < max_sequences && size < max_residues; ++i) {
1521 /* restricted request_size is used to ensure that all blocks are pretty close to the
1522 * same size. Without it, we may either naively keep asking for max_residue windows,
1523 * which can result in a window with ~2*max_residues ... or we can end up with absurdly
1524 * short fragments at the end of blocks
1525 */
1526 int request_size = ESL_MAX(max_residues-size, max_residues * .05);
1527
1528 esl_sq_Reuse(tmpsq);
1529 esl_sq_Reuse(sqBlock->list + i);
1530
1531 status = sqascii_ReadWindow(sqfp, 0, request_size , tmpsq);
1532 esl_sq_Copy(tmpsq, sqBlock->list +i);
1533 if (status != eslOK && status != eslEOD){
1534 break;
1535 } /* end of sequences (eslEOF), or we read an empty seq (eslEOD) or error (other) */
1536 size += sqBlock->list[i].n - sqBlock->list[i].C;
1537 sqBlock->list[i].L = sqfp->data.ascii.L;
1538 ++(sqBlock->count);
1539
1540 if (size >= max_residues) {
1541 // a full window worth of sequence has been read; did we reach the end of the final sequence in the block?
1542 sqBlock->complete = FALSE; // default value, unless overridden below
1543
1544 status = skip_whitespace(sqfp);
1545 if ( status != eslOK ) { // either EOD or end of buffer (EOF) was reached before the next character was seen
1546 sqBlock->complete = TRUE;
1547 status = eslOK;
1548 }
1549
1550 if(tmpsq != NULL) esl_sq_Destroy(tmpsq);
1551 return status;
1552 } else if(status == eslEOD) {
1553 /* We've read an empty sequence of length 0, rare, but
1554 * possible, and we need to be able to handle it
1555 * gracefully. Ensure L is 0, set status to eslOK and move
1556 * on, we've already incremented sqBlock->count by 1
1557 * above. This means our block may contain zero-length
1558 * sequences when we return (that is, we still add these
1559 * seqs onto the block instead of skipping them altogether).
1560 */
1561 sqBlock->list[i].L = 0; /* actually, this should already be 0... */
1562 status = eslOK;
1563 } else {
1564 /* Sequence finished, but haven't yet reached max_residues. Need to burn off the EOD value
1565 that will be returned by the next ReadWindow call. Can just use a tmp sq, after setting
1566 a couple values ReadWindow needs to see for correct processing.
1567 */
1568 esl_sq_Reuse(tmpsq);
1569 tmpsq->start = sqBlock->list[i].start ;
1570 tmpsq->C = 0;
1571 status = sqascii_ReadWindow(sqfp, 0, max_residues, tmpsq);
1572
1573 if (status != eslEOD) {
1574 if(tmpsq != NULL) esl_sq_Destroy(tmpsq);
1575 return status; //surprising
1576 }
1577 //sqBlock->list[i].L = tmpsq->L;
1578 status = eslOK;
1579 }
1580 }
1581 }
1582
1583 /* EOF will be returned only in the case were no sequences were read */
1584 if (status == eslEOF && i > 0) status = eslOK;
1585
1586 sqBlock->complete = TRUE;
1587
1588 if(tmpsq != NULL) esl_sq_Destroy(tmpsq);
1589
1590 return status;
1591 }
1592
1593 /* Function: sqascii_Echo()
1594 * Synopsis: Echo a sequence's record onto output stream.
1595 *
1596 * Purpose: Given a complete <sq> that we have read by some means
1597 * from an open <sqfp>; echo that sequence's record
1598 * onto the output stream <ofp>.
1599 *
1600 * This allows records to be regurgitated exactly as they
1601 * appear, rather than writing the subset of information
1602 * stored in an <ESL_SQ>. <esl-sfetch> in the miniapps uses
1603 * this, for example.
1604 *
1605 * Because this relies on repositioning the <sqfp>, it
1606 * cannot be called on non-positionable streams (stdin or
1607 * gzipped files). Because it relies on the sequence lying
1608 * in a contiguous sequence of bytes in the file, it cannot
1609 * be called on a sequence in a multiple alignment file.
1610 * Trying to do so throws an <eslEINVAL> exception.
1611 *
1612 * Returns: <eslOK> on success.
1613 *
1614 * Throws: <eslEINVAL> if <sqfp> isn't a repositionable sequence file.
1615 * <eslECORRUPT> if we run out of data, probably from bad offsets
1616 * <eslEMEM> on allocation failure.
1617 * <eslESYS> on system call failures.
1618 *
1619 *
1620 */
1621 static int
sqascii_Echo(ESL_SQFILE * sqfp,const ESL_SQ * sq,FILE * ofp)1622 sqascii_Echo(ESL_SQFILE *sqfp, const ESL_SQ *sq, FILE *ofp)
1623 {
1624 int status;
1625 int64_t save_linenumber;
1626 int save_currpl;
1627 int save_curbpl;
1628 int save_prvrpl;
1629 int save_prvbpl;
1630 int64_t save_L;
1631 int n;
1632 int nwritten;
1633
1634 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
1635
1636 if (ascii->do_stdin) ESL_EXCEPTION(eslEINVAL, "can't Echo() a sequence from standard input");
1637 if (ascii->do_gzip) ESL_EXCEPTION(eslEINVAL, "can't Echo() a sequence from a gzipped file");
1638 if (esl_sqio_IsAlignment(sqfp->format)) ESL_EXCEPTION(eslEINVAL, "can't Echo() a sequence from an alignment file");
1639 if (sq->roff == -1 || sq->eoff == -1) ESL_EXCEPTION(eslEINVAL, "can't Echo() a sequence without disk offset info");
1640
1641 save_linenumber = ascii->linenumber;
1642 save_currpl = ascii->currpl;
1643 save_curbpl = ascii->curbpl;
1644 save_prvrpl = ascii->prvrpl;
1645 save_prvbpl = ascii->prvbpl;
1646 save_L = ascii->L;
1647
1648 status = esl_sqfile_Position(sqfp, sq->roff);
1649 if (status == eslEOF) ESL_EXCEPTION(eslECORRUPT, "repositioning failed; bad offset?");
1650 else if (status != eslOK) return status;
1651
1652 while (ascii->boff + ascii->nc <= sq->eoff)
1653 {
1654 if (fwrite(ascii->buf, sizeof(char), ascii->nc, ofp) != ascii->nc) ESL_EXCEPTION(eslESYS, "fwrite() failed");
1655 if (loadbuf(sqfp) != eslOK) ESL_EXCEPTION(eslECORRUPT, "repositioning failed; bad offset?");
1656 }
1657 n = sq->eoff - ascii->boff + 1;
1658 nwritten = fwrite(ascii->buf, sizeof(char), n, ofp);
1659 if (nwritten != n) ESL_EXCEPTION(eslESYS, "fwrite() failed");
1660
1661 status = esl_sqfile_Position(sqfp, sq->roff);
1662 if (status == eslEOF) ESL_EXCEPTION(eslECORRUPT, "repositioning failed; bad offset?");
1663 else if (status != eslOK) return status;
1664
1665 ascii->linenumber = save_linenumber;
1666 ascii->currpl = save_currpl;
1667 ascii->curbpl = save_curbpl;
1668 ascii->prvrpl = save_prvrpl;
1669 ascii->prvbpl = save_prvbpl;
1670 ascii->L = save_L;
1671 return eslOK;
1672 }
1673 /*------------------ end, sequential sequence input -------------*/
1674
1675
1676 /*****************************************************************
1677 *# 5. Sequence/subsequence fetching, random access [with <ssi>]
1678 *****************************************************************/
1679
1680 /* Function: sqascii_OpenSSI()
1681 * Synopsis: Opens an SSI index associated with a sequence file.
1682 *
1683 * Purpose: Opens an SSI index file associated with the already open
1684 * sequence file <sqfp>. If successful, the necessary
1685 * information about the open SSI file is stored internally
1686 * in <sqfp>.
1687 *
1688 * The SSI index file name is determined in one of two
1689 * ways, depending on whether a non-<NULL> <ssifile_hint>
1690 * is provided.
1691 *
1692 * If <ssifile_hint> is <NULL>, the default for
1693 * constructing the SSI filename from the sequence
1694 * filename, by using exactly the same path (if any) for
1695 * the sequence filename, and appending the suffix <.ssi>.
1696 * For example, the SSI index for <foo> is <foo.ssi>, for
1697 * <./foo.fa> is <./foo.fa.ssi>, and for
1698 * </my/path/to/foo.1.fa> is </my/path/to/foo.1.fa.ssi>.
1699 *
1700 * If <ssifile_hint> is <non-NULL>, this exact fully
1701 * qualified path is used as the SSI file name.
1702 *
1703 * Returns: <eslOK> on success, and <sqfp->ssi> is now internally
1704 * valid.
1705 *
1706 * <eslENOTFOUND> if no SSI index file is found;
1707 * <eslEFORMAT> if it's found, but appears to be in incorrect format;
1708 * <eslERANGE> if the SSI file uses 64-bit offsets but we're on
1709 * a system that doesn't support 64-bit file offsets.
1710 *
1711 * Throws: <eslEINVAL> if the open sequence file <sqfp> doesn't
1712 * correspond to a normal sequence flatfile -- we can't
1713 * random access in .gz compressed files, standard input,
1714 * or multiple alignment files that we're reading
1715 * sequentially.
1716 *
1717 * Throws <eslEMEM> on allocation error.
1718 */
1719 static int
sqascii_OpenSSI(ESL_SQFILE * sqfp,const char * ssifile_hint)1720 sqascii_OpenSSI(ESL_SQFILE *sqfp, const char *ssifile_hint)
1721 {
1722 int status;
1723
1724 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
1725
1726 if (ascii->do_gzip) ESL_EXCEPTION(eslEINVAL, "can't open an SSI index for a .gz compressed seq file");
1727 if (ascii->do_stdin) ESL_EXCEPTION(eslEINVAL, "can't open an SSI index for standard input");
1728 if (ascii->afp != NULL) ESL_EXCEPTION(eslEINVAL, "can't open an SSI index for sequential input from an MSA");
1729
1730 if (ssifile_hint == NULL) {
1731 if ((status = esl_strdup(sqfp->filename, -1, &(ascii->ssifile))) != eslOK) return status;
1732 if ((status = esl_strcat(&(ascii->ssifile), -1, ".ssi", 4)) != eslOK) return status;
1733 } else {
1734 if ((status = esl_strdup(ssifile_hint, -1, &(ascii->ssifile))) != eslOK) return status;
1735 }
1736
1737 return esl_ssi_Open(ascii->ssifile, &(ascii->ssi));
1738 }
1739
1740
1741
1742 /* Function: sqascii_PositionByKey()
1743 * Synopsis: Use SSI to reposition seq file to a particular sequence.
1744 *
1745 * Purpose: Reposition <sqfp> so that the next sequence we read will
1746 * be the one named (or accessioned) <key>.
1747 *
1748 * <sqfp->linenumber> is reset to be relative to the start
1749 * of the record named <key>, rather than the start of the
1750 * file.
1751 *
1752 * Returns: <eslOK> on success, and the file <sqfp> is repositioned
1753 * so that the next <esl_sqio_Read()> call will read the
1754 * sequence named <key>.
1755 *
1756 * Returns <eslENOTFOUND> if <key> isn't found in the
1757 * index; in this case, the position of <sqfp> in the file
1758 * is unchanged.
1759 *
1760 * Returns <eslEFORMAT> if something goes wrong trying to
1761 * read the index, almost certainly indicating a format
1762 * problem in the SSI file.
1763 *
1764 * Returns <eslEOF> if, after repositioning, we fail to
1765 * load the next line or buffer from the sequence file;
1766 * this probably also indicates a format problem in the SSI
1767 * file.
1768 *
1769 * Throws: <eslEMEM> on allocation error;
1770 * <eslEINVAL> if there's no open SSI index in <sqfp>;
1771 * <eslESYS> if the <fseek()> fails.
1772 *
1773 * In all these cases, the state of <sqfp> becomes
1774 * undefined, and the caller should not use it again.
1775 */
1776 static int
sqascii_PositionByKey(ESL_SQFILE * sqfp,const char * key)1777 sqascii_PositionByKey(ESL_SQFILE *sqfp, const char *key)
1778 {
1779 uint16_t fh;
1780 off_t offset;
1781 int status;
1782
1783 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
1784
1785 if (ascii->ssi == NULL) ESL_EXCEPTION(eslEINVAL,"Need an open SSI index to call esl_sqfile_PositionByKey()");
1786 if ((status = esl_ssi_FindName(ascii->ssi, key, &fh, &offset, NULL, NULL)) != eslOK) return status;
1787 return esl_sqfile_Position(sqfp, offset);
1788 }
1789
1790
1791 /* Function: sqascii_PositionByNumber()
1792 * Synopsis: Use SSI to reposition by sequence number
1793 *
1794 * Purpose: Reposition <sqfp> so that the next sequence we
1795 * read will be the <which>'th sequence, where <which>
1796 * is <0..sqfp->ssi->nprimary-1>.
1797 *
1798 * <sqfp->linenumber> is reset to be relative to the start
1799 * of the record named <key>, rather than the start of the
1800 * file.
1801 *
1802 * Returns: <eslOK> on success, and the file <sqfp> is repositioned.
1803 *
1804 * Returns <eslENOTFOUND> if there is no sequence number
1805 * <which> in the index; in this case, the position of
1806 * <sqfp> in the file is unchanged.
1807 *
1808 * Returns <eslEFORMAT> if something goes wrong trying to
1809 * read the index, almost certainly indicating a format
1810 * problem in the SSI file.
1811 *
1812 * Returns <eslEOF> if, after repositioning, we fail to
1813 * load the next line or buffer from the sequence file;
1814 * this probably also indicates a format problem in the SSI
1815 * file.
1816 *
1817 * Throws: <eslEMEM> on allocation error;
1818 * <eslEINVAL> if there's no open SSI index in <sqfp>;
1819 * <eslESYS> if the <fseek()> fails.
1820 *
1821 * In all these cases, the state of <sqfp> becomes
1822 * undefined, and the caller should not use it again.
1823 */
1824 static int
sqascii_PositionByNumber(ESL_SQFILE * sqfp,int which)1825 sqascii_PositionByNumber(ESL_SQFILE *sqfp, int which)
1826 {
1827 uint16_t fh;
1828 off_t offset;
1829 int status;
1830
1831 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
1832
1833 if (ascii->ssi == NULL) ESL_EXCEPTION(eslEINVAL,"Need open SSI index to call esl_sqfile_PositionByNumber()");
1834 if ((status = esl_ssi_FindNumber(ascii->ssi, which, &fh, &offset, NULL, NULL, NULL)) != eslOK) return status;
1835 return esl_sqfile_Position(sqfp, offset);
1836 }
1837
1838
1839 /* Function: sqascii_Fetch()
1840 * Synopsis: Fetch a complete sequence, using SSI indexing.
1841 *
1842 * Purpose: Fetch a sequence named (or accessioned) <key> from
1843 * the repositionable, open sequence file <sqfp>.
1844 * The open <sqfp> must have an open SSI index.
1845 * The sequence is returned in <sq>.
1846 *
1847 * Returns: <eslOK> on soccess.
1848 * <eslEINVAL> if no SSI index is present, or if <sqfp> can't
1849 * be repositioned.
1850 * <eslENOTFOUND> if <source> isn't found in the file.
1851 * <eslEFORMAT> if either the index file or the sequence file
1852 * can't be parsed, because of unexpected format issues.
1853 *
1854 * Throws: <eslEMEM> on allocation error.
1855 */
1856 static int
sqascii_Fetch(ESL_SQFILE * sqfp,const char * key,ESL_SQ * sq)1857 sqascii_Fetch(ESL_SQFILE *sqfp, const char *key, ESL_SQ *sq)
1858 {
1859 int status;
1860
1861 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
1862
1863 if (ascii->ssi == NULL) ESL_FAIL(eslEINVAL, ascii->errbuf, "No SSI index for %s; can't fetch subsequences", sqfp->filename);
1864 if ((status = sqascii_PositionByKey(sqfp, key)) != eslOK) return status;
1865 if ((status = sqascii_Read(sqfp, sq)) != eslOK) return status;
1866 return eslOK;
1867 }
1868
1869 /* Function: sqascii_FetchInfo()
1870 * Synopsis: Fetch a sequence's info, using SSI indexing.
1871 *
1872 * Purpose: Fetch a sequence named (or accessioned) <key> from
1873 * the repositionable, open sequence file <sqfp>, reading
1874 * all info except the sequence (and secondary structure).
1875 * The open <sqfp> must have an open SSI index.
1876 * The sequence info is returned in <sq>.
1877 *
1878 * Returns: <eslOK> on soccess.
1879 * <eslEINVAL> if no SSI index is present, or if <sqfp> can't
1880 * be repositioned.
1881 * <eslENOTFOUND> if <source> isn't found in the file.
1882 * <eslEFORMAT> if either the index file or the sequence file
1883 * can't be parsed, because of unexpected format issues.
1884 *
1885 * Throws: <eslEMEM> on allocation error.
1886 */
1887 static int
sqascii_FetchInfo(ESL_SQFILE * sqfp,const char * key,ESL_SQ * sq)1888 sqascii_FetchInfo(ESL_SQFILE *sqfp, const char *key, ESL_SQ *sq)
1889 {
1890 int status;
1891
1892 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
1893
1894 if (ascii->ssi == NULL) ESL_FAIL(eslEINVAL, ascii->errbuf, "No SSI index for %s; can't fetch subsequences", sqfp->filename);
1895 if ((status = sqascii_PositionByKey(sqfp, key)) != eslOK) return status;
1896 if ((status = sqascii_ReadInfo(sqfp, sq)) != eslOK) return status;
1897 return eslOK;
1898 }
1899
1900
1901 /* Function: sqascii_FetchSubseq()
1902 * Synopsis: Fetch a subsequence, using SSI indexing.
1903 *
1904 * Purpose: Fetch subsequence <start..end> from a sequence named (or
1905 * accessioned) <source>, in the repositionable, open sequence file <sqfp>.
1906 * The open <sqfp> must have an SSI index. Put the
1907 * subsequence in <sq>.
1908 *
1909 * As a special case, if <end> is 0, the subsequence is
1910 * fetched all the way to the end, so you don't need to
1911 * look up the sequence length <L> to fetch a suffix.
1912 *
1913 * The caller may want to rename/reaccession/reannotate the
1914 * subsequence. Upon successful return, <sq->name> is set
1915 * to <source/start-end>, and <sq->source> is set to
1916 * <source> The accession and description <sq->acc> and
1917 * <sq->desc> are set to the accession and description of
1918 * the source sequence.
1919 *
1920 * Returns: <eslOK> on success.
1921 * <eslEINVAL> if no SSI index is present, or if <sqfp> can't
1922 * be repositioned.
1923 * <eslENOTFOUND> if <source> isn't found in the file.
1924 * <eslEFORMAT> if either the index file or the sequence file
1925 * can't be parsed, because of unexpected format issues.
1926 * <eslERANGE> if the <start..end> coords don't lie entirely
1927 * within the <source> sequence.
1928 *
1929 * Throws: <eslEMEM> on allocation errors.
1930 */
1931 static int
sqascii_FetchSubseq(ESL_SQFILE * sqfp,const char * source,int64_t start,int64_t end,ESL_SQ * sq)1932 sqascii_FetchSubseq(ESL_SQFILE *sqfp, const char *source, int64_t start, int64_t end, ESL_SQ *sq)
1933 {
1934 uint16_t fh;/* SSI file handle */
1935 off_t r_off, d_off;
1936 int64_t L;
1937 int64_t actual_start;
1938 int64_t nskip;
1939 int64_t nres;
1940 int64_t n;
1941 int status;
1942
1943 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
1944
1945 if (ascii->ssi == NULL) ESL_FAIL(eslEINVAL, ascii->errbuf, "No SSI index for %s; can't fetch subsequences", sqfp->filename);
1946
1947 /* Find sequence info in the index */
1948 status = esl_ssi_FindSubseq(ascii->ssi, source, start, &fh, &r_off, &d_off, &L, &actual_start);
1949 if (status == eslENOTFOUND) ESL_FAIL(status, ascii->errbuf, "Didn't find sequence %s in the index", source);
1950 else if (status == eslEFORMAT) ESL_FAIL(status, ascii->errbuf, "Failure reading SSI index; corrupt or bad format");
1951 else if (status == eslERANGE) ESL_FAIL(status, ascii->errbuf, "Requested start %" PRIi64 " isn't in the sequence %s", start, source);
1952 else if (status != eslOK) ESL_FAIL(status, ascii->errbuf, "Unexpected failure in finding subseq offset");
1953
1954 /* The special case of end=0, asking for suffix fetch */
1955 if (end == 0) end = L;
1956
1957 /* Validate coords if we can */
1958 if (start > end) ESL_FAIL(eslERANGE, ascii->errbuf, "Subsequence start %" PRIi64 " is greater than end %" PRIi64 "\n", start, end);
1959 if (L > 0 && end > L) ESL_FAIL(eslERANGE, ascii->errbuf, "Subsequence end %" PRIi64 " is greater than length %" PRIi64 "\n", end, L);
1960
1961 /* Position the file at the record header; read the header info */
1962 status = esl_sqfile_Position(sqfp, r_off);
1963 if (status == eslEOF) ESL_FAIL(status, ascii->errbuf, "Position appears to be off the end of the file");
1964 else if (status == eslEINVAL) ESL_FAIL(status, ascii->errbuf, "Sequence file is not repositionable");
1965 else if (status != eslOK) ESL_FAIL(status, ascii->errbuf, "Failure in positioning sequence file");
1966 if ((status = ascii->parse_header(sqfp, sq)) != eslOK) return status;
1967
1968 /* Position the file close to the subseq: either at the start of the line
1969 * where the subseq starts, or exactly at the residue.
1970 */
1971 if (d_off != 0)
1972 {
1973 status = esl_sqfile_Position(sqfp, d_off);
1974 if (status == eslEOF) ESL_FAIL(eslERANGE, ascii->errbuf, "Position appears to be off the end of the file");
1975 else if (status == eslEINVAL) ESL_FAIL(status, ascii->errbuf, "Sequence file is not repositionable");
1976 else if (status != eslOK) ESL_FAIL(status, ascii->errbuf, "Failure in positioning sequence file");
1977 }
1978 /* even if we didn't have a data offset, we're positioned at the
1979 * start of the sequence anyway, because we parsed the full header
1980 */
1981 nskip = start - actual_start; /* how many residues do we still need to skip to reach start */
1982 nres = end - start + 1; /* how many residues do we need to read as subseq */
1983
1984 if ((status = esl_sq_GrowTo(sq, nres)) != eslOK) return status;
1985 status = read_nres(sqfp, sq, nskip, nres, &n);
1986 if (status != eslOK || n < nres) ESL_EXCEPTION(eslEINCONCEIVABLE, "Failed to fetch subsequence residues -- corrupt coords?");
1987
1988 /* Set the coords */
1989 sq->start = start;
1990 sq->end = end;
1991 sq->C = 0;
1992 sq->W = sq->n;
1993 sq->L = (L > 0 ? L : -1);
1994 esl_sq_FormatName(sq, "%s/%" PRId64 "-%" PRId64, source, start, end);
1995 esl_sq_SetSource (sq, source);
1996 return eslOK;
1997 }
1998 /*------------- end, random sequence access with SSI -------------------*/
1999
2000
2001 /*****************************************************************
2002 * 6. Internal routines shared by parsers
2003 *****************************************************************/
2004
2005
2006 /* loadmem()
2007 *
2008 * Load the next block of data from stream into mem buffer,
2009 * either concatenating to previous buffer (if we're recording) or
2010 * overwriting (if not).
2011 *
2012 * This block is loaded at sqfp->mem + sqfp->mpos.
2013 *
2014 * Upon return:
2015 * sqfp->mem now contains up to eslREADBUFSIZE more chars
2016 * sqfp->mpos is position of first byte in newly read block
2017 * sqfp->allocm may have increased by eslREADBUFSIZE, if we concatenated
2018 * sqfp->mn is # of chars in <mem>; <mn-1> is pos of last byte in new block
2019 *
2020 * Returns <eslEOF> (and mpos == mn) if no new data can be read;
2021 * Returns <eslOK> (and mpos < mn) if new data is read.
2022 * Throws <eslEMEM> on allocation error.
2023 */
2024 static int
loadmem(ESL_SQFILE * sqfp)2025 loadmem(ESL_SQFILE *sqfp)
2026 {
2027 void *tmp;
2028 int n = 0;
2029 int status;
2030
2031 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2032
2033 if (ascii->do_buffer)
2034 {
2035 ascii->mpos = 0;
2036 ascii->mn = 0;
2037 }
2038 else if (ascii->is_recording == TRUE)
2039 {
2040 if (ascii->mem == NULL) ascii->moff = ftello(ascii->fp); /* first time init of the offset */
2041 ESL_RALLOC(ascii->mem, tmp, sizeof(char) * (ascii->allocm + eslREADBUFSIZE));
2042 ascii->allocm += eslREADBUFSIZE;
2043 n = fread(ascii->mem + ascii->mpos, sizeof(char), eslREADBUFSIZE, ascii->fp);
2044 ascii->mn += n;
2045 }
2046 else
2047 {
2048 if (ascii->mem == NULL) {
2049 ESL_ALLOC(ascii->mem, sizeof(char) * eslREADBUFSIZE);
2050 ascii->allocm = eslREADBUFSIZE;
2051 }
2052 ascii->is_recording = -1;/* no more recording is possible now */
2053 ascii->mpos = 0;
2054 ascii->moff = ftello(ascii->fp);
2055 n = fread(ascii->mem, sizeof(char), eslREADBUFSIZE, ascii->fp); /* see note [1] below */
2056 ascii->mn = n;
2057 }
2058 return (n == 0 ? eslEOF : eslOK);
2059
2060 ERROR:
2061 return status;
2062 }
2063
2064 /* [1] Be alert for a possible problem above in that fread().
2065 * Farrar had inserted an alternative case as follows:
2066 * "If we are reading from stdin, buffered read cannot be used
2067 * because if will block until EOF or the buffer is full, ie
2068 * eslREADBUFSIZE characters have been read. Usually this would
2069 * not be a problem, unless stdin is from a pipe. In that case
2070 * if the sequence is less than eslREADBUFSIZE we would block.
2071 *
2072 * NOTE: any changes to the IO stream ascii->fp, such as fseek,
2073 * might not have any affect on the file descriptor for the stream.
2074 *
2075 * if (ascii->do_stdin) {
2076 * n = read(fileno(ascii->fp), ascii->mem, eslREADBUFSIZE);
2077 * } else {
2078 * ...
2079 *
2080 * but that's a bug, because you can't mix read and fread;
2081 * the i17-stdin.pl test fails, in particular.
2082 */
2083
2084
2085
2086
2087 /* loadbuf()
2088 * Set sqfp->buf to contain next line of data, or point to next block.
2089 * This might just mean working with previously buffered memory in <sqfp->mem>
2090 * or might require reading new data from <sqfp->fp>.
2091 *
2092 * Reset sqfp->boff to be the position of the start of the block/line.
2093 * Reset sqfp->bpos to 0.
2094 * Reset sqfp->nc to the number of chars (bytes) in the new block/line.
2095 * Returns eslOK on success; eslEOF if there's no more data in the file.
2096 * (sqfp->nc == 0 is the same as eslEOF: no data in the new buffer.)
2097 * Can throw an <eslEMEM> error.
2098 */
2099 static int
loadbuf(ESL_SQFILE * sqfp)2100 loadbuf(ESL_SQFILE *sqfp)
2101 {
2102 void *tmp;
2103 char *nlp;
2104 int n;
2105 int status = eslOK;
2106
2107 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2108
2109 if (! ascii->is_linebased)
2110 {
2111 if (ascii->mpos >= ascii->mn) {
2112 if ((status = loadmem(sqfp)) == eslEMEM) return status;
2113 }
2114 ascii->buf = ascii->mem + ascii->mpos;
2115 ascii->boff = ascii->moff + ascii->mpos;
2116 ascii->balloc = 0;
2117 ascii->bpos = 0;
2118 ascii->nc = ascii->mn - ascii->mpos;
2119 ascii->mpos += ascii->nc;
2120 }
2121 else
2122 { /* Copy next line from <mem> into <buf>. Might require new load(s) into <mem>. */
2123 if (ascii->mpos >= ascii->mn) {
2124 if ((status = loadmem(sqfp)) == eslEMEM) return status;
2125 }
2126 ascii->boff = ascii->moff + ascii->mpos;
2127 ascii->nc = 0;
2128 nlp = memchr(ascii->mem + ascii->mpos, '\n', ascii->mn - ascii->mpos);
2129 while (nlp == NULL)
2130 {
2131 n = ascii->mn - ascii->mpos;
2132 while (ascii->nc + n + 1 > ascii->balloc) { /* +1: it'll hold the terminal \0 */
2133 ESL_RALLOC(ascii->buf, tmp, sizeof(char) * (ascii->balloc + eslREADBUFSIZE));
2134 ascii->balloc += eslREADBUFSIZE;
2135 }
2136 memcpy(ascii->buf + ascii->nc, ascii->mem + ascii->mpos, n);
2137 ascii->mpos += n;
2138 ascii->nc += n;
2139 status = loadmem(sqfp);
2140 if (status == eslEOF) { break; }
2141 else if (status != eslOK) return status;
2142 nlp = memchr(ascii->mem + ascii->mpos, '\n', ascii->mn - ascii->mpos);
2143 }
2144 if (status != eslEOF) {
2145 n = nlp - (ascii->mem + ascii->mpos) + 1; /* inclusive of \n */
2146 if (ascii->nc + n + 1 > ascii->balloc) {
2147 ESL_RALLOC(ascii->buf, tmp, sizeof(char) * (ascii->balloc + eslREADBUFSIZE));
2148 ascii->balloc += eslREADBUFSIZE;
2149 }
2150 memcpy(ascii->buf + ascii->nc, ascii->mem + ascii->mpos, n);
2151 ascii->mpos += n;
2152 ascii->nc += n;
2153 }
2154 ascii->bpos = 0;
2155 ascii->buf[ascii->nc] = '\0';
2156 }
2157 return (ascii->nc == 0 ? eslEOF : eslOK);
2158
2159 ERROR:
2160 return status;
2161 }
2162
2163 /* nextchar()
2164 *
2165 * Load next char from sqfp->buf into <*ret_c> and sets sqfp->bpos to
2166 * its position; usually this is c = sqfp->buf[++sqfp->bpos], but
2167 * we will refill the buffer w/ fresh fread() when needed, in which
2168 * case c = sqfp->buf[0] and sqfp->bpos = 0.
2169 *
2170 * Returns <eslOK> on success.
2171 * Return <eslEOF> if we ran out of data in <sqfp>.
2172 * May throw an <eslEMEM> error.
2173 */
2174 static int
nextchar(ESL_SQFILE * sqfp,char * ret_c)2175 nextchar(ESL_SQFILE *sqfp, char *ret_c)
2176 {
2177 int status;
2178
2179 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2180
2181 ascii->bpos++;
2182 if (ascii->nc == ascii->bpos && (status = loadbuf(sqfp)) != eslOK) return status;
2183 *ret_c = ascii->buf[ascii->bpos];
2184 return eslOK;
2185 }
2186
2187 /* seebuf()
2188 *
2189 * Examine and validate the current buffer <sqfp->buf> from its
2190 * current position <sqfp->bpos> until either the buffer ends (we run
2191 * out of characters) or the sequence data ends (we see whatever
2192 * character indicates EOD in this format) or we've seen <maxn>
2193 * residues. If <maxn> is passed as -1, parse the entire buffer,
2194 * without a residue limit.
2195 *
2196 * There are three possible outcomes:
2197 * <eslOK>: The buffer is all residues that belong to the current
2198 * seq we're parsing (or chars we can ignore), at least
2199 * up to the <maxn> residue limit (if present).
2200 * <eslEOD>: Part of the buffer may be residues, but the current sequence
2201 * ends in this buffer (before <maxn> was reached).
2202 * <eslEFORMAT>: Somewhere before we reached the end of the buffer or
2203 * the sequence record, we saw an illegal character.
2204 *
2205 * On <eslOK>:
2206 * *opt_nres is the number of residues in the buffer (up to <maxn>)
2207 * *opt_endpos is sqfp->nc (off the end of the buffer by one)
2208 * The caller will want to deal with the buffer, then load the next one.
2209 *
2210 * On <eslEOD>: same as OK, except:
2211 * *opt_endpos is where sqfp->bpos *would* be at when we saw the EOD
2212 * signal (the next '>', in FASTA files) had we been parsing residues
2213 * Therefore on EOD, the caller will want to deal with the <*opt_nres>
2214 * residues in this buffer, then reposition the buffer by
2215 * <sqfp->bpos = *opt_epos> (without reloading the buffer), so
2216 * the next read will pick up there.
2217 *
2218 * On <eslEFORMAT>:
2219 * ascii->errbuf contains informative message about the format error.
2220 *
2221 * seebuf() also handles linenumber and SSI bookkeeping in
2222 * <sqfp>. Every newline character seen increments <linenumber> (thus,
2223 * on EFORMAT return, linenumber is set to the line on which the bad
2224 * char occurred). <curbpl>,<currpl>,<prvbpl>,<prvrpl> keep track of # of bytes,
2225 * residues on the current,prev line; they keep state across calls to seebuf().
2226 * <bpl>,<rpl> are tracking whether there's a constant number of
2227 * bytes/residues per line; these are either -1 for "not set yet", 0
2228 * for "no, not constant", or a number > 0. Because of this bookkeeping, it's important
2229 * to make sure that <seebuf()> never counts the same byte twice (hence
2230 * the need for the <maxn> limit, which ReadWindow() uses.)
2231 */
2232 static int
seebuf(ESL_SQFILE * sqfp,int64_t maxn,int64_t * opt_nres,int64_t * opt_endpos)2233 seebuf(ESL_SQFILE *sqfp, int64_t maxn, int64_t *opt_nres, int64_t *opt_endpos)
2234 {
2235 int bpos;
2236 int64_t nres = 0;
2237 int64_t nres2 = 0;/* an optimization for determining lastrpl from nres, without incrementing lastrpl on every char */
2238 int sym;
2239 ESL_DSQ x;
2240 int lasteol;
2241 int status = eslOK;
2242
2243 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2244
2245 lasteol = ascii->bpos - 1;
2246 if (maxn == -1) maxn = ascii->nc; /* makes for a more efficient test. nc is a guaranteed upper bound on nres */
2247
2248 for (bpos = ascii->bpos; nres < maxn && bpos < ascii->nc; bpos++)
2249 {
2250 sym = ascii->buf[bpos];
2251 //printf ("nres: %d, bpos: %d (%d)\n", nres, bpos, sym);
2252 if (!isascii(sym)) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": non-ASCII character %c in sequence", ascii->linenumber, sym);
2253 x = sqfp->inmap[sym];
2254
2255 if (x <= 127) nres++;
2256 else if (x == eslDSQ_EOL)
2257 {
2258 if (ascii->curbpl != -1) ascii->curbpl += bpos - lasteol;
2259 if (ascii->currpl != -1) ascii->currpl += nres - nres2;
2260 nres2 += nres - nres2;
2261
2262 if (ascii->rpl != 0 && ascii->prvrpl != -1) { /* need to ignore counts on last line in record, hence cur/prv */
2263 if (ascii->rpl == -1) ascii->rpl = ascii->prvrpl; /* init */
2264 else if (ascii->prvrpl != ascii->rpl) ascii->rpl = 0; /* inval*/
2265 }
2266 if (ascii->bpl != 0 && ascii->prvbpl != -1) {
2267 if (ascii->bpl == -1) ascii->bpl = ascii->prvbpl; /* init */
2268 else if (ascii->prvbpl != ascii->bpl) ascii->bpl = 0; /* inval */
2269 }
2270
2271 ascii->prvbpl = ascii->curbpl;
2272 ascii->prvrpl = ascii->currpl;
2273 ascii->curbpl = 0;
2274 ascii->currpl = 0;
2275 lasteol = bpos;
2276 if (ascii->linenumber != -1) ascii->linenumber++;
2277 }
2278 else if (x == eslDSQ_ILLEGAL) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": illegal character %c", ascii->linenumber, sym);
2279 else if (x == eslDSQ_EOD) { status = eslEOD; break; }
2280 else if (x != eslDSQ_IGNORED) ESL_FAIL(eslEFORMAT, ascii->errbuf, "inmap corruption?");
2281 }
2282
2283 if (ascii->curbpl != -1) ascii->curbpl += bpos - lasteol - 1;
2284 if (ascii->currpl != -1) ascii->currpl += nres - nres2;
2285 if (opt_nres != NULL) *opt_nres = nres;
2286 if (opt_endpos != NULL) *opt_endpos = bpos;
2287 return status;
2288 }
2289
2290 /* addbuf()
2291 * Add <nres> residues from the current buffer <sqfp->buf> to <sq>.
2292 * This is designed to work when we're constructing a complete
2293 * sequence (add the whole buffer); when we're adding a suffix
2294 * of the buffer (<sqfp->bpos> is skipped ahead already);
2295 * or when we're adding a prefix of the buffer (terminating a subseq
2296 * or window load).
2297 *
2298 * The caller must know that there are at least <nres> residues in
2299 * this buffer, and that all the characters are valid in the
2300 * format and alphabet, via a previous call to <seebuf()>.
2301 *
2302 * The caller also must have already allocated <sq> to hold at least
2303 * <nres> more residues.
2304 *
2305 * On input:
2306 * sqfp->buf[] contains an fread() buffer
2307 * sqfp->bpos is set to where we're going to start parsing residues
2308 * sqfp->nc is the length of <buf>
2309 *
2310 * On return:
2311 * sqfp->buf[] still contains the same buffer (no new freads here)
2312 * sqfp->bpos is set after the last residue we parsed
2313 * sq->seq/dsq now holds <nres> new residues
2314 * sq->n is incremented by <nres>
2315 */
2316 static void
addbuf(ESL_SQFILE * sqfp,ESL_SQ * sq,int64_t nres)2317 addbuf(ESL_SQFILE *sqfp, ESL_SQ *sq, int64_t nres)
2318 {
2319 ESL_DSQ x;
2320 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2321
2322 if (sq->dsq != NULL)
2323 {
2324 while (nres) {
2325 x = sq->abc->inmap[(int) ascii->buf[ascii->bpos++]];
2326 if (x <= 127) { nres--; sq->dsq[++sq->n] = x; }
2327 } /* we skipped IGNORED, EOL. EOD, ILLEGAL don't occur; seebuf() already checked */
2328 }
2329 else
2330 {
2331 while (nres) {
2332 x = sqfp->inmap[(int) ascii->buf[ascii->bpos++]];
2333 if (x <= 127) { nres--; sq->seq[sq->n++] = x; }
2334 }
2335 }
2336 }
2337
2338 /* skipbuf()
2339 * Like addbuf(), but we skip <nskip> residues instead of
2340 * reading them.
2341 */
2342 static void
skipbuf(ESL_SQFILE * sqfp,int64_t nskip)2343 skipbuf(ESL_SQFILE *sqfp, int64_t nskip)
2344 {
2345 ESL_DSQ x;
2346 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2347
2348 while (nskip) {
2349 x = sqfp->inmap[(int) ascii->buf[ascii->bpos++]];
2350 if (x <= 127) nskip--;/* skip IGNORED, EOL. */
2351 }
2352 }
2353
2354
2355 /* skip_whitespace()
2356 * Like skipbuf(), but instead of skipping a fixed number of
2357 * residues, skip forward until one of three conditions is met:
2358 *
2359 * (1) end of the sequence record (a character indicating
2360 * the beginning of a new sequence); set ascii->bpos
2361 * to the beginning of the new record, and return eslEOD;
2362 * (2) a non-whitespace character in the current sequence is
2363 * reached that does not indicate the end of a sequence
2364 * record; set ascii->bpos to that character's position,
2365 * and return eslOK;
2366 * (3) end of file; return eslEOF.
2367 *
2368 */
2369 static int
skip_whitespace(ESL_SQFILE * sqfp)2370 skip_whitespace(ESL_SQFILE *sqfp)
2371 {
2372 int status;
2373 int c;
2374 ESL_DSQ x;
2375 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2376
2377 if (ascii->nc == 0)
2378 return eslEOF;
2379
2380 c = (int) ascii->buf[ascii->bpos];
2381 x = sqfp->inmap[c];
2382
2383 while ( isspace(c) ) {
2384
2385 ascii->bpos++;
2386
2387 if (ascii->bpos == ascii->nc)
2388 if ((status = loadbuf(sqfp)) == eslEOF)
2389 return eslEOF;
2390
2391 c = (int) ascii->buf[ascii->bpos];
2392 x = sqfp->inmap[c];
2393 }
2394 if (x == eslDSQ_EOD)
2395 return eslEOD;
2396
2397 return eslOK;
2398 }
2399
2400
2401
2402 /* read_nres()
2403 * Read the next <nres> residues from <sqfp> after skipping <nskip> residues, then stop.
2404 *
2405 * Returns <eslOK> and <0 < *ret_actual_nres <= nres> if it succeeded, and
2406 * there's more residues in the current seq record.
2407 * Returns <eslEOD> and <*ret_actual_nres == 0> if no more residues are
2408 * seen in the sequence record.
2409 *
2410 * Even on <eslEOD>, the <dsq/seq> is appropriately terminated here,
2411 * and <sq->n> is left the way it was (no new residues added - but there
2412 * may have been saved context C from a previous window).
2413 *
2414 * Returns <eslEFORMAT> on any parsing problem, and <ascii->errbuf> is set.
2415 *
2416 * On <eslOK>, sqfp->bpos is positioned on the next character past the last residue we store;
2417 * on <eslEOD>, sqfp->bpos is positioned for reading the next sequence.
2418 *
2419 * FetchSubseq() uses this with <nskip>, <nres>, and expects an
2420 * <eslOK> with <*opt_actual_nres = nres>. On <EOD>, or if fewer than
2421 * <nres> residues are obtained, the coords must've been screwed up,
2422 * because we didn't read the whole subseq we asked for.
2423 *
2424 * ReadWindow() on forward strand uses this with <nskip=0>, <nres=W>.
2425 * The last window might normally return <eslEOD> with
2426 * <*ret_actual_nres == 0>, and now <sqfp->bpos> is positioned at the
2427 * start of the next sequence on <EOD>, and at the next residue on
2428 * <OK>.
2429 *
2430 * ReadWindow() in reverse complement acts like a subseq fetch.
2431 *
2432 */
2433 static int
read_nres(ESL_SQFILE * sqfp,ESL_SQ * sq,int64_t nskip,int64_t nres,int64_t * opt_actual_nres)2434 read_nres(ESL_SQFILE *sqfp, ESL_SQ *sq, int64_t nskip, int64_t nres, int64_t *opt_actual_nres)
2435 {
2436 int64_t n;
2437 int64_t epos;
2438 int64_t actual_nres = 0;
2439 int status = eslOK;
2440
2441 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2442 status = seebuf(sqfp, nskip+nres, &n, &epos);
2443 while (status == eslOK && nskip - n > 0) {
2444 nskip -= n;
2445 if ((status = loadbuf(sqfp)) == eslEOF) break;
2446 status = seebuf(sqfp, nskip+nres, &n, &epos);
2447 }
2448
2449 if (status == eslEOF) {
2450 if (! ascii->eof_is_ok) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Premature EOF before end of seq record");
2451 if (nskip > 0) ESL_EXCEPTION(eslECORRUPT, "premature EOD while trying to skip residues");
2452 n = 0;
2453 } else if (status == eslEOD) {
2454 if (n < nskip) ESL_EXCEPTION(eslECORRUPT, "premature EOD while trying to skip residues");
2455 } else if (status != eslOK)
2456 return status;
2457
2458 skipbuf(sqfp, nskip);
2459 n -= nskip;
2460
2461 while (status == eslOK && nres - n > 0)
2462 {
2463 addbuf(sqfp, sq, n);
2464 actual_nres += n;
2465 nres -= n;
2466 if ((status = loadbuf(sqfp)) == eslEOF) break;
2467 status = seebuf(sqfp, nres, &n, &epos);
2468 }
2469
2470
2471 if (status == eslEOF) {
2472 if (! ascii->eof_is_ok) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Premature EOF before end of seq record");
2473 n = 0;
2474 } else if (status == eslEFORMAT) {
2475 return status;
2476 }
2477
2478 n = ESL_MIN(nres, n);
2479 addbuf(sqfp, sq, n); /* bpos now at last residue + 1 if OK/EOD, 0 if EOF */
2480 actual_nres += n;
2481
2482 if (sq->dsq != NULL) sq->dsq[sq->n+1] = eslDSQ_SENTINEL;
2483 else sq->seq[sq->n] = '\0';
2484
2485 if (status == eslEOD) {
2486 ascii->bpos = epos;
2487 }
2488
2489 if (opt_actual_nres != NULL) *opt_actual_nres = actual_nres;
2490 return (actual_nres == 0 ? eslEOD : eslOK);
2491 }
2492 /*--------------- end, buffer-based parsers --------------------*/
2493
2494
2495 /*****************************************************************
2496 *# 7. Internal routines for EMBL format (including UniProt, TrEMBL)
2497 *****************************************************************/
2498 /* EMBL and UniProt protein sequence database format.
2499 * See: http://us.expasy.org/sprot/userman.html
2500 * and: http://www.ebi.ac.uk/embl/Documentation/User_manual/usrman.html#3
2501 * We use the same parser for both formats, so we have to be
2502 * careful to only parse the conserved intersection of these two
2503 * very similar formats.
2504 */
2505 static void
config_embl(ESL_SQFILE * sqfp)2506 config_embl(ESL_SQFILE *sqfp)
2507 {
2508 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2509
2510 ascii->is_linebased = TRUE;
2511 ascii->eof_is_ok = FALSE;/* records end with // */
2512 ascii->parse_header = &header_embl;
2513 ascii->skip_header = &skip_embl;
2514 ascii->parse_end = &end_embl;
2515 }
2516
2517 static void
inmap_embl(ESL_SQFILE * sqfp,const ESL_DSQ * abc_inmap)2518 inmap_embl(ESL_SQFILE *sqfp, const ESL_DSQ *abc_inmap)
2519 {
2520 int x;
2521
2522 if (abc_inmap != NULL) {
2523 for (x = 0; x < 128; x++) sqfp->inmap[x] = abc_inmap[x];
2524 sqfp->inmap['-'] = eslDSQ_ILLEGAL; // don't let the abc_inmap map the gap char; this is an ungapped file format
2525 } else {
2526 for (x = 0; x < 128; x++) sqfp->inmap[x] = eslDSQ_ILLEGAL;
2527 for (x = 'A'; x <= 'Z'; x++) sqfp->inmap[x] = x;
2528 for (x = 'a'; x <= 'z'; x++) sqfp->inmap[x] = x;
2529 }
2530 for (x = '0'; x <= '9'; x++)
2531 sqfp->inmap[x] = eslDSQ_IGNORED; /* EMBL DNA sequence format puts coordinates after each line */
2532 sqfp->inmap['*'] = '*'; /* accept * as a nonresidue/stop codon character */
2533 sqfp->inmap[' '] = eslDSQ_IGNORED;
2534 sqfp->inmap['\t'] = eslDSQ_IGNORED;
2535 sqfp->inmap['\n'] = eslDSQ_IGNORED;
2536 sqfp->inmap['\r'] = eslDSQ_IGNORED; /* DOS eol compatibility */
2537 sqfp->inmap['/'] = eslDSQ_EOD;
2538 }
2539
2540 /* header_embl()
2541 *
2542 * See: http://us.expasy.org/sprot/userman.html
2543 * And: http://www.ebi.ac.uk/embl/Documentation/User_manual/usrman.html#3
2544 * Our parser must work on the highest common denominator of EMBL DNA
2545 * and UniProt protein sequence files.
2546 *
2547 * sqfp->buf is the first (ID) line of the entry, or a blank line before
2548 * it (in which case we'll scan forwards skipping blank lines to find
2549 * the ID line).
2550 *
2551 * On success, returns <eslOK> and:
2552 * sq->name contains sequence name (and may have been reallocated, changing sq->nalloc)
2553 * sq->acc contains seq accession (and may have been reallocated, changing sq->aalloc)
2554 * sq->desc contains description line (and may have been reallocated, changing sq->dalloc)
2555 * sq->roff has been set to the record offset
2556 * sq->doff has been set to the data offset (start of sequence line)
2557 * sqfp->buf is the first seq line.
2558 *
2559 * If no more seqs are found in the file, returns <eslEOF>.
2560 * On parse failure, returns <eslEFORMAT>, leaves as mesg in ascii->errbuf.
2561 *
2562 * May also throw <eslEMEM> on allocation errors.
2563 */
2564 static int
header_embl(ESL_SQFILE * sqfp,ESL_SQ * sq)2565 header_embl(ESL_SQFILE *sqfp, ESL_SQ *sq)
2566 {
2567 char *s;
2568 char *tok;
2569 int status;
2570
2571 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2572
2573 /* Find first line:
2574 * "Each entry must begin with an identification line (ID)..."
2575 * "The two-character line-type code that begins each line is always
2576 * followed by three blanks..."
2577 */
2578 if (ascii->nc == 0) return eslEOF;
2579 while (esl_str_IsBlank(ascii->buf)) {
2580 if ((status = loadbuf(sqfp)) == eslEOF) return eslEOF; /* normal */
2581 else if (status != eslOK) return status; /* abnormal */
2582 }
2583
2584 /* ID line is defined as:
2585 * ID ENTRY_NAME DATA_CLASS; MOLECULE_TYPE; SEQUENCE_LENGTH.
2586 * We're only after the ENTRY_NAME.
2587 * Examples:
2588 * ID SNRPA_DROME STANDARD; PRT; 216 AA.
2589 * ID SNRPA_DROME Reviewed; 216 AA.
2590 * ID X06347; SV 1; linear; mRNA; STD; HUM; 1209 BP.
2591 */
2592 if (strncmp(ascii->buf, "ID ", 5) != 0) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": failed to find ID line", ascii->linenumber);
2593
2594 s = ascii->buf+5;
2595 if ((status = esl_strtok(&s, " ;", &tok)) != eslOK)
2596 ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": failed to parse name on ID line", ascii->linenumber);
2597 if ((status = esl_sq_SetName(sq, tok)) != eslOK) return status;
2598 sq->roff = ascii->boff;/* record the offset of the ID line */
2599
2600 /* Look for SQ line; parsing optional info as we go.
2601 */
2602 do {
2603 if ((status = loadbuf(sqfp)) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": failed to find SQ line", ascii->linenumber);
2604
2605 /* "The format of the AC line is:
2606 * AC AC_number_1;[ AC_number_2;]...[ AC_number_N;]
2607 * Researchers who wish to cite entries in their publications
2608 * should always cite the first accession number. This is
2609 * commonly referred to as the 'primary accession
2610 * number'."
2611 *
2612 * Examples:
2613 * AC P43332; Q9W4D7;
2614 * AC X06347;
2615 *
2616 * Note that Easel only stores primary accessions.
2617 * Because there can be more than one accession line, we check to
2618 * see if the accession is already set before storing a line.
2619 */
2620 if (strncmp(ascii->buf, "AC ", 5) == 0 && sq->acc[0] == '\0')
2621 {
2622 s = ascii->buf+5;
2623 if ((status = esl_strtok(&s, ";", &tok)) != eslOK)
2624 ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": failed to parse accession on AC line", ascii->linenumber);
2625 if ((status = esl_sq_SetAccession(sq, tok)) != eslOK) return status;
2626 }
2627
2628 /* "The format of the DE line is:
2629 * DE Description.
2630 * ...In cases where more than one DE line is required, the text is
2631 * only divided between words and only the last DE line is
2632 * terminated by a period."
2633 *
2634 * Examples:
2635 * DE U1 small nuclear ribonucleoprotein A (U1 snRNP protein A) (U1-A) (Sex
2636 * DE determination protein snf).
2637 *
2638 * DE Human mRNA for U1 small nuclear RNP-specific A protein
2639 *
2640 * DE RecName: Full=U1 small nuclear ribonucleoprotein A;
2641 * DE Short=U1 snRNP protein A;
2642 * DE Short=U1-A;
2643 * DE AltName: Full=Sex determination protein snf;
2644 *
2645 * We'll make no attempt to parse the structured UniProt description header,
2646 * for the moment.
2647 */
2648 if (strncmp(ascii->buf, "DE ", 5) == 0)
2649 {
2650 s = ascii->buf+5;
2651 esl_strchop(s, ascii->nc-5);
2652 if ((status = esl_sq_AppendDesc(sq, s)) != eslOK)
2653 ESL_FAIL(status, ascii->errbuf, "Line %" PRId64 ": failed to parse description on DE line", ascii->linenumber);
2654 }
2655
2656 /* UniProt: "The format of the SQ line is:
2657 * SQ SEQUENCE XXXX AA; XXXXX MW; XXXXXXXXXXXXXXXX CRC64;"
2658 * EMBL: "The SQ (SeQuence header) line marks the beginning of
2659 * the sequence data and Gives a summary of its content.
2660 * An example is:
2661 * SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;"
2662 *
2663 * We don't parse this line; we just look for it as the last line
2664 * before the sequence starts.
2665 */
2666 } while (strncmp(ascii->buf, "SQ ", 5) != 0);
2667
2668 if (loadbuf(sqfp) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Failed to find any sequence");
2669 sq->hoff = ascii->boff - 1;
2670 sq->doff = ascii->boff;
2671 return eslOK;
2672 }
2673
2674 /* skip_embl()
2675 *
2676 * Skip past the EMBL header and position to start of the sequence line.
2677 *
2678 * On success, returns <eslOK> and:
2679 * sq->roff has been set to the record offset
2680 * sq->doff has been set to the data offset (start of sequence line)
2681 * sqfp->buf is the first seq line.
2682 *
2683 * If no more seqs are found in the file, returns <eslEOF>.
2684 * On parse failure, returns <eslEFORMAT>, leaves as mesg in ascii->errbuf.
2685 *
2686 * May also throw <eslEMEM> on allocation errors.
2687 */
2688 static int
skip_embl(ESL_SQFILE * sqfp,ESL_SQ * sq)2689 skip_embl(ESL_SQFILE *sqfp, ESL_SQ *sq)
2690 {
2691 int status;
2692
2693 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2694
2695 /* Find first line:
2696 * "Each entry must begin with an identification line (ID)..."
2697 * "The two-character line-type code that begins each line is always
2698 * followed by three blanks..."
2699 */
2700 if (ascii->nc == 0) return eslEOF;
2701 while (esl_str_IsBlank(ascii->buf)) {
2702 if ((status = loadbuf(sqfp)) == eslEOF) return eslEOF; /* normal */
2703 else if (status != eslOK) return status; /* abnormal */
2704 }
2705
2706 /* ID line */
2707 if (strncmp(ascii->buf, "ID ", 5) != 0) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": failed to find ID line", ascii->linenumber);
2708
2709 sq->roff = ascii->boff;/* record the offset of the ID line */
2710
2711 /* zero out the name, accession and description */
2712 sq->name[0] = '\0';
2713 sq->acc[0] = '\0';
2714 sq->desc[0] = '\0';
2715
2716 /* Look for SQ line; parsing optional info as we go. */
2717 do {
2718 if ((status = loadbuf(sqfp)) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": failed to find SQ line", ascii->linenumber);
2719 } while (strncmp(ascii->buf, "SQ ", 5) != 0);
2720
2721 if (loadbuf(sqfp) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Failed to find any sequence");
2722 sq->hoff = ascii->boff - 1;
2723 sq->doff = ascii->boff;
2724 return eslOK;
2725 }
2726
2727 static int
end_embl(ESL_SQFILE * sqfp,ESL_SQ * sq)2728 end_embl(ESL_SQFILE *sqfp, ESL_SQ *sq)
2729 {
2730 int status;
2731
2732 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2733
2734 if (strncmp(ascii->buf, "//", 2) != 0) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": did not find // terminator at end of seq record", ascii->linenumber);
2735 sq->eoff = ascii->boff + ascii->nc - 1;
2736 status = loadbuf(sqfp);
2737 if (status == eslEOF) return eslOK; /* ok, actually. */
2738 else if (status == eslOK) return eslOK;
2739 else return status;
2740 }
2741
2742 /*---------------------- EMBL format ---------------------------------*/
2743
2744
2745
2746 /*****************************************************************
2747 *# 8. Internal routines for GenBank format
2748 *****************************************************************/
2749 /* NCBI GenBank sequence database format.
2750 * See GenBank release notes; for example,
2751 * ftp://ftp.ncbi.nih.gov/genbank/gbrel.txt
2752 */
2753
2754 static void
config_genbank(ESL_SQFILE * sqfp)2755 config_genbank(ESL_SQFILE *sqfp)
2756 {
2757 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2758
2759 ascii->is_linebased = TRUE;
2760 ascii->eof_is_ok = FALSE;/* records end with // */
2761 ascii->parse_header = &header_genbank;
2762 ascii->skip_header = &skip_genbank;
2763 ascii->parse_end = &end_genbank;
2764 }
2765
2766 static void
inmap_genbank(ESL_SQFILE * sqfp,const ESL_DSQ * abc_inmap)2767 inmap_genbank(ESL_SQFILE *sqfp, const ESL_DSQ *abc_inmap)
2768 {
2769 int x;
2770
2771 if (abc_inmap != NULL) {
2772 for (x = 0; x < 128; x++) sqfp->inmap[x] = abc_inmap[x];
2773 sqfp->inmap['-'] = eslDSQ_ILLEGAL; // don't let the abc_inmap map the gap char; this is an ungapped file format
2774 } else {
2775 for (x = 0; x < 128; x++) sqfp->inmap[x] = eslDSQ_ILLEGAL;
2776 for (x = 'A'; x <= 'Z'; x++) sqfp->inmap[x] = x;
2777 for (x = 'a'; x <= 'z'; x++) sqfp->inmap[x] = x;
2778 }
2779 for (x = '0'; x <= '9'; x++)
2780 sqfp->inmap[x] = eslDSQ_IGNORED;
2781 sqfp->inmap['*'] = '*'; /* accept * as a nonresidue/stop codon character */
2782 sqfp->inmap[' '] = eslDSQ_IGNORED;
2783 sqfp->inmap['\t'] = eslDSQ_IGNORED;
2784 sqfp->inmap['\n'] = eslDSQ_IGNORED;
2785 sqfp->inmap['\r'] = eslDSQ_IGNORED;/* DOS eol compatibility */
2786 sqfp->inmap['/'] = eslDSQ_EOD;
2787 }
2788
2789 /* header_genbank()
2790 *
2791 * sqfp->buf is the first (LOCUS) line of the entry, or a line before
2792 * it (in which case we'll scan forwards to find the LOCUS line - even
2793 * skipping non-blank lines, because there are sometimes headers at
2794 * the start of GenBank files).
2795 *
2796 * On success, returns <eslOK> and:
2797 * sq->name contains sequence name (and may have been reallocated, changing sq->nalloc)
2798 * sq->acc contains seq accession (and may have been reallocated, changing sq->aalloc)
2799 * sq->desc contains description line (and may have been reallocated, changing sq->dalloc)
2800 * sq->roff has been set to the record offset
2801 * sq->doff has been set to the data offset (start of sequence line)
2802 * sqfp->buf is the first seq line.
2803 *
2804 * If no more seqs are found in the file, returns <eslEOF>.
2805 * On parse failure, returns <eslEFORMAT>, leaves as mesg in ascii->errbuf.
2806 *
2807 * May also throw <eslEMEM> on allocation errors.
2808 */
2809 static int
header_genbank(ESL_SQFILE * sqfp,ESL_SQ * sq)2810 header_genbank(ESL_SQFILE *sqfp, ESL_SQ *sq)
2811 {
2812 char *s;
2813 char *tok;
2814 int status;
2815
2816 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2817
2818 /* Find LOCUS line, allowing for ignoration of a file header. */
2819 if (ascii->nc == 0) return eslEOF;
2820 while (strncmp(ascii->buf, "LOCUS ", 8) != 0) {
2821 if ((status = loadbuf(sqfp)) == eslEOF) return eslEOF; /* normal */
2822 else if (status != eslOK) return status; /* abnormal */
2823 }
2824
2825 s = ascii->buf+12;
2826 if ((status = esl_strtok(&s, " ", &tok)) != eslOK)
2827 ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": failed to parse name on LOCUS line", ascii->linenumber);
2828 if ((status = esl_sq_SetName(sq, tok)) != eslOK) return status;
2829 sq->roff = ascii->boff;/* record the disk offset to the LOCUS line */
2830
2831 /* Look for ORIGIN line, parsing optional info as we go. */
2832 do {
2833 if ((status = loadbuf(sqfp)) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Failed to find ORIGIN line");
2834
2835 /* Optional VERSION line is parsed as "accession". */
2836 if (strncmp(ascii->buf, "VERSION ", 10) == 0)
2837 {
2838 s = ascii->buf+12;
2839 if ((status = esl_strtok(&s, " \t\n", &tok)) != eslOK)
2840 ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": failed to parse VERSION line", ascii->linenumber);
2841 if ((status = esl_sq_SetAccession(sq, tok)) != eslOK) return status;
2842 }
2843
2844 /* Optional DEFINITION Line is parsed as "description". */
2845 if (strncmp(ascii->buf, "DEFINITION ", 11) == 0)
2846 {
2847 s = ascii->buf+12;
2848 esl_strchop(s, ascii->nc-12);
2849 if ((status = esl_sq_AppendDesc(sq, s)) != eslOK)
2850 ESL_FAIL(status, ascii->errbuf, "Line %" PRId64 ": failed to parse desc on DEFINITION line", ascii->linenumber);
2851 }
2852 } while (strncmp(ascii->buf, "ORIGIN", 6) != 0);
2853
2854 if (loadbuf(sqfp) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Failed to find any sequence");
2855 sq->hoff = ascii->boff - 1;
2856 sq->doff = ascii->boff;
2857 return eslOK;
2858 }
2859
2860 /* skip_genbank()
2861 *
2862 * Skip past the GenBank header and position to start of the sequence line.
2863 *
2864 * On success, returns <eslOK> and:
2865 * sq->roff has been set to the record offset
2866 * sq->doff has been set to the data offset (start of sequence line)
2867 * sqfp->buf is the first seq line.
2868 *
2869 * If no more seqs are found in the file, returns <eslEOF>.
2870 * On parse failure, returns <eslEFORMAT>, leaves as mesg in ascii->errbuf.
2871 *
2872 * May also throw <eslEMEM> on allocation errors.
2873 */
2874 static int
skip_genbank(ESL_SQFILE * sqfp,ESL_SQ * sq)2875 skip_genbank(ESL_SQFILE *sqfp, ESL_SQ *sq)
2876 {
2877 int status;
2878
2879 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2880
2881 /* Find LOCUS line, allowing for ignoration of a file header. */
2882 if (ascii->nc == 0) return eslEOF;
2883 while (strncmp(ascii->buf, "LOCUS ", 8) != 0) {
2884 if ((status = loadbuf(sqfp)) == eslEOF) return eslEOF; /* normal */
2885 else if (status != eslOK) return status; /* abnormal */
2886 }
2887
2888 sq->roff = ascii->boff;/* record the disk offset to the LOCUS line */
2889
2890 /* zero out the name, accession and description */
2891 sq->name[0] = '\0';
2892 sq->acc[0] = '\0';
2893 sq->desc[0] = '\0';
2894
2895 /* Look for ORIGIN line, parsing optional info as we go. */
2896 do {
2897 if ((status = loadbuf(sqfp)) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Failed to find ORIGIN line");
2898 } while (strncmp(ascii->buf, "ORIGIN", 6) != 0);
2899
2900 if (loadbuf(sqfp) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Failed to find any sequence");
2901 sq->hoff = ascii->boff - 1;
2902 sq->doff = ascii->boff;
2903 return eslOK;
2904 }
2905
2906 static int
end_genbank(ESL_SQFILE * sqfp,ESL_SQ * sq)2907 end_genbank(ESL_SQFILE *sqfp, ESL_SQ *sq)
2908 {
2909 int status;
2910
2911 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2912
2913 if (strncmp(ascii->buf, "//", 2) != 0) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": did not find // terminator at end of seq record", ascii->linenumber);
2914 sq->eoff = ascii->boff + ascii->nc - 1;
2915 status = loadbuf(sqfp);
2916 if (status == eslEOF) return eslOK; /* ok, actually; we'll detect EOF on next sq read */
2917 else if (status == eslOK) return eslOK;
2918 else return status;
2919 }
2920 /*----------------- end GenBank format -------------------------------*/
2921
2922
2923
2924 /*****************************************************************
2925 *# 9. Internal routines for FASTA format
2926 *****************************************************************/
2927
2928 static void
config_fasta(ESL_SQFILE * sqfp)2929 config_fasta(ESL_SQFILE *sqfp)
2930 {
2931 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2932
2933 ascii->is_linebased = FALSE;
2934 ascii->eof_is_ok = TRUE;
2935 ascii->parse_header = &header_fasta;
2936 ascii->skip_header = &skip_fasta;
2937 ascii->parse_end = &end_fasta;
2938 }
2939
2940 static void
inmap_fasta(ESL_SQFILE * sqfp,const ESL_DSQ * abc_inmap)2941 inmap_fasta(ESL_SQFILE *sqfp, const ESL_DSQ *abc_inmap)
2942 {
2943 int x;
2944
2945 if (abc_inmap != NULL) {
2946 for (x = 0; x < 128; x++) sqfp->inmap[x] = abc_inmap[x];
2947 sqfp->inmap['-'] = eslDSQ_ILLEGAL; // don't let the abc_inmap map the gap char; this is an ungapped file format
2948 } else {
2949 for (x = 0; x < 128; x++) sqfp->inmap[x] = eslDSQ_ILLEGAL;
2950 for (x = 'A'; x <= 'Z'; x++) sqfp->inmap[x] = x;
2951 for (x = 'a'; x <= 'z'; x++) sqfp->inmap[x] = x;
2952 }
2953 sqfp->inmap['*'] = '*'; /* accept * as a nonresidue/stop codon character */
2954 sqfp->inmap[' '] = eslDSQ_IGNORED;
2955 sqfp->inmap['\t'] = eslDSQ_IGNORED;
2956 sqfp->inmap['\r'] = eslDSQ_IGNORED;/* DOS eol compatibility */
2957 sqfp->inmap['\n'] = eslDSQ_EOL;
2958 sqfp->inmap['>'] = eslDSQ_EOD;
2959 /* \n is special - fasta reader detects it as an eol */
2960 }
2961
2962
2963 /* header_fasta()
2964 *
2965 * sqfp->buf[sqfp->bpos] is sitting at the start of a FASTA record, or
2966 * at a space before it (in which case we'll advance, skipping whitespace,
2967 * until a > is reached).
2968 * Parse the header line, storing name and description in <sq>.
2969 *
2970 * On success, returns <eslOK> and:
2971 * sq->name contains sequence name (and may have been reallocated, changing sq->nalloc)
2972 * sq->desc contains description line (and may have been reallocated, changing sq->dalloc)
2973 * sq->roff has been set to the record offset
2974 * sq->doff has been set to the data offset (start of sequence line)
2975 * sqfp->buf[sqfp->bpos] is sitting at the start of the seq line.
2976 * sqfp->currpl,curbpl set to 0, to start bookkeeping data line lengths
2977 *
2978 * If no more seqs are found in the file, returns <eslEOF>.
2979 * On parse failure, return <eslEFORMAT>, leaves as mesg in ascii->errbuf.
2980 *
2981 * May also throw <eslEMEM> on allocation errors.
2982 */
2983 static int
header_fasta(ESL_SQFILE * sqfp,ESL_SQ * sq)2984 header_fasta(ESL_SQFILE *sqfp, ESL_SQ *sq)
2985 {
2986 char c;
2987 int status = eslOK;
2988 void *tmp;
2989 int pos;
2990
2991 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2992
2993 /* make sure there are characters in the buffer */
2994 if (ascii->nc == ascii->bpos && (status = loadbuf(sqfp)) != eslOK) return status;
2995
2996 c = ascii->buf[ascii->bpos];
2997 while (status == eslOK && isspace(c)) status = nextchar(sqfp, &c); /* skip space (including \n) */
2998
2999 if (status == eslEOF) return eslEOF;
3000
3001 if (status == eslOK && c == '>') { /* accept the > */
3002 sq->roff = ascii->boff + ascii->bpos; /* store SSI record offset */
3003 status = nextchar(sqfp, &c);
3004 } else if (c != '>') ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": unexpected char %c; expected FASTA to start with >", ascii->linenumber, c);
3005
3006 while (status == eslOK && (c == '\t' || c == ' ')) status = nextchar(sqfp, &c); /* skip space */
3007
3008 /* Store the name (space delimited) */
3009 pos = 0;
3010 while (status == eslOK && ! isspace(c))
3011 {
3012 sq->name[pos++] = c;
3013 if (pos == sq->nalloc-1) { ESL_RALLOC(sq->name, tmp, sq->nalloc*2); sq->nalloc*=2; }
3014 status = nextchar(sqfp, &c);
3015 }
3016 if (pos == 0) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": no FASTA name found", ascii->linenumber);
3017 sq->name[pos] = '\0';
3018
3019 while (status == eslOK && (c == '\t' || c == ' ')) status = nextchar(sqfp, &c); /* skip space */
3020
3021 /* Store the description (end-of-line delimited) */
3022 /* Patched to deal with NCBI NR desclines: delimit by ctrl-A (0x01) too. [SRE:H1/82] */
3023 pos = 0;
3024 while (status == eslOK && c != '\n' && c != '\r' && c != 1)
3025 {
3026 sq->desc[pos++] = c;
3027 if (pos == sq->dalloc-1) { ESL_RALLOC(sq->desc, tmp, sq->dalloc*2); sq->dalloc*= 2; }
3028 status = nextchar(sqfp, &c);
3029 }
3030 sq->desc[pos] = '\0';
3031
3032 /* Because of the NCBI NR patch, c might be0x01 ctrl-A now; skip to eol.
3033 * (TODO: I'm worried about the efficiency of this nextchar() stuff. Revisit.)
3034 */
3035 while (status == eslOK && c != '\n' && c != '\r')
3036 status = nextchar(sqfp, &c);
3037 sq->hoff = ascii->boff + ascii->bpos;
3038
3039 while (status == eslOK && (c == '\n' || c == '\r')) status = nextchar(sqfp, &c); /* skip past eol (DOS \r\n, MAC \r, UNIX \n */
3040 if (status != eslOK && status != eslEOF) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Unexpected failure in parsing FASTA name/description line");
3041 /* Edge case: if the last sequence in the file is L=0, no residues, we are EOF now, not OK; but we'll return OK because we parsed the header line */
3042
3043 sq->doff = ascii->boff + ascii->bpos;
3044 ascii->prvrpl = ascii->prvbpl = -1;
3045 ascii->currpl = ascii->curbpl = 0;
3046 ascii->linenumber++;
3047 return eslOK;
3048
3049 ERROR:
3050 return status;/* eslEMEM, from failed realloc */
3051 }
3052
3053 /* skip_fasta()
3054 *
3055 * Skip past the fasta header and position to start of the sequence line.
3056 *
3057 * On success, returns <eslOK> and:
3058 * sq->roff has been set to the record offset
3059 * sq->doff has been set to the data offset (start of sequence line)
3060 * sqfp->buf[sqfp->bpos] is sitting at the start of the seq line.
3061 * sqfp->currpl,curbpl set to 0, to start bookkeeping data line lengths
3062 *
3063 * If no more seqs are found in the file, returns <eslEOF>.
3064 * On parse failure, return <eslEFORMAT>, leaves as mesg in ascii->errbuf.
3065 *
3066 * May also throw <eslEMEM> on allocation errors.
3067 */
3068 static int
skip_fasta(ESL_SQFILE * sqfp,ESL_SQ * sq)3069 skip_fasta(ESL_SQFILE *sqfp, ESL_SQ *sq)
3070 {
3071 char c;
3072 int status = eslOK;
3073
3074 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
3075
3076 c = ascii->buf[ascii->bpos];
3077 while (status == eslOK && isspace(c)) status = nextchar(sqfp, &c); /* skip space (including \n) */
3078
3079 if (status == eslEOF) return eslEOF;
3080 if (status != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Unexpected parsing error %d", status);
3081 if (c != '>') ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": unexpected char %c; expecting '>'", ascii->linenumber, c);
3082
3083 sq->roff = ascii->boff + ascii->bpos; /* store SSI record offset */
3084
3085 /* zero out the name, accession and description */
3086 sq->name[0] = '\0';
3087 sq->acc[0] = '\0';
3088 sq->desc[0] = '\0';
3089
3090 status = nextchar(sqfp, &c);
3091
3092 /* skip to end of line */
3093 while (status == eslOK && c != '\n' && c != '\r') status = nextchar(sqfp, &c);
3094 sq->doff = ascii->boff + ascii->bpos;
3095
3096 /* skip past end of line */
3097 while (status == eslOK && (c == '\n' || c == '\r')) status = nextchar(sqfp, &c);
3098
3099 if (status != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Premature EOF in parsing FASTA name/description line");
3100 sq->doff = ascii->boff + ascii->bpos;
3101
3102 ascii->linenumber++;
3103 return eslOK;
3104 }
3105
3106
3107 static int
end_fasta(ESL_SQFILE * sqfp,ESL_SQ * sq)3108 end_fasta(ESL_SQFILE *sqfp, ESL_SQ *sq)
3109 {
3110 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
3111
3112 if (ascii->bpos < ascii->nc) {
3113 if (ascii->buf[ascii->bpos] != '>') ESL_FAIL(eslEFORMAT, ascii->errbuf, "Whoops, FASTA reader is corrupted");
3114 sq->eoff = ascii->boff + ascii->bpos - 1; /* this puts eoff at the last \n */
3115 } /* else, EOF, and we don't have to do anything. */
3116 return eslOK;
3117 }
3118
3119
3120 /* Function: esl_sqascii_WriteFasta()
3121 * Synopsis: Write a sequence in FASTA foramt
3122 *
3123 * Purpose: Write sequence <sq> in FASTA format to the open stream <fp>.
3124 *
3125 * If <save_offsets> is TRUE, then store record, data, and end
3126 * offsets in <sq>; this ability is used by unit tests.
3127 *
3128 * Returns: <eslOK> on success.
3129 *
3130 * Throws: <eslEMEM> on allocation failure.
3131 * <eslEWRITE> on system write error.
3132 */
3133 int
esl_sqascii_WriteFasta(FILE * fp,ESL_SQ * sq,int save_offsets)3134 esl_sqascii_WriteFasta(FILE *fp, ESL_SQ *sq, int save_offsets)
3135 {
3136 char buf[61];
3137 int64_t pos;
3138
3139 if (save_offsets) sq->roff = ftello(fp);
3140 if (fprintf(fp, ">%s", sq->name) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "fasta seq write failed");
3141 if (sq->acc[0] != 0 && fprintf(fp, " %s", sq->acc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "fasta seq write failed");
3142 if (sq->desc[0] != 0 && fprintf(fp, " %s", sq->desc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "fasta seq write failed");
3143 if (save_offsets) sq->hoff = ftello(fp);
3144 if (fputc('\n', fp) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "fasta seq write failed");
3145
3146 buf[60] = '\0';
3147 if (save_offsets) sq->doff = ftello(fp);
3148 for (pos = 0; pos < sq->n; pos += 60)
3149 {
3150 if (sq->dsq != NULL) esl_abc_TextizeN(sq->abc, sq->dsq+pos+1, 60, buf);
3151 else strncpy(buf, sq->seq+pos, 60);
3152 if (fprintf(fp, "%s\n", buf) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "fasta seq write failed");
3153 }
3154 if (save_offsets) sq->eoff = ftello(fp) - 1;
3155 return eslOK;
3156 }
3157 /*------------------- end of FASTA i/o ---------------------------*/
3158
3159 /*****************************************************************
3160 *# 10. Internal routines for daemon format
3161 *****************************************************************/
3162
3163 /* Special case FASTA format where each sequence is terminated with "//".
3164 *
3165 * The use case is where the sequences are being read from a pipe and a
3166 * way is needed to signal the end of the sequence so it can be processed.
3167 * The next sequence might not be in the pipe, so the usual '>' is not
3168 * present to signal the end of the sequence. Also, an EOF is not
3169 * an option, since the daemon might run continuously.
3170 */
3171
3172 static void
config_daemon(ESL_SQFILE * sqfp)3173 config_daemon(ESL_SQFILE *sqfp)
3174 {
3175 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
3176
3177 ascii->is_linebased = FALSE;
3178 ascii->eof_is_ok = FALSE;
3179 ascii->parse_header = &header_fasta;
3180 ascii->skip_header = &skip_fasta;
3181 ascii->parse_end = &end_daemon;
3182 }
3183
3184 static void
inmap_daemon(ESL_SQFILE * sqfp,const ESL_DSQ * abc_inmap)3185 inmap_daemon(ESL_SQFILE *sqfp, const ESL_DSQ *abc_inmap)
3186 {
3187 int x;
3188
3189 if (abc_inmap != NULL) {
3190 for (x = 0; x < 128; x++) sqfp->inmap[x] = abc_inmap[x];
3191 sqfp->inmap['-'] = eslDSQ_ILLEGAL; // don't let the abc_inmap map the gap char; this is an ungapped file format
3192 } else {
3193 for (x = 0; x < 128; x++) sqfp->inmap[x] = eslDSQ_ILLEGAL;
3194 for (x = 'A'; x <= 'Z'; x++) sqfp->inmap[x] = x;
3195 for (x = 'a'; x <= 'z'; x++) sqfp->inmap[x] = x;
3196 }
3197 sqfp->inmap['*'] = '*'; /* accept * as a nonresidue/stop codon character */
3198 sqfp->inmap[' '] = eslDSQ_IGNORED;
3199 sqfp->inmap['\t'] = eslDSQ_IGNORED;
3200 sqfp->inmap['\r'] = eslDSQ_IGNORED;/* DOS eol compatibility */
3201 sqfp->inmap['\n'] = eslDSQ_EOL;
3202 sqfp->inmap['/'] = eslDSQ_EOD;
3203 /* \n is special - fasta reader detects it as an eol */
3204 }
3205
3206
3207 /* end_daemon()
3208 *
3209 * Special case FASTA format where each sequence is terminated with "//".
3210 *
3211 * The use case is were the sequences are being read from a pipe and a
3212 * way is needed to signal the end of the sequence so it can be processed.
3213 */
3214 static int
end_daemon(ESL_SQFILE * sqfp,ESL_SQ * sq)3215 end_daemon(ESL_SQFILE *sqfp, ESL_SQ *sq)
3216 {
3217 char c;
3218
3219 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
3220
3221 if (ascii->nc < 3) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Whoops, daemon input stream is corrupted");
3222
3223 c = ascii->buf[ascii->bpos++];
3224 if (c != '/') ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": did not find // terminator at end of seq record", ascii->linenumber);
3225
3226 c = ascii->buf[ascii->bpos++];
3227 if (c != '/') ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": did not find // terminator at end of seq record", ascii->linenumber);
3228
3229 /* skip to end of line */
3230 while (c != '\n' && c != '\r' && ascii->bpos < ascii->nc) c = ascii->buf[ascii->bpos++];
3231
3232 /* skip past end of line */
3233 while ((c == '\n' || c == '\r') && ascii->bpos < ascii->nc) c = ascii->buf[ascii->bpos++];
3234
3235 return eslOK;
3236 }
3237
3238
3239 /* esl_sqascii_Parse()
3240 *
3241 * Parse a sequence already read into a buffer.
3242 */
3243 int
esl_sqascii_Parse(char * buf,int size,ESL_SQ * sq,int format)3244 esl_sqascii_Parse(char *buf, int size, ESL_SQ *sq, int format)
3245 {
3246 int status;
3247 int64_t epos;
3248 int64_t n;
3249
3250 ESL_SQFILE sqfp;
3251 ESL_SQASCII_DATA *ascii = &sqfp.data.ascii;
3252
3253 /* fill in a dummy esl_sqfile structure used to parse buf */
3254 ascii->fp = NULL;
3255 ascii->do_gzip = FALSE;
3256 ascii->do_stdin = FALSE;
3257 ascii->do_buffer = TRUE;
3258
3259 ascii->mem = buf;
3260 ascii->allocm = 0;
3261 ascii->mn = size;
3262 ascii->mpos = 0;
3263 ascii->moff = -1;
3264 ascii->is_recording = FALSE;
3265
3266 ascii->buf = NULL;
3267 ascii->boff = 0;
3268 ascii->balloc = 0;
3269 ascii->nc = 0;
3270 ascii->bpos = 0;
3271 ascii->L = 0;
3272 ascii->linenumber = 1;
3273
3274 ascii->afp = NULL;
3275 ascii->msa = NULL;
3276 ascii->idx = -1;
3277
3278 ascii->ssifile = NULL;
3279 ascii->rpl = -1;/* -1 = not set yet */
3280 ascii->bpl = -1;/* (ditto) */
3281 ascii->prvrpl = -1;/* (ditto) */
3282 ascii->prvbpl = -1;/* (ditto) */
3283 ascii->currpl = -1;
3284 ascii->curbpl = -1;
3285 ascii->ssi = NULL;
3286
3287 /* Configure the <sqfp>'s parser and inmaps for this format. */
3288 switch (format) {
3289 case eslSQFILE_EMBL:
3290 case eslSQFILE_UNIPROT:
3291 config_embl(&sqfp);
3292 inmap_embl(&sqfp, NULL);
3293 break;
3294 case eslSQFILE_GENBANK:
3295 case eslSQFILE_DDBJ:
3296 config_genbank(&sqfp);
3297 inmap_genbank(&sqfp, NULL);
3298 break;
3299 case eslSQFILE_FASTA:
3300 config_fasta(&sqfp);
3301 inmap_fasta(&sqfp, NULL);
3302 break;
3303 case eslSQFILE_DAEMON:
3304 config_daemon(&sqfp);
3305 inmap_daemon(&sqfp, NULL);
3306 break;
3307 default:
3308 return eslEFORMAT;
3309 }
3310
3311 /* Main case: read next seq from sqfp's stream */
3312 if ((status = ascii->parse_header(&sqfp, sq)) != eslOK) return status; /* EOF, EFORMAT */
3313
3314 do {
3315 if ((status = seebuf(&sqfp, -1, &n, &epos)) == eslEFORMAT) return status;
3316 if (esl_sq_GrowTo(sq, sq->n + n) != eslOK) return eslEMEM;
3317 addbuf(&sqfp, sq, n);
3318 ascii->L += n;
3319 sq->eoff = ascii->boff + epos - 1;
3320 if (status == eslEOD) break;
3321 } while ((status = loadbuf(&sqfp)) == eslOK);
3322
3323 if (status == eslEOF)
3324 {
3325 if (! ascii->eof_is_ok) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Unexpected EOF; file truncated?");
3326 if ((status = ascii->parse_end(&sqfp, sq)) != eslOK) return status;
3327 }
3328 else if (status == eslEOD)
3329 {
3330 ascii->bpos = epos;
3331 if ((status = ascii->parse_end(&sqfp, sq)) != eslOK) return status;
3332 }
3333 else if (status != eslOK) return status;
3334
3335 if (sq->dsq != NULL) sq->dsq[sq->n+1] = eslDSQ_SENTINEL;
3336 else sq->seq[sq->n] = '\0';
3337 sq->start = 1;
3338 sq->end = sq->n;
3339 sq->C = 0;
3340 sq->W = sq->n;
3341 sq->L = sq->n;
3342
3343 if (ascii->balloc > 0) free(ascii->buf);
3344
3345 return eslOK;
3346 }
3347 /*-------------------- end of daemon ----------------------------*/
3348
3349 /*****************************************************************
3350 *# 11. Internal routines for HMMPGMD format
3351 *****************************************************************/
3352
3353 static int
fileheader_hmmpgmd(ESL_SQFILE * sqfp)3354 fileheader_hmmpgmd(ESL_SQFILE *sqfp)
3355 {
3356 ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
3357 char c;
3358 int status = eslOK;
3359
3360 /* We've just loaded first buffer, after an Open. First char should be the # of the hmmpgmd file,
3361 * but let's tolerate leading whitespace anyway
3362 */
3363 c = ascii->buf[ascii->bpos];
3364 while (status == eslOK && isspace(c)) status = nextchar(sqfp, &c); /* skip space (including \n, \r) */
3365 if (status == eslEOF) return eslEOF;
3366
3367 if (c != '#') ESL_FAIL(eslEFORMAT, ascii->errbuf, "hmmpgmd file expected to start with #");
3368
3369 /* skip first line; remainder of file is FASTA format */
3370 while (status == eslOK && (c != '\n' && c != '\r')) status = nextchar(sqfp, &c);
3371 if (status == eslEOF) return eslEOF;
3372
3373 /* next character read should be the '>' of the first FASTA record. We're properly positioned at "start of file". */
3374 return eslOK;
3375 }
3376 /*-------------------- end of HMMPGMD ---------------------------*/
3377
3378
3379