1 /* Unaligned ascii sequence file i/o.
2  *
3  * Contents:
4  *    1. An <ESL_SQFILE> object, in text mode.
5  *    2. An <ESL_SQFILE> object, in digital mode.
6  *    3. Miscellaneous routines.
7  *    4. Sequence reading (sequential).
8  *    5. Sequence/subsequence fetching, random access
9  *    6. Internal routines shared by parsers.
10  *    7. Internal routines for EMBL format (including UniProt, TrEMBL)
11  *    8. Internal routines for GenBank format
12  *    9. Internal routines for FASTA format
13  *   10. Internal routines for daemon format
14  *   11. Internal routines for HMMPGMD format
15  *
16  * This module shares remote evolutionary homology with Don Gilbert's
17  * seminal, public domain ReadSeq package, though the last common
18  * ancestor was circa 1991 and no recognizable vestiges are likely to
19  * remain. Thanks Don!
20  *
21  */
22 #include "esl_config.h"
23 
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <string.h>
27 #include <ctype.h>
28 #include <unistd.h>
29 
30 #include "easel.h"
31 #include "esl_alphabet.h"
32 #include "esl_msa.h"
33 #include "esl_msafile.h"
34 #include "esl_sqio.h"
35 #include "esl_sq.h"
36 #include "esl_ssi.h"
37 
38 /* format specific routines */
39 static int   sqascii_GuessFileFormat(ESL_SQFILE *sqfp, int *ret_fmt);
40 static int   sqascii_Position       (ESL_SQFILE *sqfp, off_t offset);
41 static void  sqascii_Close          (ESL_SQFILE *sqfp);
42 static int   sqascii_SetDigital     (ESL_SQFILE *sqfp, const ESL_ALPHABET *abc);
43 static int   sqascii_GuessAlphabet  (ESL_SQFILE *sqfp, int *ret_type);
44 static int   sqascii_Read           (ESL_SQFILE *sqfp, ESL_SQ *sq);
45 static int   sqascii_ReadInfo       (ESL_SQFILE *sqfp, ESL_SQ *sq);
46 static int   sqascii_ReadSequence   (ESL_SQFILE *sqfp, ESL_SQ *sq);
47 static int   sqascii_ReadWindow     (ESL_SQFILE *sqfp, int C, int W, ESL_SQ *sq);
48 static int   sqascii_ReadBlock      (ESL_SQFILE *sqfp, ESL_SQ_BLOCK *sqBlock, int max_residues, int max_sequences, int long_target);
49 static int   sqascii_Echo           (ESL_SQFILE *sqfp, const ESL_SQ *sq, FILE *ofp);
50 
51 static int   sqascii_IsRewindable   (const ESL_SQFILE *sqfp);
52 static const char *sqascii_GetError (const ESL_SQFILE *sqfp);
53 
54 static int   sqascii_OpenSSI         (ESL_SQFILE *sqfp, const char *ssifile_hint);
55 static int   sqascii_PositionByKey   (ESL_SQFILE *sqfp, const char *key);
56 static int   sqascii_PositionByNumber(ESL_SQFILE *sqfp, int which);
57 static int   sqascii_Fetch           (ESL_SQFILE *sqfp, const char *key, ESL_SQ *sq);
58 static int   sqascii_FetchInfo       (ESL_SQFILE *sqfp, const char *key, ESL_SQ *sq);
59 static int   sqascii_FetchSubseq     (ESL_SQFILE *sqfp, const char *source, int64_t start, int64_t end, ESL_SQ *sq);
60 
61 /* Internal routines shared by parsers. */
62 static int  loadmem  (ESL_SQFILE *sqfp);
63 static int  loadbuf  (ESL_SQFILE *sqfp);
64 static int  nextchar (ESL_SQFILE *sqfp, char *ret_c);
65 static int  seebuf   (ESL_SQFILE *sqfp, int64_t maxn, int64_t *opt_nres, int64_t *opt_endpos);
66 static void addbuf   (ESL_SQFILE *sqfp, ESL_SQ *sq, int64_t nres);
67 static void skipbuf  (ESL_SQFILE *sqfp, int64_t nskip);
68 static int  read_nres(ESL_SQFILE *sqfp, ESL_SQ *sq, int64_t nskip, int64_t nres, int64_t *opt_actual_nres);
69 static int  skip_whitespace(ESL_SQFILE *sqfp);
70 
71 /* EMBL format; also UniProt, TrEMBL */
72 static void config_embl(ESL_SQFILE *sqfp);
73 static void inmap_embl (ESL_SQFILE *sqfp, const ESL_DSQ *abc_inmap);
74 static int  header_embl(ESL_SQFILE *sqfp, ESL_SQ *sq);
75 static int  skip_embl  (ESL_SQFILE *sqfp, ESL_SQ *sq);
76 static int  end_embl   (ESL_SQFILE *sqfp, ESL_SQ *sq);
77 
78 /* GenBank format; also DDBJ */
79 static void config_genbank(ESL_SQFILE *sqfp);
80 static void inmap_genbank (ESL_SQFILE *sqfp, const ESL_DSQ *abc_inmap);
81 static int  header_genbank(ESL_SQFILE *sqfp, ESL_SQ *sq);
82 static int  skip_genbank  (ESL_SQFILE *sqfp, ESL_SQ *sq);
83 static int  end_genbank   (ESL_SQFILE *sqfp, ESL_SQ *sq);
84 
85 /* FASTA format */
86 static void config_fasta(ESL_SQFILE *sqfp);
87 static void inmap_fasta (ESL_SQFILE *sqfp, const ESL_DSQ *abc_inmap);
88 static int  header_fasta(ESL_SQFILE *sqfp, ESL_SQ *sq);
89 static int  skip_fasta  (ESL_SQFILE *sqfp, ESL_SQ *sq);
90 static int  end_fasta   (ESL_SQFILE *sqfp, ESL_SQ *sq);
91 
92 /* daemon format */
93 static void config_daemon(ESL_SQFILE *sqfp);
94 static void inmap_daemon (ESL_SQFILE *sqfp, const ESL_DSQ *abc_inmap);
95 static int  end_daemon   (ESL_SQFILE *sqfp, ESL_SQ *sq);
96 
97 /* HMMPGMD format */
98 static int  fileheader_hmmpgmd(ESL_SQFILE *sqfp);
99 
100 
101 /*****************************************************************
102  *# 1. An <ESL_SQFILE> object, in text mode.
103  *****************************************************************/
104 
105 /* Function:  esl_sqascii_Open()
106  * Synopsis:  Open a sequence file for reading.
107  *
108  * Purpose:   Open a sequence file <filename> for reading.
109  *            The opened <ESL_SQFILE> is returned through <ret_sqfp>.
110  *
111  *            The format of the file is asserted to be <format> (for
112  *            example, <eslSQFILE_FASTA>). If <format> is
113  *            <eslSQFILE_UNKNOWN> then the routine attempts to
114  *            autodetect the file format.
115  *
116  *            There are two special cases for <filename>. If
117  *            <filename> is "-", the sequence data are read from a
118  *            <STDIN> pipe. If <filename> ends in ".gz", the file is
119  *            assumed to be compressed with <gzip>, and it is opened
120  *            by a pipe from <gzip -dc>. Reading gzip files only works
121  *            on POSIX-compliant systems that have pipes
122  *            (specifically, the POSIX.2 popen() call).
123  *
124  * Returns:   <eslOK> on success, and <*ret_sqfp> points to a new
125  *            open <ESL_SQFILE>. Caller deallocates this object with
126  *            <esl_sqfile_Close()>.
127  *
128  *            Returns <eslENOTFOUND> if <filename> can't be found or
129  *            opened.  Returns <eslEFORMAT> if the file is empty, or
130  *            if autodetection is attempted and the format can't be
131  *            determined.  On any error condition, <*ret_sqfp> is
132  *            returned NULL.
133  *
134  * Throws:    <eslEMEM> on allocation failure.
135  */
136 int
esl_sqascii_Open(char * filename,int format,ESL_SQFILE * sqfp)137 esl_sqascii_Open(char *filename, int format, ESL_SQFILE *sqfp)
138 {
139   int         status;/* return status from an ESL call */
140   int         n;
141 
142   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
143 
144   /* before we go any further, make sure we can handle the format */
145   if (format == eslSQFILE_NCBI) return eslENOTFOUND;
146 
147   /* Default initializations */
148   ascii->fp           = NULL;
149   ascii->do_gzip      = FALSE;
150   ascii->do_stdin     = FALSE;
151   ascii->do_buffer    = FALSE;
152 
153   ascii->mem          = NULL;
154   ascii->allocm       = 0;
155   ascii->mn           = 0;
156   ascii->mpos         = 0;
157   ascii->moff         = -1;
158   ascii->is_recording = FALSE;
159 
160   ascii->buf          = NULL;
161   ascii->boff         = 0;
162   ascii->balloc       = 0;
163   ascii->nc           = 0;
164   ascii->bpos         = 0;
165   ascii->L            = 0;
166   ascii->linenumber   = 1;
167 
168   ascii->bookmark_offset  = 0;
169   ascii->bookmark_linenum = 0;
170 
171   ascii->is_linebased = FALSE;
172   ascii->eof_is_ok    = FALSE;
173   ascii->parse_header = NULL;
174   ascii->skip_header  = NULL;
175   ascii->parse_end    = NULL;
176 
177   ascii->afp        = NULL;
178   ascii->msa        = NULL;
179   ascii->idx        = -1;
180 
181   ascii->ssifile    = NULL;
182   ascii->rpl        = -1; /* -1 = not set yet */
183   ascii->bpl        = -1; /* (ditto) */
184   ascii->prvrpl     = -1; /* (ditto) */
185   ascii->prvbpl     = -1; /* (ditto) */
186   ascii->currpl     = -1;
187   ascii->curbpl     = -1;
188   ascii->ssi        = NULL;
189 
190   /* MSA formats are handled entirely by msafile module -
191    * let it  handle stdin, .gz, etc
192    */
193   if (! esl_sqio_IsAlignment(format))
194   {
195     if (strcmp(filename, "-") == 0) /* stdin special case */
196     {
197       ascii->fp       = stdin;
198       ascii->do_stdin = TRUE;
199     }
200     else
201     { /* Check the current working directory first. */
202       if ((ascii->fp = fopen(filename, "r")) == NULL) {
203         status = eslENOTFOUND;
204         goto ERROR;
205       }
206     }
207 
208       /* Deal with the .gz special case: to popen(), "success" only means
209        * it found and executed gzip -dc.  If gzip -dc doesn't find our
210        * file, popen() still blithely returns success, so we have to be
211        * sure the file exists. That's why we fopen()'ed it above, only to
212        * close it and popen() it here.
213        */
214 #ifdef HAVE_POPEN
215       n = strlen(filename);
216       if (n > 3 && strcmp(filename+n-3, ".gz") == 0)
217       {
218         char *cmd;
219         fclose(ascii->fp);
220         ESL_ALLOC(cmd, sizeof(char) * (n+1+strlen("gzip -dc ")));
221         sprintf(cmd, "gzip -dc %s", filename);
222         ascii->fp = popen(cmd, "r");
223         if (ascii->fp == NULL) { status = eslENOTFOUND; goto ERROR; }
224         ascii->do_gzip  = TRUE;
225         free(cmd);
226       }
227 #endif /*HAVE_POPEN*/
228 
229       /* If we don't know the format yet, try to autodetect it now. */
230       if (format == eslSQFILE_UNKNOWN)
231       {
232          status = sqascii_GuessFileFormat(sqfp, &format);
233          if      (status == eslOK)      sqfp->format = format;
234          else if (status != eslEFORMAT) goto ERROR; /* <format> might still be eslSQFILE_UNKNOWN, for MSA files  */
235       }
236 
237       /* If the format is still unknown, it may be an MSA file.  The
238        * msafile module is capable of autodetecting format even in a .gz
239        * or stdin pipe, but the stuff above has already read from these
240        * nonrewindable sources, trying to guess an unaligned format.  We
241        * could open a second .gz pipe, but that's ugly; and in any case,
242        * we can't rewind stdin. Eventually, this will get resolved, by
243        * having sqio open an ESL_BUFFER, then doing an
244        * esl_msafile_OpenBuffer() if we need to hand control to the
245        * msafile module. For now, sqio is already documented to be
246        * unable to autodetect MSA file formats in stdin or .gz pipes,
247        * so leave it that way.
248        */
249       if (format == eslSQFILE_UNKNOWN && (ascii->do_gzip || ascii->do_stdin))
250       { status = eslEFORMAT; goto ERROR; }
251     }
252 
253   /* If format is definitely an MSA, open it through the msafile interface.
254    * Or, if format is still unknown, try to open the file as an MSA file,
255    * using msafile autodetection.
256    */
257   if (format == eslSQFILE_UNKNOWN || esl_sqio_IsAlignment(format))
258     {
259       status = esl_msafile_Open(NULL, filename, NULL, format, NULL, &(ascii->afp));
260       if (status != eslOK) { status = eslEFORMAT; goto ERROR; } /* This was our last attempt. Failure to open == failure to detect format */
261       sqfp->format = format = ascii->afp->format;
262     }
263   if (format == eslSQFILE_UNKNOWN) { status = eslEFORMAT; goto ERROR; }
264 
265 
266   /* Configure the <sqfp>'s parser and inmaps for this format. */
267   if (!esl_sqio_IsAlignment(format))
268     {
269       switch (format) {
270       case eslSQFILE_EMBL:     config_embl(sqfp);    inmap_embl(sqfp,    NULL);   break;
271       case eslSQFILE_UNIPROT:  config_embl(sqfp);    inmap_embl(sqfp,    NULL);   break;
272       case eslSQFILE_GENBANK:  config_genbank(sqfp); inmap_genbank(sqfp, NULL);   break;
273       case eslSQFILE_DDBJ:     config_genbank(sqfp); inmap_genbank(sqfp, NULL);   break;
274       case eslSQFILE_FASTA:    config_fasta(sqfp);   inmap_fasta(sqfp,   NULL);   break;
275       case eslSQFILE_DAEMON:   config_daemon(sqfp);  inmap_daemon(sqfp,  NULL);   break;
276       case eslSQFILE_HMMPGMD:  config_fasta(sqfp);   inmap_fasta(sqfp,   NULL);   break;
277       default:status = eslEFORMAT; goto ERROR;
278       }
279 
280       /* Preload the first line or chunk of file. */
281       status = loadbuf(sqfp);
282       if      (status == eslEOF) { status = eslEFORMAT; goto ERROR; }
283       else if (status != eslOK)  { goto ERROR; }
284 
285       /* hmmpgmd is a special case: we need to skip first line before parsing it.
286        * generalize that a little: this could be a section for parsing a file header,
287        * and leaving the buf positioned at the first char of the first record
288        * (just as expected if there's no file header)
289        */
290       switch (format) {
291       case eslSQFILE_HMMPGMD:   status = fileheader_hmmpgmd(sqfp); break;
292       default:                  status = eslOK;                    break;
293       }
294       if (status != eslOK) goto ERROR;
295     }
296   else
297     {
298       ascii->is_linebased = TRUE;
299       ascii->eof_is_ok    = FALSE; /* no-op for msa's */
300       ascii->parse_header = NULL;  /* no-op for msa's */
301       ascii->skip_header  = NULL;  /* no-op for msa's */
302       ascii->parse_end    = NULL;  /* no-op for msa's */
303     }
304 
305   /* initialize the function pointers for the ascii routines */
306   sqfp->position          = &sqascii_Position;
307   sqfp->close             = &sqascii_Close;
308 
309   sqfp->set_digital       = &sqascii_SetDigital;
310   sqfp->guess_alphabet    = &sqascii_GuessAlphabet;
311 
312   sqfp->is_rewindable     = &sqascii_IsRewindable;
313 
314   sqfp->read              = &sqascii_Read;
315   sqfp->read_info         = &sqascii_ReadInfo;
316   sqfp->read_seq          = &sqascii_ReadSequence;
317   sqfp->read_window       = &sqascii_ReadWindow;
318   sqfp->echo              = &sqascii_Echo;
319 
320   sqfp->read_block        = &sqascii_ReadBlock;
321 
322   sqfp->open_ssi          = &sqascii_OpenSSI;
323   sqfp->pos_by_key        = &sqascii_PositionByKey;
324   sqfp->pos_by_number     = &sqascii_PositionByNumber;
325 
326   sqfp->fetch             = &sqascii_Fetch;
327   sqfp->fetch_info        = &sqascii_FetchInfo;
328   sqfp->fetch_subseq      = &sqascii_FetchSubseq;
329   sqfp->get_error         = &sqascii_GetError;
330 
331   return eslOK;
332 
333  ERROR:
334   sqascii_Close(sqfp);
335   return status;
336 }
337 
338 
339 /* Function:  sqascii_GuessFileFormat()
340  * Synopsis:  Guess the format of an open <ESL_SQFILE>.
341  *
342  * Purpose:   Try to guess the sequence file format of <sqfp>, and
343  *            return the format code in <*ret_fmt>.
344  *
345  *            First we attempt to guess based on the <filename>'s
346  *            suffix. <*.fa> is assumed to be in FASTA format; <*.gb>
347  *            is assumed to be in GenBank format.
348  *
349  *            If that fails, we attempt to guess based on peeking at
350  *            the first nonblank line of <filename>. If the line
351  *            starts with $>$, we assume FASTA format; if the line
352  *            starts with <ID>, we assume EMBL format; if the line
353  *            starts with <LOCUS> or it contains the string <Genetic
354  *            Sequence Data Bank> we assume GenBank format.
355  *
356  *            If that fails too, return an <eslEFORMAT> error, and
357  *            <*ret_fmt> is set to <eslSQFILE_UNKNOWN>.
358  *
359  * Returns:   <eslOK> on success, and <*ret_fmt> contains
360  *            a valid sequence file format code, such as
361  *            <eslSQFILE_FASTA>.
362  *
363  *            Returns <eslEFORMAT> if we opened <filename> but it
364  *            contains no nonblank lines, or if we peeked at the first
365  *            nonblank line and still couldn't guess the format;
366  *            <*ret_fmt> is then <eslSQFILE_UNKNOWN>.
367  *
368  * Throws:    <eslEMEM> on allocation failure.
369  */
370 static int
sqascii_GuessFileFormat(ESL_SQFILE * sqfp,int * ret_fmt)371 sqascii_GuessFileFormat(ESL_SQFILE *sqfp, int *ret_fmt)
372 {
373   int   n         = strlen(sqfp->filename);
374   const char *sfx = NULL;
375   int   is_gzip   = FALSE;
376   int   nsfx;
377   int   status;
378 
379   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
380 
381   /* On any premature exit, *ret_fmt is eslSQFILE_UNKNOWN */
382   *ret_fmt = eslSQFILE_UNKNOWN;
383 
384   /* Is <filename> gzip'ed? Look at suffix. */
385   if (n > 3 && strcmp(sqfp->filename+n-3, ".gz") == 0) is_gzip = TRUE;
386 
387   /* Locate the suffix that might indicate format (ignore .gz) */
388   for (nsfx = 1, sfx = sqfp->filename + n - 1 - (is_gzip ? 3 : 0);
389        sfx != sqfp->filename && *sfx != '.';
390        sfx--)
391     nsfx++;
392 
393   /* now sfx points either to filename (we didn't find a suffix) or to the . of the suffix,
394    * and nsfx is the suffix length inclusive of the .
395    */
396 
397   /* Attempt to guess file format based on file name suffix. */
398   if      (nsfx && strcmp(sfx, ".fa") == 0)  { *ret_fmt = eslSQFILE_FASTA;      return eslOK; }
399   else if (nsfx && strcmp(sfx, ".gb") == 0)  { *ret_fmt = eslSQFILE_GENBANK;    return eslOK; }
400 
401   /* If that didn't work, we'll have a peek at the stream;
402    * turn recording on, and set for line based input.
403    */
404   if (ascii->is_recording == -1) ESL_EXCEPTION(eslEINVAL, "sq file already too advanced");
405   ascii->is_recording = TRUE;
406   ascii->is_linebased = TRUE;
407   loadbuf(sqfp);/* now ascii->buf is a line of the file */
408 
409   /* get first nonblank line */
410   while (esl_str_IsBlank(ascii->buf)) {
411     status = loadbuf(sqfp);
412     if      (status == eslEOF) ESL_XFAIL(eslEFORMAT, ascii->errbuf, "No data found in file");
413     else if (status != eslOK)  goto ERROR;
414   }
415 
416   /* formats that can be determined from the first line: */
417   if      (*(ascii->buf) == '>')                                     *ret_fmt = eslSQFILE_FASTA;
418   else if (strncmp(ascii->buf, "ID   ", 5)    == 0)                  *ret_fmt = eslSQFILE_EMBL;
419   else if (strncmp(ascii->buf, "LOCUS   ", 8) == 0)                  *ret_fmt = eslSQFILE_GENBANK;
420   else if (strstr(ascii->buf, "Genetic Sequence Data Bank") != NULL) *ret_fmt = eslSQFILE_GENBANK;
421 
422   /* reset the sqfp */
423   ascii->mpos         = 0;
424   ascii->is_recording = FALSE;
425   ascii->is_linebased = FALSE;
426   free(ascii->buf);
427   ascii->buf    = NULL;
428   ascii->balloc = 0;
429   return (*ret_fmt == eslSQFILE_UNKNOWN) ? eslEFORMAT : eslOK;
430 
431  ERROR:
432   ascii->mpos         = 0;
433   ascii->is_recording = FALSE;
434   ascii->is_linebased = FALSE;
435   if (ascii->buf != NULL) { free(ascii->buf); ascii->balloc = 0; }
436   return status;
437 }
438 
439 /* Function:  sqascii_Position()
440  * Synopsis:  Reposition an open sequence file to an offset.
441   *
442  * Purpose:   Reposition an open <sqfp> to offset <offset>.
443  *            <offset> would usually be the first byte of a
444  *            desired sequence record.
445  *
446  *            Only normal sequence files can be positioned to a
447  *            nonzero offset. If <sqfp> corresponds to a standard
448  *            input stream or gzip -dc stream, it may not be
449  *            repositioned. If <sqfp> corresponds to a multiple
450  *            sequence alignment file, the only legal <offset>
451  *            is 0, to rewind the file to the beginning and
452  *            be able to read the entire thing again.
453  *
454  *            After <esl_sqfile_Position()> is called on a nonzero
455  *            <offset>, <sqfp->linenumber> and other bookkeeping
456  *            information is unknown. If caller knows it, it should
457  *            set it explicitly.
458  *
459  *            See the SSI module for manipulating offsets and indices.
460  *
461  * Returns:   <eslOK>     on success;
462  *            <eslEOF>    if no data can be read from this position.
463  *
464  * Throws:    <eslESYS> if the fseeko() or fread() call fails.
465  *            <eslEMEM> on (re-)allocation failure.
466  *            <eslEINVAL> if the <sqfp> is not positionable.
467  *            <eslENOTFOUND> if in trying to rewind an alignment file
468  *              by closing and reopening it, the open fails.
469  *            On errors, the state of <sqfp> is indeterminate, and
470  *            it should not be used again.
471  */
472 static int
sqascii_Position(ESL_SQFILE * sqfp,off_t offset)473 sqascii_Position(ESL_SQFILE *sqfp, off_t offset)
474 {
475   int status;
476 
477   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
478 
479   if (ascii->do_stdin)                  ESL_EXCEPTION(eslEINVAL, "can't Position() in standard input");
480   if (ascii->do_gzip)                   ESL_EXCEPTION(eslEINVAL, "can't Position() in a gzipped file");
481   if (offset < 0)                       ESL_EXCEPTION(eslEINVAL, "bad offset");
482   if (offset > 0 && ascii->afp != NULL) ESL_EXCEPTION(eslEINVAL, "can't use esl_sqfile_Position() w/ nonzero offset on MSA file");
483 
484   if (esl_sqio_IsAlignment(sqfp->format))
485     {/* msa file: close and reopen. maybe sometime we'll have esl_msafile_Rewind() */
486         /* we have already verified that offset==0 for MSA file */
487       esl_msafile_Close(ascii->afp);
488       if (ascii->msa != NULL) esl_msa_Destroy(ascii->msa);
489       ascii->afp = NULL;
490       ascii->msa = NULL;
491       ascii->idx = 0;
492 
493       /* we know we successfully opened it the first time, so a
494          failure to reopen is an exception, not a user-reportable
495          normal error. ENOTFOUND is the only normal error;
496          EFORMAT error can't occur because we know the format and
497          don't use autodetection.
498        */
499       status = esl_msafile_Open(NULL, sqfp->filename, NULL, sqfp->format, NULL, &(ascii->afp));
500       if      (status == eslENOTFOUND) ESL_EXCEPTION(eslENOTFOUND, "failed to reopen alignment file");
501       else if (status != eslOK)        return status;
502     }
503   else/* normal case: unaligned sequence file */
504     {
505       if (fseeko(ascii->fp, offset, SEEK_SET) != 0) ESL_EXCEPTION(eslESYS, "fseeko() failed");
506 
507       ascii->currpl     = -1;
508       ascii->curbpl     = -1;
509       ascii->prvrpl     = -1;
510       ascii->prvbpl     = -1;
511       ascii->linenumber = (offset == 0) ? 1 : -1; /* -1 is "unknown" */
512       ascii->L          = -1;
513       ascii->mpos       = ascii->mn;/* this forces loadbuf to load new data */
514       if ((status = loadbuf(sqfp)) != eslOK) return status;
515     }
516   return eslOK;
517 }
518 
519 
520 
521 /* Function:  sqascii_Close()
522  * Synopsis:  Close a sequence file.
523  *
524  * Purpose:   Closes an open <sqfp>.
525  *
526  * Returns:   (void).
527  */
528 static void
sqascii_Close(ESL_SQFILE * sqfp)529 sqascii_Close(ESL_SQFILE *sqfp)
530 {
531   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
532 
533 #ifdef HAVE_POPEN
534   if (ascii->do_gzip)          pclose(ascii->fp);
535   else
536 #endif
537   if (! ascii->do_stdin && ascii->fp != NULL) fclose(ascii->fp);
538 
539   if (ascii->ssifile  != NULL) free(ascii->ssifile);
540   if (ascii->mem      != NULL) free(ascii->mem);
541   if (ascii->balloc   > 0)     free(ascii->buf);
542   if (ascii->ssi      != NULL) esl_ssi_Close(ascii->ssi);
543 
544   if (ascii->afp      != NULL) esl_msafile_Close(ascii->afp);
545   if (ascii->msa      != NULL) esl_msa_Destroy(ascii->msa);
546 
547   ascii->do_gzip  = FALSE;
548   ascii->do_stdin = FALSE;
549 
550   ascii->fp       = NULL;
551 
552   ascii->ssifile  = NULL;
553   ascii->mem      = NULL;
554 
555   ascii->balloc   = 0;
556   ascii->buf      = NULL;
557 
558   ascii->ssi      = NULL;
559 
560   ascii->afp      = NULL;
561   ascii->msa      = NULL;
562 
563   return;
564 }
565 /*------------------- ESL_SQFILE open/close -----------------------*/
566 
567 
568 /*****************************************************************
569  *# 2. An <ESL_SQFILE> object, in digital mode [with <alphabet>]
570  *****************************************************************/
571 
572 /* Function:  sqascii_SetDigital()
573  * Synopsis:  Set an open <ESL_SQFILE> to read in digital mode.
574  *
575  * Purpose:   Given an <ESL_SQFILE> that's already been opened,
576  *            configure it to expect subsequent input to conform
577  *            to the digital alphabet <abc>.
578  *
579  *            Calling <esl_sqfile_Open(); esl_sqfile_SetDigital()> is
580  *            equivalent to <esl_sqfile_OpenDigital()>. The two-step
581  *            version is useful when you need a
582  *            <esl_sqfile_GuessAlphabet()> call in between, guessing
583  *            the file's alphabet in text mode before you set it to
584  *            digital mode.
585  *
586  * Returns:   <eslOK> on success.
587  */
588 static int
sqascii_SetDigital(ESL_SQFILE * sqfp,const ESL_ALPHABET * abc)589 sqascii_SetDigital(ESL_SQFILE *sqfp, const ESL_ALPHABET *abc)
590 {
591   int status = eslOK;
592 
593   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
594 
595   if (!esl_sqio_IsAlignment(sqfp->format))
596     {
597       switch (sqfp->format) {
598       case eslSQFILE_EMBL:       inmap_embl(sqfp,    abc->inmap); break;
599       case eslSQFILE_UNIPROT:    inmap_embl(sqfp,    abc->inmap); break;
600       case eslSQFILE_GENBANK:    inmap_genbank(sqfp, abc->inmap); break;
601       case eslSQFILE_DDBJ:       inmap_genbank(sqfp, abc->inmap); break;
602       case eslSQFILE_FASTA:      inmap_fasta(sqfp,   abc->inmap); break;
603       case eslSQFILE_DAEMON:     inmap_daemon(sqfp,  abc->inmap); break;
604 
605       default:                   status = eslEFORMAT;             break;
606       }
607     }
608   else
609     esl_msafile_SetDigital(ascii->afp, abc);
610 
611   return status;
612 }
613 
614 /* Function:  sqascii_GuessAlphabet()
615  * Synopsis:  Guess the alphabet of an open <ESL_SQFILE>.
616  *
617  * Purpose:   After opening <sqfp>, attempt to guess what alphabet
618  *            its sequences are in, by inspecting the first sequence
619  *            in the file, and return this alphabet type in <*ret_type>.
620  *
621  * Returns:   <eslOK> on success, and <*ret_type> is set to <eslDNA>,
622  *            <eslRNA>, or <eslAMINO>.
623  *
624  *            Returns <eslENOALPHABET> and sets <*ret_type> to
625  *            <eslUNKNOWN> if the first sequence (or alignment)
626  *            in the file contains no more than ten residues total,
627  *            or if its alphabet cannot be guessed (i.e. it contains
628  *            IUPAC degeneracy codes, but no amino acid specific
629  *            residues).
630  *
631  *            Returns <eslEFORMAT> if a parse error is encountered in
632  *            trying to read the sequence file. <ascii->errbuf> is set
633  *            to a useful error message if this occurs,
634  *            <sqfp->linenumber> is the line on which the error
635  *            occurred, and <*ret_type> is set to <eslUNKNOWN>.
636  *
637  *            Returns <eslENODATA> and sets <*ret_type> to <eslUNKNOWN>
638  *            if the file appears to be empty.
639  *
640  * Throws:    <eslEMEM> on allocation error;
641  *            <eslEINCONCEIVABLE> on unimaginable internal errors.
642  */
643 static int
sqascii_GuessAlphabet(ESL_SQFILE * sqfp,int * ret_type)644 sqascii_GuessAlphabet(ESL_SQFILE *sqfp, int *ret_type)
645 {
646   ESL_SQ *sq = NULL;
647   int     status;
648 
649   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
650 
651   /* Special case: for MSA files, hand this off to msafile_GuessAlphabet. */
652   if (esl_sqio_IsAlignment(sqfp->format)) return esl_msafile_GuessAlphabet(ascii->afp, ret_type);
653 
654   /* set the sqfp to record; we'll rewind afterwards and use the recording */
655   ascii->is_recording = TRUE;
656 
657   if ((sq = esl_sq_Create()) == NULL) { status = eslEMEM; goto ERROR; }
658 
659   status = sqascii_ReadWindow(sqfp, 0, 4000, sq);
660   if      (status == eslEOF) { status = eslENODATA; goto ERROR; }
661   else if (status != eslOK)  goto ERROR;
662 
663   if ((status = esl_sq_GuessAlphabet(sq, ret_type)) != eslOK) goto ERROR;
664 
665   /* reset the sqfp, so it uses the recording next */
666   ascii->mpos         = 0;
667   ascii->linenumber   = 1;
668   ascii->is_recording = FALSE;
669   if ((status = loadbuf(sqfp)) != eslOK) ESL_EXCEPTION(status, "buffer load failed, but shouldn't have");
670   esl_sq_Destroy(sq);
671   return eslOK;
672 
673  ERROR:
674   esl_sq_Destroy(sq);
675   *ret_type      = eslUNKNOWN;
676   return status;
677 }
678 /*-------------- end, digital mode ESL_SQFILE -------------------*/
679 
680 
681 
682 
683 /*****************************************************************
684  *# 3. Miscellaneous routines
685  *****************************************************************/
686 
687 /* Function:  sqascii_IsRewindable()
688  * Synopsis:  Return <TRUE> if <sqfp> can be rewound.
689  *
690  * Purpose:   Returns <TRUE> if <sqfp> can be rewound (positioned
691  *            to an offset of zero), in order to read it a second
692  *            time.
693  */
694 static int
sqascii_IsRewindable(const ESL_SQFILE * sqfp)695 sqascii_IsRewindable(const ESL_SQFILE *sqfp)
696 {
697   if (sqfp->data.ascii.do_gzip  == TRUE) return FALSE;
698   if (sqfp->data.ascii.do_stdin == TRUE) return FALSE;
699   return TRUE;
700 }
701 
702 /* Function:  sqascii_GetError()
703  * Synopsis:  Return <TRUE> if <sqfp> can be rewound.
704  *
705  * Purpose:   Returns <TRUE> if <sqfp> can be rewound (positioned
706  *            to an offset of zero), in order to read it a second
707  *            time.
708  */
709 static const char *
sqascii_GetError(const ESL_SQFILE * sqfp)710 sqascii_GetError(const ESL_SQFILE *sqfp)
711 {
712   return sqfp->data.ascii.errbuf;
713 }
714 
715 
716 /*****************************************************************
717  *# 4. Sequence reading (sequential)
718  *****************************************************************/
719 
720 /* Function:  sqascii_Read()
721  * Synopsis:  Read the next sequence from a file.
722  *
723  * Purpose:   Reads the next sequence from open sequence file <sqfp> into
724  *            <sq>. Caller provides an allocated and initialized <s>, which
725  *            will be internally reallocated if its space is insufficient.
726  *
727  * Returns:   <eslOK> on success; the new sequence is stored in <s>.
728  *
729  *            Returns <eslEOF> when there is no sequence left in the
730  *            file (including first attempt to read an empty file).
731  *
732  *            Returns <eslEFORMAT> if there's a problem with the format,
733  *            such as an illegal character; the line number that the parse
734  *            error occurs on is in <sqfp->linenumber>, and an informative
735  *            error message is placed in <ascii->errbuf>.
736  *
737  * Throws:    <eslEMEM> on allocation failure;
738  *            <eslEINCONCEIVABLE> on internal error.
739  */
740 static int
sqascii_Read(ESL_SQFILE * sqfp,ESL_SQ * sq)741 sqascii_Read(ESL_SQFILE *sqfp, ESL_SQ *sq)
742 {
743   int     status;
744   int64_t epos;
745   int64_t n;
746 
747   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
748 
749   if (esl_sqio_IsAlignment(sqfp->format))
750   {
751       ESL_SQ *tmpsq = NULL;
752       if (ascii->msa == NULL || ascii->idx >= ascii->msa->nseq)
753       { /* we need to load a new alignment? */
754         esl_msa_Destroy(ascii->msa);
755         status = esl_msafile_Read(ascii->afp, &(ascii->msa));
756         if (status == eslEFORMAT)
757         { /* oops, a parse error; upload the error info from afp to sqfp */
758            ascii->linenumber = ascii->afp->linenumber;
759            strcpy(ascii->errbuf, ascii->afp->errmsg); /* errbufs same size! */
760            return eslEFORMAT;
761         }
762         if (status != eslOK) return status;
763         ascii->idx = 0;
764       }
765 
766       /* grab next seq from alignment */
767       /* this is inefficient; it goes via a temporarily allocated copy of the sequence */
768       if ((status = esl_sq_FetchFromMSA(ascii->msa, ascii->idx, &tmpsq)) != eslOK) return status;
769       esl_sq_GrowTo(sq, tmpsq->n);
770       esl_sq_Copy(tmpsq, sq);
771       esl_sq_Destroy(tmpsq);
772       ascii->idx++;
773 
774       sq->start = 1;
775       sq->end   = sq->n;
776       sq->C     = 0;
777       sq->W     = sq->n;
778       sq->L     = sq->n;
779       return eslOK;
780     }
781 
782   /* Main case: read next seq from sqfp's stream */
783   if (ascii->nc == 0) return eslEOF;
784   if ((status = ascii->parse_header(sqfp, sq)) != eslOK) return status; /* EMEM, EOF, EFORMAT */
785 
786   do {
787     if ((status = seebuf(sqfp, -1, &n, &epos)) == eslEFORMAT) return status;
788     if (esl_sq_GrowTo(sq, sq->n + n) != eslOK) return eslEMEM;
789     addbuf(sqfp, sq, n);
790     ascii->L   += n;
791     sq->eoff   = ascii->boff + epos - 1;
792     if (status == eslEOD)     break;
793   } while ((status = loadbuf(sqfp)) == eslOK);
794 
795   if      (status == eslEOF)
796     {
797       if (! ascii->eof_is_ok) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Unexpected EOF; file truncated?");
798       if ((status = ascii->parse_end(sqfp, sq)) != eslOK) return status;
799     }
800   else if (status == eslEOD)
801     {
802       ascii->bpos = epos;
803       if ((status = ascii->parse_end(sqfp, sq)) != eslOK) return status;
804     }
805   else if (status != eslOK) return status;
806 
807   if (sq->dsq != NULL) sq->dsq[sq->n+1] = eslDSQ_SENTINEL;
808   else                 sq->seq[sq->n] = '\0';
809   sq->start = 1;
810   sq->end   = sq->n;
811   sq->C     = 0;
812   sq->W     = sq->n;
813   sq->L     = sq->n;
814   return eslOK;
815 }
816 
817 
818 /* Function:  sqascii_ReadInfo()
819  * Synopsis:  Read sequence info, but not the sequence itself.
820  *
821  * Purpose:   Read the next sequence from open sequence file <sqfp>,
822  *            but don't store the sequence (or secondary structure).
823  *            Upon successful return, <s> holds all the available
824  *            information about the sequence -- its name, accession,
825  *            description, and overall length <sq->L>.
826  *
827  *            This is useful for indexing sequence files, where
828  *            individual sequences might be ginormous, and we'd rather
829  *            avoid reading complete seqs into memory.
830  *
831  * Returns:   <eslOK> on success.
832  *
833  * Throws:    <eslEMEM> on allocation error.
834  */
835 static int
sqascii_ReadInfo(ESL_SQFILE * sqfp,ESL_SQ * sq)836 sqascii_ReadInfo(ESL_SQFILE *sqfp, ESL_SQ *sq)
837 {
838   int     status;
839   int64_t epos;
840   int64_t n;
841 
842   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
843 
844   if (esl_sqio_IsAlignment(sqfp->format))
845     {
846       ESL_SQ *tmpsq = NULL;
847       if (ascii->msa == NULL || ascii->idx >= ascii->msa->nseq)
848       { /* we need to load a new alignment? */
849         esl_msa_Destroy(ascii->msa);
850         status = esl_msafile_Read(ascii->afp, &(ascii->msa));
851         if (status == eslEFORMAT)
852         { /* oops, a parse error; upload the error info from afp to sqfp */
853           ascii->linenumber = ascii->afp->linenumber;
854           strcpy(ascii->errbuf, ascii->afp->errmsg); /* errbufs same size! */
855           return eslEFORMAT;
856         }
857         if (status != eslOK) return status;
858         ascii->idx = 0;
859       }
860 
861       /* grab next seq from alignment */
862       /* this is inefficient; it goes via a temporarily allocated copy of the sequence */
863       if ((status = esl_sq_FetchFromMSA(ascii->msa, ascii->idx, &tmpsq)) != eslOK) return status;
864       if (tmpsq->dsq != NULL) tmpsq->dsq[1] = eslDSQ_SENTINEL;
865       else                    tmpsq->seq[0] = '\0';
866       esl_sq_Copy(tmpsq, sq);
867       esl_sq_Destroy(tmpsq);
868       ascii->idx++;
869 
870       if (sq->dsq != NULL) sq->dsq[1] = eslDSQ_SENTINEL;
871       else                 sq->seq[0] = '\0';
872       if (sq->ss  != NULL) { free(sq->ss); sq->ss = NULL; }
873 
874       sq->n     = 0;
875       sq->start = 0;
876       sq->end   = 0;
877       sq->C     = 0;
878       sq->W     = 0;
879       return eslOK;
880     }
881 
882   if (ascii->nc == 0) return eslEOF;
883   if ((status = ascii->parse_header(sqfp, sq)) != eslOK) return status; /* EOF, EFORMAT */
884 
885   ascii->L       = 0;
886   do {
887     status = seebuf(sqfp, -1, &n, &epos);
888     ascii->L += n;
889     sq->eoff = ascii->boff + epos - 1;
890     if (status == eslEFORMAT) return status;
891     if (status == eslEOD)     break;
892   } while ((status = loadbuf(sqfp)) == eslOK);
893 
894   if      (status == eslEOF)
895     {
896       if (! ascii->eof_is_ok) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Unexpected EOF; file truncated?");
897     }
898   else if (status == eslEOD)
899     {
900       ascii->bpos = epos;
901       if ((status = ascii->parse_end(sqfp, sq)) != eslOK) return status;
902     }
903   else if (status != eslOK) return status;
904   sq->L = ascii->L;
905 
906   /* Set coord system for an info-only ESL_SQ  */
907   if (sq->dsq != NULL) sq->dsq[1] = eslDSQ_SENTINEL;
908   else                 sq->seq[0] = '\0';
909   if (sq->ss  != NULL) { free(sq->ss); sq->ss = NULL; }
910   sq->n     = 0;
911   sq->start = 0;
912   sq->end   = 0;
913   sq->C     = 0;
914   sq->W     = 0;
915   return eslOK;
916 }
917 
918 
919 /* Function:  sqascii_ReadSequence()
920  * Synopsis:  Read the next sequence from a file.
921  *
922  * Purpose:   Reads the next sequence from open sequence file <sqfp> into
923  *            <sq>. Caller provides an allocated and initialized <s>, which
924  *            will be internally reallocated if its space is insufficient.
925  *
926  * Returns:   <eslOK> on success; the new sequence is stored in <s>.
927  *
928  *            Returns <eslEOF> when there is no sequence left in the
929  *            file (including first attempt to read an empty file).
930  *
931  *            Returns <eslEFORMAT> if there's a problem with the format,
932  *            such as an illegal character; the line number that the parse
933  *            error occurs on is in <sqfp->linenumber>, and an informative
934  *            error message is placed in <ascii->errbuf>.
935  *
936  * Throws:    <eslEMEM> on allocation failure;
937  *            <eslEINCONCEIVABLE> on internal error.
938  */
939 static int
sqascii_ReadSequence(ESL_SQFILE * sqfp,ESL_SQ * sq)940 sqascii_ReadSequence(ESL_SQFILE *sqfp, ESL_SQ *sq)
941 {
942   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
943   int64_t epos;
944   int64_t n;
945   int     status;
946 
947   if (esl_sqio_IsAlignment(sqfp->format))
948     {
949       ESL_SQ *tmpsq = NULL;
950       if (ascii->msa == NULL || ascii->idx >= ascii->msa->nseq)
951       { /* we need to load a new alignment? */
952         esl_msa_Destroy(ascii->msa);
953         status = esl_msafile_Read(ascii->afp, &(ascii->msa));
954         if (status == eslEFORMAT)
955         { /* oops, a parse error; upload the error info from afp to sqfp */
956           ascii->linenumber = ascii->afp->linenumber;
957           strcpy(ascii->errbuf, ascii->afp->errmsg); /* errbufs same size! */
958           return eslEFORMAT;
959         }
960         if (status != eslOK) return status;
961         ascii->idx = 0;
962       }
963 
964       /* grab next seq from alignment */
965       /* this is inefficient; it goes via a temporarily allocated copy of the sequence */
966       status = esl_sq_FetchFromMSA(ascii->msa, ascii->idx, &tmpsq);  // eslEMEM | eslEOD
967       if (status != eslOK) return status;
968 
969       esl_sq_GrowTo(sq, tmpsq->n);
970       esl_sq_Copy(tmpsq, sq);
971       esl_sq_Destroy(tmpsq);
972       ascii->idx++;
973 
974       sq->start = 1;
975       sq->end   = sq->n;
976       sq->C     = 0;
977       sq->W     = sq->n;
978       sq->L     = sq->n;
979       return eslOK;
980     }
981 
982   /* Main case: read next seq from sqfp's stream */
983   if (ascii->nc == 0) return eslEOF;
984   if ((status = ascii->skip_header(sqfp, sq)) != eslOK) return status; /* EOF, EFORMAT */
985 
986   do {
987     if ((status = seebuf(sqfp, -1, &n, &epos)) == eslEFORMAT) return status;
988     if (esl_sq_GrowTo(sq, sq->n + n) != eslOK) return eslEMEM;
989     addbuf(sqfp, sq, n);
990     ascii->L   += n;
991     sq->eoff   = ascii->boff + epos - 1;
992     if (status == eslEOD)     break;
993   } while ((status = loadbuf(sqfp)) == eslOK);
994 
995   if      (status == eslEOF)
996     {
997       if (! ascii->eof_is_ok) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Unexpected EOF; file truncated?");
998       if ((status = ascii->parse_end(sqfp, sq)) != eslOK) return status;
999     }
1000   else if (status == eslEOD)
1001     {
1002       ascii->bpos = epos;
1003       if ((status = ascii->parse_end(sqfp, sq)) != eslOK) return status;
1004     }
1005   else if (status != eslOK) return status;
1006 
1007   if (sq->dsq != NULL) sq->dsq[sq->n+1] = eslDSQ_SENTINEL;
1008   else                 sq->seq[sq->n] = '\0';
1009   sq->start = 1;
1010   sq->end   = sq->n;
1011   sq->C     = 0;
1012   sq->W     = sq->n;
1013   sq->L     = sq->n;
1014   return eslOK;
1015 }
1016 
1017 
1018 /* Function:  sqascii_ReadWindow()
1019  * Synopsis:  Read next window of sequence.
1020  *
1021  * Purpose:   Read a next window of <W> residues from open file <sqfp>,
1022  *            keeping <C> residues from the previous window as
1023  *            context, and keeping previous annotation in the <sq>
1024  *            as before.
1025  *
1026  *            If this is the first window of a new sequence record,
1027  *            <C> is ignored (there's no previous context yet), and
1028  *            the annotation fields of the <sq> (name, accession, and
1029  *            description) are initialized by reading the sequence
1030  *            record's header. This is the only time the annotation
1031  *            fields are initialized.
1032  *
1033  *            On return, <sq->dsq[]> contains the window and its
1034  *            context; residues <1..sq->C> are the previous context,
1035  *            and residues <sq->C+1..sq->n> are the new window.  The
1036  *            start and end coordinates of the whole <dsq[1..n]>
1037  *            (including context) in the original source sequence are
1038  *            <sq->start..sq->end>. (Or, for text mode sequences,
1039  *            <sq->seq[0..sq->C-1,sq->C..sq->n-1]>, while <start> and
1040  *            <end> coords are still <1..L>.)
1041  *
1042  *            When a sequence record is completed and no more data
1043  *            remain, <eslEOD> is returned, with an ``info'' <sq>
1044  *            structure (containing the annotation and the total
1045  *            sequence length <L>, but no sequence). (The total
1046  *            sequence length <L> is unknown in <sq> until this
1047  *            <eslEOD> return.)
1048  *
1049  *            The caller may then do one of two things before calling
1050  *            <esl_sq_ReadWindow()> again; it can reset the sequence
1051  *            with <esl_sq_Reuse()> to continue reading the next
1052  *            sequence in the file, or it can set a negative <W> as a
1053  *            signal to read windows from the reverse complement
1054  *            (Crick) strand. Reverse complement reading only works
1055  *            for nucleic acid sequence.
1056  *
1057  *            If you read the reverse complement strand, you must read
1058  *            the whole thing, calling <esl_sqio_ReadWindow()> with
1059  *            negative <W> windows until <eslEOD> is returned again
1060  *            with an empty (info-only) <sq> structure. When that
1061  *            <EOD> is reached, the <sqfp> is repositioned at the
1062  *            start of the next sequence record; the caller should now
1063  *            <Reuse()> the <sq>, and the next <esl_sqio_ReadWindow()>
1064  *            call must have a positive <W>, corresponding to starting
1065  *            to read the Watson strand of the next sequence.
1066  *
1067  *            Note that the <ReadWindow()> interface is designed for
1068  *            an idiom of sequential reading of complete sequences in
1069  *            overlapping windows, possibly on both strands; if you
1070  *            want more freedom to move around in the sequence
1071  *            grabbing windows in another order, you can use the
1072  *            <FetchSubseq()> interface.
1073  *
1074  *            Reading the reverse complement strand requires file
1075  *            repositioning, so it will not work on non-repositionable
1076  *            streams like gzipped files or a stdin pipe. Moreover,
1077  *            for reverse complement input to be efficient, the
1078  *            sequence file should have consistent line lengths,
1079  *            suitable for SSI's fast subsequence indexing.
1080  *
1081  * Returns:   <eslOK> on success; <sq> now contains next window of
1082  *            sequence, with at least 1 new residue. The number
1083  *            of new residues is <sq->W>; <sq->C> residues are
1084  *            saved from the previous window. Caller may now
1085  *            process residues <sq->dsq[sq->C+1]..sq->dsq[sq->n]>.
1086  *
1087  *            <eslEOD> if no new residues were read for this sequence
1088  *            and strand, and <sq> now contains an empty info-only
1089  *            structure (annotation and <L> are valid). Before calling
1090  *            <esl_sqio_ReadWindow()> again, caller will either want
1091  *            to make <W> negative (to start reading the Crick strand
1092  *            of the current sequence), or it will want to reset the
1093  *            <sq> (with <esl_sq_Reuse()>) to go on the next sequence.
1094  *
1095  *            <eslEOF> if we've already returned <eslEOD> before to
1096  *            signal the end of the previous seq record, and moreover,
1097  *            there's no more sequence records in the file.
1098  *
1099  *            <eslEINVAL> if an invalid residue is found in the
1100  *            sequence, or if you attempt to take the reverse
1101  *            complement of a sequence that can't be reverse
1102  *            complemented.
1103  *
1104  * Throws:    <eslESYNTAX> if you try to read a reverse window before
1105  *            you've read forward strand.
1106  *
1107  *            <eslECORRUPT> if something goes awry internally in the
1108  *            coordinate system.
1109  *
1110  *            <eslEMEM> on allocation error.
1111  */
1112 static int
sqascii_ReadWindow(ESL_SQFILE * sqfp,int C,int W,ESL_SQ * sq)1113 sqascii_ReadWindow(ESL_SQFILE *sqfp, int C, int W, ESL_SQ *sq)
1114 {
1115   int     actual_start;
1116   int64_t nres;
1117   int64_t line;
1118   off_t   offset;
1119   int     status;
1120   ESL_SQ *tmpsq = NULL;
1121 
1122   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
1123 
1124   if (esl_sqio_IsAlignment(sqfp->format))
1125   {
1126     /* special: if we're initializing a revcomp window read, back ascii->idx up one */
1127     if (W < 0 && sq->start == 0) ascii->idx--;
1128 
1129     if (ascii->msa == NULL || ascii->idx >= ascii->msa->nseq)
1130     { /* need new alignment? */
1131       esl_msa_Destroy(ascii->msa);
1132       status = esl_msafile_Read(ascii->afp, &(ascii->msa));
1133       if (status == eslEFORMAT)
1134       { /* oops, a parse error; upload the error info from afp to sqfp */
1135         ascii->linenumber = ascii->afp->linenumber;
1136         strcpy(ascii->errbuf, ascii->afp->errmsg); /* errbufs same size! */
1137         return eslEFORMAT;
1138       }
1139       else if (status != eslOK) goto ERROR;
1140       ascii->idx = 0;
1141     }
1142 
1143     /* grab appropriate seq from alignment into tmpsq */
1144     if ((status = esl_sq_FetchFromMSA(ascii->msa, ascii->idx, &tmpsq)) != eslOK) goto ERROR;
1145 
1146     /*by default, tmpsq is an ascii sequence, convert it to digital if that's what sq is*/
1147     if (sq->seq == NULL &&
1148 	(( status = esl_sq_Digitize(sq->abc, tmpsq)) != eslOK))
1149       goto ERROR;
1150 
1151     /* Figure out tmpsq coords we'll put in sq */
1152     if (W > 0)
1153     {/* forward strand */
1154        sq->C     = ESL_MIN(sq->n, C);
1155        sq->start = sq->end - sq->C + 1;
1156        sq->end   = ESL_MIN(tmpsq->L, sq->end + W);
1157        sq->n     = sq->end - sq->start + 1;
1158        sq->W     = sq->n - sq->C;
1159     }
1160     else
1161     {/* reverse strand */
1162        if (sq->L == -1) ESL_XEXCEPTION(eslESYNTAX, "Can't read reverse complement until you've read forward strand");
1163 
1164        sq->C     = ESL_MIN(sq->n, sq->end + C - 1);
1165        sq->end   = (sq->start == 0 ? sq->L : sq->end + sq->C - 1);
1166        sq->start = ESL_MAX(1, sq->end + W - sq->C - 1);
1167        sq->n     = sq->end - sq->start + 1;
1168        sq->W     = sq->n - sq->C;
1169     }
1170 
1171     if (sq->W == 0)/* no new sequence? that's the EOD case */
1172     {
1173        sq->start      = 0;
1174        sq->end        = 0;
1175        sq->C          = 0;
1176        sq->W          = 0;
1177        sq->n          = 0;
1178        sq->L          = tmpsq->L;
1179        if      (sq->dsq) sq->dsq[1] = eslDSQ_SENTINEL;
1180        else if (sq->seq) sq->seq[0] = '\0';
1181 
1182        ascii->idx++;
1183        esl_sq_Destroy(tmpsq);
1184        return eslEOD;
1185     }
1186 
1187     /* Copy the sequence frag.  */
1188     if (tmpsq->ss != NULL && sq->ss == NULL) ESL_ALLOC(sq->ss, sizeof(char) * (sq->salloc)); /* this *must* be for salloc  */
1189     esl_sq_GrowTo(sq, sq->n);
1190     if (tmpsq->seq != NULL)
1191     {/* text mode */
1192        memcpy(sq->seq, tmpsq->seq + sq->start - 1, sizeof(char) * sq->n);
1193        sq->seq[sq->n] = '\0';
1194        if (tmpsq->ss != NULL) {
1195          memcpy(sq->ss, tmpsq->ss + sq->start - 1, sizeof(char) * sq->n);
1196          sq->ss[sq->n] = '\0';
1197        }
1198     }
1199     else
1200     {
1201      memcpy(sq->dsq + 1, tmpsq->dsq + sq->start, sizeof(ESL_DSQ) * sq->n);
1202      sq->dsq[sq->n+1] = eslDSQ_SENTINEL;
1203      if (tmpsq->ss != NULL) {
1204        memcpy(sq->ss + 1, tmpsq->ss + sq->start, sizeof(char) * sq->n);
1205        sq->ss[sq->n+1] = '\0';
1206      }
1207     }
1208     if (W < 0 && (status = esl_sq_ReverseComplement(sq)) != eslOK)
1209       ESL_XFAIL(eslEINVAL, ascii->errbuf, "Can't reverse complement that sequence window");
1210 
1211     /* Copy annotation */
1212     if ((status = esl_sq_SetName     (sq, tmpsq->name))   != eslOK) goto ERROR;
1213     if ((status = esl_sq_SetSource   (sq, tmpsq->name))   != eslOK) goto ERROR;
1214     if ((status = esl_sq_SetAccession(sq, tmpsq->acc))    != eslOK) goto ERROR;
1215     if ((status = esl_sq_SetDesc     (sq, tmpsq->desc))   != eslOK) goto ERROR;
1216     sq->roff = -1;
1217     sq->doff = -1;
1218     sq->eoff = -1;
1219     sq->hoff = -1;
1220 
1221     esl_sq_Destroy(tmpsq);
1222     return eslOK;
1223   }
1224 
1225   /* Now for the normal case: we're reading a normal unaligned seq file, not an alignment. */
1226   /* Negative W indicates reverse complement direction */
1227   if (W < 0)
1228   {
1229     if (sq->L == -1) ESL_EXCEPTION(eslESYNTAX, "Can't read reverse complement until you've read forward strand");
1230 
1231     if (sq->end == 1 || sq->L == 0)
1232       { /* last end == 1 means last window was the final one on reverse strand,
1233          * so we're EOD; jump back to last forward position.
1234          *
1235          * Also check for the unusual case of sq->L == 0, a completely empty sequence:
1236          * in that case, immediately return eslEOD.
1237          */
1238         if (ascii->bookmark_offset > 0)
1239           {
1240             if (esl_sqfile_Position(sqfp, ascii->bookmark_offset) != eslOK)
1241               ESL_EXCEPTION(eslECORRUPT, "Failed to reposition seq file at last forward bookmark");
1242             ascii->linenumber = ascii->bookmark_linenum;
1243           }
1244         else
1245           ascii->nc = 0; /* signals EOF */
1246 
1247         ascii->bookmark_offset  = 0;
1248         ascii->bookmark_linenum = 0;
1249 
1250         sq->start      = 0;
1251         sq->end        = 0;
1252         sq->C          = 0;
1253         sq->W          = 0;
1254         sq->n          = 0;
1255         /* sq->L stays as it is */
1256         if (sq->dsq != NULL) sq->dsq[1] = eslDSQ_SENTINEL;
1257         else                 sq->seq[0] = '\0';
1258         return eslEOD;
1259       }
1260 
1261     /* If sq->start == 0, we haven't read any reverse windows yet;
1262      * init reading from sq->L
1263      */
1264     W = -W;
1265     if (sq->start == 0)
1266       {
1267         sq->start        = ESL_MAX(1, (sq->L - W + 1));
1268         sq->end          = sq->L;
1269         sq->C            = 0;
1270         sq->W            = sq->end - sq->start + 1;
1271         ascii->curbpl     = -1;
1272         ascii->currpl     = -1;
1273         ascii->prvbpl     = -1;
1274         ascii->prvrpl     = -1;
1275         ascii->linenumber = -1;
1276         ascii->L          = -1;
1277       }
1278     else
1279       { /* Else, we're continuing to next window; prv was <end>..<start> */
1280         sq->C     = ESL_MIN(C, sq->L - sq->end + 1);  /* based on prev window's end */
1281         sq->end   = sq->end + sq->C - 1;                /* also based on prev end     */
1282         sq->start = ESL_MAX(1, (sq->end - W - sq->C + 1));
1283         sq->W     = sq->end - sq->start + 1 - sq->C;
1284       }
1285 
1286     /* Now position for a subseq fetch of <start..end> on fwd strand, using SSI offset calc  */
1287     ESL_DASSERT1(( sq->doff != 0 ));
1288     if (ascii->bpl == 0 || ascii->rpl == 0)      /* no help; brute force resolution. */
1289       {
1290         offset       = sq->doff;
1291         actual_start = 1;
1292       }
1293     else if (ascii->bpl == ascii->rpl+1)         /* residue resolution */
1294       {
1295         line = (sq->start-1) / ascii->rpl; /* data line #0.. that <end> is on */
1296         offset       = sq->doff + line * ascii->bpl + (sq->start-1)%ascii->rpl;
1297         actual_start = sq->start;
1298       }
1299     else /* line resolution */
1300       {
1301         line         = (sq->start-1) / ascii->rpl; /* data line #0.. that <end> is on */
1302         offset       = sq->doff + line * ascii->bpl;
1303         actual_start = 1 + line * ascii->rpl;
1304       }
1305 
1306     if (esl_sqfile_Position(sqfp, offset) != eslOK)
1307       ESL_EXCEPTION(eslECORRUPT, "Failed to reposition seq file for reverse window read");
1308 
1309     /* grab the subseq and rev comp it */
1310     if ((status = esl_sq_GrowTo(sq, sq->C+sq->W)) != eslOK) return status;
1311     sq->n = 0;
1312     status = read_nres(sqfp, sq, (sq->start - actual_start), (sq->end - sq->start + 1), &nres);
1313 
1314     if (status != eslOK || nres < (sq->end - sq->start + 1))
1315       ESL_EXCEPTION(eslECORRUPT, "Failed to extract %d..%d", sq->start, sq->end);
1316 
1317     status = esl_sq_ReverseComplement(sq);
1318     if      (status    == eslEINVAL) ESL_FAIL(eslEINVAL, ascii->errbuf, "can't reverse complement that seq - it's not DNA/RNA");
1319     else if (status    != eslOK)     return status;
1320 
1321     return eslOK;
1322   }
1323 
1324   /* Else, we're reading the forward strand */
1325   else
1326   { /* sq->start == 0 means we haven't read any windows on this sequence yet...
1327    * it's a new record, and we need to initialize with the header and
1328    * the first window. This is the only case that we're allowed to return
1329    * EOF from.
1330    */
1331     if (sq->start == 0)
1332     {
1333       if (ascii->nc == 0) return eslEOF;
1334       if ((status = ascii->parse_header(sqfp, sq)) != eslOK) return status; /* EOF, EFORMAT */
1335       sq->start     = 1;
1336       sq->C         = 0;/* no context in first window                   */
1337       sq->L         = -1;/* won't be known 'til EOD.                     */
1338       ascii->L       = 0;/* init to 0, so we can count residues as we go */
1339       esl_sq_SetSource(sq, sq->name);
1340       /* the <ascii->buf> is now positioned at the start of seq data */
1341       /* ascii->linenumber is ok where it is */
1342       /* the header_*() routines initialized rpl,bpl bookkeeping at start of seq line,
1343        * and also sq->doff,roff.
1344        */
1345     }
1346     else
1347     { /* else we're reading a window other than first; slide context over. */
1348       sq->C = ESL_MIN(C, sq->n);
1349 
1350       /* if the case where the window is smaller than the context and the
1351        * context is not full, it is not necessary to move the context part
1352        * of the sequence that has been read in.
1353        */
1354       if (sq->C >= C) {
1355          /* now handle the case where the context is full */
1356          if (sq->seq != NULL) memmove(sq->seq,   sq->seq + sq->n - sq->C,     sq->C);
1357          else                 memmove(sq->dsq+1, sq->dsq + sq->n - sq->C + 1, sq->C);
1358          sq->start = ascii->L - sq->C + 1;
1359          sq->n = C;
1360       }
1361     }
1362 
1363     if ((status = esl_sq_GrowTo(sq, C+W)) != eslOK)                return status; /* EMEM    */
1364     status = read_nres(sqfp, sq, 0, W, &nres);
1365     ascii->L += nres;
1366 
1367     if (status == eslEOD)
1368     { /* Forward strand is done. 0 residues were read. Return eslEOD and an empty (info) <sq>. */
1369       if ((status = ascii->parse_end(sqfp, sq)) != eslOK) return status;
1370 
1371       sq->start      = 0;
1372       sq->end        = 0;
1373       sq->C          = 0;
1374       sq->W          = 0;
1375       sq->L          = ascii->L;
1376       sq->n          = 0;
1377 
1378       if (ascii->nc > 0) {
1379         ascii->bookmark_offset  = ascii->boff+ascii->bpos; /* remember where the next seq starts. */
1380         //ascii->bookmark_linenum = ascii->bookmark_linenum;
1381       } else {
1382         ascii->bookmark_offset  = 0;                     /* signals for EOF, no more seqs        */
1383         ascii->bookmark_linenum = 0;
1384       }
1385 
1386       if (sq->dsq != NULL) sq->dsq[1] = eslDSQ_SENTINEL; /* erase the saved context */
1387       else                 sq->seq[0] = '\0';
1388       return eslEOD;
1389     }
1390     else if (status == eslOK)
1391     { /* Forward strand is still in progress. <= W residues were read. Return eslOK. */
1392       sq->end        = sq->start + sq->C + nres - 1;
1393       sq->W          = nres;
1394       return eslOK;
1395     }
1396     else return status;/* EFORMAT,EMEM */
1397   }
1398   /*NOTREACHED*/
1399   return eslOK;
1400 
1401  ERROR:
1402   if (tmpsq != NULL) esl_sq_Destroy(tmpsq);
1403   return status;
1404 }
1405 
1406 /* Function:  sqascii_ReadBlock()
1407  * Synopsis:  Read the next block of sequences from a file.
1408  *
1409  * Purpose:   Reads a block of sequences from open sequence file <sqfp> into
1410  *            <sqBlock>.
1411  *
1412  *            In the case that <long_target> is false, the sequences are
1413  *            expected to be protein - individual sequences won't be long
1414  *            so read them in one-whole-sequence at a time. If <max_sequences> is set
1415  *            to a number > 0 read <max_sequences> sequences, up to at most
1416  *            MAX_RESIDUE_COUNT residues.
1417  *
1418  *            If <long_target> is true, the sequences are expected to be DNA.
1419  *            Because sequences in a DNA database can exceed MAX_RESIDUE_COUNT,
1420  *            this function uses ReadWindow to read chunks of sequence no
1421  *            larger than <max_residues>, and must allow for the possibility that a
1422  *            request will be made to continue reading a partly-read
1423  *            sequence. This case also respects the <max_sequences> limit.
1424  *
1425  * Returns:   <eslOK> on success; the new sequence is stored in <sqBlock>.
1426  *
1427  *            Returns <eslEOF> when there is no sequence left in the
1428  *            file (including first attempt to read an empty file).
1429  *
1430  *            Returns <eslEFORMAT> if there's a problem with the format,
1431  *            such as an illegal character; the line number that the parse
1432  *            error occurs on is in <sqfp->linenumber>, and an informative
1433  *            error message is placed in <ascii->errbuf>.
1434  *
1435  * Throws:    <eslEMEM> on allocation failure;
1436  *            <eslEINCONCEIVABLE> on internal error.
1437  */
1438 static int
sqascii_ReadBlock(ESL_SQFILE * sqfp,ESL_SQ_BLOCK * sqBlock,int max_residues,int max_sequences,int long_target)1439 sqascii_ReadBlock(ESL_SQFILE *sqfp, ESL_SQ_BLOCK *sqBlock, int max_residues, int max_sequences, int long_target)
1440 {
1441   int     i = 0;
1442   int     size = 0;
1443   int     status = eslOK;
1444   ESL_SQ *tmpsq = NULL;
1445 
1446   sqBlock->count = 0;
1447   if (max_sequences < 1 || max_sequences > sqBlock->listSize)
1448     max_sequences = sqBlock->listSize;
1449 
1450 
1451   if ( !long_target  )
1452   {  /* in these cases, an individual sequence won't ever be really long,
1453       so just read in a sequence at a time  */
1454 
1455     for (i = 0; i < max_sequences && size < MAX_RESIDUE_COUNT; ++i)
1456     {
1457       status = sqascii_Read(sqfp, sqBlock->list + i);
1458 
1459       if (status != eslOK) break;
1460       size += sqBlock->list[i].n;
1461       ++sqBlock->count;
1462     }
1463   }
1464   else
1465   { /* DNA, not an alignment.  Might be really long sequences */
1466 
1467     if (max_residues < 1)
1468       max_residues = MAX_RESIDUE_COUNT;
1469 
1470     tmpsq = esl_sq_CreateDigital(sqBlock->list->abc);
1471     //if complete flag is set to FALSE, then the prior block must have ended with a window that was a possibly
1472     //incomplete part of it's full sequence. Read another overlapping window.
1473     if (! sqBlock->complete )
1474     {
1475       //overloading C as indicator of how big C should be for this window reading action
1476       status = sqascii_ReadWindow(sqfp, sqBlock->list->C, max_residues, sqBlock->list);
1477       if (status == eslOK)
1478       {
1479         sqBlock->count = i = 1;
1480         size = sqBlock->list->n - sqBlock->list->C;
1481         sqBlock->list->L = sqfp->data.ascii.L;
1482         if (size == max_residues)
1483         { // Filled the block with a single very long window.
1484 
1485           sqBlock->complete = FALSE; // default value, unless overridden below
1486           status = skip_whitespace(sqfp);
1487           if ( status != eslOK ) { // either EOD or end of buffer (EOF) was reached before the next character was seen
1488             sqBlock->complete = TRUE;
1489             status = eslOK;
1490           }
1491 
1492           if(tmpsq != NULL) esl_sq_Destroy(tmpsq);
1493           return status;
1494         }
1495         else
1496         {
1497           // Burn off EOD (see notes for similar entry ~25 lines below), then go fetch the next sequence
1498           esl_sq_Reuse(tmpsq);
1499           tmpsq->start =  sqBlock->list->start ;
1500           tmpsq->C = 0;
1501           status = sqascii_ReadWindow(sqfp, 0, max_residues, tmpsq);
1502           if (status != eslEOD) {
1503             if(tmpsq != NULL) esl_sq_Destroy(tmpsq);
1504             return status; //surprising
1505           }
1506           //sqBlock->list->L = tmpsq->L;
1507         }
1508       }
1509       else if (status == eslEOD)
1510       { // turns out there isn't any more of the sequence to read, after all
1511       }
1512       else
1513       {
1514          if(tmpsq != NULL) esl_sq_Destroy(tmpsq);
1515          return status;
1516        }
1517     } // otherwise, just start at the beginning
1518 
1519 
1520     for (  ; i < max_sequences && size < max_residues; ++i) {
1521       /* restricted request_size is used to ensure that all blocks are pretty close to the
1522        * same size. Without it, we may either naively keep asking for max_residue windows,
1523        * which can result in a window with ~2*max_residues ... or we can end up with absurdly
1524        * short fragments at the end of blocks
1525        */
1526       int request_size = ESL_MAX(max_residues-size, max_residues * .05);
1527 
1528       esl_sq_Reuse(tmpsq);
1529       esl_sq_Reuse(sqBlock->list + i);
1530 
1531       status = sqascii_ReadWindow(sqfp, 0, request_size , tmpsq);
1532       esl_sq_Copy(tmpsq, sqBlock->list +i);
1533       if (status != eslOK && status != eslEOD){
1534         break;
1535         } /* end of sequences (eslEOF), or we read an empty seq (eslEOD) or error (other)  */
1536       size += sqBlock->list[i].n - sqBlock->list[i].C;
1537       sqBlock->list[i].L = sqfp->data.ascii.L;
1538       ++(sqBlock->count);
1539 
1540       if (size >= max_residues) {
1541         // a full window worth of sequence has been read; did we reach the end of the final sequence in the block?
1542         sqBlock->complete = FALSE; // default value, unless overridden below
1543 
1544         status = skip_whitespace(sqfp);
1545         if ( status != eslOK ) { // either EOD or end of buffer (EOF) was reached before the next character was seen
1546           sqBlock->complete = TRUE;
1547           status = eslOK;
1548         }
1549 
1550         if(tmpsq != NULL) esl_sq_Destroy(tmpsq);
1551         return status;
1552       } else if(status == eslEOD) {
1553         /* We've read an empty sequence of length 0, rare, but
1554          * possible, and we need to be able to handle it
1555          * gracefully. Ensure L is 0, set status to eslOK and move
1556          * on, we've already incremented sqBlock->count by 1
1557          * above. This means our block may contain zero-length
1558          * sequences when we return (that is, we still add these
1559          * seqs onto the block instead of skipping them altogether).
1560          */
1561         sqBlock->list[i].L = 0; /* actually, this should already be 0... */
1562         status = eslOK;
1563       } else {
1564         /* Sequence finished, but haven't yet reached max_residues. Need to burn off the EOD value
1565            that will be returned by the next ReadWindow call. Can just use a tmp sq, after setting
1566            a couple values ReadWindow needs to see for correct processing.
1567         */
1568         esl_sq_Reuse(tmpsq);
1569         tmpsq->start =  sqBlock->list[i].start ;
1570         tmpsq->C = 0;
1571         status = sqascii_ReadWindow(sqfp, 0, max_residues, tmpsq);
1572 
1573         if (status != eslEOD) {
1574           if(tmpsq != NULL) esl_sq_Destroy(tmpsq);
1575           return status; //surprising
1576         }
1577         //sqBlock->list[i].L = tmpsq->L;
1578         status = eslOK;
1579       }
1580     }
1581   }
1582 
1583   /* EOF will be returned only in the case were no sequences were read */
1584   if (status == eslEOF && i > 0) status = eslOK;
1585 
1586   sqBlock->complete = TRUE;
1587 
1588   if(tmpsq != NULL) esl_sq_Destroy(tmpsq);
1589 
1590   return status;
1591 }
1592 
1593 /* Function:  sqascii_Echo()
1594  * Synopsis:  Echo a sequence's record onto output stream.
1595  *
1596  * Purpose:   Given a complete <sq> that we have read by some means
1597  *            from an open <sqfp>; echo that sequence's record
1598  *            onto the output stream <ofp>.
1599  *
1600  *            This allows records to be regurgitated exactly as they
1601  *            appear, rather than writing the subset of information
1602  *            stored in an <ESL_SQ>. <esl-sfetch> in the miniapps uses
1603  *            this, for example.
1604  *
1605  *            Because this relies on repositioning the <sqfp>, it
1606  *            cannot be called on non-positionable streams (stdin or
1607  *            gzipped files). Because it relies on the sequence lying
1608  *            in a contiguous sequence of bytes in the file, it cannot
1609  *            be called on a sequence in a multiple alignment file.
1610  *            Trying to do so throws an <eslEINVAL> exception.
1611  *
1612  * Returns:   <eslOK> on success.
1613  *
1614  * Throws:    <eslEINVAL>   if <sqfp> isn't a repositionable sequence file.
1615  *            <eslECORRUPT> if we run out of data, probably from bad offsets
1616  *            <eslEMEM>     on allocation failure.
1617  *            <eslESYS>     on system call failures.
1618  *
1619  *
1620  */
1621 static int
sqascii_Echo(ESL_SQFILE * sqfp,const ESL_SQ * sq,FILE * ofp)1622 sqascii_Echo(ESL_SQFILE *sqfp, const ESL_SQ *sq, FILE *ofp)
1623 {
1624   int     status;
1625   int64_t save_linenumber;
1626   int     save_currpl;
1627   int     save_curbpl;
1628   int     save_prvrpl;
1629   int     save_prvbpl;
1630   int64_t save_L;
1631   int     n;
1632   int     nwritten;
1633 
1634   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
1635 
1636   if (ascii->do_stdin)                    ESL_EXCEPTION(eslEINVAL, "can't Echo() a sequence from standard input");
1637   if (ascii->do_gzip)                     ESL_EXCEPTION(eslEINVAL, "can't Echo() a sequence from a gzipped file");
1638   if (esl_sqio_IsAlignment(sqfp->format)) ESL_EXCEPTION(eslEINVAL, "can't Echo() a sequence from an alignment file");
1639   if (sq->roff == -1 || sq->eoff == -1)   ESL_EXCEPTION(eslEINVAL, "can't Echo() a sequence without disk offset info");
1640 
1641   save_linenumber = ascii->linenumber;
1642   save_currpl     = ascii->currpl;
1643   save_curbpl     = ascii->curbpl;
1644   save_prvrpl     = ascii->prvrpl;
1645   save_prvbpl     = ascii->prvbpl;
1646   save_L          = ascii->L;
1647 
1648   status = esl_sqfile_Position(sqfp, sq->roff);
1649   if      (status == eslEOF) ESL_EXCEPTION(eslECORRUPT, "repositioning failed; bad offset?");
1650   else if (status != eslOK)  return status;
1651 
1652   while (ascii->boff + ascii->nc <= sq->eoff)
1653     {
1654       if (fwrite(ascii->buf, sizeof(char), ascii->nc, ofp) != ascii->nc) ESL_EXCEPTION(eslESYS, "fwrite() failed");
1655       if (loadbuf(sqfp) != eslOK)  ESL_EXCEPTION(eslECORRUPT, "repositioning failed; bad offset?");
1656     }
1657   n =  sq->eoff - ascii->boff + 1;
1658   nwritten = fwrite(ascii->buf, sizeof(char), n, ofp);
1659   if (nwritten != n) ESL_EXCEPTION(eslESYS, "fwrite() failed");
1660 
1661   status = esl_sqfile_Position(sqfp, sq->roff);
1662   if      (status == eslEOF) ESL_EXCEPTION(eslECORRUPT, "repositioning failed; bad offset?");
1663   else if (status != eslOK)  return status;
1664 
1665   ascii->linenumber = save_linenumber;
1666   ascii->currpl     = save_currpl;
1667   ascii->curbpl     = save_curbpl;
1668   ascii->prvrpl     = save_prvrpl;
1669   ascii->prvbpl     = save_prvbpl;
1670   ascii->L          = save_L;
1671   return eslOK;
1672 }
1673 /*------------------ end, sequential sequence input -------------*/
1674 
1675 
1676 /*****************************************************************
1677  *# 5. Sequence/subsequence fetching, random access [with <ssi>]
1678  *****************************************************************/
1679 
1680 /* Function:  sqascii_OpenSSI()
1681  * Synopsis:  Opens an SSI index associated with a sequence file.
1682  *
1683  * Purpose:   Opens an SSI index file associated with the already open
1684  *            sequence file <sqfp>. If successful, the necessary
1685  *            information about the open SSI file is stored internally
1686  *            in <sqfp>.
1687  *
1688  *            The SSI index file name is determined in one of two
1689  *            ways, depending on whether a non-<NULL> <ssifile_hint>
1690  *            is provided.
1691  *
1692  *            If <ssifile_hint> is <NULL>, the default for
1693  *            constructing the SSI filename from the sequence
1694  *            filename, by using exactly the same path (if any) for
1695  *            the sequence filename, and appending the suffix <.ssi>.
1696  *            For example, the SSI index for <foo> is <foo.ssi>, for
1697  *            <./foo.fa> is <./foo.fa.ssi>, and for
1698  *            </my/path/to/foo.1.fa> is </my/path/to/foo.1.fa.ssi>.
1699  *
1700  *            If <ssifile_hint> is <non-NULL>, this exact fully
1701  *            qualified path is used as the SSI file name.
1702  *
1703  * Returns:   <eslOK> on success, and <sqfp->ssi> is now internally
1704  *            valid.
1705  *
1706  *            <eslENOTFOUND> if no SSI index file is found;
1707  *            <eslEFORMAT> if it's found, but appears to be in incorrect format;
1708  *            <eslERANGE> if the SSI file uses 64-bit offsets but we're on
1709  *            a system that doesn't support 64-bit file offsets.
1710  *
1711  * Throws:    <eslEINVAL> if the open sequence file <sqfp> doesn't
1712  *            correspond to a normal sequence flatfile -- we can't
1713  *            random access in .gz compressed files, standard input,
1714  *            or multiple alignment files that we're reading
1715  *            sequentially.
1716  *
1717  *            Throws <eslEMEM> on allocation error.
1718  */
1719 static int
sqascii_OpenSSI(ESL_SQFILE * sqfp,const char * ssifile_hint)1720 sqascii_OpenSSI(ESL_SQFILE *sqfp, const char *ssifile_hint)
1721 {
1722   int status;
1723 
1724   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
1725 
1726   if (ascii->do_gzip)     ESL_EXCEPTION(eslEINVAL, "can't open an SSI index for a .gz compressed seq file");
1727   if (ascii->do_stdin)    ESL_EXCEPTION(eslEINVAL, "can't open an SSI index for standard input");
1728   if (ascii->afp != NULL) ESL_EXCEPTION(eslEINVAL, "can't open an SSI index for sequential input from an MSA");
1729 
1730   if (ssifile_hint == NULL) {
1731     if ((status = esl_strdup(sqfp->filename, -1, &(ascii->ssifile)))           != eslOK) return status;
1732     if ((status = esl_strcat(&(ascii->ssifile), -1, ".ssi", 4))                != eslOK) return status;
1733   } else {
1734     if ((status = esl_strdup(ssifile_hint, -1, &(ascii->ssifile)))             != eslOK) return status;
1735   }
1736 
1737   return esl_ssi_Open(ascii->ssifile, &(ascii->ssi));
1738 }
1739 
1740 
1741 
1742 /* Function:  sqascii_PositionByKey()
1743  * Synopsis:  Use SSI to reposition seq file to a particular sequence.
1744  *
1745  * Purpose:   Reposition <sqfp> so that the next sequence we read will
1746  *            be the one named (or accessioned) <key>.
1747  *
1748  *            <sqfp->linenumber> is reset to be relative to the start
1749  *            of the record named <key>, rather than the start of the
1750  *            file.
1751  *
1752  * Returns:   <eslOK> on success, and the file <sqfp> is repositioned
1753  *            so that the next <esl_sqio_Read()> call will read the
1754  *            sequence named <key>.
1755  *
1756  *            Returns <eslENOTFOUND> if <key> isn't found in the
1757  *            index; in this case, the position of <sqfp> in the file
1758  *            is unchanged.
1759  *
1760  *            Returns <eslEFORMAT> if something goes wrong trying to
1761  *            read the index, almost certainly indicating a format
1762  *            problem in the SSI file.
1763  *
1764  *            Returns <eslEOF> if, after repositioning, we fail to
1765  *            load the next line or buffer from the sequence file;
1766  *            this probably also indicates a format problem in the SSI
1767  *            file.
1768  *
1769  * Throws:    <eslEMEM>   on allocation error;
1770  *            <eslEINVAL> if there's no open SSI index in <sqfp>;
1771  *            <eslESYS>   if the <fseek()> fails.
1772  *
1773  *            In all these cases, the state of <sqfp> becomes
1774  *            undefined, and the caller should not use it again.
1775  */
1776 static int
sqascii_PositionByKey(ESL_SQFILE * sqfp,const char * key)1777 sqascii_PositionByKey(ESL_SQFILE *sqfp, const char *key)
1778 {
1779   uint16_t fh;
1780   off_t    offset;
1781   int      status;
1782 
1783   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
1784 
1785   if (ascii->ssi == NULL)                          ESL_EXCEPTION(eslEINVAL,"Need an open SSI index to call esl_sqfile_PositionByKey()");
1786   if ((status = esl_ssi_FindName(ascii->ssi, key, &fh, &offset, NULL, NULL)) != eslOK) return status;
1787   return esl_sqfile_Position(sqfp, offset);
1788 }
1789 
1790 
1791 /* Function:  sqascii_PositionByNumber()
1792  * Synopsis:  Use SSI to reposition by sequence number
1793  *
1794  * Purpose:   Reposition <sqfp> so that the next sequence we
1795  *            read will be the <which>'th sequence, where <which>
1796  *            is <0..sqfp->ssi->nprimary-1>.
1797  *
1798  *            <sqfp->linenumber> is reset to be relative to the start
1799  *            of the record named <key>, rather than the start of the
1800  *            file.
1801  *
1802  * Returns:   <eslOK> on success, and the file <sqfp> is repositioned.
1803  *
1804  *            Returns <eslENOTFOUND> if there is no sequence number
1805  *            <which> in the index; in this case, the position of
1806  *            <sqfp> in the file is unchanged.
1807  *
1808  *            Returns <eslEFORMAT> if something goes wrong trying to
1809  *            read the index, almost certainly indicating a format
1810  *            problem in the SSI file.
1811  *
1812  *            Returns <eslEOF> if, after repositioning, we fail to
1813  *            load the next line or buffer from the sequence file;
1814  *            this probably also indicates a format problem in the SSI
1815  *            file.
1816  *
1817  * Throws:    <eslEMEM>   on allocation error;
1818  *            <eslEINVAL> if there's no open SSI index in <sqfp>;
1819  *            <eslESYS>   if the <fseek()> fails.
1820  *
1821  *            In all these cases, the state of <sqfp> becomes
1822  *            undefined, and the caller should not use it again.
1823  */
1824 static int
sqascii_PositionByNumber(ESL_SQFILE * sqfp,int which)1825 sqascii_PositionByNumber(ESL_SQFILE *sqfp, int which)
1826 {
1827   uint16_t fh;
1828   off_t    offset;
1829   int      status;
1830 
1831   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
1832 
1833   if (ascii->ssi == NULL)                          ESL_EXCEPTION(eslEINVAL,"Need open SSI index to call esl_sqfile_PositionByNumber()");
1834   if ((status = esl_ssi_FindNumber(ascii->ssi, which, &fh, &offset, NULL, NULL, NULL)) != eslOK) return status;
1835   return esl_sqfile_Position(sqfp, offset);
1836 }
1837 
1838 
1839 /* Function:  sqascii_Fetch()
1840  * Synopsis:  Fetch a complete sequence, using SSI indexing.
1841  *
1842  * Purpose:   Fetch a sequence named (or accessioned) <key> from
1843  *            the repositionable, open sequence file <sqfp>.
1844  *            The open <sqfp> must have an open SSI index.
1845  *            The sequence is returned in <sq>.
1846  *
1847  * Returns:   <eslOK> on soccess.
1848  *            <eslEINVAL> if no SSI index is present, or if <sqfp> can't
1849  *            be repositioned.
1850  *            <eslENOTFOUND> if <source> isn't found in the file.
1851  *            <eslEFORMAT> if either the index file or the sequence file
1852  *            can't be parsed, because of unexpected format issues.
1853  *
1854  * Throws:    <eslEMEM> on allocation error.
1855  */
1856 static int
sqascii_Fetch(ESL_SQFILE * sqfp,const char * key,ESL_SQ * sq)1857 sqascii_Fetch(ESL_SQFILE *sqfp, const char *key, ESL_SQ *sq)
1858 {
1859   int status;
1860 
1861   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
1862 
1863   if (ascii->ssi == NULL) ESL_FAIL(eslEINVAL, ascii->errbuf, "No SSI index for %s; can't fetch subsequences", sqfp->filename);
1864   if ((status = sqascii_PositionByKey(sqfp, key)) != eslOK) return status;
1865   if ((status = sqascii_Read(sqfp, sq))           != eslOK) return status;
1866   return eslOK;
1867 }
1868 
1869 /* Function:  sqascii_FetchInfo()
1870  * Synopsis:  Fetch a sequence's info, using SSI indexing.
1871  *
1872  * Purpose:   Fetch a sequence named (or accessioned) <key> from
1873  *            the repositionable, open sequence file <sqfp>, reading
1874  *            all info except the sequence (and secondary structure).
1875  *            The open <sqfp> must have an open SSI index.
1876  *            The sequence info is returned in <sq>.
1877  *
1878  * Returns:   <eslOK> on soccess.
1879  *            <eslEINVAL> if no SSI index is present, or if <sqfp> can't
1880  *            be repositioned.
1881  *            <eslENOTFOUND> if <source> isn't found in the file.
1882  *            <eslEFORMAT> if either the index file or the sequence file
1883  *            can't be parsed, because of unexpected format issues.
1884  *
1885  * Throws:    <eslEMEM> on allocation error.
1886  */
1887 static int
sqascii_FetchInfo(ESL_SQFILE * sqfp,const char * key,ESL_SQ * sq)1888 sqascii_FetchInfo(ESL_SQFILE *sqfp, const char *key, ESL_SQ *sq)
1889 {
1890   int status;
1891 
1892   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
1893 
1894   if (ascii->ssi == NULL) ESL_FAIL(eslEINVAL, ascii->errbuf, "No SSI index for %s; can't fetch subsequences", sqfp->filename);
1895   if ((status = sqascii_PositionByKey(sqfp, key)) != eslOK) return status;
1896   if ((status = sqascii_ReadInfo(sqfp, sq))         != eslOK) return status;
1897   return eslOK;
1898 }
1899 
1900 
1901 /* Function:  sqascii_FetchSubseq()
1902  * Synopsis:  Fetch a subsequence, using SSI indexing.
1903  *
1904  * Purpose:   Fetch subsequence <start..end> from a sequence named (or
1905  *            accessioned) <source>, in the repositionable, open sequence file <sqfp>.
1906  *            The open <sqfp> must have an SSI index. Put the
1907  *            subsequence in <sq>.
1908  *
1909  *            As a special case, if <end> is 0, the subsequence is
1910  *            fetched all the way to the end, so you don't need to
1911  *            look up the sequence length <L> to fetch a suffix.
1912  *
1913  *            The caller may want to rename/reaccession/reannotate the
1914  *            subsequence.  Upon successful return, <sq->name> is set
1915  *            to <source/start-end>, and <sq->source> is set to
1916  *            <source> The accession and description <sq->acc> and
1917  *            <sq->desc> are set to the accession and description of
1918  *            the source sequence.
1919  *
1920  * Returns:   <eslOK> on success.
1921  *            <eslEINVAL> if no SSI index is present, or if <sqfp> can't
1922  *            be repositioned.
1923  *            <eslENOTFOUND> if <source> isn't found in the file.
1924  *            <eslEFORMAT> if either the index file or the sequence file
1925  *            can't be parsed, because of unexpected format issues.
1926  *            <eslERANGE> if the <start..end> coords don't lie entirely
1927  *            within the <source> sequence.
1928  *
1929  * Throws:    <eslEMEM> on allocation errors.
1930  */
1931 static int
sqascii_FetchSubseq(ESL_SQFILE * sqfp,const char * source,int64_t start,int64_t end,ESL_SQ * sq)1932 sqascii_FetchSubseq(ESL_SQFILE *sqfp, const char *source, int64_t start, int64_t end, ESL_SQ *sq)
1933 {
1934   uint16_t fh;/* SSI file handle */
1935   off_t    r_off, d_off;
1936   int64_t  L;
1937   int64_t  actual_start;
1938   int64_t  nskip;
1939   int64_t  nres;
1940   int64_t  n;
1941   int      status;
1942 
1943   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
1944 
1945   if (ascii->ssi == NULL) ESL_FAIL(eslEINVAL, ascii->errbuf, "No SSI index for %s; can't fetch subsequences", sqfp->filename);
1946 
1947   /* Find sequence info in the index */
1948   status = esl_ssi_FindSubseq(ascii->ssi, source, start, &fh, &r_off, &d_off, &L, &actual_start);
1949   if      (status == eslENOTFOUND) ESL_FAIL(status, ascii->errbuf, "Didn't find sequence %s in the index", source);
1950   else if (status == eslEFORMAT)   ESL_FAIL(status, ascii->errbuf, "Failure reading SSI index; corrupt or bad format");
1951   else if (status == eslERANGE)    ESL_FAIL(status, ascii->errbuf, "Requested start %" PRIi64 " isn't in the sequence %s", start, source);
1952   else if (status != eslOK)        ESL_FAIL(status, ascii->errbuf, "Unexpected failure in finding subseq offset");
1953 
1954   /* The special case of end=0, asking for suffix fetch */
1955   if (end == 0) end = L;
1956 
1957   /* Validate coords if we can */
1958   if (start > end)       ESL_FAIL(eslERANGE, ascii->errbuf, "Subsequence start %" PRIi64 " is greater than end %" PRIi64 "\n", start, end);
1959   if (L > 0 && end > L)  ESL_FAIL(eslERANGE, ascii->errbuf, "Subsequence end %" PRIi64 " is greater than length %" PRIi64 "\n", end, L);
1960 
1961   /* Position the file at the record header; read the header info */
1962   status = esl_sqfile_Position(sqfp, r_off);
1963   if      (status == eslEOF)    ESL_FAIL(status, ascii->errbuf, "Position appears to be off the end of the file");
1964   else if (status == eslEINVAL) ESL_FAIL(status, ascii->errbuf, "Sequence file is not repositionable");
1965   else if (status != eslOK)     ESL_FAIL(status, ascii->errbuf, "Failure in positioning sequence file");
1966   if ((status = ascii->parse_header(sqfp, sq)) != eslOK) return status;
1967 
1968   /* Position the file close to the subseq: either at the start of the line
1969    * where the subseq starts, or exactly at the residue.
1970    */
1971   if (d_off != 0)
1972     {
1973       status = esl_sqfile_Position(sqfp, d_off);
1974       if      (status == eslEOF)    ESL_FAIL(eslERANGE, ascii->errbuf, "Position appears to be off the end of the file");
1975       else if (status == eslEINVAL) ESL_FAIL(status,    ascii->errbuf, "Sequence file is not repositionable");
1976       else if (status != eslOK)     ESL_FAIL(status,    ascii->errbuf, "Failure in positioning sequence file");
1977     }
1978   /* even if we didn't have a data offset, we're positioned at the
1979    * start of the sequence anyway, because we parsed the full header
1980    */
1981   nskip = start - actual_start; /* how many residues do we still need to skip to reach start       */
1982   nres  = end - start + 1;   /* how many residues do we need to read as subseq                  */
1983 
1984   if ((status = esl_sq_GrowTo(sq, nres)) != eslOK) return status;
1985   status = read_nres(sqfp, sq, nskip, nres, &n);
1986   if (status != eslOK || n < nres) ESL_EXCEPTION(eslEINCONCEIVABLE, "Failed to fetch subsequence residues -- corrupt coords?");
1987 
1988   /* Set the coords */
1989   sq->start = start;
1990   sq->end   = end;
1991   sq->C     = 0;
1992   sq->W     = sq->n;
1993   sq->L     = (L > 0 ? L : -1);
1994   esl_sq_FormatName(sq, "%s/%" PRId64 "-%" PRId64, source, start, end);
1995   esl_sq_SetSource (sq, source);
1996   return eslOK;
1997 }
1998 /*------------- end, random sequence access with SSI -------------------*/
1999 
2000 
2001 /*****************************************************************
2002  * 6. Internal routines shared by parsers
2003  *****************************************************************/
2004 
2005 
2006 /* loadmem()
2007  *
2008  * Load the next block of data from stream into mem buffer,
2009  * either concatenating to previous buffer (if we're recording) or
2010  * overwriting (if not).
2011  *
2012  * This block is loaded at sqfp->mem + sqfp->mpos.
2013  *
2014  * Upon return:
2015  * sqfp->mem     now contains up to eslREADBUFSIZE more chars
2016  * sqfp->mpos    is position of first byte in newly read block
2017  * sqfp->allocm  may have increased by eslREADBUFSIZE, if we concatenated
2018  * sqfp->mn      is # of chars in <mem>; <mn-1> is pos of last byte in new block
2019  *
2020  * Returns <eslEOF> (and mpos == mn) if no new data can be read;
2021  * Returns <eslOK>  (and mpos < mn) if new data is read.
2022  * Throws <eslEMEM> on allocation error.
2023  */
2024 static int
loadmem(ESL_SQFILE * sqfp)2025 loadmem(ESL_SQFILE *sqfp)
2026 {
2027   void *tmp;
2028   int   n = 0;
2029   int   status;
2030 
2031   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2032 
2033   if (ascii->do_buffer)
2034   {
2035       ascii->mpos = 0;
2036       ascii->mn   = 0;
2037   }
2038   else if (ascii->is_recording == TRUE)
2039   {
2040       if (ascii->mem == NULL) ascii->moff = ftello(ascii->fp);        /* first time init of the offset */
2041       ESL_RALLOC(ascii->mem, tmp, sizeof(char) * (ascii->allocm + eslREADBUFSIZE));
2042       ascii->allocm += eslREADBUFSIZE;
2043       n = fread(ascii->mem + ascii->mpos, sizeof(char), eslREADBUFSIZE, ascii->fp);
2044       ascii->mn += n;
2045   }
2046   else
2047   {
2048       if (ascii->mem == NULL) {
2049         ESL_ALLOC(ascii->mem, sizeof(char) * eslREADBUFSIZE);
2050         ascii->allocm = eslREADBUFSIZE;
2051       }
2052       ascii->is_recording = -1;/* no more recording is possible now */
2053       ascii->mpos = 0;
2054       ascii->moff = ftello(ascii->fp);
2055       n = fread(ascii->mem, sizeof(char), eslREADBUFSIZE, ascii->fp); /* see note [1] below */
2056       ascii->mn   = n;
2057   }
2058   return (n == 0 ? eslEOF : eslOK);
2059 
2060  ERROR:
2061   return status;
2062 }
2063 
2064 /* [1] Be alert for a possible problem above in that fread().
2065  *     Farrar had inserted an alternative case as follows:
2066  *     "If we are reading from stdin, buffered read cannot be used
2067  *      because if will block until EOF or the buffer is full, ie
2068  *      eslREADBUFSIZE characters have been read.  Usually this would
2069  *      not be a problem, unless stdin is from a pipe.  In that case
2070  *      if the sequence is less than eslREADBUFSIZE we would block.
2071  *
2072  *      NOTE:  any changes to the IO stream ascii->fp, such as fseek,
2073  *      might not have any affect on the file descriptor for the stream.
2074  *
2075  *   if (ascii->do_stdin) {
2076  *     n = read(fileno(ascii->fp), ascii->mem, eslREADBUFSIZE);
2077  *   } else {
2078  *   ...
2079  *
2080  * but that's a bug, because you can't mix read and fread;
2081  * the i17-stdin.pl test fails, in particular.
2082  */
2083 
2084 
2085 
2086 
2087 /* loadbuf()
2088  * Set sqfp->buf to contain next line of data, or point to next block.
2089  * This might just mean working with previously buffered memory in <sqfp->mem>
2090  * or might require reading new data from <sqfp->fp>.
2091  *
2092  * Reset sqfp->boff to be the position of the start of the block/line.
2093  * Reset sqfp->bpos to 0.
2094  * Reset sqfp->nc to the number of chars (bytes) in the new block/line.
2095  * Returns eslOK on success; eslEOF if there's no more data in the file.
2096  * (sqfp->nc == 0 is the same as eslEOF: no data in the new buffer.)
2097  * Can throw an <eslEMEM> error.
2098  */
2099 static int
loadbuf(ESL_SQFILE * sqfp)2100 loadbuf(ESL_SQFILE *sqfp)
2101 {
2102   void *tmp;
2103   char *nlp;
2104   int   n;
2105   int   status = eslOK;
2106 
2107   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2108 
2109   if (! ascii->is_linebased)
2110   {
2111       if (ascii->mpos >= ascii->mn) {
2112         if ((status = loadmem(sqfp)) == eslEMEM) return status;
2113       }
2114       ascii->buf    = ascii->mem  + ascii->mpos;
2115       ascii->boff   = ascii->moff + ascii->mpos;
2116       ascii->balloc = 0;
2117       ascii->bpos   = 0;
2118       ascii->nc     = ascii->mn - ascii->mpos;
2119       ascii->mpos  += ascii->nc;
2120   }
2121   else
2122   { /* Copy next line from <mem> into <buf>. Might require new load(s) into <mem>. */
2123       if (ascii->mpos >= ascii->mn) {
2124         if ((status = loadmem(sqfp)) == eslEMEM) return status;
2125       }
2126       ascii->boff = ascii->moff + ascii->mpos;
2127       ascii->nc   = 0;
2128       nlp        = memchr(ascii->mem + ascii->mpos, '\n', ascii->mn - ascii->mpos);
2129       while (nlp == NULL)
2130       {
2131         n = ascii->mn - ascii->mpos;
2132         while (ascii->nc + n + 1 > ascii->balloc) { /* +1: it'll hold the terminal \0 */
2133           ESL_RALLOC(ascii->buf, tmp, sizeof(char) * (ascii->balloc + eslREADBUFSIZE));
2134           ascii->balloc += eslREADBUFSIZE;
2135         }
2136         memcpy(ascii->buf + ascii->nc, ascii->mem + ascii->mpos, n);
2137         ascii->mpos += n;
2138         ascii->nc   += n;
2139         status = loadmem(sqfp);
2140         if      (status == eslEOF) { break; }
2141         else if (status != eslOK)  return status;
2142         nlp = memchr(ascii->mem + ascii->mpos, '\n', ascii->mn - ascii->mpos);
2143       }
2144       if (status != eslEOF) {
2145         n = nlp - (ascii->mem + ascii->mpos) + 1; /* inclusive of \n */
2146         if (ascii->nc + n + 1 > ascii->balloc) {
2147           ESL_RALLOC(ascii->buf, tmp, sizeof(char) * (ascii->balloc + eslREADBUFSIZE));
2148           ascii->balloc += eslREADBUFSIZE;
2149         }
2150         memcpy(ascii->buf + ascii->nc, ascii->mem + ascii->mpos, n);
2151         ascii->mpos += n;
2152         ascii->nc   += n;
2153       }
2154       ascii->bpos  = 0;
2155       ascii->buf[ascii->nc] = '\0';
2156   }
2157   return (ascii->nc == 0 ? eslEOF : eslOK);
2158 
2159 ERROR:
2160   return status;
2161 }
2162 
2163 /* nextchar()
2164  *
2165  * Load next char from sqfp->buf into <*ret_c> and sets sqfp->bpos to
2166  * its position; usually this is c = sqfp->buf[++sqfp->bpos], but
2167  * we will refill the buffer w/ fresh fread() when needed, in which
2168  * case c =  sqfp->buf[0] and sqfp->bpos = 0.
2169  *
2170  * Returns <eslOK> on success.
2171  * Return  <eslEOF> if we ran out of data in <sqfp>.
2172  * May throw an <eslEMEM> error.
2173  */
2174 static int
nextchar(ESL_SQFILE * sqfp,char * ret_c)2175 nextchar(ESL_SQFILE *sqfp, char *ret_c)
2176 {
2177   int status;
2178 
2179   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2180 
2181   ascii->bpos++;
2182   if (ascii->nc == ascii->bpos && (status = loadbuf(sqfp)) != eslOK) return status;
2183   *ret_c = ascii->buf[ascii->bpos];
2184   return eslOK;
2185 }
2186 
2187 /* seebuf()
2188  *
2189  * Examine and validate the current buffer <sqfp->buf> from its
2190  * current position <sqfp->bpos> until either the buffer ends (we run
2191  * out of characters) or the sequence data ends (we see whatever
2192  * character indicates EOD in this format) or we've seen <maxn>
2193  * residues. If <maxn> is passed as -1, parse the entire buffer,
2194  * without a residue limit.
2195  *
2196  * There are three possible outcomes:
2197  *   <eslOK>:      The buffer is all residues that belong to the current
2198  *                 seq we're parsing (or chars we can ignore), at least
2199  *                 up to the <maxn> residue limit (if present).
2200  *   <eslEOD>:     Part of the buffer may be residues, but the current sequence
2201  *                 ends in this buffer (before <maxn> was reached).
2202  *   <eslEFORMAT>: Somewhere before we reached the end of the buffer or
2203  *                 the sequence record, we saw an illegal character.
2204  *
2205  * On <eslOK>:
2206  *    *opt_nres    is the number of residues in the buffer (up to <maxn>)
2207  *    *opt_endpos  is sqfp->nc (off the end of the buffer by one)
2208  *    The caller will want to deal with the buffer, then load the next one.
2209  *
2210  * On <eslEOD>: same as OK, except:
2211  *    *opt_endpos  is where sqfp->bpos *would* be at when we saw the EOD
2212  *                 signal (the next '>', in FASTA files) had we been parsing residues
2213  *    Therefore on EOD, the caller will want to deal with the <*opt_nres>
2214  *    residues in this buffer, then reposition the buffer by
2215  *    <sqfp->bpos = *opt_epos> (without reloading the buffer), so
2216  *    the next read will pick up there.
2217  *
2218  * On <eslEFORMAT>:
2219  *    ascii->errbuf  contains informative message about the format error.
2220  *
2221  * seebuf() also handles linenumber and SSI bookkeeping in
2222  * <sqfp>. Every newline character seen increments <linenumber> (thus,
2223  * on EFORMAT return, linenumber is set to the line on which the bad
2224  * char occurred). <curbpl>,<currpl>,<prvbpl>,<prvrpl> keep track of # of bytes,
2225  * residues on the current,prev line; they keep state across calls to seebuf().
2226  * <bpl>,<rpl> are tracking whether there's a constant number of
2227  * bytes/residues per line; these are either -1 for "not set yet", 0
2228  * for "no, not constant", or a number > 0. Because of this bookkeeping, it's important
2229  * to make sure that <seebuf()> never counts the same byte twice (hence
2230  * the need for the <maxn> limit, which ReadWindow() uses.)
2231  */
2232 static int
seebuf(ESL_SQFILE * sqfp,int64_t maxn,int64_t * opt_nres,int64_t * opt_endpos)2233 seebuf(ESL_SQFILE *sqfp, int64_t maxn, int64_t *opt_nres, int64_t *opt_endpos)
2234 {
2235   int     bpos;
2236   int64_t nres  = 0;
2237   int64_t nres2 = 0;/* an optimization for determining lastrpl from nres, without incrementing lastrpl on every char */
2238   int     sym;
2239   ESL_DSQ x;
2240   int     lasteol;
2241   int     status  = eslOK;
2242 
2243   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2244 
2245   lasteol = ascii->bpos - 1;
2246   if (maxn == -1) maxn = ascii->nc; /* makes for a more efficient test. nc is a guaranteed upper bound on nres */
2247 
2248   for (bpos = ascii->bpos; nres < maxn && bpos < ascii->nc; bpos++)
2249   {
2250       sym = ascii->buf[bpos];
2251       //printf ("nres: %d, bpos: %d  (%d)\n", nres, bpos, sym);
2252       if (!isascii(sym)) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": non-ASCII character %c in sequence", ascii->linenumber, sym);
2253       x   = sqfp->inmap[sym];
2254 
2255       if      (x <= 127) nres++;
2256       else if (x == eslDSQ_EOL)
2257       {
2258          if (ascii->curbpl != -1) ascii->curbpl += bpos - lasteol;
2259          if (ascii->currpl != -1) ascii->currpl += nres - nres2;
2260          nres2        += nres - nres2;
2261 
2262          if (ascii->rpl != 0 && ascii->prvrpl != -1) { /* need to ignore counts on last line in record, hence cur/prv */
2263            if      (ascii->rpl    == -1)        ascii->rpl = ascii->prvrpl; /* init */
2264            else if (ascii->prvrpl != ascii->rpl) ascii->rpl = 0;           /* inval*/
2265          }
2266          if (ascii->bpl != 0 && ascii->prvbpl != -1) {
2267            if      (ascii->bpl    == -1)        ascii->bpl = ascii->prvbpl; /* init  */
2268            else if (ascii->prvbpl != ascii->bpl) ascii->bpl = 0;            /* inval */
2269          }
2270 
2271          ascii->prvbpl  = ascii->curbpl;
2272          ascii->prvrpl  = ascii->currpl;
2273          ascii->curbpl  = 0;
2274          ascii->currpl  = 0;
2275          lasteol       = bpos;
2276          if (ascii->linenumber != -1) ascii->linenumber++;
2277     }
2278     else if (x == eslDSQ_ILLEGAL) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": illegal character %c", ascii->linenumber, sym);
2279     else if (x == eslDSQ_EOD)     { status = eslEOD; break; }
2280     else if (x != eslDSQ_IGNORED) ESL_FAIL(eslEFORMAT, ascii->errbuf, "inmap corruption?");
2281   }
2282 
2283   if (ascii->curbpl != -1) ascii->curbpl += bpos - lasteol - 1;
2284   if (ascii->currpl != -1) ascii->currpl += nres - nres2;
2285   if (opt_nres   != NULL) *opt_nres   = nres;
2286   if (opt_endpos != NULL) *opt_endpos = bpos;
2287   return status;
2288 }
2289 
2290 /* addbuf()
2291  * Add <nres> residues from the current buffer <sqfp->buf> to <sq>.
2292  * This is designed to work when we're constructing a complete
2293  * sequence (add the whole buffer); when we're adding a suffix
2294  * of the buffer (<sqfp->bpos> is skipped ahead already);
2295  * or when we're adding a prefix of the buffer (terminating a subseq
2296  * or window load).
2297  *
2298  * The caller must know that there are at least <nres> residues in
2299  * this buffer, and that all the characters are valid in the
2300  * format and alphabet, via a previous call to <seebuf()>.
2301  *
2302  * The caller also must have already allocated <sq> to hold at least
2303  * <nres> more residues.
2304  *
2305  * On input:
2306  *   sqfp->buf[]  contains an fread() buffer
2307  *   sqfp->bpos   is set to where we're going to start parsing residues
2308  *   sqfp->nc     is the length of <buf>
2309  *
2310  * On return:
2311  *   sqfp->buf[]  still contains the same buffer (no new freads here)
2312  *   sqfp->bpos   is set after the last residue we parsed
2313  *   sq->seq/dsq  now holds <nres> new residues
2314  *   sq->n        is incremented by <nres>
2315  */
2316 static void
addbuf(ESL_SQFILE * sqfp,ESL_SQ * sq,int64_t nres)2317 addbuf(ESL_SQFILE *sqfp, ESL_SQ *sq, int64_t nres)
2318 {
2319   ESL_DSQ x;
2320   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2321 
2322   if (sq->dsq != NULL)
2323     {
2324       while (nres) {
2325         x  = sq->abc->inmap[(int) ascii->buf[ascii->bpos++]];
2326         if (x <= 127) { nres--; sq->dsq[++sq->n] = x; }
2327       } /* we skipped IGNORED, EOL. EOD, ILLEGAL don't occur; seebuf() already checked  */
2328     }
2329   else
2330     {
2331       while (nres) {
2332         x   = sqfp->inmap[(int) ascii->buf[ascii->bpos++]];
2333         if (x <= 127) { nres--; sq->seq[sq->n++] = x; }
2334       }
2335     }
2336 }
2337 
2338 /* skipbuf()
2339  * Like addbuf(), but we skip <nskip> residues instead of
2340  * reading them.
2341  */
2342 static void
skipbuf(ESL_SQFILE * sqfp,int64_t nskip)2343 skipbuf(ESL_SQFILE *sqfp, int64_t nskip)
2344 {
2345   ESL_DSQ x;
2346   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2347 
2348   while (nskip) {
2349     x  = sqfp->inmap[(int) ascii->buf[ascii->bpos++]];
2350     if (x <= 127) nskip--;/* skip IGNORED, EOL. */
2351   }
2352 }
2353 
2354 
2355 /* skip_whitespace()
2356  * Like skipbuf(), but instead of skipping a fixed number of
2357  * residues, skip forward until one of three conditions is met:
2358  *
2359  * (1) end of the sequence record (a character indicating
2360  *     the beginning of a new sequence); set ascii->bpos
2361  *     to the beginning of the new record, and return eslEOD;
2362  * (2) a non-whitespace character in the current sequence is
2363  *     reached that does not indicate the end of a sequence
2364  *     record; set ascii->bpos to that character's position,
2365  *     and return eslOK;
2366  * (3) end of file;  return eslEOF.
2367  *
2368  */
2369 static int
skip_whitespace(ESL_SQFILE * sqfp)2370 skip_whitespace(ESL_SQFILE *sqfp)
2371 {
2372   int status;
2373   int c;
2374   ESL_DSQ x;
2375   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2376 
2377   if (ascii->nc == 0)
2378     return eslEOF;
2379 
2380   c = (int) ascii->buf[ascii->bpos];
2381   x  = sqfp->inmap[c];
2382 
2383   while ( isspace(c) ) {
2384 
2385     ascii->bpos++;
2386 
2387     if (ascii->bpos == ascii->nc)
2388       if ((status = loadbuf(sqfp)) == eslEOF)
2389         return eslEOF;
2390 
2391     c = (int) ascii->buf[ascii->bpos];
2392     x  = sqfp->inmap[c];
2393   }
2394   if (x == eslDSQ_EOD)
2395     return eslEOD;
2396 
2397   return eslOK;
2398 }
2399 
2400 
2401 
2402 /* read_nres()
2403  * Read the next <nres> residues from <sqfp> after skipping <nskip> residues, then stop.
2404  *
2405  * Returns <eslOK> and <0 < *ret_actual_nres <= nres> if it succeeded, and
2406  *                 there's more residues in the current seq record.
2407  * Returns <eslEOD> and <*ret_actual_nres == 0> if no more residues are
2408  *                 seen in the sequence record.
2409  *
2410  * Even on <eslEOD>, the <dsq/seq> is appropriately terminated here,
2411  * and <sq->n> is left the way it was (no new residues added - but there
2412  * may have been saved context C from a previous window).
2413  *
2414  * Returns <eslEFORMAT> on any parsing problem, and <ascii->errbuf> is set.
2415  *
2416  * On <eslOK>, sqfp->bpos is positioned on the next character past the last residue we store;
2417  * on <eslEOD>, sqfp->bpos is positioned for reading the next sequence.
2418  *
2419  * FetchSubseq() uses this with <nskip>, <nres>, and expects an
2420  * <eslOK> with <*opt_actual_nres = nres>. On <EOD>, or if fewer than
2421  * <nres> residues are obtained, the coords must've been screwed up,
2422  * because we didn't read the whole subseq we asked for.
2423  *
2424  * ReadWindow() on forward strand uses this with <nskip=0>, <nres=W>.
2425  * The last window might normally return <eslEOD> with
2426  * <*ret_actual_nres == 0>, and now <sqfp->bpos> is positioned at the
2427  * start of the next sequence on <EOD>, and at the next residue on
2428  * <OK>.
2429  *
2430  * ReadWindow() in reverse complement acts like a subseq fetch.
2431  *
2432  */
2433 static int
read_nres(ESL_SQFILE * sqfp,ESL_SQ * sq,int64_t nskip,int64_t nres,int64_t * opt_actual_nres)2434 read_nres(ESL_SQFILE *sqfp, ESL_SQ *sq, int64_t nskip, int64_t nres, int64_t *opt_actual_nres)
2435 {
2436   int64_t n;
2437   int64_t epos;
2438   int64_t actual_nres = 0;
2439   int     status      = eslOK;
2440 
2441   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2442   status = seebuf(sqfp, nskip+nres, &n, &epos);
2443   while (status == eslOK && nskip - n > 0) {
2444     nskip   -= n;
2445     if ((status = loadbuf(sqfp)) == eslEOF) break;
2446     status = seebuf(sqfp, nskip+nres, &n, &epos);
2447   }
2448 
2449   if         (status == eslEOF) {
2450     if (! ascii->eof_is_ok) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Premature EOF before end of seq record");
2451     if (nskip > 0)         ESL_EXCEPTION(eslECORRUPT, "premature EOD while trying to skip residues");
2452     n = 0;
2453   } else if  (status == eslEOD) {
2454     if (n < nskip)         ESL_EXCEPTION(eslECORRUPT, "premature EOD while trying to skip residues");
2455   } else if  (status != eslOK)
2456     return status;
2457 
2458   skipbuf(sqfp, nskip);
2459   n -= nskip;
2460 
2461   while (status == eslOK && nres - n > 0)
2462     {
2463       addbuf(sqfp, sq, n);
2464       actual_nres += n;
2465       nres        -= n;
2466       if ((status = loadbuf(sqfp)) == eslEOF) break;
2467       status = seebuf(sqfp, nres, &n, &epos);
2468     }
2469 
2470 
2471   if        (status == eslEOF) {
2472     if (! ascii->eof_is_ok) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Premature EOF before end of seq record");
2473     n = 0;
2474   } else if  (status == eslEFORMAT) {
2475     return status;
2476   }
2477 
2478   n = ESL_MIN(nres, n);
2479   addbuf(sqfp, sq, n);   /* bpos now at last residue + 1 if OK/EOD, 0 if EOF  */
2480   actual_nres += n;
2481 
2482   if (sq->dsq != NULL) sq->dsq[sq->n+1] = eslDSQ_SENTINEL;
2483   else                 sq->seq[sq->n]   = '\0';
2484 
2485   if (status == eslEOD) {
2486     ascii->bpos = epos;
2487   }
2488 
2489   if (opt_actual_nres != NULL) *opt_actual_nres = actual_nres;
2490   return (actual_nres == 0 ? eslEOD : eslOK);
2491 }
2492 /*--------------- end, buffer-based parsers --------------------*/
2493 
2494 
2495 /*****************************************************************
2496  *#  7. Internal routines for EMBL format (including UniProt, TrEMBL)
2497  *****************************************************************/
2498 /* EMBL and UniProt protein sequence database format.
2499  *   See: http://us.expasy.org/sprot/userman.html
2500  *   and: http://www.ebi.ac.uk/embl/Documentation/User_manual/usrman.html#3
2501  * We use the same parser for both formats, so we have to be
2502  * careful to only parse the conserved intersection of these two
2503  * very similar formats.
2504  */
2505 static void
config_embl(ESL_SQFILE * sqfp)2506 config_embl(ESL_SQFILE *sqfp)
2507 {
2508   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2509 
2510   ascii->is_linebased      = TRUE;
2511   ascii->eof_is_ok         = FALSE;/* records end with // */
2512   ascii->parse_header      = &header_embl;
2513   ascii->skip_header       = &skip_embl;
2514   ascii->parse_end         = &end_embl;
2515 }
2516 
2517 static void
inmap_embl(ESL_SQFILE * sqfp,const ESL_DSQ * abc_inmap)2518 inmap_embl(ESL_SQFILE *sqfp, const ESL_DSQ *abc_inmap)
2519 {
2520   int x;
2521 
2522   if (abc_inmap != NULL) {
2523     for (x = 0; x < 128; x++) sqfp->inmap[x] = abc_inmap[x];
2524     sqfp->inmap['-']  = eslDSQ_ILLEGAL;                      // don't let the abc_inmap map the gap char; this is an ungapped file format
2525   } else {
2526     for (x =  0;  x < 128;  x++) sqfp->inmap[x] = eslDSQ_ILLEGAL;
2527     for (x = 'A'; x <= 'Z'; x++) sqfp->inmap[x] = x;
2528     for (x = 'a'; x <= 'z'; x++) sqfp->inmap[x] = x;
2529   }
2530   for (x = '0'; x <= '9'; x++)
2531     sqfp->inmap[x] = eslDSQ_IGNORED;    /* EMBL DNA sequence format puts coordinates after each line */
2532   sqfp->inmap['*']  = '*';              /* accept * as a nonresidue/stop codon character */
2533   sqfp->inmap[' ']  = eslDSQ_IGNORED;
2534   sqfp->inmap['\t'] = eslDSQ_IGNORED;
2535   sqfp->inmap['\n'] = eslDSQ_IGNORED;
2536   sqfp->inmap['\r'] = eslDSQ_IGNORED;  /* DOS eol compatibility */
2537   sqfp->inmap['/']  = eslDSQ_EOD;
2538 }
2539 
2540 /* header_embl()
2541  *
2542  * See: http://us.expasy.org/sprot/userman.html
2543  * And: http://www.ebi.ac.uk/embl/Documentation/User_manual/usrman.html#3
2544  * Our parser must work on the highest common denominator of EMBL DNA
2545  * and UniProt protein sequence files.
2546  *
2547  * sqfp->buf is the first (ID) line of the entry, or a blank line before
2548  * it (in which case we'll scan forwards skipping blank lines to find
2549  * the ID line).
2550  *
2551  * On success, returns <eslOK> and:
2552  *   sq->name  contains sequence name (and may have been reallocated, changing sq->nalloc)
2553  *   sq->acc   contains seq accession (and may have been reallocated, changing sq->aalloc)
2554  *   sq->desc  contains description line (and may have been reallocated, changing sq->dalloc)
2555  *   sq->roff  has been set to the record offset
2556  *   sq->doff  has been set to the data offset (start of sequence line)
2557  *   sqfp->buf is the first seq line.
2558  *
2559  * If no more seqs are found in the file, returns <eslEOF>.
2560  * On parse failure, returns <eslEFORMAT>, leaves as mesg in ascii->errbuf.
2561  *
2562  * May also throw <eslEMEM> on allocation errors.
2563  */
2564 static int
header_embl(ESL_SQFILE * sqfp,ESL_SQ * sq)2565 header_embl(ESL_SQFILE *sqfp, ESL_SQ *sq)
2566 {
2567   char *s;
2568   char *tok;
2569   int   status;
2570 
2571   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2572 
2573   /* Find first line:
2574    * "Each entry must begin with an identification line (ID)..."
2575    * "The two-character line-type code that begins each line is always
2576    *  followed by three blanks..."
2577    */
2578   if (ascii->nc == 0) return eslEOF;
2579   while (esl_str_IsBlank(ascii->buf)) {
2580     if ((status = loadbuf(sqfp)) == eslEOF) return eslEOF; /* normal */
2581     else if (status != eslOK) return status; /* abnormal */
2582   }
2583 
2584   /* ID line is defined as:
2585    *     ID   ENTRY_NAME DATA_CLASS; MOLECULE_TYPE; SEQUENCE_LENGTH.
2586    * We're only after the ENTRY_NAME.
2587    * Examples:
2588    *  ID   SNRPA_DROME    STANDARD;      PRT;   216 AA.
2589    *  ID   SNRPA_DROME             Reviewed;         216 AA.
2590    *  ID   X06347; SV 1; linear; mRNA; STD; HUM; 1209 BP.
2591    */
2592   if (strncmp(ascii->buf, "ID   ", 5) != 0) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": failed to find ID line", ascii->linenumber);
2593 
2594   s = ascii->buf+5;
2595   if ((status = esl_strtok(&s, " ;", &tok)) != eslOK)
2596     ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": failed to parse name on ID line", ascii->linenumber);
2597   if ((status = esl_sq_SetName(sq, tok)) != eslOK) return status;
2598   sq->roff = ascii->boff;/* record the offset of the ID line */
2599 
2600   /* Look for SQ line; parsing optional info as we go.
2601    */
2602   do {
2603     if ((status = loadbuf(sqfp)) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": failed to find SQ line", ascii->linenumber);
2604 
2605     /* "The format of the AC line is:
2606      *    AC   AC_number_1;[ AC_number_2;]...[ AC_number_N;]
2607      *  Researchers who wish to cite entries in their publications
2608      *  should always cite the first accession number. This is
2609      *  commonly referred to as the 'primary accession
2610      *  number'."
2611      *
2612      *  Examples:
2613      *   AC   P43332; Q9W4D7;
2614      *   AC   X06347;
2615      *
2616      *  Note that Easel only stores primary accessions.
2617      *  Because there can be more than one accession line, we check to
2618      *  see if the accession is already set before storing a line.
2619      */
2620     if (strncmp(ascii->buf, "AC   ", 5) == 0 && sq->acc[0] == '\0')
2621     {
2622       s = ascii->buf+5;
2623       if ((status = esl_strtok(&s, ";", &tok)) != eslOK)
2624         ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": failed to parse accession on AC line", ascii->linenumber);
2625       if ((status = esl_sq_SetAccession(sq, tok)) != eslOK) return status;
2626     }
2627 
2628     /* "The format of the DE line is:
2629      *    DE   Description.
2630      * ...In cases where more than one DE line is required, the text is
2631      * only divided between words and only the last DE line is
2632      * terminated by a period."
2633      *
2634      * Examples:
2635      *   DE   U1 small nuclear ribonucleoprotein A (U1 snRNP protein A) (U1-A) (Sex
2636      *   DE   determination protein snf).
2637      *
2638      *   DE   Human mRNA for U1 small nuclear RNP-specific A protein
2639      *
2640      *   DE   RecName: Full=U1 small nuclear ribonucleoprotein A;
2641      *   DE            Short=U1 snRNP protein A;
2642      *   DE            Short=U1-A;
2643      *   DE   AltName: Full=Sex determination protein snf;
2644      *
2645      * We'll make no attempt to parse the structured UniProt description header,
2646      * for the moment.
2647      */
2648     if (strncmp(ascii->buf, "DE   ", 5) == 0)
2649     {
2650       s = ascii->buf+5;
2651       esl_strchop(s, ascii->nc-5);
2652       if ((status = esl_sq_AppendDesc(sq, s)) != eslOK)
2653         ESL_FAIL(status, ascii->errbuf, "Line %" PRId64 ": failed to parse description on DE line", ascii->linenumber);
2654     }
2655 
2656     /* UniProt: "The format of the SQ line is:
2657      *  SQ   SEQUENCE XXXX AA; XXXXX MW; XXXXXXXXXXXXXXXX CRC64;"
2658      * EMBL:    "The SQ (SeQuence header) line marks the beginning of
2659      *           the sequence data and Gives a summary of its content.
2660      *           An example is:
2661      *  SQ   Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;"
2662      *
2663      * We don't parse this line; we just look for it as the last line
2664      * before the sequence starts.
2665      */
2666   } while (strncmp(ascii->buf, "SQ   ", 5) != 0);
2667 
2668   if (loadbuf(sqfp) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Failed to find any sequence");
2669   sq->hoff = ascii->boff - 1;
2670   sq->doff = ascii->boff;
2671   return eslOK;
2672 }
2673 
2674 /* skip_embl()
2675  *
2676  * Skip past the EMBL header and position to start of the sequence line.
2677  *
2678  * On success, returns <eslOK> and:
2679  *   sq->roff  has been set to the record offset
2680  *   sq->doff  has been set to the data offset (start of sequence line)
2681  *   sqfp->buf is the first seq line.
2682  *
2683  * If no more seqs are found in the file, returns <eslEOF>.
2684  * On parse failure, returns <eslEFORMAT>, leaves as mesg in ascii->errbuf.
2685  *
2686  * May also throw <eslEMEM> on allocation errors.
2687  */
2688 static int
skip_embl(ESL_SQFILE * sqfp,ESL_SQ * sq)2689 skip_embl(ESL_SQFILE *sqfp, ESL_SQ *sq)
2690 {
2691   int   status;
2692 
2693   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2694 
2695   /* Find first line:
2696    * "Each entry must begin with an identification line (ID)..."
2697    * "The two-character line-type code that begins each line is always
2698    *  followed by three blanks..."
2699    */
2700   if (ascii->nc == 0) return eslEOF;
2701   while (esl_str_IsBlank(ascii->buf)) {
2702     if ((status = loadbuf(sqfp)) == eslEOF) return eslEOF; /* normal */
2703     else if (status != eslOK) return status; /* abnormal */
2704   }
2705 
2706   /* ID line */
2707   if (strncmp(ascii->buf, "ID   ", 5) != 0) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": failed to find ID line", ascii->linenumber);
2708 
2709   sq->roff = ascii->boff;/* record the offset of the ID line */
2710 
2711   /* zero out the name, accession and description */
2712   sq->name[0] = '\0';
2713   sq->acc[0]  = '\0';
2714   sq->desc[0] = '\0';
2715 
2716   /* Look for SQ line; parsing optional info as we go. */
2717   do {
2718     if ((status = loadbuf(sqfp)) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": failed to find SQ line", ascii->linenumber);
2719   } while (strncmp(ascii->buf, "SQ   ", 5) != 0);
2720 
2721   if (loadbuf(sqfp) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Failed to find any sequence");
2722   sq->hoff = ascii->boff - 1;
2723   sq->doff = ascii->boff;
2724   return eslOK;
2725 }
2726 
2727 static int
end_embl(ESL_SQFILE * sqfp,ESL_SQ * sq)2728 end_embl(ESL_SQFILE *sqfp, ESL_SQ *sq)
2729 {
2730   int status;
2731 
2732   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2733 
2734   if (strncmp(ascii->buf, "//", 2) != 0) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": did not find // terminator at end of seq record", ascii->linenumber);
2735   sq->eoff = ascii->boff + ascii->nc - 1;
2736   status = loadbuf(sqfp);
2737   if      (status == eslEOF) return eslOK; /* ok, actually. */
2738   else if (status == eslOK)  return eslOK;
2739   else                       return status;
2740 }
2741 
2742 /*---------------------- EMBL format ---------------------------------*/
2743 
2744 
2745 
2746 /*****************************************************************
2747  *#  8. Internal routines for GenBank format
2748  *****************************************************************/
2749 /* NCBI GenBank sequence database format.
2750  * See GenBank release notes; for example,
2751  * ftp://ftp.ncbi.nih.gov/genbank/gbrel.txt
2752  */
2753 
2754 static void
config_genbank(ESL_SQFILE * sqfp)2755 config_genbank(ESL_SQFILE *sqfp)
2756 {
2757   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2758 
2759   ascii->is_linebased      = TRUE;
2760   ascii->eof_is_ok         = FALSE;/* records end with //  */
2761   ascii->parse_header      = &header_genbank;
2762   ascii->skip_header       = &skip_genbank;
2763   ascii->parse_end         = &end_genbank;
2764 }
2765 
2766 static void
inmap_genbank(ESL_SQFILE * sqfp,const ESL_DSQ * abc_inmap)2767 inmap_genbank(ESL_SQFILE *sqfp, const ESL_DSQ *abc_inmap)
2768 {
2769   int x;
2770 
2771   if (abc_inmap != NULL) {
2772     for (x = 0; x < 128; x++) sqfp->inmap[x] = abc_inmap[x];
2773     sqfp->inmap['-']  = eslDSQ_ILLEGAL;                      // don't let the abc_inmap map the gap char; this is an ungapped file format
2774   } else {
2775     for (x =  0;  x < 128;  x++) sqfp->inmap[x] = eslDSQ_ILLEGAL;
2776     for (x = 'A'; x <= 'Z'; x++) sqfp->inmap[x] = x;
2777     for (x = 'a'; x <= 'z'; x++) sqfp->inmap[x] = x;
2778   }
2779   for (x = '0'; x <= '9'; x++)
2780     sqfp->inmap[x] = eslDSQ_IGNORED;
2781   sqfp->inmap['*']  = '*';         /* accept * as a nonresidue/stop codon character */
2782   sqfp->inmap[' ']  = eslDSQ_IGNORED;
2783   sqfp->inmap['\t'] = eslDSQ_IGNORED;
2784   sqfp->inmap['\n'] = eslDSQ_IGNORED;
2785   sqfp->inmap['\r'] = eslDSQ_IGNORED;/* DOS eol compatibility */
2786   sqfp->inmap['/']  = eslDSQ_EOD;
2787 }
2788 
2789 /* header_genbank()
2790  *
2791  * sqfp->buf is the first (LOCUS) line of the entry, or a line before
2792  * it (in which case we'll scan forwards to find the LOCUS line - even
2793  * skipping non-blank lines, because there are sometimes headers at
2794  * the start of GenBank files).
2795  *
2796  * On success, returns <eslOK> and:
2797  *   sq->name  contains sequence name (and may have been reallocated, changing sq->nalloc)
2798  *   sq->acc   contains seq accession (and may have been reallocated, changing sq->aalloc)
2799  *   sq->desc  contains description line (and may have been reallocated, changing sq->dalloc)
2800  *   sq->roff  has been set to the record offset
2801  *   sq->doff  has been set to the data offset (start of sequence line)
2802  *   sqfp->buf is the first seq line.
2803  *
2804  * If no more seqs are found in the file, returns <eslEOF>.
2805  * On parse failure, returns <eslEFORMAT>, leaves as mesg in ascii->errbuf.
2806  *
2807  * May also throw <eslEMEM> on allocation errors.
2808  */
2809 static int
header_genbank(ESL_SQFILE * sqfp,ESL_SQ * sq)2810 header_genbank(ESL_SQFILE *sqfp, ESL_SQ *sq)
2811 {
2812   char *s;
2813   char *tok;
2814   int   status;
2815 
2816   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2817 
2818   /* Find LOCUS line, allowing for ignoration of a file header.  */
2819   if (ascii->nc == 0) return eslEOF;
2820   while (strncmp(ascii->buf, "LOCUS   ", 8) != 0) {
2821     if ((status = loadbuf(sqfp)) == eslEOF) return eslEOF; /* normal   */
2822     else if (status != eslOK) return status;                /* abnormal */
2823   }
2824 
2825   s = ascii->buf+12;
2826   if ((status = esl_strtok(&s, " ", &tok)) != eslOK)
2827     ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": failed to parse name on LOCUS line", ascii->linenumber);
2828   if ((status = esl_sq_SetName(sq, tok)) != eslOK) return status;
2829   sq->roff = ascii->boff;/* record the disk offset to the LOCUS line */
2830 
2831   /* Look for ORIGIN line, parsing optional info as we go. */
2832   do {
2833     if ((status = loadbuf(sqfp)) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Failed to find ORIGIN line");
2834 
2835     /* Optional VERSION line is parsed as "accession". */
2836     if (strncmp(ascii->buf, "VERSION   ", 10) == 0)
2837     {
2838       s = ascii->buf+12;
2839       if ((status = esl_strtok(&s, " \t\n", &tok)) != eslOK)
2840         ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": failed to parse VERSION line", ascii->linenumber);
2841       if ((status = esl_sq_SetAccession(sq, tok)) != eslOK) return status;
2842     }
2843 
2844     /* Optional DEFINITION Line is parsed as "description". */
2845     if (strncmp(ascii->buf, "DEFINITION ", 11) == 0)
2846     {
2847       s = ascii->buf+12;
2848       esl_strchop(s, ascii->nc-12);
2849       if ((status = esl_sq_AppendDesc(sq, s)) != eslOK)
2850         ESL_FAIL(status, ascii->errbuf, "Line %" PRId64 ": failed to parse desc on DEFINITION line", ascii->linenumber);
2851     }
2852   } while (strncmp(ascii->buf, "ORIGIN", 6) != 0);
2853 
2854   if (loadbuf(sqfp) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Failed to find any sequence");
2855   sq->hoff = ascii->boff - 1;
2856   sq->doff = ascii->boff;
2857   return eslOK;
2858 }
2859 
2860 /* skip_genbank()
2861  *
2862  * Skip past the GenBank header and position to start of the sequence line.
2863  *
2864  * On success, returns <eslOK> and:
2865  *   sq->roff  has been set to the record offset
2866  *   sq->doff  has been set to the data offset (start of sequence line)
2867  *   sqfp->buf is the first seq line.
2868  *
2869  * If no more seqs are found in the file, returns <eslEOF>.
2870  * On parse failure, returns <eslEFORMAT>, leaves as mesg in ascii->errbuf.
2871  *
2872  * May also throw <eslEMEM> on allocation errors.
2873  */
2874 static int
skip_genbank(ESL_SQFILE * sqfp,ESL_SQ * sq)2875 skip_genbank(ESL_SQFILE *sqfp, ESL_SQ *sq)
2876 {
2877   int   status;
2878 
2879   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2880 
2881   /* Find LOCUS line, allowing for ignoration of a file header.  */
2882   if (ascii->nc == 0) return eslEOF;
2883   while (strncmp(ascii->buf, "LOCUS   ", 8) != 0) {
2884     if ((status = loadbuf(sqfp)) == eslEOF) return eslEOF; /* normal   */
2885     else if (status != eslOK) return status;               /* abnormal */
2886   }
2887 
2888   sq->roff = ascii->boff;/* record the disk offset to the LOCUS line */
2889 
2890   /* zero out the name, accession and description */
2891   sq->name[0] = '\0';
2892   sq->acc[0]  = '\0';
2893   sq->desc[0] = '\0';
2894 
2895   /* Look for ORIGIN line, parsing optional info as we go. */
2896   do {
2897     if ((status = loadbuf(sqfp)) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Failed to find ORIGIN line");
2898   } while (strncmp(ascii->buf, "ORIGIN", 6) != 0);
2899 
2900   if (loadbuf(sqfp) != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Failed to find any sequence");
2901   sq->hoff = ascii->boff - 1;
2902   sq->doff = ascii->boff;
2903   return eslOK;
2904 }
2905 
2906 static int
end_genbank(ESL_SQFILE * sqfp,ESL_SQ * sq)2907 end_genbank(ESL_SQFILE *sqfp, ESL_SQ *sq)
2908 {
2909   int status;
2910 
2911   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2912 
2913   if (strncmp(ascii->buf, "//", 2) != 0) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": did not find // terminator at end of seq record", ascii->linenumber);
2914   sq->eoff = ascii->boff + ascii->nc - 1;
2915   status = loadbuf(sqfp);
2916   if      (status == eslEOF) return eslOK; /* ok, actually; we'll detect EOF on next sq read */
2917   else if (status == eslOK)  return eslOK;
2918   else                       return status;
2919 }
2920 /*----------------- end GenBank format -------------------------------*/
2921 
2922 
2923 
2924 /*****************************************************************
2925  *#  9. Internal routines for FASTA format
2926  *****************************************************************/
2927 
2928 static void
config_fasta(ESL_SQFILE * sqfp)2929 config_fasta(ESL_SQFILE *sqfp)
2930 {
2931   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2932 
2933   ascii->is_linebased = FALSE;
2934   ascii->eof_is_ok    = TRUE;
2935   ascii->parse_header = &header_fasta;
2936   ascii->skip_header  = &skip_fasta;
2937   ascii->parse_end    = &end_fasta;
2938 }
2939 
2940 static void
inmap_fasta(ESL_SQFILE * sqfp,const ESL_DSQ * abc_inmap)2941 inmap_fasta(ESL_SQFILE *sqfp, const ESL_DSQ *abc_inmap)
2942 {
2943   int x;
2944 
2945   if (abc_inmap != NULL) {
2946     for (x = 0; x < 128; x++) sqfp->inmap[x] = abc_inmap[x];
2947     sqfp->inmap['-']  = eslDSQ_ILLEGAL;                      // don't let the abc_inmap map the gap char; this is an ungapped file format
2948   } else {
2949     for (x =  0;  x < 128;  x++) sqfp->inmap[x] = eslDSQ_ILLEGAL;
2950     for (x = 'A'; x <= 'Z'; x++) sqfp->inmap[x] = x;
2951     for (x = 'a'; x <= 'z'; x++) sqfp->inmap[x] = x;
2952   }
2953   sqfp->inmap['*']  = '*';         /* accept * as a nonresidue/stop codon character */
2954   sqfp->inmap[' ']  = eslDSQ_IGNORED;
2955   sqfp->inmap['\t'] = eslDSQ_IGNORED;
2956   sqfp->inmap['\r'] = eslDSQ_IGNORED;/* DOS eol compatibility */
2957   sqfp->inmap['\n'] = eslDSQ_EOL;
2958   sqfp->inmap['>']  = eslDSQ_EOD;
2959   /* \n is special - fasta reader detects it as an eol */
2960 }
2961 
2962 
2963 /* header_fasta()
2964  *
2965  * sqfp->buf[sqfp->bpos] is sitting at the start of a FASTA record, or
2966  * at a space before it (in which case we'll advance, skipping whitespace,
2967  * until a > is reached).
2968  * Parse the header line, storing name and description in <sq>.
2969  *
2970  * On success, returns <eslOK> and:
2971  *    sq->name contains sequence name (and may have been reallocated, changing sq->nalloc)
2972  *    sq->desc contains description line (and may have been reallocated, changing sq->dalloc)
2973  *    sq->roff has been set to the record offset
2974  *    sq->doff has been set to the data offset (start of sequence line)
2975  *    sqfp->buf[sqfp->bpos] is sitting at the start of the seq line.
2976  *    sqfp->currpl,curbpl set to 0, to start bookkeeping data line lengths
2977  *
2978  * If no more seqs are found in the file, returns <eslEOF>.
2979  * On parse failure, return <eslEFORMAT>, leaves as mesg in ascii->errbuf.
2980  *
2981  * May also throw <eslEMEM> on allocation errors.
2982  */
2983 static int
header_fasta(ESL_SQFILE * sqfp,ESL_SQ * sq)2984 header_fasta(ESL_SQFILE *sqfp, ESL_SQ *sq)
2985 {
2986   char  c;
2987   int   status = eslOK;
2988   void *tmp;
2989   int   pos;
2990 
2991   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
2992 
2993   /* make sure there are characters in the buffer */
2994   if (ascii->nc == ascii->bpos && (status = loadbuf(sqfp)) != eslOK) return status;
2995 
2996   c =  ascii->buf[ascii->bpos];
2997   while (status == eslOK && isspace(c)) status = nextchar(sqfp, &c); /* skip space (including \n) */
2998 
2999   if (status == eslEOF) return eslEOF;
3000 
3001   if (status == eslOK && c == '>') {    /* accept the > */
3002     sq->roff = ascii->boff + ascii->bpos; /* store SSI record offset */
3003     status = nextchar(sqfp, &c);
3004   } else if (c != '>') ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": unexpected char %c; expected FASTA to start with >", ascii->linenumber, c);
3005 
3006   while (status == eslOK && (c == '\t' || c == ' ')) status = nextchar(sqfp, &c); /* skip space */
3007 
3008   /* Store the name (space delimited) */
3009   pos = 0;
3010   while (status == eslOK && ! isspace(c))
3011   {
3012       sq->name[pos++] = c;
3013       if (pos == sq->nalloc-1) { ESL_RALLOC(sq->name, tmp, sq->nalloc*2); sq->nalloc*=2; }
3014       status = nextchar(sqfp, &c);
3015   }
3016   if (pos == 0) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": no FASTA name found", ascii->linenumber);
3017   sq->name[pos] = '\0';
3018 
3019   while (status == eslOK &&  (c == '\t' || c == ' ')) status = nextchar(sqfp, &c);   /* skip space */
3020 
3021   /* Store the description (end-of-line delimited) */
3022   /* Patched to deal with NCBI NR desclines: delimit by ctrl-A (0x01) too. [SRE:H1/82] */
3023   pos = 0;
3024   while (status == eslOK && c != '\n' && c != '\r' && c != 1)
3025   {
3026       sq->desc[pos++] = c;
3027       if (pos == sq->dalloc-1) { ESL_RALLOC(sq->desc, tmp, sq->dalloc*2); sq->dalloc*= 2; }
3028       status = nextchar(sqfp, &c);
3029   }
3030   sq->desc[pos] = '\0';
3031 
3032   /* Because of the NCBI NR patch, c might be0x01 ctrl-A now; skip to eol.
3033    * (TODO: I'm worried about the efficiency of this nextchar() stuff. Revisit.)
3034    */
3035   while (status == eslOK && c != '\n' && c != '\r')
3036     status = nextchar(sqfp, &c);
3037   sq->hoff = ascii->boff + ascii->bpos;
3038 
3039   while (status == eslOK && (c == '\n' || c == '\r')) status = nextchar(sqfp, &c); /* skip past eol (DOS \r\n, MAC \r, UNIX \n */
3040   if (status != eslOK && status != eslEOF) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Unexpected failure in parsing FASTA name/description line");
3041   /* Edge case: if the last sequence in the file is L=0, no residues, we are EOF now, not OK; but we'll return OK because we parsed the header line */
3042 
3043   sq->doff = ascii->boff + ascii->bpos;
3044   ascii->prvrpl = ascii->prvbpl = -1;
3045   ascii->currpl = ascii->curbpl = 0;
3046   ascii->linenumber++;
3047   return eslOK;
3048 
3049  ERROR:
3050   return status;/* eslEMEM, from failed realloc */
3051 }
3052 
3053 /* skip_fasta()
3054  *
3055  * Skip past the fasta header and position to start of the sequence line.
3056  *
3057  * On success, returns <eslOK> and:
3058  *    sq->roff has been set to the record offset
3059  *    sq->doff has been set to the data offset (start of sequence line)
3060  *    sqfp->buf[sqfp->bpos] is sitting at the start of the seq line.
3061  *    sqfp->currpl,curbpl set to 0, to start bookkeeping data line lengths
3062  *
3063  * If no more seqs are found in the file, returns <eslEOF>.
3064  * On parse failure, return <eslEFORMAT>, leaves as mesg in ascii->errbuf.
3065  *
3066  * May also throw <eslEMEM> on allocation errors.
3067  */
3068 static int
skip_fasta(ESL_SQFILE * sqfp,ESL_SQ * sq)3069 skip_fasta(ESL_SQFILE *sqfp, ESL_SQ *sq)
3070 {
3071   char  c;
3072   int   status = eslOK;
3073 
3074   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
3075 
3076   c =  ascii->buf[ascii->bpos];
3077   while (status == eslOK && isspace(c)) status = nextchar(sqfp, &c); /* skip space (including \n) */
3078 
3079   if (status == eslEOF) return eslEOF;
3080   if (status != eslOK)  ESL_FAIL(eslEFORMAT, ascii->errbuf, "Unexpected parsing error %d", status);
3081   if (c != '>')         ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": unexpected char %c; expecting '>'", ascii->linenumber, c);
3082 
3083   sq->roff = ascii->boff + ascii->bpos; /* store SSI record offset */
3084 
3085   /* zero out the name, accession and description */
3086   sq->name[0] = '\0';
3087   sq->acc[0]  = '\0';
3088   sq->desc[0] = '\0';
3089 
3090   status = nextchar(sqfp, &c);
3091 
3092   /* skip to end of line */
3093   while (status == eslOK && c != '\n' && c != '\r') status = nextchar(sqfp, &c);
3094   sq->doff = ascii->boff + ascii->bpos;
3095 
3096   /* skip past end of line */
3097   while (status == eslOK && (c == '\n' || c == '\r')) status = nextchar(sqfp, &c);
3098 
3099   if (status != eslOK) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Premature EOF in parsing FASTA name/description line");
3100   sq->doff = ascii->boff + ascii->bpos;
3101 
3102   ascii->linenumber++;
3103   return eslOK;
3104 }
3105 
3106 
3107 static int
end_fasta(ESL_SQFILE * sqfp,ESL_SQ * sq)3108 end_fasta(ESL_SQFILE *sqfp, ESL_SQ *sq)
3109 {
3110   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
3111 
3112   if (ascii->bpos < ascii->nc) {
3113     if (ascii->buf[ascii->bpos] != '>') ESL_FAIL(eslEFORMAT, ascii->errbuf, "Whoops, FASTA reader is corrupted");
3114     sq->eoff = ascii->boff + ascii->bpos - 1; /* this puts eoff at the last \n */
3115   } /* else, EOF, and we don't have to do anything. */
3116   return eslOK;
3117 }
3118 
3119 
3120 /* Function:  esl_sqascii_WriteFasta()
3121  * Synopsis:  Write a sequence in FASTA foramt
3122  *
3123  * Purpose:   Write sequence <sq> in FASTA format to the open stream <fp>.
3124  *
3125  *            If <save_offsets> is TRUE, then store record, data, and end
3126  *            offsets in <sq>; this ability is used by unit tests.
3127  *
3128  * Returns:   <eslOK> on success.
3129  *
3130  * Throws:    <eslEMEM> on allocation failure.
3131  *            <eslEWRITE> on system write error.
3132  */
3133 int
esl_sqascii_WriteFasta(FILE * fp,ESL_SQ * sq,int save_offsets)3134 esl_sqascii_WriteFasta(FILE *fp, ESL_SQ *sq, int save_offsets)
3135 {
3136   char     buf[61];
3137   int64_t  pos;
3138 
3139   if (save_offsets) sq->roff = ftello(fp);
3140   if (fprintf(fp, ">%s", sq->name)                     < 0) ESL_EXCEPTION_SYS(eslEWRITE, "fasta seq write failed");
3141   if (sq->acc[0]  != 0 && fprintf(fp, " %s", sq->acc)  < 0) ESL_EXCEPTION_SYS(eslEWRITE, "fasta seq write failed");
3142   if (sq->desc[0] != 0 && fprintf(fp, " %s", sq->desc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "fasta seq write failed");
3143   if (save_offsets) sq->hoff = ftello(fp);
3144   if (fputc('\n', fp)                                  < 0) ESL_EXCEPTION_SYS(eslEWRITE, "fasta seq write failed");
3145 
3146   buf[60] = '\0';
3147   if (save_offsets) sq->doff = ftello(fp);
3148   for (pos = 0; pos < sq->n; pos += 60)
3149   {
3150       if (sq->dsq != NULL) esl_abc_TextizeN(sq->abc, sq->dsq+pos+1, 60, buf);
3151       else                 strncpy(buf, sq->seq+pos, 60);
3152       if (fprintf(fp, "%s\n", buf) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "fasta seq write failed");
3153   }
3154   if (save_offsets) sq->eoff = ftello(fp) - 1;
3155   return eslOK;
3156 }
3157 /*------------------- end of FASTA i/o ---------------------------*/
3158 
3159 /*****************************************************************
3160  *#  10. Internal routines for daemon format
3161  *****************************************************************/
3162 
3163 /* Special case FASTA format where each sequence is terminated with "//".
3164  *
3165  * The use case is where the sequences are being read from a pipe and a
3166  * way is needed to signal the end of the sequence so it can be processed.
3167  * The next sequence might not be in the pipe, so the usual '>' is not
3168  * present to signal the end of the sequence.  Also, an EOF is not
3169  * an option, since the daemon might run continuously.
3170  */
3171 
3172 static void
config_daemon(ESL_SQFILE * sqfp)3173 config_daemon(ESL_SQFILE *sqfp)
3174 {
3175   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
3176 
3177   ascii->is_linebased = FALSE;
3178   ascii->eof_is_ok    = FALSE;
3179   ascii->parse_header = &header_fasta;
3180   ascii->skip_header  = &skip_fasta;
3181   ascii->parse_end    = &end_daemon;
3182 }
3183 
3184 static void
inmap_daemon(ESL_SQFILE * sqfp,const ESL_DSQ * abc_inmap)3185 inmap_daemon(ESL_SQFILE *sqfp, const ESL_DSQ *abc_inmap)
3186 {
3187   int x;
3188 
3189   if (abc_inmap != NULL) {
3190     for (x = 0; x < 128; x++) sqfp->inmap[x] = abc_inmap[x];
3191     sqfp->inmap['-']  = eslDSQ_ILLEGAL;                      // don't let the abc_inmap map the gap char; this is an ungapped file format
3192   } else {
3193     for (x =  0;  x < 128;  x++) sqfp->inmap[x] = eslDSQ_ILLEGAL;
3194     for (x = 'A'; x <= 'Z'; x++) sqfp->inmap[x] = x;
3195     for (x = 'a'; x <= 'z'; x++) sqfp->inmap[x] = x;
3196   }
3197   sqfp->inmap['*']  = '*';         /* accept * as a nonresidue/stop codon character */
3198   sqfp->inmap[' ']  = eslDSQ_IGNORED;
3199   sqfp->inmap['\t'] = eslDSQ_IGNORED;
3200   sqfp->inmap['\r'] = eslDSQ_IGNORED;/* DOS eol compatibility */
3201   sqfp->inmap['\n'] = eslDSQ_EOL;
3202   sqfp->inmap['/']  = eslDSQ_EOD;
3203   /* \n is special - fasta reader detects it as an eol */
3204 }
3205 
3206 
3207 /* end_daemon()
3208  *
3209  * Special case FASTA format where each sequence is terminated with "//".
3210  *
3211  * The use case is were the sequences are being read from a pipe and a
3212  * way is needed to signal the end of the sequence so it can be processed.
3213  */
3214 static int
end_daemon(ESL_SQFILE * sqfp,ESL_SQ * sq)3215 end_daemon(ESL_SQFILE *sqfp, ESL_SQ *sq)
3216 {
3217   char  c;
3218 
3219   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
3220 
3221   if (ascii->nc < 3) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Whoops, daemon input stream is corrupted");
3222 
3223   c =  ascii->buf[ascii->bpos++];
3224   if (c != '/') ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": did not find // terminator at end of seq record", ascii->linenumber);
3225 
3226   c =  ascii->buf[ascii->bpos++];
3227   if (c != '/') ESL_FAIL(eslEFORMAT, ascii->errbuf, "Line %" PRId64 ": did not find // terminator at end of seq record", ascii->linenumber);
3228 
3229   /* skip to end of line */
3230   while (c != '\n' && c != '\r' && ascii->bpos < ascii->nc) c =  ascii->buf[ascii->bpos++];
3231 
3232   /* skip past end of line */
3233   while ((c == '\n' || c == '\r') && ascii->bpos < ascii->nc) c =  ascii->buf[ascii->bpos++];
3234 
3235   return eslOK;
3236 }
3237 
3238 
3239 /* esl_sqascii_Parse()
3240  *
3241  * Parse a sequence already read into a buffer.
3242  */
3243 int
esl_sqascii_Parse(char * buf,int size,ESL_SQ * sq,int format)3244 esl_sqascii_Parse(char *buf, int size, ESL_SQ *sq, int format)
3245 {
3246   int               status;
3247   int64_t           epos;
3248   int64_t           n;
3249 
3250   ESL_SQFILE        sqfp;
3251   ESL_SQASCII_DATA *ascii = &sqfp.data.ascii;
3252 
3253   /* fill in a dummy esl_sqfile structure used to parse buf */
3254   ascii->fp           = NULL;
3255   ascii->do_gzip      = FALSE;
3256   ascii->do_stdin     = FALSE;
3257   ascii->do_buffer    = TRUE;
3258 
3259   ascii->mem          = buf;
3260   ascii->allocm       = 0;
3261   ascii->mn           = size;
3262   ascii->mpos         = 0;
3263   ascii->moff         = -1;
3264   ascii->is_recording = FALSE;
3265 
3266   ascii->buf          = NULL;
3267   ascii->boff         = 0;
3268   ascii->balloc       = 0;
3269   ascii->nc           = 0;
3270   ascii->bpos         = 0;
3271   ascii->L            = 0;
3272   ascii->linenumber   = 1;
3273 
3274   ascii->afp          = NULL;
3275   ascii->msa          = NULL;
3276   ascii->idx          = -1;
3277 
3278   ascii->ssifile      = NULL;
3279   ascii->rpl          = -1;/* -1 = not set yet */
3280   ascii->bpl          = -1;/* (ditto) */
3281   ascii->prvrpl       = -1;/* (ditto) */
3282   ascii->prvbpl       = -1;/* (ditto) */
3283   ascii->currpl       = -1;
3284   ascii->curbpl       = -1;
3285   ascii->ssi          = NULL;
3286 
3287   /* Configure the <sqfp>'s parser and inmaps for this format. */
3288   switch (format) {
3289   case eslSQFILE_EMBL:
3290   case eslSQFILE_UNIPROT:
3291     config_embl(&sqfp);
3292     inmap_embl(&sqfp, NULL);
3293     break;
3294   case eslSQFILE_GENBANK:
3295   case eslSQFILE_DDBJ:
3296     config_genbank(&sqfp);
3297     inmap_genbank(&sqfp, NULL);
3298     break;
3299   case eslSQFILE_FASTA:
3300     config_fasta(&sqfp);
3301     inmap_fasta(&sqfp, NULL);
3302     break;
3303   case eslSQFILE_DAEMON:
3304     config_daemon(&sqfp);
3305     inmap_daemon(&sqfp, NULL);
3306     break;
3307   default:
3308     return eslEFORMAT;
3309   }
3310 
3311   /* Main case: read next seq from sqfp's stream */
3312   if ((status = ascii->parse_header(&sqfp, sq)) != eslOK) return status; /* EOF, EFORMAT */
3313 
3314   do {
3315     if ((status = seebuf(&sqfp, -1, &n, &epos)) == eslEFORMAT) return status;
3316     if (esl_sq_GrowTo(sq, sq->n + n) != eslOK) return eslEMEM;
3317     addbuf(&sqfp, sq, n);
3318     ascii->L   += n;
3319     sq->eoff   = ascii->boff + epos - 1;
3320     if (status == eslEOD)     break;
3321   } while ((status = loadbuf(&sqfp)) == eslOK);
3322 
3323   if      (status == eslEOF)
3324     {
3325       if (! ascii->eof_is_ok) ESL_FAIL(eslEFORMAT, ascii->errbuf, "Unexpected EOF; file truncated?");
3326       if ((status = ascii->parse_end(&sqfp, sq)) != eslOK) return status;
3327     }
3328   else if (status == eslEOD)
3329     {
3330       ascii->bpos = epos;
3331       if ((status = ascii->parse_end(&sqfp, sq)) != eslOK) return status;
3332     }
3333   else if (status != eslOK) return status;
3334 
3335   if (sq->dsq != NULL) sq->dsq[sq->n+1] = eslDSQ_SENTINEL;
3336   else                 sq->seq[sq->n] = '\0';
3337   sq->start = 1;
3338   sq->end   = sq->n;
3339   sq->C     = 0;
3340   sq->W     = sq->n;
3341   sq->L     = sq->n;
3342 
3343   if (ascii->balloc > 0) free(ascii->buf);
3344 
3345   return eslOK;
3346 }
3347 /*-------------------- end of daemon ----------------------------*/
3348 
3349 /*****************************************************************
3350  *# 11. Internal routines for HMMPGMD format
3351  *****************************************************************/
3352 
3353 static int
fileheader_hmmpgmd(ESL_SQFILE * sqfp)3354 fileheader_hmmpgmd(ESL_SQFILE *sqfp)
3355 {
3356   ESL_SQASCII_DATA *ascii = &sqfp->data.ascii;
3357   char c;
3358   int  status = eslOK;
3359 
3360   /* We've just loaded first buffer, after an Open. First char should be the # of the hmmpgmd file,
3361    * but let's tolerate leading whitespace anyway
3362    */
3363   c =  ascii->buf[ascii->bpos];
3364   while (status == eslOK && isspace(c)) status = nextchar(sqfp, &c); /* skip space (including \n, \r) */
3365   if (status == eslEOF) return eslEOF;
3366 
3367   if (c != '#') ESL_FAIL(eslEFORMAT, ascii->errbuf, "hmmpgmd file expected to start with #");
3368 
3369   /* skip first line; remainder of file is FASTA format */
3370   while (status == eslOK && (c != '\n' && c != '\r')) status = nextchar(sqfp, &c);
3371   if (status == eslEOF) return eslEOF;
3372 
3373   /* next character read should be the '>' of the first FASTA record. We're properly positioned at "start of file". */
3374   return eslOK;
3375 }
3376 /*-------------------- end of HMMPGMD ---------------------------*/
3377 
3378 
3379