1 /* Unaligned ncbi sequence file i/o.
2  *
3  * Contents:
4  *    1. An <ESL_SQFILE> object, in text mode.
5  *    2. An <ESL_SQFILE> object, in digital mode. [with <alphabet>]
6  *    3. Miscellaneous routines.
7  *    4. Sequence reading (sequential).
8  *    5. Parsing routines
9  */
10 #include "esl_config.h"
11 
12 #include <stdlib.h>
13 #include <stdio.h>
14 #include <string.h>
15 #include <ctype.h>
16 
17 #ifdef HAVE_SYS_TYPES_H
18 #include <sys/types.h>
19 #endif
20 
21 #ifdef HAVE_ENDIAN_H
22 #include <endian.h>
23 #endif
24 
25 #include "easel.h"
26 #include "esl_alphabet.h"	/* alphabet aug adds digital sequences */
27 #include "esl_sq.h"
28 #include "esl_sqio.h"
29 
30 #ifndef htobe32
31 #ifdef  WORDS_BIGENDIAN
32 #define htobe32(x) (x)
33 #else
34 #define htobe32(x) \
35      ((((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >>  8) |		      \
36       (((x) & 0x0000ff00) <<  8) | (((x) & 0x000000ff) << 24))
37 #endif
38 #endif
39 
40 /* format specific routines */
41 static int   sqncbi_Position       (ESL_SQFILE *sqfp, off_t offset);
42 static void  sqncbi_Close          (ESL_SQFILE *sqfp);
43 static int   sqncbi_SetDigital     (ESL_SQFILE *sqfp, const ESL_ALPHABET *abc);
44 static int   sqncbi_GuessAlphabet  (ESL_SQFILE *sqfp, int *ret_type);
45 static int   sqncbi_Read           (ESL_SQFILE *sqfp, ESL_SQ *sq);
46 static int   sqncbi_ReadInfo       (ESL_SQFILE *sqfp, ESL_SQ *sq);
47 static int   sqncbi_ReadSequence   (ESL_SQFILE *sqfp, ESL_SQ *sq);
48 static int   sqncbi_ReadWindow     (ESL_SQFILE *sqfp, int C, int W, ESL_SQ *sq);
49 static int   sqncbi_ReadBlock      (ESL_SQFILE *sqfp, ESL_SQ_BLOCK *sqBlock, int max_residues, int max_sequences, int long_target);
50 static int   sqncbi_Echo           (ESL_SQFILE *sqfp, const ESL_SQ *sq, FILE *ofp);
51 
52 static int   sqncbi_IsRewindable   (const ESL_SQFILE *sqfp);
53 static const char *sqncbi_GetError (const ESL_SQFILE *sqfp);
54 
55 /* common routines for processing ncbi database */
56 static int  sqncbi_Open         (ESL_SQNCBI_DATA *ncbi, char *filename);
57 
58 static void reset_db            (ESL_SQNCBI_DATA *ncbi);
59 static int  pos_sequence        (ESL_SQNCBI_DATA *ncbi, int inx);
60 static int  volume_open         (ESL_SQNCBI_DATA *ncbi, int volume);
61 static void reset_header_values (ESL_SQNCBI_DATA *ncbi);
62 
63 static int  read_amino          (ESL_SQFILE *sqfp, ESL_SQ *sq);
64 static int  read_dna            (ESL_SQFILE *sqfp, ESL_SQ *sq);
65 static int  read_nres_amino     (ESL_SQFILE *sqfp, ESL_SQ *sq, int len, uint64_t *nres);
66 static int  read_nres_dna       (ESL_SQFILE *sqfp, ESL_SQ *sq, int len, uint64_t *nres);
67 
68 static int  inmap_ncbi          (ESL_SQFILE *sqfp);
69 static int  inmap_ncbi_amino    (ESL_SQFILE *sqfp);
70 static int  inmap_ncbi_dna      (ESL_SQFILE *sqfp);
71 
72 /* parsing routines */
73 static int  parse_header              (ESL_SQNCBI_DATA *ncbi, ESL_SQ *sq);
74 static int  parse_def_line            (ESL_SQNCBI_DATA *ncbi, ESL_SQ *sq);
75 static int  parse_seq_id              (ESL_SQNCBI_DATA *ncbi);
76 static int  parse_textseq_id          (ESL_SQNCBI_DATA *ncbi);
77 static int  parse_object_id           (ESL_SQNCBI_DATA *ncbi);
78 static int  parse_dbtag               (ESL_SQNCBI_DATA *ncbi);
79 static int  parse_patent_seq_id       (ESL_SQNCBI_DATA *ncbi);
80 static int  parse_giimport_id         (ESL_SQNCBI_DATA *ncbi);
81 static int  parse_id_pat              (ESL_SQNCBI_DATA *ncbi);
82 static int  parse_pdb_seq_id          (ESL_SQNCBI_DATA *ncbi);
83 static int  parse_date_std            (ESL_SQNCBI_DATA *ncbi);
84 static int  parse_string              (ESL_SQNCBI_DATA *ncbi, char **str, int *len);
85 static int  parse_integer             (ESL_SQNCBI_DATA *ncbi, int *value);
86 static int  ignore_sequence_of_integer(ESL_SQNCBI_DATA *ncbi);
87 
88 #define INDEX_TABLE_SIZE      1024
89 #define INIT_HDR_BUFFER_SIZE  2048
90 
91 #define NCBI_VERSION_4             4
92 #define NCBI_DNA_DB                0
93 #define NCBI_AMINO_DB              1
94 
95 /*****************************************************************
96  *# 1. An <ESL_SQFILE> object, in text mode.
97  *****************************************************************/
98 
99 /* Function:  esl_sqncbi_Open()
100  * Synopsis:  Open a sequence file for reading.
101  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
102  *
103  * Purpose:   Open a sequence file <filename> for reading.
104  *            The opened <ESL_SQFILE> is returned through <ret_sqfp>.
105  *
106  *            The .pin, .phr and .psq files are required for the
107  *            open function to succeed.  Only ncbi version 4
108  *            databases are currently supported.
109  *
110  * Returns:   <eslOK> on success, and <*ret_sqfp> points to a new
111  *            open <ESL_SQFILE>. Caller deallocates this object with
112  *            <esl_sqfile_Close()>.
113  *
114  *            Returns <eslENOTFOUND> if <filename> can't be found or
115  *            opened.  Returns <eslEFORMAT> if the file is empty, or
116  *            if autodetection is attempted and the format can't be
117  *            determined.  On any error condition, <*ret_sqfp> is
118  *            returned NULL.
119  *
120  * Throws:    <eslEMEM> on allocation failure.
121  */
122 int
esl_sqncbi_Open(char * filename,int format,ESL_SQFILE * sqfp)123 esl_sqncbi_Open(char *filename, int format, ESL_SQFILE *sqfp)
124 {
125   int  i;
126   int  status = eslOK;	/* return status from an ESL call */
127 
128   ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
129 
130   /* before we go any further, make sure we can handle the format */
131   if (format != eslSQFILE_NCBI && format != eslSQFILE_UNKNOWN) return eslENOTFOUND;
132 
133   ncbi->fppin        = NULL;
134   ncbi->fpphr        = NULL;
135   ncbi->fppsq        = NULL;
136 
137   ncbi->title        = NULL;
138   ncbi->timestamp    = NULL;
139 
140   ncbi->index        = 0;
141 
142   ncbi->hdr_off      = -1;
143   ncbi->seq_off      = -1;
144   ncbi->amb_off      = -1;
145 
146   ncbi->index_start  = -1;
147   ncbi->index_end    = -1;
148   ncbi->hdr_indexes  = NULL;
149   ncbi->seq_indexes  = NULL;
150   ncbi->amb_indexes  = NULL;
151 
152   ncbi->hdr_buf      = NULL;
153   reset_header_values(ncbi);
154 
155   ncbi->amb_off      = 0;
156 
157   ncbi->alphatype    = eslUNKNOWN;
158   ncbi->alphasym     = NULL;
159 
160   ncbi->vol_index    = -1;
161   ncbi->volumes      = 0;
162 
163   for (i = 0; i < MAX_DB_VOLUMES; ++i) {
164     ncbi->vols[i].name      = NULL;
165     ncbi->vols[i].start_seq = -1;
166     ncbi->vols[i].end_seq   = -1;
167   }
168 
169   if ((status = sqncbi_Open(ncbi, filename)) != eslOK) goto ERROR;
170 
171   sqfp->format = eslSQFILE_NCBI;
172   if ((status = inmap_ncbi(sqfp)) != eslOK) goto ERROR;
173 
174   /* initialize the function pointers for the ncbi routines */
175   sqfp->position          = &sqncbi_Position;
176   sqfp->close             = &sqncbi_Close;
177 
178   sqfp->set_digital       = &sqncbi_SetDigital;
179   sqfp->guess_alphabet    = &sqncbi_GuessAlphabet;
180 
181   sqfp->is_rewindable     = &sqncbi_IsRewindable;
182 
183   sqfp->read              = &sqncbi_Read;
184   sqfp->read_info         = &sqncbi_ReadInfo;
185   sqfp->read_seq          = &sqncbi_ReadSequence;
186   sqfp->read_window       = &sqncbi_ReadWindow;
187   sqfp->echo              = &sqncbi_Echo;
188 
189   sqfp->read_block        = &sqncbi_ReadBlock;
190 
191   sqfp->get_error         = &sqncbi_GetError;
192 
193   return eslOK;
194 
195  ERROR:
196   sqncbi_Close(sqfp);
197   return status;
198 }
199 
200 /* sqncbi_ParseIndexFile()
201  *
202  * Parse an open index file verifying database type and version
203  * if handled.  Read in the entries, i.e. name, number of sequences,
204  * etc.  Keep track of the offsets where the header and sequence
205  * tables begin.
206  */
207 static int
sqncbi_ParseIndexFile(ESL_SQNCBI_DATA * ncbi,int dbtype)208 sqncbi_ParseIndexFile(ESL_SQNCBI_DATA *ncbi, int dbtype)
209 {
210   int         len;
211   uint32_t    info[4];
212   int         status = eslOK;	/* return status from an ESL call */
213 
214   if (fread(&info[0], sizeof(uint32_t), 3, ncbi->fppin) != 3) status = eslFAIL;
215   if (htobe32(info[0]) != NCBI_VERSION_4)                     status = eslEFORMAT;
216   if (htobe32(info[1]) != dbtype)                             status = eslEUNIMPLEMENTED;
217 
218   if (status != eslOK) goto ERROR;
219   ncbi->version = htobe32(info[0]);
220   ncbi->alphatype = (dbtype == NCBI_DNA_DB) ? eslDNA : eslAMINO;
221   ncbi->index = 0;
222 
223   /* read the database title */
224   len = htobe32(info[2]);
225   ESL_ALLOC(ncbi->title, sizeof(char) * (len + 1));
226   if (fread(ncbi->title, sizeof(char), len, ncbi->fppin) != len) { status = eslFAIL; goto ERROR; }
227   ncbi->title[len] = 0;
228 
229   /* read the database time stamp */
230   if (fread(&info[0], sizeof(uint32_t), 1, ncbi->fppin) != 1) { status = eslFAIL; goto ERROR; }
231   len = htobe32(info[0]);
232   ESL_ALLOC(ncbi->timestamp, sizeof(char) * (len + 1));
233   if (fread(ncbi->timestamp, sizeof(char), len, ncbi->fppin) != len) { status = eslFAIL; goto ERROR; }
234   ncbi->timestamp[len] = 0;
235 
236   /* read in database stats */
237   if (fread(&info[0], sizeof(uint32_t), 4, ncbi->fppin) != 4) { status = eslFAIL; goto ERROR; }
238   ncbi->num_seq   = htobe32(info[0]);
239   memcpy(&ncbi->total_res, info+1, sizeof(uint64_t));
240   ncbi->max_seq   = htobe32(info[3]);
241 
242   /* save the offsets to the index tables */
243   ncbi->hdr_off = ftell(ncbi->fppin);
244   ncbi->seq_off = ncbi->hdr_off + sizeof(uint32_t) * (ncbi->num_seq + 1);
245 
246   return eslOK;
247 
248  ERROR:
249   return status;
250 }
251 
252 /* sqncbi_DbOpen()
253  *
254  * Try to open the database files.  On successful opening of
255  * the files, index, header and sequence, the index file is
256  * parsed for validity.
257  */
258 static int
sqncbi_DbOpen(ESL_SQNCBI_DATA * ncbi,char * filename,int dbtype)259 sqncbi_DbOpen(ESL_SQNCBI_DATA *ncbi, char *filename, int dbtype)
260 {
261   int         status = eslOK;	/* return status from an ESL call */
262   int         len;
263 
264   char       *name = NULL;
265 
266   len = strlen(filename);
267   ESL_ALLOC(name, sizeof(char) * (len+5));
268   strcpy(name, filename);
269 
270   /* Check for basic database first */
271   strcpy(name+len, ".Xin");
272   name[len+1] = (dbtype == NCBI_DNA_DB) ? 'n' : 'p';
273   if ((ncbi->fppin = fopen(name, "rb")) == NULL) {
274     status = eslENOTFOUND;
275     goto ERROR;
276   }
277   strcpy(name+len, ".Xhr");
278   name[len+1] = (dbtype == NCBI_DNA_DB) ? 'n' : 'p';
279   if ((ncbi->fpphr = fopen(name, "rb")) == NULL) {
280     status = eslENOTFOUND;
281     goto ERROR;
282   }
283   strcpy(name+len, ".Xsq");
284   name[len+1] = (dbtype == NCBI_DNA_DB) ? 'n' : 'p';
285   if ((ncbi->fppsq = fopen(name, "rb")) == NULL) {
286     status = eslENOTFOUND;
287     goto ERROR;
288   }
289 
290   /* parse the header make sure we are looking at a version 4 db. */
291   if ((status = sqncbi_ParseIndexFile(ncbi, dbtype)) != eslOK) goto ERROR;
292 
293   if (name != NULL) free(name);
294 
295   return eslOK;
296 
297  ERROR:
298 
299   reset_db(ncbi);
300 
301   if (name != NULL) free(name);
302   return status;
303 }
304 
305 
306 /* sqncbi_AliasOpen()
307  *
308  * Opens an alias file parsing the DBLIST directive building
309  * a list of all the volumes makeing up this database.  As each
310  * volume is successfully parsed, the name of the volume, number
311  * of sequences and offsets are kept for quick reference.
312  */
313 static int
sqncbi_AliasOpen(ESL_SQNCBI_DATA * ncbi,char * filename,int dbtype)314 sqncbi_AliasOpen(ESL_SQNCBI_DATA *ncbi, char *filename, int dbtype)
315 {
316   int         status    = eslOK;	/* return status from an ESL call */
317   int         newline   = 1;
318   int         seqcnt    = 0;
319   int         rescnt    = 0;
320   int         done      = 0;
321   int         vol;
322   int         len;
323 
324   char       *ptr       = NULL;
325   char       *name      = NULL;
326   char        buffer[80];
327 
328   char       *dbptr     = NULL;
329   char       *dbname    = NULL;
330   int         dbsize    = 512;
331   int         dblen     = 0;
332 
333   FILE       *fp        = NULL;
334 
335   len = strlen(filename);
336   ESL_ALLOC(name, sizeof(char) * (len+5));
337   strcpy(name, filename);
338 
339   /* Check for alias file */
340   strcpy(name+len, ".Xal");
341   name[len+1] = (dbtype == NCBI_DNA_DB) ? 'n' : 'p';
342   if ((fp = fopen(name, "r")) == NULL) {
343     status = eslENOTFOUND;
344     goto ERROR;
345   }
346 
347   /* copy the filename */
348   dbsize = (dbsize > len) ? dbsize : len * 2;
349   ESL_ALLOC(dbname, sizeof(char) * dbsize);
350   strcpy(dbname, filename);
351 
352   /* remove the filename keeping the path */
353   while (len > 0 && dbname[len-1] != eslDIRSLASH) {
354     dbname[len-1] = '\0';
355     --len;
356   }
357 
358   /* find the DBLIST directive */
359   while (!done) {
360     ptr = buffer;
361     if (fgets(buffer, sizeof(buffer), fp) == NULL) {
362       status = eslEFORMAT;
363       goto ERROR;
364     }
365 
366     if (newline) {
367       if (strncmp(buffer, "DBLIST", 6) == 0 && isspace(buffer[6])) {
368 	done = 1;
369 	ptr = buffer + 6;
370       }
371     }
372 
373     /* skip to the end of the line */
374     if (!done) {
375       newline = 0;
376       while (*ptr != '\0') {
377 	if (*ptr == '\n') newline = 1;
378 	++ptr;
379       }
380     }
381   }
382 
383   /* parse the DBLIST line */
384   done = 0;
385   dblen = len;
386   while (!done) {
387 
388     /* check if we hit the end of the buffer */
389     if (*ptr == '\0') {
390       ptr = buffer;
391       if (fgets(buffer, sizeof(buffer), fp) == NULL) {
392 	status = eslEFORMAT;
393 	goto ERROR;
394       }
395     }
396 
397     /* skip spaces */
398     if (isspace(*ptr)) {
399       if (dblen > len) {
400 	dbname[dblen++] = '\0';
401 
402 	/* if the name in the DBLIST directive was ablsolute, do not
403 	 * use the working directory as a prefix.
404 	 */
405 	if (dbname[len] == eslDIRSLASH) {
406 	  dbptr = dbname + len;
407 	} else {
408 	  dbptr = dbname;
409 	}
410 
411 	status = sqncbi_DbOpen(ncbi, dbptr, dbtype);
412 	if (status != eslOK) goto ERROR;
413 
414 	/* close any open files and free up allocations */
415 	reset_db(ncbi);
416 
417 	/* if successful, copy the db information */
418 	vol = ncbi->volumes++;
419 
420 	/* allocate the name of the string big enought so the buffer can
421 	 * handle an extension, i.e. ".pin" or ".nsq" tacked on to the end
422 	 * of it without reallocating.
423 	 */
424 	ncbi->vols[vol].name = NULL;
425 	ESL_ALLOC(ncbi->vols[vol].name, sizeof(char) * strlen(dbptr) + 5);
426 	strcpy(ncbi->vols[vol].name, dbptr);
427 
428 	ncbi->vols[vol].start_seq = seqcnt;
429 	ncbi->vols[vol].end_seq   = seqcnt + ncbi->num_seq - 1;
430 
431 	ncbi->vols[vol].hdr_off = ncbi->hdr_off;
432 	ncbi->vols[vol].seq_off = ncbi->seq_off;
433 	ncbi->vols[vol].amb_off = ncbi->amb_off;
434 
435 	seqcnt += ncbi->num_seq;
436 	rescnt += ncbi->total_res;
437 
438 	dblen = len;
439       }
440 
441       done = *ptr == '\n' || *ptr == '\r';
442 
443       ptr++;
444     } else {
445       dbname[dblen++] = *ptr++;
446       if (dblen >= dbsize - 1) {
447 	char *t;
448 	dbsize += dbsize;
449 	ESL_RALLOC(dbname, t, dbsize);
450       }
451     }
452   }
453 
454   /* reopen the first volume for processing */
455   if ((status = volume_open(ncbi, 0)) != eslOK) goto ERROR;
456 
457   ncbi->num_seq = seqcnt;
458   ncbi->total_res = rescnt;
459 
460   if (name   != NULL) free(name);
461   if (dbname != NULL) free(dbname);
462 
463   if (fp != NULL) fclose(fp);
464 
465   return eslOK;
466 
467  ERROR:
468 
469   reset_db(ncbi);
470 
471   if (fp != NULL) fclose(fp);
472 
473   if (dbname != NULL) free(dbname);
474   if (name   != NULL) free(name);
475 
476   return status;
477 }
478 
479 
480 /* Function:  sqncbi_Open()
481  * Synopsis:  Open an ncbi database.
482  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
483  *
484  * Purpose:   Open an ncbi database making sure all the necessry
485  *            files are present.  Parse the index file for database
486  *            information filling in the ncbi data structure.
487  *
488  * Returns:   <eslOK> on success, and the ncbi data structre is filled
489  *            in with the database information.
490  *
491  *            Returns <eslENOTFOUND> if <filename> can't be found or
492  *            opened.  Returns <eslEFORMAT> if the index file is an
493  *            upsupported version.
494  *
495  * Throws:    <eslEMEM> on allocation failure.
496  */
497 static int
sqncbi_Open(ESL_SQNCBI_DATA * ncbi,char * filename)498 sqncbi_Open(ESL_SQNCBI_DATA *ncbi, char *filename)
499 {
500   int    status  = eslOK;	/* return status from an ESL call */
501   char  *name    = NULL;
502 
503   /* first try to open a single protein database */
504   status = sqncbi_DbOpen(ncbi, filename, NCBI_AMINO_DB);
505 
506   /* if the database was not found, look for protein volume */
507   if (status == eslENOTFOUND) {
508     status = sqncbi_AliasOpen(ncbi, filename, NCBI_AMINO_DB);
509   }
510 
511   /* if nothing so far, try a dna database */
512   if (status == eslENOTFOUND) {
513     status = sqncbi_DbOpen(ncbi, filename, NCBI_DNA_DB);
514   }
515 
516   /* still nothing, look for dna volume */
517   if (status == eslENOTFOUND) {
518     status = sqncbi_AliasOpen(ncbi, filename, NCBI_DNA_DB);
519   }
520 
521   if (status != eslOK) goto ERROR;
522 
523   /* allocate buffers used in parsing the database files */
524   ESL_ALLOC(ncbi->hdr_indexes, sizeof(uint32_t) * INDEX_TABLE_SIZE);
525   ESL_ALLOC(ncbi->seq_indexes, sizeof(uint32_t) * INDEX_TABLE_SIZE);
526 
527   /* if this is a dna database we need to allocate space for the
528    * ambiguity offsets.
529    */
530   if (ncbi->alphatype == eslDNA) {
531     ncbi->amb_off = ncbi->seq_off + sizeof(uint32_t) * (ncbi->num_seq + 1);
532     ESL_ALLOC(ncbi->amb_indexes, sizeof(uint32_t) * INDEX_TABLE_SIZE);
533   }
534 
535   ncbi->hdr_alloced = INIT_HDR_BUFFER_SIZE;
536   ESL_ALLOC(ncbi->hdr_buf, sizeof(unsigned char) * INIT_HDR_BUFFER_SIZE);
537 
538   /* skip the first sentinal byte in the .psq file */
539   fgetc(ncbi->fppsq);
540 
541   if (name != NULL) free(name);
542   return eslOK;
543 
544  ERROR:
545   if (name != NULL) free(name);
546   return status;
547 }
548 
549 
550 /* Function:  sqncbi_Position()
551  * Synopsis:  Reposition an open sequence file to an offset.
552  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
553  *
554  * Purpose:   Reposition an open <sqfp> to offset <offset>.
555  *            <offset> for the ncbi db format specified the sequence
556  *            index, not file offset.  Both the sequence and header
557  *            files are repositioned.
558  *
559  * Returns:   <eslOK>     on success;
560  *
561  * Throws:    <eslESYS> if the fseeko() or fread() call fails.
562  *            On errors, the state of <sqfp> is indeterminate, and
563  *            it should not be used again.
564  */
565 static int
sqncbi_Position(ESL_SQFILE * sqfp,off_t offset)566 sqncbi_Position(ESL_SQFILE *sqfp, off_t offset)
567 {
568   int      status;
569 
570   ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
571 
572   status = pos_sequence(ncbi, offset);
573   return status;
574 }
575 
576 /* Function:  sqncbi_Close()
577  * Synopsis:  Close a sequence file.
578  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
579  *
580  * Purpose:   Closes an open <sqfp>.
581  *
582  * Returns:   (void).
583  */
584 static void
sqncbi_Close(ESL_SQFILE * sqfp)585 sqncbi_Close(ESL_SQFILE *sqfp)
586 {
587   int i;
588 
589   ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
590 
591   if (ncbi->title != NULL)       free(ncbi->title);
592   if (ncbi->timestamp != NULL)   free(ncbi->timestamp);
593 
594   if (ncbi->hdr_buf != NULL)     free(ncbi->hdr_buf);
595 
596   if (ncbi->hdr_indexes != NULL) free(ncbi->hdr_indexes);
597   if (ncbi->seq_indexes != NULL) free(ncbi->seq_indexes);
598   if (ncbi->amb_indexes != NULL) free(ncbi->amb_indexes);
599 
600   if (ncbi->alphasym != NULL)    free(ncbi->alphasym);
601 
602   if (ncbi->fppin != NULL) fclose(ncbi->fppin);
603   if (ncbi->fpphr != NULL) fclose(ncbi->fpphr);
604   if (ncbi->fppsq != NULL) fclose(ncbi->fppsq);
605 
606   ncbi->vol_index    = -1;
607   ncbi->volumes      = 0;
608 
609   for (i = 0; i < MAX_DB_VOLUMES; ++i) {
610     if (ncbi->vols[i].name != NULL) free(ncbi->vols[i].name);
611 
612     ncbi->vols[i].name      = NULL;
613     ncbi->vols[i].start_seq = -1;
614     ncbi->vols[i].end_seq   = -1;
615   }
616 
617   ncbi->fppin        = NULL;
618   ncbi->fpphr        = NULL;
619   ncbi->fppsq        = NULL;
620 
621   ncbi->title        = NULL;
622   ncbi->timestamp    = NULL;
623 
624   ncbi->index        = 0;
625 
626   ncbi->hdr_off      = -1;
627   ncbi->seq_off      = -1;
628   ncbi->amb_off      = -1;
629 
630   ncbi->index_start  = -1;
631   ncbi->index_end    = -1;
632   ncbi->hdr_indexes  = NULL;
633   ncbi->seq_indexes  = NULL;
634   ncbi->amb_indexes  = NULL;
635 
636   ncbi->hdr_buf      = NULL;
637 
638   ncbi->alphatype    = eslUNKNOWN;
639   ncbi->alphasym     = NULL;
640 
641   return;
642 }
643 /*------------------- SQNCBI open/close -----------------------*/
644 
645 
646 /*****************************************************************
647  *# 2. An <ESL_SQFILE> object, in digital mode [with <alphabet>]
648  *****************************************************************/
649 
650 /* Function:  sqncbi_SetDigital()
651  * Synopsis:  Set an open <ESL_SQFILE> to read in digital mode.
652  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
653  *
654  * Purpose:   Given an <ESL_SQFILE> that's already been opened,
655  *            configure it to expect subsequent input to conform
656  *            to the digital alphabet <abc>.
657  *
658  *            Calling <esl_sqfile_Open(); esl_sqfile_SetDigital()> is
659  *            equivalent to <esl_sqfile_OpenDigital()>. The two-step
660  *            version is useful when you need a
661  *            <esl_sqfile_GuessAlphabet()> call in between, guessing
662  *            the file's alphabet in text mode before you set it to
663  *            digital mode.
664  *
665  * Returns:   <eslOK> on success.
666  */
667 static int
sqncbi_SetDigital(ESL_SQFILE * sqfp,const ESL_ALPHABET * abc)668 sqncbi_SetDigital(ESL_SQFILE *sqfp, const ESL_ALPHABET *abc)
669 {
670   return eslOK;
671 }
672 
673 /* Function:  sqncbi_GuessAlphabet()
674  * Synopsis:  Guess the alphabet of an open <ESL_SQFILE>.
675  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
676  *
677  * Purpose:   The only ncbi db format supported is protein.
678  *
679  * Returns:   <eslOK> on success, and <*ret_type> is set to <eslAMINO>.
680  */
681 static int
sqncbi_GuessAlphabet(ESL_SQFILE * sqfp,int * ret_type)682 sqncbi_GuessAlphabet(ESL_SQFILE *sqfp, int *ret_type)
683 {
684   *ret_type = sqfp->data.ncbi.alphatype;
685   return eslOK;
686 }
687 
688 /*-------------- end, digital mode SQNCBI -------------------*/
689 
690 
691 
692 
693 /*****************************************************************
694  *# 3. Miscellaneous routines
695  *****************************************************************/
696 
697 /* Function:  sqncbi_IsRewindable()
698  * Synopsis:  Return <TRUE> if <sqfp> can be rewound.
699  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
700  *
701  * Purpose:   Returns <TRUE> if <sqfp> can be rewound (positioned
702  *            to an offset of zero), in order to read it a second
703  *            time.
704  */
705 static int
sqncbi_IsRewindable(const ESL_SQFILE * sqfp)706 sqncbi_IsRewindable(const ESL_SQFILE *sqfp)
707 {
708   return TRUE;
709 }
710 
711 /* Function:  sqncbi_GetError()
712  * Synopsis:  Returns error buffer
713  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
714  *
715  * Purpose:   Return a pointer to the error buffer.
716  */
717 static const char *
sqncbi_GetError(const ESL_SQFILE * sqfp)718 sqncbi_GetError(const ESL_SQFILE *sqfp)
719 {
720   return sqfp->data.ncbi.errbuf;
721 }
722 
723 
724 /*****************************************************************
725  *# 4. Sequence reading (sequential)
726  *****************************************************************/
727 
728 /* Function:  sqncbi_Read()
729  * Synopsis:  Read the next sequence from a file.
730  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
731  *
732  * Purpose:   Reads the next sequence from open sequence file <sqfp> into
733  *            <sq>. Caller provides an allocated and initialized <s>, which
734  *            will be internally reallocated if its space is insufficient.
735  *
736  * Returns:   <eslOK> on success; the new sequence is stored in <s>.
737  *
738  *            Returns <eslEOF> when there is no sequence left in the
739  *            file (including first attempt to read an empty file).
740  *
741  *            Returns <eslEFORMAT> if there's a problem with the format,
742  *            such as an illegal character; the line number that the parse
743  *            error occurs on is in <sqfp->linenumber>, and an informative
744  *            error message is placed in <sqfp->errbuf>.
745  *
746  * Throws:    <eslEMEM> on allocation failure;
747  *            <eslEINCONCEIVABLE> on internal error.
748  */
749 static int
sqncbi_Read(ESL_SQFILE * sqfp,ESL_SQ * sq)750 sqncbi_Read(ESL_SQFILE *sqfp, ESL_SQ *sq)
751 {
752   //int  index;
753   int  status;
754 
755   ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
756 
757   //index = ncbi->index + 1;
758   if (ncbi->index >= ncbi->num_seq) return eslEOF;
759 
760   if ((status = pos_sequence(ncbi, ncbi->index)) != eslOK) return status;
761 
762   /* Disk offset bookkeeping */
763   sq->idx  = ncbi->index;
764   sq->roff = ncbi->roff;
765   sq->doff = ncbi->doff;
766   sq->hoff = ncbi->hoff;
767   sq->eoff = ncbi->eoff;
768 
769   if (ncbi->alphatype == eslAMINO)
770     status = read_amino(sqfp, sq);
771   else
772     status = read_dna(sqfp, sq);
773   if (status != eslOK) return status;
774 
775   /* read and parse the ncbi header */
776   if ((status = parse_header(ncbi, sq)) != eslOK) return status;
777 
778   (ncbi->index)++;
779 
780   return eslOK;
781 }
782 
783 
784 /* Function:  sqncbi_ReadInfo()
785  * Synopsis:  Read sequence info, but not the sequence itself.
786  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
787  *
788  * Purpose:   Read the next sequence from open sequence file <sqfp>,
789  *            but don't store the sequence (or secondary structure).
790  *            Upon successful return, <s> holds all the available
791  *            information about the sequence -- its name, accession,
792  *            description, and overall length <sq->L>.
793  *
794  *            This is useful for indexing sequence files, where
795  *            individual sequences might be ginormous, and we'd rather
796  *            avoid reading complete seqs into memory.
797  *
798  * Returns:   <eslOK> on success.
799  */
800 static int
sqncbi_ReadInfo(ESL_SQFILE * sqfp,ESL_SQ * sq)801 sqncbi_ReadInfo(ESL_SQFILE *sqfp, ESL_SQ *sq)
802 {
803   //int   index;
804   int   status;
805 
806   ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
807 
808   //index = ncbi->index + 1;
809   if (ncbi->index >= ncbi->num_seq) return eslEOF;
810 
811   if ((status = pos_sequence(ncbi, ncbi->index)) != eslOK) return status;
812 
813   /* Disk offset bookkeeping */
814   sq->idx  = ncbi->index;
815   sq->roff = ncbi->roff;
816   sq->doff = ncbi->doff;
817   sq->hoff = ncbi->hoff;
818   sq->eoff = ncbi->eoff;
819 
820   /* figure out the sequence length */
821   sq->L = -1;
822 
823   /* read and parse the ncbi header */
824   if ((status = parse_header(ncbi, sq)) != eslOK) return status;
825 
826   (ncbi->index)++;
827 
828   return eslOK;
829 }
830 
831 
832 /* Function:  sqncbi_ReadSequence()
833  * Synopsis:  Read the sequence, not the sequence header.
834  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
835  *
836  * Purpose:   Read the next sequence from open sequence file <sqfp>,
837  *            but not the header information.  Upon successful return,
838  *            <s> holds all the sequence.
839  *
840  *            This is useful reading binary formats and delaying the
841  *            over heads of reading the sequence name until needed by
842  *            the report generator.
843  *
844  * Returns:   <eslOK> on success; the new sequence is stored in <s>.
845  *
846  *            Returns <eslEOF> when there is no sequence left in the
847  *            file (including first attempt to read an empty file).
848  *
849  * Throws:    <eslEMEM> on allocation failure;
850  */
851 static int
sqncbi_ReadSequence(ESL_SQFILE * sqfp,ESL_SQ * sq)852 sqncbi_ReadSequence(ESL_SQFILE *sqfp, ESL_SQ *sq)
853 {
854   //int    index;
855   int    status;
856 
857   ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
858 
859   //index = ncbi->index + 1;
860   if (ncbi->index >= ncbi->num_seq) return eslEOF;
861 
862   if ((status = pos_sequence(ncbi, ncbi->index)) != eslOK) return status;
863 
864   /* Disk offset bookkeeping */
865   sq->idx  = ncbi->index;
866   sq->roff = ncbi->roff;
867   sq->doff = ncbi->doff;
868   sq->hoff = ncbi->hoff;
869   sq->eoff = ncbi->eoff;
870 
871   reset_header_values(ncbi);
872 
873   if (ncbi->alphatype == eslAMINO)
874     status = read_amino(sqfp, sq);
875   else
876     status = read_dna(sqfp, sq);
877   if (status != eslOK) return status;
878 
879   (ncbi->index)++;
880   return eslOK;
881 }
882 
883 
884 /* Function:  sqncbi_ReadWindow()
885  * Synopsis:  Read next window of sequence.
886  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
887  *
888  * Purpose:   Read a next window of <W> residues from open file <sqfp>,
889  *            keeping <C> residues from the previous window as
890  *            context, and keeping previous annotation in the <sq>
891  *            as before.
892  *
893  *            If this is the first window of a new sequence record,
894  *            <C> is ignored (there's no previous context yet), and
895  *            the annotation fields of the <sq> (name, accession, and
896  *            description) are initialized by reading the sequence
897  *            record's header. This is the only time the annotation
898  *            fields are initialized.
899  *
900  *            On return, <sq->dsq[]> contains the window and its
901  *            context; residues <1..sq->C> are the previous context,
902  *            and residues <sq->C+1..sq->n> are the new window.  The
903  *            start and end coordinates of the whole <dsq[1..n]>
904  *            (including context) in the original source sequence are
905  *            <sq->start..sq->end>. (Or, for text mode sequences,
906  *            <sq->seq[0..sq->C-1,sq->C..sq->n-1]>, while <start> and
907  *            <end> coords are still <1..L>.)
908  *
909  *            When a sequence record is completed and no more data
910  *            remain, <eslEOD> is returned, with an ``info'' <sq>
911  *            structure (containing the annotation and the total
912  *            sequence length <L>, but no sequence). (The total
913  *            sequence length <L> is unknown in <sq> until this
914  *            <eslEOD> return.)
915  *
916  *            The caller may then do one of two things before calling
917  *            <esl_sq_ReadWindow()> again; it can reset the sequence
918  *            with <esl_sq_Reuse()> to continue reading the next
919  *            sequence in the file, or it can set a negative <W> as a
920  *            signal to read windows from the reverse complement
921  *            (Crick) strand. Reverse complement reading only works
922  *            for nucleic acid sequence.
923  *
924  *            If you read the reverse complement strand, you must read
925  *            the whole thing, calling <esl_sqio_ReadWindow()> with
926  *            negative <W> windows until <eslEOD> is returned again
927  *            with an empty (info-only) <sq> structure. When that
928  *            <EOD> is reached, the <sqfp> is repositioned at the
929  *            start of the next sequence record; the caller should now
930  *            <Reuse()> the <sq>, and the next <esl_sqio_ReadWindow()>
931  *            call must have a positive <W>, corresponding to starting
932  *            to read the Watson strand of the next sequence.
933  *
934  *            Note that the <ReadWindow()> interface is designed for
935  *            an idiom of sequential reading of complete sequences in
936  *            overlapping windows, possibly on both strands; if you
937  *            want more freedom to move around in the sequence
938  *            grabbing windows in another order, you can use the
939  *            <FetchSubseq()> interface.
940  *
941  *            Reading the reverse complement strand requires file
942  *            repositioning, so it will not work on non-repositionable
943  *            streams like gzipped files or a stdin pipe. Moreover,
944  *            for reverse complement input to be efficient, the
945  *            sequence file should have consistent line lengths,
946  *            suitable for SSI's fast subsequence indexing.
947  *
948  * Returns:   <eslOK> on success; <sq> now contains next window of
949  *            sequence, with at least 1 new residue. The number
950  *            of new residues is <sq->W>; <sq->C> residues are
951  *            saved from the previous window. Caller may now
952  *            process residues <sq->dsq[sq->C+1]..sq->dsq[sq->n]>.
953  *
954  *            <eslEOD> if no new residues were read for this sequence
955  *            and strand, and <sq> now contains an empty info-only
956  *            structure (annotation and <L> are valid). Before calling
957  *            <esl_sqio_ReadWindow()> again, caller will either want
958  *            to make <W> negative (to start reading the Crick strand
959  *            of the current sequence), or it will want to reset the
960  *            <sq> (with <esl_sq_Reuse()>) to go on the next sequence.
961  *
962  *            <eslEOF> if we've already returned <eslEOD> before to
963  *            signal the end of the previous seq record, and moreover,
964  *            there's no more sequence records in the file.
965  *
966  *            <eslEINVAL> if an invalid residue is found in the
967  *            sequence, or if you attempt to take the reverse
968  *            complement of a sequence that can't be reverse
969  *            complemented.
970  *
971  * Throws:    <eslESYNTAX> if you try to read a reverse window before
972  *            you've read forward strand.
973  *
974  *            <eslECORRUPT> if something goes awry internally in the
975  *            coordinate system.
976  *
977  *            <eslEMEM> on allocation error.
978  */
979 static int
sqncbi_ReadWindow(ESL_SQFILE * sqfp,int C,int W,ESL_SQ * sq)980 sqncbi_ReadWindow(ESL_SQFILE *sqfp, int C, int W, ESL_SQ *sq)
981 {
982   uint64_t  nres;
983   int       status;
984 
985   ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
986 
987   /* Negative W indicates reverse complement direction */
988   if (W < 0)
989   {
990     if (sq->L == -1) ESL_EXCEPTION(eslESYNTAX, "Can't read reverse complement until you've read forward strand");
991 
992     /* update the sequence index */
993     if ((status = sqncbi_Position(sqfp, sq->idx)) != eslOK)
994       ESL_FAIL(eslEINVAL, ncbi->errbuf, "Unexpected error positioning database to sequence %" PRId64, sq->idx);
995 
996     if (sq->end == 1)
997     { /* last end == 1 means last window was the final one on reverse strand,
998        * so we're EOD; jump back to last forward position.
999        */
1000         sq->start      = 0;
1001         sq->end        = 0;
1002         sq->C          = 0;
1003         sq->W          = 0;
1004         sq->n          = 0;
1005         /* sq->L stays as it is */
1006         if (sq->dsq != NULL) sq->dsq[1] = eslDSQ_SENTINEL;
1007         else                 sq->seq[0] = '\0';
1008 
1009         return eslEOD;
1010     }
1011 
1012     /* If s == 0, we haven't read any reverse windows yet;
1013      * init reading from sq->L
1014      */
1015     W = -W;
1016     if (sq->start == 0)
1017     {
1018       sq->start        = ESL_MAX(1, (sq->L - W + 1));
1019       sq->end          = sq->start;
1020       sq->C            = 0;
1021     }
1022     else
1023     { /* Else, we're continuing to next window; prv was <end>..<start> */
1024       sq->C     = ESL_MIN(C, sq->L - sq->end + 1);  /* based on prev window's end */
1025       sq->start = ESL_MAX(1, (sq->end - W));
1026       W         = sq->end - sq->start + sq->C;
1027       sq->end   = sq->start;
1028     }
1029 
1030     /* grab the subseq and rev comp it */
1031     if ((status = esl_sq_GrowTo(sq, W)) != eslOK) return status;
1032     sq->n = 0;
1033     if (ncbi->alphatype == eslAMINO) status = read_nres_amino(sqfp, sq, W, &nres);
1034     else                             status = read_nres_dna(sqfp, sq, W, &nres);
1035 
1036     if (status != eslOK || nres != W) {
1037       ESL_EXCEPTION(eslECORRUPT, "Failed to extract %d..%d", sq->start, sq->end);
1038     } else {
1039       sq->end        = sq->start + nres - 1;
1040       sq->W          = nres - sq->C;
1041     }
1042 
1043     status = esl_sq_ReverseComplement(sq);
1044     if      (status    == eslEINVAL) ESL_FAIL(eslEINVAL, ncbi->errbuf, "can't reverse complement that seq - it's not DNA/RNA");
1045     else if (status    != eslOK)     return status;
1046 
1047     return eslOK;
1048   }
1049 
1050   /* Else, we're reading the forward strand */
1051   else
1052   { /* sq->start == 0 means we haven't read any windows on this sequence yet...
1053        * it's a new record, and we need to initialize with the header and
1054        * the first window. This is the only case that we're allowed to return
1055        * EOF from.
1056        */
1057     if (sq->start == 0)
1058     {
1059       if (ncbi->index >= ncbi->num_seq) return eslEOF;
1060 
1061       /* get the sequence and header offsets */
1062       if ((status = pos_sequence(ncbi, ncbi->index)) != eslOK) return status;
1063 
1064       /* Disk offset bookkeeping */
1065       sq->idx  = ncbi->index;
1066       sq->roff = ncbi->roff;
1067       sq->doff = ncbi->doff;
1068       sq->hoff = ncbi->hoff;
1069       sq->eoff = ncbi->eoff;
1070 
1071       ncbi->seq_cpos = -1;
1072       ncbi->seq_L    = -1;
1073 
1074       /* read and parse the ncbi header */
1075       if ((status = parse_header(ncbi, sq)) != eslOK) return status;
1076 
1077       sq->start    = 1;
1078       sq->C        = 0;	/* no context in first window                   */
1079       sq->L        = -1;	/* won't be known 'til EOD.                     */
1080       ncbi->seq_L  = -1;	/* init to 0, so we can count residues as we go */
1081       esl_sq_SetSource(sq, sq->name);
1082 
1083     }
1084     else
1085     { /* else we're reading a window other than first; slide context over. */
1086       sq->C = ESL_MIN(C, sq->n);
1087 
1088       /* if the case where the window is smaller than the context and the
1089        * context is not full, it is not necessary to move the context part
1090        * of the sequence that has been read in.
1091        */
1092       if (sq->C >= C) {
1093         if (sq->seq != NULL) memmove(sq->seq,   sq->seq + sq->n - sq->C,     sq->C);
1094         else                 memmove(sq->dsq+1, sq->dsq + sq->n - sq->C + 1, sq->C);
1095         sq->start = sq->end - sq->C + 1;
1096         sq->n = C;
1097       }
1098     }
1099 
1100     if ((status = esl_sq_GrowTo(sq, C+W)) != eslOK)                return status; /* EMEM    */
1101     if (ncbi->alphatype == eslAMINO) status = read_nres_amino(sqfp, sq, W, &nres);
1102     else                             status = read_nres_dna(sqfp, sq, W, &nres);
1103 
1104     if (status == eslEOD)
1105     {
1106       (ncbi->index)++;
1107       sq->start  = 0;
1108       sq->end    = 0;
1109       sq->C      = 0;
1110       sq->W      = 0;
1111       sq->n      = 0;
1112 
1113       if (sq->dsq != NULL) sq->dsq[1] = eslDSQ_SENTINEL; /* erase the saved context */
1114       else                 sq->seq[0] = '\0';
1115 
1116       return eslEOD;
1117     }
1118     else if (status == eslOK)
1119     { /* Forward strand is still in progress. <= W residues were read. Return eslOK. */
1120       sq->end        = sq->start + sq->C + nres - 1;
1121       sq->W          = nres;
1122       return eslOK;
1123     }
1124     else return status;	/* EFORMAT,EMEM */
1125   }
1126   /*NOTREACHED*/
1127   return eslOK;
1128 }
1129 
1130 /* Function:  sqncbi_ReadBlock()
1131  * Synopsis:  Read the next block of sequences from a file.
1132  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
1133  *
1134  * Purpose:   Reads a block of sequences from open sequence file <sqfp> into
1135  *            <sqBlock>.
1136  *
1137  *            In the case that <long_target> is false, the sequences are
1138  *            expected to be protein - individual sequences won't be long
1139  *            so read them in one-whole-sequence at a time. If <max_sequences>
1140  *            is set to a number > 0 read <max_sequences> sequences.
1141  *
1142  *            If <long_target> is true, the sequences are expected to be DNA.
1143  *            Because sequences in a DNA database can exceed MAX_RESIDUE_COUNT,
1144  *            this function uses ReadWindow to read chunks of sequence no
1145  *            larger than <max_residues>, and must allow for the possibility that
1146  *            a request will be made to continue reading a partly-read
1147  *            sequence.
1148  *
1149  * Returns:   <eslOK> on success; the new sequence is stored in <sqBlock>.
1150  *
1151  *            Returns <eslEOF> when there is no sequence left in the
1152  *            file (including first attempt to read an empty file).
1153  *
1154  *            Returns <eslEFORMAT> if there's a problem with the format,
1155  *            such as an illegal character;
1156  *
1157  * Throws:    <eslEMEM> on allocation failure;
1158  *            <eslEINCONCEIVABLE> on internal error.
1159  */
1160 static int
sqncbi_ReadBlock(ESL_SQFILE * sqfp,ESL_SQ_BLOCK * sqBlock,int max_residues,int max_sequences,int long_target)1161 sqncbi_ReadBlock(ESL_SQFILE *sqfp, ESL_SQ_BLOCK *sqBlock, int max_residues, int max_sequences, int long_target)
1162 {
1163 	  int     i = 0;
1164 	  int     size = 0;
1165 	  int     status = eslOK;
1166 	  ESL_SQ *tmpsq  = NULL;
1167 
1168 	  sqBlock->count = 0;
1169 
1170 	  if ( !long_target  )
1171 	  {  /* in these cases, an individual sequence won't ever be really long,
1172 			     so just read in a sequence at a time  */
1173 
1174 	    if (max_sequences < 1 || max_sequences > sqBlock->listSize)
1175 	      max_sequences = sqBlock->listSize;
1176 
1177 		  for (i = 0; i < max_sequences && size < MAX_RESIDUE_COUNT; ++i)
1178 		  {
1179 			  status = sqncbi_Read(sqfp, sqBlock->list + i);
1180 			  if (status != eslOK) break;
1181 			  size += sqBlock->list[i].n;
1182 			  ++sqBlock->count;
1183 		  }
1184 	  }
1185 	  else
1186 	  { /* DNA, not an alignment.  Might be really long sequences */
1187 
1188           /*this variable was used instead of the MAX_RESIDUE_COUNT macro because old H3 impl_dummy could've required shorter sequences to fit in memory*/
1189           if (max_residues < 0)
1190             max_residues = MAX_RESIDUE_COUNT;
1191 
1192           tmpsq = esl_sq_CreateDigital(sqBlock->list->abc);
1193 
1194 		  //if complete flag set to FALSE, then the prior block must have ended with a window that was a possibly
1195 		  //incomplete part of it's full sequence. Read another overlapping window.
1196 		  if (! sqBlock->complete )
1197 		  {
1198 			  //overloading C as indicator of how big C should be for this window reading action
1199 			  status = sqncbi_ReadWindow(sqfp, sqBlock->list->C, max_residues, sqBlock->list);
1200 			  if (status == eslOK)
1201 			  {
1202 				  sqBlock->count = i = 1;
1203 				  size = sqBlock->list->n;
1204 				  sqBlock->list[i].L = sqfp->data.ncbi.seq_L;
1205 				  if (sqBlock->list->n >= max_residues)
1206 				  { // Filled the block with a single very long window.
1207 
1208 				    if ( sqBlock->list->n == sqfp->data.ncbi.seq_L) {
1209 				      sqBlock->complete = TRUE;
1210 				      esl_sq_Reuse(tmpsq);
1211 				      tmpsq->start =  sqBlock->list->start ;
1212 				      tmpsq->C = 0;
1213 				      status = sqncbi_ReadWindow(sqfp, 0, max_residues, tmpsq); // burn off the EOD
1214               if (status == eslEOD) // otherwise, the unexpected status will be returned
1215                 status = eslOK;
1216 
1217 				    } else {
1218 				      sqBlock->complete = FALSE;
1219 				    }
1220 
1221 	          if(tmpsq != NULL) esl_sq_Destroy(tmpsq);
1222 	          return status;
1223 
1224 				  }
1225 				  else
1226 				  {
1227 					  // Burn off EOD (see notes for similar entry ~25 lines below), then go fetch the next sequence
1228 					  esl_sq_Reuse(tmpsq);
1229 					  tmpsq->start =  sqBlock->list->start ;
1230 					  tmpsq->C = 0;
1231 					  status = sqncbi_ReadWindow(sqfp, 0, max_residues, tmpsq);
1232 					  if (status != eslEOD) {
1233 					    if(tmpsq != NULL) esl_sq_Destroy(tmpsq);
1234 					    return status; //surprising
1235 					  }
1236 	          sqBlock->list->L = tmpsq->L;
1237 				  }
1238 			  }
1239 			  else if (status == eslEOD)
1240 			  { // turns out there isn't any more of the sequence to read, after all
1241 
1242 			  }
1243 			  else
1244 			  {
1245 			    if(tmpsq != NULL) esl_sq_Destroy(tmpsq);
1246 				  return status;
1247 			  }
1248 		  } // otherwise, just start at the beginning
1249 
1250 
1251 		  for (  ; i < sqBlock->listSize && size < max_residues; ++i)
1252 		  {
1253 	      /* restricted request_size is used to ensure that all blocks are pretty close to the
1254 	       * same size. Without it, we may either naively keep asking for max_residue windows,
1255 	       * which can result in a window with ~2*max_residues ... or we can end up with absurdly
1256 	       * short fragments at the end of blocks
1257 	       */
1258 		    int request_size = ESL_MAX(max_residues-size, max_residues * .05);
1259 
1260 			  esl_sq_Reuse(tmpsq);
1261 			  esl_sq_Reuse(sqBlock->list + i);
1262 			  status = sqncbi_ReadWindow(sqfp, 0, request_size, tmpsq);
1263         esl_sq_Copy(tmpsq, sqBlock->list +i);
1264 			  if (status != eslOK) break; // end of sequences
1265 
1266 			  size += sqBlock->list[i].n - sqBlock->list[i].C;
1267 			  sqBlock->list[i].L = sqfp->data.ncbi.seq_L;
1268 			  ++(sqBlock->count);
1269 			  if (size >= max_residues)
1270 		     { // a full window worth of sequence was read
1271 
1272           if ( sqBlock->list[i].n == sqfp->data.ncbi.seq_L) {
1273              sqBlock->complete = TRUE;
1274              esl_sq_Reuse(tmpsq);
1275              tmpsq->start =  sqBlock->list->start ;
1276              tmpsq->C = 0;
1277              status = sqncbi_ReadWindow(sqfp, 0, max_residues, tmpsq); // burn off the EOD
1278              if (status == eslEOD) // otherwise, the unexpected status will be returned
1279                 status = eslOK;
1280 
1281            } else {
1282              sqBlock->complete = FALSE;
1283            }
1284 
1285 
1286 		       if(tmpsq != NULL) esl_sq_Destroy(tmpsq);
1287 		       return status;
1288 
1289 			  }
1290 			  else
1291 			  {
1292 				  /* Sequence was finished before filling a full window. Need to burn off the EOD value that will be
1293 				     returned by the next ReadWindow call. Can just use a tmp sq, after setting a couple
1294 				     values ReadWindow needs to see for correct processing.
1295 				   */
1296 				  esl_sq_Reuse(tmpsq);
1297 				  tmpsq->start =  sqBlock->list[i].start ;
1298 				  tmpsq->end   =  sqBlock->list[i].end ;
1299 				  tmpsq->n     =  sqBlock->list[i].n ;
1300 				  //tmpsq->doff  =  sqBlock->list[i].doff ;
1301 				  tmpsq->idx   =  sqBlock->list[i].idx ;
1302 
1303 				  tmpsq->C = 0;
1304 				  status = sqncbi_ReadWindow(sqfp, 0, max_residues, tmpsq);
1305 		       if (status != eslEOD) {
1306 		         if(tmpsq != NULL) esl_sq_Destroy(tmpsq);
1307 		         return status; //surprising
1308 		       }
1309 		       //sqBlock->list[i].L = tmpsq->L;
1310 				  status = eslOK;
1311 			  }
1312 		  }
1313 	  }
1314 
1315 	  /* EOF will be returned only in the case were no sequences were read */
1316 	  if (status == eslEOF && i > 0) status = eslOK;
1317 
1318 	  sqBlock->complete = TRUE;
1319 
1320 	  if(tmpsq != NULL) esl_sq_Destroy(tmpsq);
1321 
1322 	  return status;
1323 }
1324 
1325 /* Function:  sqncbi_Echo()
1326  * Synopsis:  Echo a sequence's record onto output stream.
1327  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
1328  *
1329  * Returns:   <eslEUNIMPLEMENTED>.
1330  */
1331 static int
sqncbi_Echo(ESL_SQFILE * sqfp,const ESL_SQ * sq,FILE * ofp)1332 sqncbi_Echo(ESL_SQFILE *sqfp, const ESL_SQ *sq, FILE *ofp)
1333 {
1334   ESL_EXCEPTION(eslEINVAL, "can't Echo() a sequence from NCBI database");
1335   return eslEUNIMPLEMENTED;
1336 }
1337 /*------------------ end, sequential sequence input -------------*/
1338 
1339 
1340 
1341 /* Function:  reset_db()
1342  * Synopsis:  Resets a sequence file.
1343  * Incept:    MSF, Wed Mar 17, 2010 [Janelia]
1344  *
1345  * Purpose:   Closes just the currently opened db and frees
1346  *            up the memory associated with that db.  This is
1347  *            used primarily with multi-volume databases where
1348  *            the current db is closed so the next one can be
1349  *            opened.
1350  *
1351  * Returns:   void.
1352  */
1353 static void
reset_db(ESL_SQNCBI_DATA * ncbi)1354 reset_db(ESL_SQNCBI_DATA *ncbi)
1355 {
1356   if (ncbi->title     != NULL) free(ncbi->title);
1357   if (ncbi->timestamp != NULL) free(ncbi->timestamp);
1358 
1359   if (ncbi->fppin != NULL) fclose(ncbi->fppin);
1360   if (ncbi->fpphr != NULL) fclose(ncbi->fpphr);
1361   if (ncbi->fppsq != NULL) fclose(ncbi->fppsq);
1362 
1363   ncbi->fppin        = NULL;
1364   ncbi->fpphr        = NULL;
1365   ncbi->fppsq        = NULL;
1366 
1367   ncbi->title        = NULL;
1368   ncbi->timestamp    = NULL;
1369 
1370   return;
1371 }
1372 
1373 
1374 /* reset_header_values()
1375  *
1376  * Clear the header values so it is clear which values
1377  * have been set by the current header.
1378  */
1379 static void
reset_header_values(ESL_SQNCBI_DATA * ncbi)1380 reset_header_values(ESL_SQNCBI_DATA *ncbi)
1381 {
1382   ncbi->name_ptr    = NULL;
1383   ncbi->name_size   = 0;
1384   ncbi->acc_ptr     = NULL;
1385   ncbi->acc_size    = 0;
1386   ncbi->int_id      = -1;
1387   ncbi->str_id_ptr  = NULL;
1388   ncbi->str_id_size = 0;
1389 }
1390 
1391 /* volume_open()
1392  *
1393  * Open up the index, head and sequence files for a particular
1394  * volume.  Parse the first three fields in the index file,
1395  * one more time, just to make sure nothing funny is going on.
1396  */
1397 static int
volume_open(ESL_SQNCBI_DATA * ncbi,int volume)1398 volume_open(ESL_SQNCBI_DATA *ncbi, int volume)
1399 {
1400   int       len;
1401   uint32_t  info[4];
1402   int       dbtype;
1403   int       status = eslOK;	/* return status from an ESL call */
1404 
1405   char     *name;
1406 
1407   if (volume < 0 || volume > ncbi->volumes) return eslEINVAL;
1408 
1409   /* if the db has no volumes return */
1410   if (ncbi->volumes == 0) return eslOK;
1411 
1412   if (ncbi->fppin != NULL) fclose(ncbi->fppin);
1413   if (ncbi->fpphr != NULL) fclose(ncbi->fpphr);
1414   if (ncbi->fppsq != NULL) fclose(ncbi->fppsq);
1415 
1416   name = ncbi->vols[volume].name;
1417   len  = strlen(name);
1418 
1419   dbtype = (ncbi->alphatype == eslDNA) ? NCBI_DNA_DB : NCBI_AMINO_DB;
1420 
1421   /* Check for basic database first */
1422   strcpy(name+len, ".Xin");
1423   name[len+1] = (dbtype == NCBI_DNA_DB) ? 'n' : 'p';
1424   if ((ncbi->fppin = fopen(name, "rb")) == NULL) {
1425     status = eslFAIL;
1426     goto ERROR;
1427   }
1428   strcpy(name+len, ".Xhr");
1429   name[len+1] = (dbtype == NCBI_DNA_DB) ? 'n' : 'p';
1430   if ((ncbi->fpphr = fopen(name, "rb")) == NULL) {
1431     status = eslFAIL;
1432     goto ERROR;
1433   }
1434   strcpy(name+len, ".Xsq");
1435   name[len+1] = (dbtype == NCBI_DNA_DB) ? 'n' : 'p';
1436   if ((ncbi->fppsq = fopen(name, "rb")) == NULL) {
1437     status = eslFAIL;
1438     goto ERROR;
1439   }
1440 
1441   /* quickly parse the header make sure we are sane. */
1442   if (fread(&info[0], sizeof(uint32_t), 3, ncbi->fppin) != 3) status = eslFAIL;
1443   if (htobe32(info[0]) != NCBI_VERSION_4)                     status = eslEFORMAT;
1444   if (htobe32(info[1]) != dbtype)                             status = eslEFORMAT;
1445 
1446   if (status != eslOK) goto ERROR;
1447 
1448   /* save the offsets to the index tables */
1449   ncbi->hdr_off = ncbi->vols[volume].hdr_off;
1450   ncbi->seq_off = ncbi->vols[volume].seq_off;
1451   if (dbtype == NCBI_DNA_DB) {
1452     ncbi->amb_off = ncbi->vols[volume].amb_off;
1453   }
1454 
1455   ncbi->vol_index   = volume;
1456   ncbi->index_start = -1;
1457   ncbi->index_end   = -1;
1458 
1459   /* skip the first sentinal byte in the .psq file */
1460   fgetc(ncbi->fppsq);
1461 
1462   /* zero terminate the name other functions can just
1463    * tack on any extension without a lot of testing.
1464    */
1465   name[len] = '\0';
1466 
1467   return eslOK;
1468 
1469  ERROR:
1470 
1471   reset_db(ncbi);
1472 
1473   return status;
1474 }
1475 
1476 
1477 /* pos_sequence_offsets()
1478  *
1479  * Position the sequence and header files for reading at
1480  * the start of the indexed sequence <inx>.  This routine
1481  * buffers <INDEX_TABLE_SIZE> offsets from the header and
1482  * sequence files.  If the index <inx> is not in the
1483  * currently buffered table, read the the indexes.  If the
1484  * index is not in the current volume find which volume
1485  * the indexed sequence is in and open up that database.
1486  */
1487 static int
pos_sequence(ESL_SQNCBI_DATA * ncbi,int inx)1488 pos_sequence(ESL_SQNCBI_DATA *ncbi, int inx)
1489 {
1490   int        cnt;
1491   int        status;
1492 
1493   uint32_t   offset;
1494   uint32_t   start;
1495   uint32_t   end;
1496 
1497   ESL_SQNCBI_VOLUME *volume;
1498 
1499   if (inx < 0 || inx > ncbi->num_seq) return eslEINVAL;
1500 
1501   start = ncbi->index_start;
1502   end   = ncbi->index_end;
1503 
1504   /* get the offsets for the header, sequence and ambiguity table */
1505   if (ncbi->index_start == -1 || inx < start || inx > end) {
1506 
1507     /* if the db is broken up into volumes, lets find the correct one to use */
1508     if (ncbi->volumes > 0) {
1509       volume = ncbi->vols + ncbi->vol_index;
1510       if (inx < volume->start_seq || inx > volume->end_seq) {
1511 	volume = ncbi->vols;
1512 	for (cnt = 0; cnt < ncbi->volumes; ++cnt) {
1513 	  if (inx < volume->end_seq) break;
1514 	  ++volume;
1515 	}
1516 
1517 	/* check just to make sure we found the volume */
1518 	if (cnt >= ncbi->volumes) return eslFAIL;
1519 
1520 	if ((status = volume_open(ncbi, cnt)) != eslOK) return status;
1521       }
1522     }
1523 
1524     /* adjust where we start reading from if we are reading forwards or backwards */
1525     if (ncbi->index_start == -1 || inx > end) {
1526       start = inx;
1527     } else {
1528       start = inx + 2;
1529       start = (start > INDEX_TABLE_SIZE) ? start - INDEX_TABLE_SIZE : 0;
1530     }
1531     ncbi->index_start = start;
1532 
1533     /* when calculating the count be sure to take into account the fact that the
1534      * index tables contain one index more that the number of sequences and this
1535      * last index is used to point to the end of the last header and sequences.
1536      */
1537     if (ncbi->volumes > 0) {
1538       cnt = volume->end_seq - inx + 2;     // cppcheck thinks end_seq can be uninitialized here. I think it's wrong.
1539       start = start - volume->start_seq;   //  .. and ditto for start_seq.
1540     } else {
1541       cnt = ncbi->num_seq - inx + 1;
1542     }
1543     cnt = (cnt > INDEX_TABLE_SIZE) ? INDEX_TABLE_SIZE : cnt;
1544     ncbi->index_end = ncbi->index_start + cnt - 2;
1545 
1546     offset = ncbi->hdr_off + (sizeof(uint32_t) * start);
1547     if (fseek(ncbi->fppin, offset, SEEK_SET) != 0) {
1548       ESL_FAIL(eslEFORMAT, ncbi->errbuf, "Error seeking header index %d\n", offset);
1549     }
1550     if (fread(ncbi->hdr_indexes, sizeof(uint32_t), cnt, ncbi->fppin) != cnt) {
1551       ESL_FAIL(eslEFORMAT, ncbi->errbuf, "Error reading header index %d at %d(%d)\n", start, offset, cnt);
1552     }
1553 
1554     offset = ncbi->seq_off + (sizeof(uint32_t) * start);
1555     if (fseek(ncbi->fppin, offset, SEEK_SET) != 0) {
1556       ESL_FAIL(eslEFORMAT, ncbi->errbuf, "Error seeking sequence index %d\n", offset);
1557     }
1558     if (fread(ncbi->seq_indexes, sizeof(uint32_t), cnt, ncbi->fppin) != cnt) {
1559       ESL_FAIL(eslEFORMAT, ncbi->errbuf, "Error reading sequence index %d at %d(%d)\n", start, offset, cnt);
1560     }
1561 
1562     if (ncbi->alphatype == eslDNA) {
1563       offset = ncbi->amb_off + (sizeof(uint32_t) * start);
1564       if (fseek(ncbi->fppin, offset, SEEK_SET) != 0) {
1565 	ESL_FAIL(eslEFORMAT, ncbi->errbuf, "Error seeking ambiguity index %d\n", offset);
1566       }
1567       if (fread(ncbi->amb_indexes, sizeof(uint32_t), cnt, ncbi->fppin) != cnt) {
1568 	ESL_FAIL(eslEFORMAT, ncbi->errbuf, "Error reading ambiguity index %d at %d(%d)\n", start, offset, cnt);
1569       }
1570     }
1571   }
1572 
1573   ncbi->index = inx;
1574 
1575   inx -= ncbi->index_start;
1576   ncbi->roff = htobe32(ncbi->hdr_indexes[inx]);
1577   ncbi->doff = htobe32(ncbi->seq_indexes[inx]);
1578   ncbi->hoff = htobe32(ncbi->hdr_indexes[inx+1]);
1579   ncbi->eoff = htobe32(ncbi->seq_indexes[inx+1]);
1580 
1581   if (ncbi->alphatype == eslDNA) {
1582     ncbi->seq_apos = htobe32(ncbi->amb_indexes[inx]);
1583     ncbi->seq_alen = ncbi->seq_apos + htobe32(ncbi->amb_indexes[inx+1]) + 1;
1584   } else {
1585     ncbi->seq_apos = 0;
1586     ncbi->seq_alen = 0;
1587   }
1588 
1589   if (fseek(ncbi->fpphr, ncbi->roff, SEEK_SET) != 0) return eslESYS;
1590   if (fseek(ncbi->fppsq, ncbi->doff, SEEK_SET) != 0) return eslESYS;
1591 
1592   return eslOK;
1593 }
1594 
1595 /* Function:  read_amino()
1596  * Synopsis:  Read in the amino sequence
1597  * Incept:    MSF, Wed Jan 27, 2010 [Janelia]
1598  *
1599  * Purpose:   Read and translate the amino acid sequence.
1600  */
1601 static int
read_amino(ESL_SQFILE * sqfp,ESL_SQ * sq)1602 read_amino(ESL_SQFILE *sqfp, ESL_SQ *sq)
1603 {
1604   int     inx;
1605   int     size;
1606 
1607   ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
1608 
1609   if (ncbi->index >= ncbi->num_seq) return eslEOF;
1610 
1611   size = sq->eoff - sq->doff;
1612 
1613   /* figure out the sequence length */
1614   if (esl_sq_GrowTo(sq, size) != eslOK) return eslEMEM;
1615 
1616   /* figure out if the sequence is in digital mode or not */
1617   if (sq->dsq != NULL) {
1618     ESL_DSQ *ptr = sq->dsq + 1;
1619     if (fread(ptr, sizeof(char), size, ncbi->fppsq) != size) return eslEFORMAT;
1620     for (inx = 0; inx < size - 1; ++inx) {
1621       *ptr = sqfp->inmap[(int) *ptr];
1622       ++ptr;
1623     }
1624     *ptr = eslDSQ_SENTINEL;
1625   } else {
1626     char *ptr = sq->seq;
1627     if (fread(ptr, sizeof(char), size, ncbi->fppsq) != size) return eslEFORMAT;
1628     for (inx = 0; inx < size - 1; ++inx) {
1629       *ptr = sqfp->inmap[(int) *ptr];
1630       *ptr = ncbi->alphasym[(int) *ptr];
1631       ++ptr;
1632     }
1633     *ptr = '\0';
1634   }
1635 
1636   sq->start = 1;
1637   sq->end   = size - 1;
1638   sq->C     = 0;
1639   sq->W     = size - 1;
1640   sq->L     = size - 1;
1641   sq->n     = size - 1;
1642 
1643   return eslOK;
1644 }
1645 
1646 
1647 /* Function:  read_dna()
1648  * Synopsis:  Read in the dna sequence
1649  * Incept:    MSF, Wed Jan 27, 2010 [Janelia]
1650  *
1651  * Purpose:   Read and translate the dna sequence.
1652  */
1653 static int
read_dna(ESL_SQFILE * sqfp,ESL_SQ * sq)1654 read_dna(ESL_SQFILE *sqfp, ESL_SQ *sq)
1655 {
1656   int64_t  inx;
1657   int64_t  cnt;
1658   int64_t  off;
1659   int      size;
1660   int      text;
1661   int      status;
1662   int      amb32;
1663 
1664   int      remainder;
1665   int      length;
1666   int      ssize;
1667   int      n;
1668 
1669   unsigned char *ptr;
1670   void    *t;
1671 
1672   unsigned char c;
1673 
1674   ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
1675 
1676   if (ncbi->index >= ncbi->num_seq) return eslEOF;
1677 
1678   /* calculate the max size of the sequence.  It is most likely
1679    * a bit smaller, but we need to read the sequence in first
1680    * before the real size can be figured out.
1681    */
1682   size = sq->eoff - sq->doff;
1683   if (ncbi->hdr_alloced < size) {
1684     while (ncbi->hdr_alloced < size) ncbi->hdr_alloced += ncbi->hdr_alloced;
1685     ESL_RALLOC(ncbi->hdr_buf, t, sizeof(char) * ncbi->hdr_alloced);
1686   }
1687   if (fread(ncbi->hdr_buf, sizeof(char), size, ncbi->fppsq) != size) return eslEFORMAT;
1688 
1689   ssize     = ncbi->seq_apos - sq->doff - 1;
1690   remainder = *(ncbi->hdr_buf + ssize) & 0x03;
1691   length    = ssize * 4 + remainder;
1692 
1693   /* figure out the sequence length */
1694   if (esl_sq_GrowTo(sq, length) != eslOK) return eslEMEM;
1695 
1696   /* figure out if the sequence is in digital mode or not */
1697   if (sq->dsq != NULL) {
1698     text = FALSE;
1699     ptr = (unsigned char *) sq->dsq + 1;
1700   } else {
1701     text = TRUE;
1702     ptr = (unsigned char *) sq->seq;
1703   }
1704 
1705   for (inx = 0; inx < ssize; ++inx) {
1706     c = ncbi->hdr_buf[inx];
1707     n = 1 << ((c >> 6) & 0x03);
1708     *ptr = sqfp->inmap[n];
1709     if (text) *ptr = ncbi->alphasym[(int) *ptr];
1710     ++ptr;
1711     n = 1 << ((c >> 4) & 0x03);
1712     *ptr = sqfp->inmap[n];
1713     if (text) *ptr = ncbi->alphasym[(int) *ptr];
1714     ++ptr;
1715     n = 1 << ((c >> 2) & 0x03);
1716     *ptr = sqfp->inmap[n];
1717     if (text) *ptr = ncbi->alphasym[(int) *ptr];
1718     ++ptr;
1719     n = 1 << ((c >> 0) & 0x03);
1720     *ptr = sqfp->inmap[n];
1721     if (text) *ptr = ncbi->alphasym[(int) *ptr];
1722     ++ptr;
1723   }
1724 
1725   /* handle the remainder */
1726   c = ncbi->hdr_buf[inx];
1727   for (inx = 0; inx < remainder; ++inx) {
1728     n = 1 << ((c >> (6 - inx * 2)) & 0x03);
1729     *ptr = sqfp->inmap[n];
1730     if (text) *ptr = ncbi->alphasym[(int) *ptr];
1731     ++ptr;
1732   }
1733 
1734   *ptr = (text) ? '\0' : eslDSQ_SENTINEL;
1735 
1736   /* we need to look that the first by the the ambiguity table
1737    * to see if the entries are 32 or 64 bit entries.
1738    */
1739   amb32 = 0;
1740   if (ncbi->seq_apos - sq->doff < size) {
1741     amb32 = ((ncbi->hdr_buf[ncbi->seq_apos - sq->doff] & 0x80) == 0);
1742   }
1743 
1744   /* skip past the count and start processing the abmiguity table */
1745   ssize = ncbi->seq_apos - sq->doff + 4;
1746   ptr = (text) ? (unsigned char *)sq->seq : (unsigned char *)sq->dsq + 1;
1747 
1748   while (ssize < size) {
1749     /* get the ambiguity character */
1750     n = ((ncbi->hdr_buf[ssize] >> 4) & 0x0f);
1751     c = sqfp->inmap[n];
1752     if (text) c = ncbi->alphasym[(int) c];
1753 
1754     if (amb32) {
1755       /* get the repeat count 4 bits */
1756       cnt = (ncbi->hdr_buf[ssize] & 0x0f);
1757       cnt += 1;
1758 
1759       /* get the offset 24 bits */
1760       off = ncbi->hdr_buf[ssize+1];
1761       off = (off << 8) | ncbi->hdr_buf[ssize+2];
1762       off = (off << 8) | ncbi->hdr_buf[ssize+3];
1763 
1764       for (inx = 0; inx < cnt; ++inx) ptr[off+inx] = c;
1765 
1766       ssize += 4;
1767     } else {
1768       /* get the repeat count 12 bits */
1769       cnt = (ncbi->hdr_buf[ssize] & 0x0f);
1770       cnt = (cnt << 8) | ncbi->hdr_buf[ssize+1];
1771       cnt += 1;
1772 
1773       /* get the offset 48 bits*/
1774       off = ncbi->hdr_buf[ssize+2];
1775       off = (off << 8) | ncbi->hdr_buf[ssize+3];
1776       off = (off << 8) | ncbi->hdr_buf[ssize+4];
1777       off = (off << 8) | ncbi->hdr_buf[ssize+5];
1778       off = (off << 8) | ncbi->hdr_buf[ssize+6];
1779       off = (off << 8) | ncbi->hdr_buf[ssize+7];
1780 
1781       for (inx = 0; inx < cnt; ++inx) ptr[off+inx] = c;
1782 
1783       ssize += 8;
1784     }
1785   }
1786 
1787   sq->start = 1;
1788   sq->end   = length;
1789   sq->C     = 0;
1790   sq->W     = length;
1791   sq->L     = length;
1792   sq->n     = length;
1793 
1794   return eslOK;
1795 
1796  ERROR:
1797   return eslEMEM;
1798 }
1799 
1800 
1801 /* Function:  read_nres_amino()
1802  * Synopsis:  Read in the amino sequence
1803  * Incept:    MSF, Wed Jan 27, 2010 [Janelia]
1804  *
1805  * Purpose:   Read and translate the dna sequence.
1806  */
1807 static int
read_nres_amino(ESL_SQFILE * sqfp,ESL_SQ * sq,int len,uint64_t * nres)1808 read_nres_amino(ESL_SQFILE *sqfp, ESL_SQ *sq, int len, uint64_t *nres)
1809 {
1810   int     inx;
1811   int     off;
1812   int     size;
1813 
1814   unsigned char   *ptr;
1815   ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
1816 
1817   if (ncbi->index >= ncbi->num_seq) return eslEOF;
1818 
1819   /* if we don't know the sequence length, figure it out */
1820   if (ncbi->seq_L == -1) ncbi->seq_L = sq->eoff - sq->doff - 1;
1821 
1822   /* check if we are at the end */
1823   if (sq->start + sq->n > ncbi->seq_L) {
1824     if (nres != NULL) *nres = 0;
1825     sq->L = ncbi->seq_L;
1826     return eslEOD;
1827   }
1828 
1829   /* figure out if the sequence is in digital mode or not */
1830   ptr = (sq->dsq != NULL) ? (unsigned char *)sq->dsq + 1 : (unsigned char *) sq->seq;
1831   ptr += sq->n;
1832 
1833   /* calculate where to start reading from */
1834   off   = sq->doff + sq->start + sq->n - 1;
1835 
1836   /* calculate the size to read */
1837   size = ncbi->seq_L - (sq->start + sq->n - 1);
1838   size = (size > len) ? len : size;
1839 
1840   /* seek to the windows location and read into the buffer */
1841   if (fseek(ncbi->fppsq, off, SEEK_SET) != 0) return eslESYS;
1842   if (fread(ptr, sizeof(char), size, ncbi->fppsq) != size) return eslEFORMAT;
1843 
1844   /* figure out if the sequence is in digital mode or not */
1845   for (inx = 0; inx < size; ++inx) {
1846     *ptr = sqfp->inmap[(int) *ptr];
1847     if (sq->dsq == NULL) *ptr = ncbi->alphasym[(int) *ptr];
1848     ++ptr;
1849   }
1850 
1851   *ptr = (sq->dsq == NULL) ? '\0' : eslDSQ_SENTINEL;
1852 
1853   sq->n = sq->n + size;
1854 
1855   if (nres != NULL) *nres = size;
1856 
1857   return eslOK;
1858 }
1859 
1860 /* Function:  correct_ambiguity()
1861  * Synopsis:  Read in the dna sequence
1862  * Incept:    MSF, Thu Feb 4, 2010 [Janelia]
1863  *
1864  * Purpose:   Correct any ambiguity characters.
1865  */
1866 static int
correct_ambiguity(ESL_SQFILE * sqfp,ESL_SQ * sq,int len)1867 correct_ambiguity(ESL_SQFILE *sqfp, ESL_SQ *sq, int len)
1868 {
1869   int64_t   alen;         /* ambiguity length       */
1870   int64_t   soff;         /* starting offset        */
1871   int64_t   eoff;         /* ending offset          */
1872   int64_t   ainx;         /* ambiguity index        */
1873   int64_t   size;         /* size of table read in  */
1874   int64_t   cnt;          /* repeat count           */
1875   int64_t   off;
1876   int64_t   n;
1877 
1878   int       amb32;        /* flag for 32 or 64 bits */
1879   char     *ptr;
1880 
1881   unsigned char c;
1882 
1883   ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
1884 
1885   if (ncbi->seq_alen == 0) return eslOK;
1886 
1887   /* go to the start of the ambiguity table and see if the table
1888    * is in 32 or 64 bit entries.
1889    */
1890   if (fseek(ncbi->fppsq, ncbi->seq_apos, SEEK_SET)       != 0) return eslESYS;
1891   if (fread(ncbi->hdr_buf, sizeof(char), 4, ncbi->fppsq) != 4) return eslEFORMAT;
1892   amb32 = ((ncbi->hdr_buf[0] & 0x80) == 0);
1893 
1894   ptr = (sq->dsq != NULL) ? (char *)sq->dsq + 1 : sq->seq;
1895   ptr += sq->n;
1896 
1897   /* calculate the starting and ending offsets */
1898   soff = sq->start + sq->n - 1;
1899   eoff = soff + len;
1900 
1901   off = 0;
1902   ainx = 0;
1903   size = 0;
1904   alen = ncbi->seq_alen - 4;
1905   while (off < eoff) {
1906     /* check if we need to read in more of the  abmiguity table */
1907     if (ainx == size) {
1908       size = alen;
1909       size = (size > INIT_HDR_BUFFER_SIZE) ? INIT_HDR_BUFFER_SIZE : size;
1910       if (fread(ncbi->hdr_buf, sizeof(char), size, ncbi->fppsq) != size) return eslEFORMAT;
1911       alen -= size;
1912       ainx = 0;
1913     }
1914 
1915     /* get the ambiguity character */
1916     n = ((ncbi->hdr_buf[ainx] >> 4) & 0x0f);
1917     c = sqfp->inmap[n];
1918     if (sq->dsq == NULL) c = ncbi->alphasym[(int) c];
1919 
1920     if (amb32) {
1921       /* get the repeat count 4 bits */
1922       cnt = (ncbi->hdr_buf[ainx] & 0x0f);
1923       cnt += 1;
1924 
1925       /* get the offset 24 bits */
1926       off = ncbi->hdr_buf[ainx+1];
1927       off = (off << 8) | ncbi->hdr_buf[ainx+2];
1928       off = (off << 8) | ncbi->hdr_buf[ainx+3];
1929 
1930       ainx += 4;
1931     } else {
1932       /* get the repeat count 12 bits */
1933       cnt = (ncbi->hdr_buf[ainx] & 0x0f);
1934       cnt = (cnt << 8) | ncbi->hdr_buf[ainx+1];
1935       cnt += 1;
1936 
1937       /* get the offset 48 bits*/
1938       off = ncbi->hdr_buf[ainx+2];
1939       off = (off << 8) | ncbi->hdr_buf[ainx+3];
1940       off = (off << 8) | ncbi->hdr_buf[ainx+4];
1941       off = (off << 8) | ncbi->hdr_buf[ainx+5];
1942       off = (off << 8) | ncbi->hdr_buf[ainx+6];
1943       off = (off << 8) | ncbi->hdr_buf[ainx+7];
1944 
1945       ainx += 8;
1946     }
1947 
1948     if (off + cnt >= soff && off < eoff) {
1949       int inx;
1950       int start = (off > soff) ? off - soff : 0;
1951       int end   = (off + cnt > eoff) ? eoff : off - soff + cnt;
1952       for (inx = start; inx < end; ++inx) ptr[inx] = c;
1953     }
1954 
1955     off += cnt;
1956   }
1957 
1958   return eslOK;
1959 }
1960 
1961 /* Function:  read_nres_dna()
1962  * Synopsis:  Read in the dna sequence
1963  * Incept:    MSF, Wed Jan 27, 2010 [Janelia]
1964  *
1965  * Purpose:   Read and translate the dna sequence.
1966  */
1967 static int
read_nres_dna(ESL_SQFILE * sqfp,ESL_SQ * sq,int len,uint64_t * nres)1968 read_nres_dna(ESL_SQFILE *sqfp, ESL_SQ *sq, int len, uint64_t *nres)
1969 {
1970   int     inx;
1971   int     off;
1972   int     cnt;
1973   int     ncnt;
1974   int     start;
1975   int     skip;
1976   int     text;
1977   int     status;
1978 
1979   int     remainder;
1980   int     length;
1981   int     ssize;
1982   int     n;
1983 
1984   unsigned char   *ptr;
1985   void   *t;
1986 
1987   unsigned char c;
1988 
1989   ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
1990 
1991   if (ncbi->index >= ncbi->num_seq) return eslEOF;
1992 
1993   /* if we don't know the sequence length, figure it out */
1994   if (ncbi->seq_L == -1) {
1995     if (fseek(ncbi->fppsq, ncbi->seq_apos - 1, SEEK_SET) != 0) return eslESYS;
1996     if (fread(&c, sizeof(char), 1, ncbi->fppsq) != 1)          return eslEFORMAT;
1997 
1998     ssize       = ncbi->seq_apos - sq->doff - 1;
1999     remainder   = c & 0x03;
2000     length      = ssize * 4 + remainder;
2001 
2002     ncbi->seq_L = length;
2003   }
2004 
2005   /* check if we are at the end */
2006   if (sq->start + sq->n > ncbi->seq_L) {
2007     if (nres != NULL) *nres = 0;
2008     sq->L = ncbi->seq_L;
2009     return eslEOD;
2010   }
2011 
2012   /* calculate where to start reading from */
2013   start = sq->start + sq->n - 1;
2014   off   = sq->doff + start / 4;
2015 
2016   /* calculate bits to skip at the beginning and end */
2017   cnt   = ncbi->seq_L - (sq->start + sq->n - 1);
2018   cnt   = (cnt > len) ? len : cnt;
2019 
2020   skip      = start & 0x03;
2021   remainder = skip + cnt;
2022   remainder = (remainder & 0x03) ? (remainder & 0x03) : 4;
2023 
2024   /* calculate bytes need to read in the window */
2025   ssize = (cnt + skip + 3) / 4;
2026 
2027   /* seek to the windows location and read into the buffer */
2028   if (fseek(ncbi->fppsq, off, SEEK_SET) != 0) return eslESYS;
2029   if (ncbi->hdr_alloced < ssize) {
2030     while (ncbi->hdr_alloced < ssize) ncbi->hdr_alloced += ncbi->hdr_alloced;
2031     ESL_RALLOC(ncbi->hdr_buf, t, sizeof(char) * ncbi->hdr_alloced);
2032   }
2033   if (fread(ncbi->hdr_buf, sizeof(char), ssize, ncbi->fppsq) != ssize) return eslEFORMAT;
2034 
2035   /* figure out if the sequence is in digital mode or not */
2036   if (sq->dsq != NULL) {
2037     text = FALSE;
2038     ptr = (unsigned char *)sq->dsq + 1;
2039   } else {
2040     text = TRUE;
2041     ptr = (unsigned char *)sq->seq;
2042   }
2043   ptr += sq->n;
2044 
2045   inx = 0;
2046   c = ncbi->hdr_buf[inx];
2047   for (inx = skip; inx < 4; ++inx) {
2048     n = 1 << ((c >> (6 - inx * 2)) & 0x03);
2049     *ptr = sqfp->inmap[n];
2050     if (text) *ptr = ncbi->alphasym[(int) *ptr];
2051     ++ptr;
2052   }
2053 
2054   for (inx = 1; inx < ssize - 1; ++inx) {
2055     c = ncbi->hdr_buf[inx];
2056     n = 1 << ((c >> 6) & 0x03);
2057     *ptr = sqfp->inmap[n];
2058     if (text) *ptr = ncbi->alphasym[(int) *ptr];
2059     ++ptr;
2060     n = 1 << ((c >> 4) & 0x03);
2061     *ptr = sqfp->inmap[n];
2062     if (text) *ptr = ncbi->alphasym[(int) *ptr];
2063     ++ptr;
2064     n = 1 << ((c >> 2) & 0x03);
2065     *ptr = sqfp->inmap[n];
2066     if (text) *ptr = ncbi->alphasym[(int) *ptr];
2067     ++ptr;
2068     n = 1 << ((c >> 0) & 0x03);
2069     *ptr = sqfp->inmap[n];
2070     if (text) *ptr = ncbi->alphasym[(int) *ptr];
2071     ++ptr;
2072   }
2073 
2074   /* handle the remainder */
2075   c = ncbi->hdr_buf[inx];
2076   for (inx = 0; inx < remainder; ++inx) {
2077     n = 1 << ((c >> (6 - inx * 2)) & 0x03);
2078     *ptr = sqfp->inmap[n];
2079     if (text) *ptr = ncbi->alphasym[(int) *ptr];
2080     ++ptr;
2081   }
2082 
2083   *ptr = (text) ? '\0' : eslDSQ_SENTINEL;
2084 
2085   /* calculate the number of residues processed */
2086   ncnt = (ssize - 1) * 4 + remainder - skip;
2087 
2088   /* start processing the abmiguity table if there is one */
2089   if (ncbi->seq_alen > 0) {
2090     correct_ambiguity(sqfp, sq, ncnt);
2091   }
2092 
2093   sq->n = sq->n + ncnt;
2094 
2095   if (nres != NULL) *nres = ncnt;
2096 
2097   return eslOK;
2098 
2099  ERROR:
2100   return eslEMEM;
2101 }
2102 
2103 
2104 /* Function:  inmap_ncbi()
2105  * Synopsis:  Set up a translation map
2106  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
2107  *
2108  * Purpose:   Initialize the translation map used to translate a ncbi
2109  *            sequences to the internal representation used in hmmer.
2110  *
2111  * Returns:   <eslOK> on success;
2112  *
2113  * Throws:    <eslEMEM> on allocation failure;
2114  */
2115 static int
inmap_ncbi(ESL_SQFILE * sqfp)2116 inmap_ncbi(ESL_SQFILE *sqfp)
2117 {
2118   int status = eslOK;
2119 
2120   ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
2121 
2122   switch(ncbi->alphatype) {
2123   case eslDNA:
2124     status = inmap_ncbi_dna(sqfp);
2125     break;
2126   case eslAMINO:
2127     status = inmap_ncbi_amino(sqfp);
2128     break;
2129   default:
2130     ESL_EXCEPTION(eslEINVAL, "bad alphabet type: unrecognized");
2131   }
2132 
2133   return status;
2134 }
2135 
2136 /* Function:  inmap_ncbi_dna()
2137  * Synopsis:  Set up a translation map
2138  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
2139  *
2140  * Purpose:   Initialize the translation map used to translate a ncbi
2141  *            protein sequence to the internal representation used in
2142  *            hmmer.
2143  *
2144  * Returns:   <eslOK> on success;
2145  *
2146  * Throws:    <eslEMEM> on allocation failure;
2147  */
2148 static int
inmap_ncbi_dna(ESL_SQFILE * sqfp)2149 inmap_ncbi_dna(ESL_SQFILE *sqfp)
2150 {
2151   int x, y;
2152   const char *ncbisym = "-ACMGRSVTWYHKDBN";
2153 
2154   ESL_ALPHABET    *abc  = NULL;
2155   ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
2156 
2157   if ((abc = esl_alphabet_Create(eslDNA)) == NULL) return eslEMEM;
2158 
2159   for (x =  0;  x < 128;  x++) sqfp->inmap[x] = eslDSQ_ILLEGAL;
2160 
2161   /* for each letter in the ncbi alphabet, find that letter in the
2162    * hmmer alphabet and map the translation.
2163    */
2164   for (x = 0; x < strlen(ncbisym); ++x) {
2165     for (y = 0; y < strlen(abc->sym); ++y) {
2166       if (ncbisym[x] == abc->sym[y]) {
2167 	sqfp->inmap[x] = y;
2168 	break;
2169       }
2170     }
2171 
2172     /* there is a problem if a translation does not exist */
2173     if (y >= strlen(abc->sym)) return eslEFORMAT;
2174   }
2175 
2176   if (ncbi->alphasym == NULL) esl_strdup(abc->sym, -1, &ncbi->alphasym);
2177 
2178   esl_alphabet_Destroy(abc);
2179 
2180   return eslOK;
2181 }
2182 
2183 
2184 /* Function:  inmap_ncbi_amino()
2185  * Synopsis:  Set up a translation map
2186  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
2187  *
2188  * Purpose:   Initialize the translation map used to translate a ncbi
2189  *            protein sequence to the internal representation used in
2190  *            hmmer.
2191  *
2192  * Returns:   <eslOK> on success;
2193  *
2194  * Throws:    <eslEMEM> on allocation failure;
2195  */
2196 static int
inmap_ncbi_amino(ESL_SQFILE * sqfp)2197 inmap_ncbi_amino(ESL_SQFILE *sqfp)
2198 {
2199   int x, y;
2200   const char *ncbisym = "-ABCDEFGHIKLMNPQRSTVWXYZU*OJ";
2201 
2202   ESL_ALPHABET    *abc  = NULL;
2203   ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
2204 
2205   if ((abc = esl_alphabet_Create(eslAMINO)) == NULL) return eslEMEM;
2206 
2207   for (x =  0;  x < 128;  x++) sqfp->inmap[x] = eslDSQ_ILLEGAL;
2208 
2209   /* for each letter in the ncbi alphabet, find that letter in the
2210    * hmmer alphabet and map the translation.
2211    */
2212   for (x = 0; x < strlen(ncbisym); ++x) {
2213     for (y = 0; y < strlen(abc->sym); ++y) {
2214       if (ncbisym[x] == abc->sym[y]) {
2215 	sqfp->inmap[x] = y;
2216 	break;
2217       }
2218     }
2219 
2220     /* there is a problem if a translation does not exist */
2221     if (y >= strlen(abc->sym)) return eslEFORMAT;
2222   }
2223 
2224   if (ncbi->alphasym == NULL) esl_strdup(abc->sym, -1, &ncbi->alphasym);
2225 
2226   esl_alphabet_Destroy(abc);
2227 
2228   return eslOK;
2229 }
2230 
2231 
2232 
2233 /*****************************************************************
2234  *# 5. Parsing routines
2235  *****************************************************************/
2236 
2237 /* Function:  parse_expect()
2238  * Synopsis:  Expect the next bytes to parse match
2239  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
2240  *
2241  * Purpose:   Match if the next <len> bytes to parse match the bytes
2242  *            in <str>.  If the bytes do not match, throw <eslEFORMAT>
2243  *            error.  Advance the parsers pointer.
2244  *
2245  * Returns:   <eslOK> on success
2246  *
2247  * Throws:    <eslEFORMAT> if there are insufficient bytes remaining
2248  *            in the header or if the data to parse does not match
2249  *            what is expected.
2250  */
2251 static int
parse_expect(ESL_SQNCBI_DATA * ncbi,void * str,int len)2252 parse_expect(ESL_SQNCBI_DATA *ncbi, void *str, int len)
2253 {
2254   int size;
2255   unsigned char *c;
2256   unsigned char *limit;
2257 
2258   size  = ncbi->hoff - ncbi->roff;
2259   limit = ncbi->hdr_buf + size;
2260 
2261   /* verify the buffer has atleast len bytes remaining */
2262   if (ncbi->hdr_ptr + len > limit) {
2263       ESL_FAIL(eslEFORMAT, ncbi->errbuf, "Expecting %d bytes at %d : 0x%X(%d)\n",
2264 	       len, (uint32_t) (ncbi->hdr_ptr - ncbi->hdr_buf), ncbi->roff, size);
2265   }
2266 
2267   /* check the buffer matches the token string */
2268   c = (unsigned char *) str;
2269   while (len--) {
2270     if (*ncbi->hdr_ptr != *c) {
2271       ESL_FAIL(eslEFORMAT, ncbi->errbuf, "Expecting 0x%X found 0x%X at %d : 0x%X(%d)\n",
2272 	       *ncbi->hdr_ptr, *c, (uint32_t) (ncbi->hdr_ptr - ncbi->hdr_buf), ncbi->roff, size);
2273     }
2274     ncbi->hdr_ptr++;
2275     c++;
2276   }
2277 
2278   return eslOK;
2279 }
2280 
2281 /* Function:  parse_accept()
2282  * Synopsis:  Check if the next bytes to parse match
2283  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
2284  *
2285  * Purpose:   Check if the next <len> bytes to parse match the bytes
2286  *            in <str>.  If the bytes match, they are consumed and the
2287  *            parsers pointer is advanced.
2288  *
2289  * Returns:   <eslOK> on success
2290  *            <eslEFORMAT> if the bytes to not match.
2291  */
2292 static int
parse_accept(ESL_SQNCBI_DATA * ncbi,void * str,int len)2293 parse_accept(ESL_SQNCBI_DATA *ncbi, void *str, int len)
2294 {
2295   int i;
2296   int size;
2297   unsigned char *c;
2298   unsigned char *limit;
2299 
2300   size  = ncbi->hoff - ncbi->roff;
2301   limit = ncbi->hdr_buf + size;
2302 
2303   /* check the buffer matches the token string */
2304   if (ncbi->hdr_ptr + len > limit)  return eslEFORMAT;
2305 
2306   /* verify the buffer matches the token string without advancing
2307    * the buffer pointers until we have a complete match.
2308    */
2309   c = (unsigned char *) str;
2310   for (i = 0; i < len; ++i) {
2311     if (ncbi->hdr_ptr[i] != c[i])   return eslEFORMAT;
2312   }
2313   ncbi->hdr_ptr += len;
2314 
2315   return eslOK;
2316 }
2317 
2318 /* Function:  parse_peek()
2319  * Synopsis:  Peek at the next byte to parse
2320  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
2321  *
2322  * Purpose:   Return the next characer to be parsed without advancing the
2323  *            parsers pointer.
2324  *
2325  * Returns:   <eslOK> on success
2326  *            <eslEFORMAT> if there are insufficient bytes remaining
2327  *            in the header.
2328  */
2329 static int
parse_peek(ESL_SQNCBI_DATA * ncbi,unsigned char * c)2330 parse_peek(ESL_SQNCBI_DATA *ncbi, unsigned char *c)
2331 {
2332   int size;
2333   unsigned char *limit;
2334 
2335   size  = ncbi->hoff - ncbi->roff;
2336   limit = ncbi->hdr_buf + size;
2337 
2338   /* verify the buffer has atleast len bytes remaining */
2339   if (ncbi->hdr_ptr + 1 > limit)    return eslEFORMAT;
2340 
2341   *c = *ncbi->hdr_ptr;
2342 
2343   return eslOK;
2344 }
2345 
2346 /* Function:  parse_consume()
2347  * Synopsis:  Copies bytes from the header
2348  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
2349  *
2350  * Purpose:   Copies <len> bytes from the header to the buffer supplied by
2351  *            <str> if non-null.  Adcance the parser pointer.
2352  *
2353  * Returns:   <eslOK> on success
2354  *
2355  * Throws:    <eslEFORMAT> if there are insufficient bytes remaining
2356  *            in the header.
2357  */
2358 static int
parse_consume(ESL_SQNCBI_DATA * ncbi,void * str,int len)2359 parse_consume(ESL_SQNCBI_DATA *ncbi, void *str, int len)
2360 {
2361   int i;
2362   int size;
2363   unsigned char *c;
2364   unsigned char *limit;
2365 
2366   size  = ncbi->hoff - ncbi->roff;
2367   limit = ncbi->hdr_buf + size;
2368 
2369   /* verify the buffer has atleast len bytes remaining */
2370   if (ncbi->hdr_ptr + len > limit) {
2371       ESL_FAIL(eslEFORMAT, ncbi->errbuf, "Expecting %d bytes at %d : 0x%X(%d)\n",
2372 	       len, (uint32_t) (ncbi->hdr_ptr - ncbi->hdr_buf), ncbi->roff, size);
2373   }
2374 
2375   /* copy the characters in the buffer to <str> */
2376   c = (unsigned char *) str;
2377   for (i = 0; i < len; ++i) {
2378     if (c != NULL) *c++ = *ncbi->hdr_ptr++;
2379   }
2380 
2381   return eslOK;
2382 }
2383 
2384 /* Function:  parse_advance()
2385  * Synopsis:  Advance the parser pointer
2386  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
2387  *
2388  * Purpose:   Advance the parser pointer <len> bytes.
2389  *
2390  * Returns:   <eslOK> on success
2391  *
2392  * Throws:    <eslEFORMAT> if there are insufficient bytes remaining
2393  *            in the header.
2394  */
2395 static int
parse_advance(ESL_SQNCBI_DATA * ncbi,int len)2396 parse_advance(ESL_SQNCBI_DATA *ncbi, int len)
2397 {
2398   int size;
2399   unsigned char *limit;
2400 
2401   size  = ncbi->hoff - ncbi->roff;
2402   limit = ncbi->hdr_buf + size;
2403 
2404   /* verify the buffer has atleast len bytes remaining */
2405   if (ncbi->hdr_ptr + len > limit) {
2406       ESL_FAIL(eslEFORMAT, ncbi->errbuf, "Expecting %d bytes at %d : 0x%X(%d)\n",
2407 	       len, (uint32_t) (ncbi->hdr_ptr - ncbi->hdr_buf), ncbi->roff, size);
2408   }
2409 
2410   ncbi->hdr_ptr += len;
2411 
2412   return eslOK;
2413 }
2414 
2415 /* Function:  parse_header()
2416  * Synopsis:  Parse the ncbi db header
2417  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
2418  *
2419  * Purpose:   Parse a ncbi database header.  This routine implements
2420  *            a recursive descent parser for the ASN.1 definition of
2421  *            a blast database header filling in <sq>.
2422  *
2423  *            The blast db header can have multiple definitions defined
2424  *            within it.  Only the information from the first usable
2425  *            defition will be used.
2426  *
2427  * Returns:   <eslOK> on success; the new sequence is stored in <s>.
2428  *
2429  *            Returns <eslEFORMAT> if there's a problem with the format,
2430  *            such as an illegal character.
2431  */
2432 static int
parse_header(ESL_SQNCBI_DATA * ncbi,ESL_SQ * sq)2433 parse_header(ESL_SQNCBI_DATA *ncbi, ESL_SQ *sq)
2434 {
2435   int   size;
2436   int   status;
2437   void *tmp;
2438 
2439   unsigned char c;
2440 
2441   reset_header_values(ncbi);
2442   size  = ncbi->hoff - ncbi->roff;
2443 
2444   /* read in the header data */
2445   if (ncbi->hdr_alloced < size) {
2446     while (ncbi->hdr_alloced < size) ncbi->hdr_alloced += ncbi->hdr_alloced;
2447     ESL_RALLOC(ncbi->hdr_buf, tmp, sizeof(char) * ncbi->hdr_alloced);
2448   }
2449   if (fread(ncbi->hdr_buf, sizeof(char), size, ncbi->fpphr) != size) return eslEFORMAT;
2450   ncbi->hdr_ptr = ncbi->hdr_buf;
2451 
2452   /* verify we are at the beginning of a structure */
2453   if (parse_expect(ncbi, "\x30\x80", 2) != eslOK)             return eslEFORMAT;
2454 
2455   if (parse_peek(ncbi, &c) != eslOK)                          return eslEFORMAT;
2456 
2457   /* parse the different seq id structures */
2458   while (c != 0x00) {
2459     if ((status = parse_def_line(ncbi, sq)) != eslOK)         return status;
2460 
2461     if (parse_peek(ncbi, &c) != eslOK)                        return eslEFORMAT;
2462   }
2463 
2464   /* verify we are at the end of the structure */
2465   if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)             return eslEFORMAT;
2466 
2467   return eslOK;
2468  ERROR:
2469   return eslEMEM;
2470 }
2471 
2472 /* Function:  parse_def_line()
2473  * Synopsis:  Parse the Blast-def-line definition
2474  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
2475  *
2476  * Blast-def-line ::= SEQUENCE {
2477  * 	title       VisibleString       OPTIONAL,  -- simple title
2478  * 	seqid       SEQUENCE OF Seq-id,            -- Regular NCBI Seq-Id
2479  * 	taxid       INTEGER             OPTIONAL,  -- taxonomy id
2480  * 	memberships SEQUENCE OF INTEGER OPTIONAL,  -- bit arrays
2481  * 	links       SEQUENCE OF INTEGER OPTIONAL,  -- bit arrays
2482  * 	other-info  SEQUENCE OF INTEGER OPTIONAL   -- for future use (probably genomic sequences)
2483  * }
2484  */
2485 static int
parse_def_line(ESL_SQNCBI_DATA * ncbi,ESL_SQ * sq)2486 parse_def_line(ESL_SQNCBI_DATA *ncbi, ESL_SQ *sq)
2487 {
2488   int   status;
2489 
2490   int   i;
2491   int   len     = 0;
2492   int   taxid   = -1;
2493   char *title   = NULL;
2494 
2495   /* verify we are at the beginning of a structure */
2496   if (parse_expect(ncbi, "\x30\x80", 2) != eslOK)             return eslEFORMAT;
2497 
2498   /* look for an optional title */
2499   sq->desc[0] = 0;
2500   if (parse_accept(ncbi, "\xa0\x80", 2) == eslOK) {
2501     if ((status = parse_string(ncbi, &title, &len)) != eslOK) return status;
2502     if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)           return eslEFORMAT;
2503   }
2504 
2505   /* look for sequence id structure */
2506   if (parse_expect(ncbi, "\xa1\x80", 2) != eslOK)             return eslEFORMAT;
2507   if ((status = parse_seq_id(ncbi)) != eslOK)                 return status;
2508   if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)             return eslEFORMAT;
2509 
2510   /* look for an optional taxonomy id */
2511   sq->tax_id = -1;
2512   if (parse_accept(ncbi, "\xa2\x80", 2) == eslOK) {
2513     if ((status = parse_integer(ncbi, &taxid)) != eslOK)      return status;
2514     if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)           return eslEFORMAT;
2515   }
2516 
2517   /* look for an optional memberships */
2518   if (parse_accept(ncbi, "\xa3\x80", 2) == eslOK) {
2519     if ((status = ignore_sequence_of_integer(ncbi)) != eslOK) return status;
2520     if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)           return eslEFORMAT;
2521   }
2522 
2523   /* look for an optional links */
2524   if (parse_accept(ncbi, "\xa4\x80", 2) == eslOK) {
2525     if ((status = ignore_sequence_of_integer(ncbi)) != eslOK) return status;
2526     if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)           return eslEFORMAT;
2527   }
2528 
2529   /* look for an optional other info */
2530   if (parse_accept(ncbi, "\xa5\x80", 2) == eslOK) {
2531     if ((status = ignore_sequence_of_integer(ncbi)) != eslOK) return status;
2532     if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)           return eslEFORMAT;
2533   }
2534 
2535   /* verify we are at the end of the structure */
2536   if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)             return eslEFORMAT;
2537 
2538   /* zero terminate any saved string */
2539   if (ncbi->name_ptr   != NULL) ncbi->name_ptr[ncbi->name_size]     = '\0';
2540   if (ncbi->acc_ptr    != NULL) ncbi->acc_ptr[ncbi->acc_size]       = '\0';
2541   if (ncbi->str_id_ptr != NULL) ncbi->str_id_ptr[ncbi->str_id_size] = '\0';
2542 
2543   if (title != NULL) title[len] = '\0';
2544 
2545   if (ncbi->name_ptr != NULL || ncbi->acc_ptr != NULL) {
2546     /* if we have both a name and accession, set both fields.
2547      * if we have only one, set the name to that one field.
2548      */
2549     if (ncbi->name_ptr != NULL) {
2550       esl_sq_SetName(sq, ncbi->name_ptr);
2551       if (ncbi->acc_ptr != NULL) {
2552 	esl_sq_SetAccession(sq, ncbi->acc_ptr);
2553       }
2554     } else {
2555       esl_sq_SetName(sq, ncbi->acc_ptr);
2556     }
2557     if (title != NULL) esl_sq_SetDesc(sq, title);
2558   } else if (ncbi->str_id_ptr != NULL || ncbi->int_id != -1) {
2559     /* since we don't have a name or accession, use the id
2560      * as the name.
2561      */
2562     if (ncbi->str_id_ptr != NULL) {
2563       esl_sq_SetName(sq, ncbi->str_id_ptr);
2564     } else {
2565       char id[32];
2566       sprintf(id, "%d", ncbi->int_id);
2567       esl_sq_SetName(sq, id);
2568     }
2569     if (title != NULL) esl_sq_SetDesc(sq, title);
2570   } else if (title != NULL) {
2571     /* lastly we don't have anything, so lets just use the
2572      * title.  take the first word of the title and use that
2573      * for the name.  the remaining portion of the title will
2574      * be used for the description.
2575      */
2576     for (i = 0; i < len; ++i) {
2577       if (isspace(title[i])) {
2578 	title[i] = '\0';
2579 	break;
2580       }
2581     }
2582     esl_sq_SetName(sq, title);
2583     ++i;
2584 
2585     /* skip over multiple spaces till the next word */
2586     for ( ; i < len; ++i) {
2587       if (!isspace(title[i])) break;
2588     }
2589     if (i < len) esl_sq_SetDesc(sq, title + i);
2590   }
2591 
2592   if (taxid != -1) sq->tax_id = taxid;
2593 
2594   return eslOK;
2595 }
2596 
2597 
2598 /* Function:  parse_seq_id()
2599  * Synopsis:  Parse the Blast-def-line definition
2600  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
2601  *
2602  * Seq-id ::= CHOICE {
2603  *     local             Object-id ,       -- local use
2604  *     gibbsq            INTEGER ,         -- Geninfo backbone seqid
2605  *     gibbmt            INTEGER ,         -- Geninfo backbone moltype
2606  *     giim              Giimport-id ,     -- Geninfo import id
2607  *     genbank           Textseq-id ,
2608  *     embl              Textseq-id ,
2609  *     pir               Textseq-id ,
2610  *     swissprot         Textseq-id ,
2611  *     patent            Patent-seq-id ,
2612  *     other             Textseq-id ,      -- for historical reasons, 'other' = 'refseq'
2613  *     general           Dbtag ,           -- for other databases
2614  *     gi                INTEGER ,         -- GenInfo Integrated Database
2615  *     ddbj              Textseq-id ,      -- DDBJ
2616  *     prf               Textseq-id ,      -- PRF SEQDB
2617  *     pdb               PDB-seq-id ,      -- PDB sequence
2618  *     tpg               Textseq-id ,      -- Third Party Annot/Seq GenBank
2619  *     tpe               Textseq-id ,      -- Third Party Annot/Seq EMBL
2620  *     tpd               Textseq-id ,      -- Third Party Annot/Seq DDBJ
2621  *     gpipe             Textseq-id ,      -- Internal NCBI genome pipeline processing ID
2622  *     named-annot-track Textseq-id        -- Internal named annotation tracking ID
2623  * }
2624  */
2625 static int
parse_seq_id(ESL_SQNCBI_DATA * ncbi)2626 parse_seq_id(ESL_SQNCBI_DATA *ncbi)
2627 {
2628   int   status;
2629   int  *id_ptr;
2630 
2631   unsigned char c;
2632 
2633   /* verify we are at the beginning of a structure */
2634   if (parse_expect(ncbi, "\x30\x80", 2) != eslOK)           return eslEFORMAT;
2635 
2636   if (parse_consume(ncbi, &c, 1) != eslOK)                  return eslEFORMAT;
2637 
2638   /* parse the different seq id structures */
2639   while (c != 0x00) {
2640     if (parse_expect(ncbi, "\x80", 1) != eslOK)             return eslEFORMAT;
2641     switch (c) {
2642     case 0xa0: /* LOCAL */
2643       status = parse_object_id(ncbi);
2644       break;
2645     case 0xa1: /* GIBBSQ */
2646       id_ptr = (ncbi->int_id != -1) ? NULL : &ncbi->int_id;
2647       status = parse_integer(ncbi, id_ptr);
2648       break;
2649     case 0xa2: /* GIBBMT */
2650       status = parse_integer(ncbi, NULL);
2651       break;
2652     case 0xa3: /* GIIM */
2653       status = parse_giimport_id(ncbi);
2654       break;
2655     case 0xa4: /* GENBANK */
2656     case 0xa5: /* EMBL */
2657     case 0xa6: /* PIR */
2658     case 0xa7: /* SWISSPROT */
2659       status = parse_textseq_id(ncbi);
2660       break;
2661     case 0xa8: /* PATENT */
2662       status = parse_patent_seq_id(ncbi);
2663       break;
2664     case 0xa9: /* OTHER */
2665       status = parse_textseq_id(ncbi);
2666       break;
2667     case 0xaa: /* GENERAL */
2668       status = parse_dbtag(ncbi);
2669       break;
2670     case 0xab: /* GI */
2671       status = parse_integer(ncbi, NULL);
2672       break;
2673     case 0xac: /* DDBJ */
2674     case 0xad: /* PRF */
2675       status = parse_textseq_id(ncbi);
2676       break;
2677     case 0xae: /* PDB */
2678       status = parse_pdb_seq_id(ncbi);
2679       break;
2680     case 0xaf: /* TPG */
2681     case 0xb0: /* TPE */
2682     case 0xb1: /* TPD */
2683     case 0xb2: /* GPIPE */
2684     case 0xb3: /* NAMED ANNOT TRACK */
2685       status = parse_textseq_id(ncbi);
2686       break;
2687     default:
2688       status = eslEFORMAT;
2689     }
2690 
2691     if (status != eslOK)                                    return status;
2692     if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)         return eslEFORMAT;
2693     if (parse_consume(ncbi, &c, 1)        != eslOK)         return eslEFORMAT;
2694   }
2695 
2696   /* verify we are at the end of the structure */
2697   if (c != 0x00 || parse_expect(ncbi, "\x00", 1) != eslOK)  return eslEFORMAT;
2698 
2699   return eslOK;
2700 }
2701 
2702 
2703 /* Function:  parse_textseq_id()
2704  * Synopsis:  Parse the general text header
2705  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
2706  *
2707  * Textseq-id ::= SEQUENCE {
2708  *     name      VisibleString OPTIONAL ,
2709  *     accession VisibleString OPTIONAL ,
2710  *     release   VisibleString OPTIONAL ,
2711  *     version   INTEGER       OPTIONAL
2712  * }
2713  */
2714 static int
parse_textseq_id(ESL_SQNCBI_DATA * ncbi)2715 parse_textseq_id(ESL_SQNCBI_DATA *ncbi)
2716 {
2717   char *acc  = NULL;
2718   int   alen = 0;
2719   char *name = NULL;
2720   int   nlen = 0;
2721 
2722   int   status;
2723 
2724   /* verify we are at the beginning of a structure */
2725   if (parse_expect(ncbi, "\x30\x80", 2) != eslOK)             return eslEFORMAT;
2726 
2727   /* look for an optional name */
2728   if (parse_accept(ncbi, "\xa0\x80", 2) == eslOK) {
2729     if ((status = parse_string(ncbi, &name, &nlen)) != eslOK) return status;
2730     if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)           return eslEFORMAT;
2731   }
2732 
2733   /* look for an optional accession */
2734   if (parse_accept(ncbi, "\xa1\x80", 2) == eslOK) {
2735     if ((status = parse_string(ncbi, &acc, &alen)) != eslOK)  return status;
2736     if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)           return eslEFORMAT;
2737   }
2738 
2739   /* look for an optional release */
2740   if (parse_accept(ncbi, "\xa2\x80", 2) == eslOK) {
2741     if ((status = parse_string(ncbi, NULL, NULL)) != eslOK)   return status;
2742     if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)           return eslEFORMAT;
2743   }
2744 
2745   /* look for an optional version */
2746   if (parse_accept(ncbi, "\xa3\x80", 2) == eslOK) {
2747     if ((status = parse_integer(ncbi, NULL)) != eslOK)        return status;
2748     if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)           return eslEFORMAT;
2749   }
2750 
2751   /* verify we are at the end of the structure */
2752   if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)             return eslEFORMAT;
2753 
2754   /* if we found both the accession and name and so far
2755    * we have only come across incomplete headers, save
2756    * this one off.
2757    */
2758   if (acc != NULL && name != NULL) {
2759     if (ncbi->name_ptr == NULL || ncbi->acc_ptr == NULL) {
2760       ncbi->name_ptr  = name;
2761       ncbi->name_size = nlen;
2762       ncbi->acc_ptr   = acc;
2763       ncbi->acc_size  = alen;
2764     }
2765 
2766   } else if (ncbi->name_ptr == NULL && ncbi->acc_ptr == NULL) {
2767     /* if neither the accession or name have been set, and the
2768      * header supplied one, save it off.
2769      */
2770     if (acc != NULL) {
2771       ncbi->acc_ptr   = acc;
2772       ncbi->acc_size  = alen;
2773     }
2774     if (name != NULL) {
2775       ncbi->name_ptr  = name;
2776       ncbi->name_size = nlen;
2777     }
2778   }
2779 
2780   return eslOK;
2781 }
2782 
2783 
2784 /* Function:  parse_dbtag()
2785  * Synopsis:  Parse the a general db tag
2786  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
2787  *
2788  * Dbtag ::= SEQUENCE {
2789  *     db  VisibleString ,     -- name of database or system
2790  *     tag Object-id           -- appropriate tag
2791  * }
2792  */
2793 static int
parse_dbtag(ESL_SQNCBI_DATA * ncbi)2794 parse_dbtag(ESL_SQNCBI_DATA *ncbi)
2795 {
2796   int   status;
2797   int   temp_id;
2798 
2799   /* verify we are at the beginning of a structure */
2800   if (parse_expect(ncbi, "\x30\x80", 2) != eslOK)             return eslEFORMAT;
2801 
2802   /* look for an db name */
2803   if (parse_expect(ncbi, "\xa0\x80", 2) != eslOK)             return eslEFORMAT;
2804   if ((status = parse_string(ncbi, 0, NULL)) != eslOK)        return status;
2805 
2806   if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)             return eslEFORMAT;
2807 
2808   /* it looks like the dbtag is used when formatdb is run
2809    * without parsing sequence ids (ie -o F).  if that is
2810    * the case, the id is equal to the sequence number in
2811    * the database.  so for dbtag headers, nothing will be
2812    * saved.  to do this lets create a bogus id value and
2813    * restore it after dbtag is parsed.
2814    */
2815   temp_id = ncbi->int_id;
2816   ncbi->int_id = 1;
2817 
2818   /* look for a tag object */
2819   if (parse_expect(ncbi, "\xa1\x80", 2) != eslOK)             return eslEFORMAT;
2820   if ((status = parse_object_id(ncbi)) != eslOK)              return status;
2821   if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)             return eslEFORMAT;
2822 
2823   /* restore the id value */
2824   ncbi->int_id = temp_id;
2825 
2826   /* verify we are at the end of the structure */
2827   if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)             return eslEFORMAT;
2828 
2829   return eslOK;
2830 }
2831 
2832 
2833 /* Function:  parse_giimport_id()
2834  * Synopsis:  Parse the a giimport id
2835  * Incept:    MSF, Thu Mar 25, 2010 [Janelia]
2836  *
2837  * Giimport-id ::= SEQUENCE {
2838  *     id INTEGER,                      -- the id to use here
2839  *     db VisibleString OPTIONAL,       -- dbase used in
2840  *     release VisibleString OPTIONAL } -- the release
2841  * }
2842  */
2843 static int
parse_giimport_id(ESL_SQNCBI_DATA * ncbi)2844 parse_giimport_id(ESL_SQNCBI_DATA *ncbi)
2845 {
2846   int   status;
2847   int   id;
2848 
2849   /* verify we are at the beginning of a structure */
2850   if (parse_expect(ncbi, "\x30\x80", 2) != eslOK)             return eslEFORMAT;
2851 
2852   /* look for an id */
2853   if (parse_expect(ncbi, "\xa0\x80", 2) != eslOK)             return eslEFORMAT;
2854   if ((status = parse_integer(ncbi, &id)) != eslOK)           return status;
2855 
2856    /* look for an optional database */
2857   if (parse_accept(ncbi, "\xa1\x80", 2) == eslOK) {
2858     if ((status = parse_string(ncbi, NULL, NULL)) != eslOK)   return status;
2859     if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)           return eslEFORMAT;
2860   }
2861 
2862   /* look for an optional release */
2863   if (parse_accept(ncbi, "\xa2\x80", 2) == eslOK) {
2864     if ((status = parse_string(ncbi, NULL, NULL)) != eslOK)   return status;
2865     if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)           return eslEFORMAT;
2866   }
2867 
2868   /* verify we are at the end of the structure */
2869   if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)             return eslEFORMAT;
2870 
2871   /* if there is not already a saved seq id, save it */
2872   if (ncbi->int_id == -1 && ncbi->str_id_ptr == NULL) {
2873     ncbi->int_id = id;
2874   }
2875 
2876   return eslOK;
2877 }
2878 
2879 
2880 /* Function:  parse_patent_seq_id()
2881  * Synopsis:  Parse the patent header
2882  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
2883  *
2884  * Patent-seq-id ::= SEQUENCE {
2885  *     seqid INTEGER ,          -- number of sequence in patent
2886  *     cit   Id-pat             -- patent citation
2887  * }
2888  */
2889 static int
parse_patent_seq_id(ESL_SQNCBI_DATA * ncbi)2890 parse_patent_seq_id(ESL_SQNCBI_DATA *ncbi)
2891 {
2892   int   status;
2893   int   id;
2894 
2895   /* verify we are at the beginning of a structure */
2896   if (parse_expect(ncbi, "\x30\x80", 2) != eslOK)             return eslEFORMAT;
2897 
2898   /* look for a seqid */
2899   if (parse_expect(ncbi, "\xa0\x80", 2) != eslOK)             return eslEFORMAT;
2900   if ((status = parse_integer(ncbi, &id)) != eslOK)           return status;
2901 
2902   /* look for a patent citation object */
2903   if (parse_expect(ncbi, "\xa1\x80", 2) != eslOK)             return eslEFORMAT;
2904   if ((status = parse_id_pat(ncbi)) != eslOK)                 return status;
2905 
2906   /* verify we are at the end of the structure */
2907   if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)             return eslEFORMAT;
2908 
2909   /* if there is not already a saved seq id, save it */
2910   if (ncbi->int_id == -1 && ncbi->str_id_ptr == NULL) {
2911     ncbi->int_id = id;
2912   }
2913 
2914   return eslOK;
2915 }
2916 
2917 
2918 /* Function:  parse_id_pat()
2919  * Synopsis:  Parse the patent citation
2920  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
2921  *
2922  * Id-pat ::= SEQUENCE {                         -- just to identify a patent
2923  *     country  VisibleString ,                  -- Patent Document Country
2924  *     id       CHOICE {
2925  *         number     VisibleString ,            -- Patent Document Number
2926  *         app-number VisibleString              -- Patent Doc Appl Number
2927  *     } ,
2928  *     doc-type VisibleString         OPTIONAL   -- Patent Doc Type
2929  * }
2930  */
2931 static int
parse_id_pat(ESL_SQNCBI_DATA * ncbi)2932 parse_id_pat(ESL_SQNCBI_DATA *ncbi)
2933 {
2934   int   status;
2935 
2936   /* verify we are at the beginning of a structure */
2937   if (parse_expect(ncbi, "\x30\x80", 2) != eslOK)             return eslEFORMAT;
2938 
2939   /* look for a country */
2940   if (parse_expect(ncbi, "\xa0\x80", 2) != eslOK)             return eslEFORMAT;
2941   if ((status = parse_string(ncbi, NULL, NULL)) != eslOK)     return status;
2942 
2943   /* look for an id */
2944   if (parse_expect(ncbi, "\xa1\x80", 2) != eslOK)             return eslEFORMAT;
2945 
2946   /* the id is a choice of two strings */
2947 
2948   /* verify we are at the beginning of a structure */
2949   if (parse_expect(ncbi, "\x30\x80", 2) != eslOK)             return eslEFORMAT;
2950 
2951   /* look for an optional taxonomy id */
2952   if (parse_accept(ncbi, "\xa0\x80", 2) == eslOK) {
2953     status = parse_string(ncbi, NULL, NULL);
2954   } else if (parse_accept(ncbi, "\xa1\x80", 2) == eslOK) {
2955     status = parse_string(ncbi, NULL, NULL);
2956   } else {
2957     status = eslEFORMAT;
2958   }
2959   if (status != eslOK)                                        return status;
2960 
2961   /* verify we are at the end of the structure */
2962   if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)             return eslEFORMAT;
2963 
2964   /* look for a doc type */
2965   if (parse_accept(ncbi, "\xa3\x80", 2) == eslOK) {
2966     if ((status = parse_string(ncbi, NULL, NULL)) != eslOK)   return eslEFORMAT;
2967   }
2968 
2969   /* verify we are at the end of the structure */
2970   if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)             return eslEFORMAT;
2971 
2972   return eslOK;
2973 }
2974 
2975 
2976 /* Function:  parse_object_id()
2977  * Synopsis:  Parse a generic sequence id
2978  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
2979  *
2980  * Object-id ::= CHOICE {
2981  *     id  INTEGER ,
2982  *     str VisibleString
2983  * }
2984  */
2985 static int
parse_object_id(ESL_SQNCBI_DATA * ncbi)2986 parse_object_id(ESL_SQNCBI_DATA *ncbi)
2987 {
2988   int    status;
2989 
2990   char  *id_str = NULL;
2991   int    id_len = 0;
2992   int    id     = -1;
2993 
2994   /* look for an optional taxonomy id */
2995   if (parse_accept(ncbi, "\xa0\x80", 2) == eslOK) {
2996     status = parse_integer(ncbi, &id);
2997   } else if (parse_accept(ncbi, "\xa1\x80", 2) == eslOK) {
2998     status = parse_string(ncbi, &id_str, &id_len);
2999   } else {
3000     status = eslEFORMAT;
3001   }
3002 
3003   /* verify we are at the end of the structure */
3004   if (status == eslOK) {
3005     status = parse_expect(ncbi, "\x00\x00", 2);
3006 
3007     /* if there is not already a saved seq id, save it */
3008     if (ncbi->int_id == -1 && ncbi->str_id_ptr == NULL) {
3009       if (id_str != NULL) {
3010 	ncbi->str_id_ptr  = id_str;
3011 	ncbi->str_id_size = id_len;
3012       } else if (id != -1) {
3013 	ncbi->int_id = id;
3014       }
3015     }
3016   }
3017 
3018   return status;
3019 }
3020 
3021 
3022 /* Function:  parse_pdb_seq_id()
3023  * Synopsis:  Parse a PDB sequence
3024  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
3025  *
3026  * PDB-seq-id ::= SEQUENCE {
3027  *     mol   PDB-mol-id ,              -- the molecule name
3028  *     chain INTEGER ,                 -- a single ASCII character, chain id
3029  *     rel   Date         OPTIONAL }   -- release date, month and year
3030  *
3031  * Date ::= CHOICE {
3032  *     str   VisibleString ,           -- for those unparsed dates
3033  *     std   Date-std                  -- use this if you can
3034  * }
3035  */
3036 static int
parse_pdb_seq_id(ESL_SQNCBI_DATA * ncbi)3037 parse_pdb_seq_id(ESL_SQNCBI_DATA *ncbi)
3038 {
3039   int   status;
3040 
3041   char  *id;
3042   int    len;
3043 
3044   /* verify we are at the beginning of a structure */
3045   if (parse_expect(ncbi, "\x30\x80", 2) != eslOK)             return eslEFORMAT;
3046 
3047   /* look for an pdb mol id */
3048   if (parse_expect(ncbi, "\xa0\x80", 2) != eslOK)             return eslEFORMAT;
3049   if ((status = parse_string(ncbi, &id, &len)) != eslOK)      return status;
3050   if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)             return eslEFORMAT;
3051 
3052   /* look for chain */
3053   if (parse_accept(ncbi, "\xa1\x80", 2) == eslOK) {
3054     if ((status = parse_integer(ncbi, NULL)) != eslOK)        return status;
3055     if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)           return eslEFORMAT;
3056   }
3057 
3058   /* look for an optional date */
3059   if (parse_accept(ncbi, "\xa2\x80", 2) == eslOK) {
3060     if (parse_accept(ncbi, "\xa0\x80", 2) == eslOK) {
3061       status = parse_string(ncbi, NULL, NULL);
3062     } else if (parse_accept(ncbi, "\xa1\x80", 2) == eslOK) {
3063       status = parse_date_std(ncbi);
3064     } else {
3065       status = eslEFORMAT;
3066     }
3067     if (status != eslOK)                                      return status;
3068     if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)           return eslEFORMAT;
3069     if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)           return eslEFORMAT;
3070   }
3071 
3072   /* if there is not already a saved seq id, save it */
3073   if (ncbi->int_id == -1 && ncbi->str_id_ptr == NULL) {
3074     ncbi->str_id_ptr  = id;
3075     ncbi->str_id_size = len;
3076   }
3077 
3078   /* verify we are at the end of the structure */
3079   if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)             return eslEFORMAT;
3080 
3081   return eslOK;
3082 }
3083 
3084 
3085 /* Function:  parse_date_std()
3086  * Synopsis:  Parse the data structure
3087  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
3088  *
3089  * Date-std ::= SEQUENCE {              -- NOTE: this is NOT a unix tm struct
3090  *     year   INTEGER ,                 -- full year (including 1900)
3091  *     month  INTEGER       OPTIONAL ,  -- month (1-12)
3092  *     day    INTEGER       OPTIONAL ,  -- day of month (1-31)
3093  *     season VisibleString OPTIONAL ,  -- for "spring", "may-june", etc
3094  *     hour   INTEGER       OPTIONAL ,  -- hour of day (0-23)
3095  *     minute INTEGER       OPTIONAL ,  -- minute of hour (0-59)
3096  *     second INTEGER       OPTIONAL    -- second of minute (0-59)
3097  * }
3098  */
3099 static int
parse_date_std(ESL_SQNCBI_DATA * ncbi)3100 parse_date_std(ESL_SQNCBI_DATA *ncbi)
3101 {
3102   int   status;
3103 
3104   /* verify we are at the beginning of a structure */
3105   if (parse_expect(ncbi, "\x30\x80", 2) != eslOK)             return eslEFORMAT;
3106 
3107   /* look for a year */
3108   if (parse_expect(ncbi, "\xa0\x80", 2) != eslOK)             return eslEFORMAT;
3109   if ((status = parse_integer(ncbi, NULL)) != eslOK)          return status;
3110   if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)             return eslEFORMAT;
3111 
3112   /* look for an optional month */
3113   if (parse_accept(ncbi, "\xa1\x80", 2) == eslOK) {
3114     if ((status = parse_integer(ncbi, NULL)) != eslOK)        return status;
3115     if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)           return eslEFORMAT;
3116   }
3117 
3118   /* look for an optional day */
3119   if (parse_accept(ncbi, "\xa2\x80", 2) == eslOK) {
3120     if ((status = parse_integer(ncbi, NULL)) != eslOK)        return status;
3121     if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)           return eslEFORMAT;
3122   }
3123 
3124   /* look for an optional season */
3125   if (parse_accept(ncbi, "\xa3\x80", 2) == eslOK) {
3126     if ((status = parse_string(ncbi, NULL, NULL)) != eslOK)   return status;
3127     if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)           return eslEFORMAT;
3128   }
3129 
3130   /* look for an optional hour */
3131   if (parse_accept(ncbi, "\xa4\x80", 2) == eslOK) {
3132     if ((status = parse_integer(ncbi, NULL)) != eslOK)        return status;
3133     if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)           return eslEFORMAT;
3134   }
3135 
3136   /* look for an optional minute */
3137   if (parse_accept(ncbi, "\xa5\x80", 2) == eslOK) {
3138     if ((status = parse_integer(ncbi, NULL)) != eslOK)        return status;
3139     if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)           return eslEFORMAT;
3140   }
3141 
3142   /* look for an optional second */
3143   if (parse_accept(ncbi, "\xa6\x80", 2) == eslOK) {
3144     if ((status = parse_integer(ncbi, NULL)) != eslOK)        return status;
3145     if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)           return eslEFORMAT;
3146   }
3147 
3148   /* verify we are at the end of the structure */
3149   if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)             return eslEFORMAT;
3150 
3151   return eslOK;
3152 }
3153 
3154 
3155 /* Function:  parse_string()
3156  * Synopsis:  Parse a visible string
3157  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
3158  *
3159  * Purpose:   Parses a string from the header stream.
3160  *
3161  *            If <str> is non null, the location of the string in
3162  *            the header will be saved.  If <len> is non null, the
3163  *            length of the string will be filled in.  If <str> is
3164  *            non null, then <len> must be non null since the strings
3165  *            are not zero terminated.
3166  *
3167  * Returns:   <eslOK> on success.
3168  *            <eslEFORMAT> if there's a problem with the format.
3169  *            <eslEINCOMPAT> if <str> is non null and <len> is null.
3170  *
3171  */
3172 static int
parse_string(ESL_SQNCBI_DATA * ncbi,char ** str,int * len)3173 parse_string(ESL_SQNCBI_DATA *ncbi, char **str, int *len)
3174 {
3175   int n;
3176 
3177   unsigned char  x;
3178   unsigned char  c;
3179   unsigned char *ptr;
3180 
3181   if (parse_expect(ncbi, "\x1a", 1) != eslOK)  return eslEFORMAT;
3182 
3183   /* the next byte is the length of the string.  if the length is
3184    * less than 128, then this is the true length; otherwise this
3185    * length describes the number of bytes necessary to hold the
3186    * true length of the string in the lower 7 bits.
3187    */
3188   if (parse_consume(ncbi, &c, 1) != eslOK)     return eslEFORMAT;
3189   if (c < 128) {
3190     n = c;
3191   } else {
3192     c = c & 0x7f;
3193     if (c > sizeof(n))                                 return eslEFORMAT;
3194 
3195     n = 0;
3196     while (c > 0) {
3197       if (parse_consume(ncbi, &x, 1) != eslOK) return eslEFORMAT;
3198       n = (n << 8) + (unsigned int) x;
3199       --c;
3200     }
3201   }
3202 
3203   /* validate the length of the string */
3204   ptr = ncbi->hdr_ptr;
3205   if (parse_advance(ncbi, n) != eslOK)         return eslEFORMAT;
3206 
3207   /* fill in the values */
3208   if (str != NULL && len == NULL) return eslEINCOMPAT;
3209   if (len != NULL) *len = n;
3210   if (str != NULL) *str = (char *)ptr;
3211 
3212   return eslOK;
3213 }
3214 
3215 
3216 /* Function:  parse_integer()
3217  * Synopsis:  Parse an integer
3218  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
3219  *
3220  * Purpose:   Reads an integer from the header stream.  If the integer is
3221  *            more bytes than the native int format, the most significant
3222  *            bytes will be lost.
3223  *
3224  *            If <value> is non null, the parsed integer will be placed
3225  *            in the pointer.
3226  *
3227  * Returns:   <eslOK> on success.
3228  *            <eslEFORMAT> if there's a problem with the format.
3229  *
3230  */
3231 static int
parse_integer(ESL_SQNCBI_DATA * ncbi,int * value)3232 parse_integer(ESL_SQNCBI_DATA *ncbi, int *value)
3233 {
3234   int n;
3235 
3236   unsigned char  c;
3237   unsigned char *ptr;
3238 
3239   if (parse_expect(ncbi, "\x02", 1) != eslOK) return eslEFORMAT;
3240 
3241   /* get the length of the integer */
3242   if (parse_peek(ncbi, &c) != eslOK)          return eslEFORMAT;
3243   ptr = ncbi->hdr_ptr + 1;
3244 
3245   /* advance past the integer to make sure the buffer holds all
3246    * of the integer.  the pointer <ptr> points the the start of
3247    * the integer.
3248    */
3249   if (parse_advance(ncbi, c + 1) != eslOK)    return eslEFORMAT;
3250 
3251   n = 0;
3252   while (c > 0) {
3253     n = (n << 8) + (unsigned int) *ptr++;
3254     --c;
3255   }
3256 
3257   if (value != NULL) *value = n;
3258 
3259   return eslOK;
3260 }
3261 
3262 
3263 /* Function:  ignore_sequence_of_integer()
3264  * Synopsis:  Skip over the sequence of integers
3265  * Incept:    MSF, Mon Dec 10, 2009 [Janelia]
3266  *
3267  * Purpose:   Skip over a sequence of integers.
3268  *
3269  * Returns:   <eslOK> on success.
3270  *            <eslEFORMAT> if there's a problem with the format.
3271  *
3272  */
3273 static int
ignore_sequence_of_integer(ESL_SQNCBI_DATA * ncbi)3274 ignore_sequence_of_integer(ESL_SQNCBI_DATA *ncbi)
3275 {
3276   int status;
3277   unsigned char c;
3278 
3279   /* verify we are at the beginning of a structure */
3280   if (parse_expect(ncbi, "\x30\x80", 2) != eslOK)      return eslEFORMAT;
3281 
3282   if (parse_peek(ncbi, &c) != eslOK)                   return eslEFORMAT;
3283   while (c == 0x02) {
3284     if ((status = parse_integer(ncbi, NULL)) != eslOK) return status;
3285     if (parse_peek(ncbi, &c) != eslOK)                 return eslEFORMAT;
3286   }
3287 
3288   /* verify we are at the end of the structure */
3289   if (parse_expect(ncbi, "\x00\x00", 2) != eslOK)      return eslEFORMAT;
3290 
3291   return eslOK;
3292 }
3293