1 /* Unaligned ncbi sequence file i/o.
2 *
3 * Contents:
4 * 1. An <ESL_SQFILE> object, in text mode.
5 * 2. An <ESL_SQFILE> object, in digital mode. [with <alphabet>]
6 * 3. Miscellaneous routines.
7 * 4. Sequence reading (sequential).
8 * 5. Parsing routines
9 */
10 #include "esl_config.h"
11
12 #include <stdlib.h>
13 #include <stdio.h>
14 #include <string.h>
15 #include <ctype.h>
16
17 #ifdef HAVE_SYS_TYPES_H
18 #include <sys/types.h>
19 #endif
20
21 #ifdef HAVE_ENDIAN_H
22 #include <endian.h>
23 #endif
24
25 #include "easel.h"
26 #include "esl_alphabet.h" /* alphabet aug adds digital sequences */
27 #include "esl_sq.h"
28 #include "esl_sqio.h"
29
30 #ifndef htobe32
31 #ifdef WORDS_BIGENDIAN
32 #define htobe32(x) (x)
33 #else
34 #define htobe32(x) \
35 ((((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >> 8) | \
36 (((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24))
37 #endif
38 #endif
39
40 /* format specific routines */
41 static int sqncbi_Position (ESL_SQFILE *sqfp, off_t offset);
42 static void sqncbi_Close (ESL_SQFILE *sqfp);
43 static int sqncbi_SetDigital (ESL_SQFILE *sqfp, const ESL_ALPHABET *abc);
44 static int sqncbi_GuessAlphabet (ESL_SQFILE *sqfp, int *ret_type);
45 static int sqncbi_Read (ESL_SQFILE *sqfp, ESL_SQ *sq);
46 static int sqncbi_ReadInfo (ESL_SQFILE *sqfp, ESL_SQ *sq);
47 static int sqncbi_ReadSequence (ESL_SQFILE *sqfp, ESL_SQ *sq);
48 static int sqncbi_ReadWindow (ESL_SQFILE *sqfp, int C, int W, ESL_SQ *sq);
49 static int sqncbi_ReadBlock (ESL_SQFILE *sqfp, ESL_SQ_BLOCK *sqBlock, int max_residues, int max_sequences, int long_target);
50 static int sqncbi_Echo (ESL_SQFILE *sqfp, const ESL_SQ *sq, FILE *ofp);
51
52 static int sqncbi_IsRewindable (const ESL_SQFILE *sqfp);
53 static const char *sqncbi_GetError (const ESL_SQFILE *sqfp);
54
55 /* common routines for processing ncbi database */
56 static int sqncbi_Open (ESL_SQNCBI_DATA *ncbi, char *filename);
57
58 static void reset_db (ESL_SQNCBI_DATA *ncbi);
59 static int pos_sequence (ESL_SQNCBI_DATA *ncbi, int inx);
60 static int volume_open (ESL_SQNCBI_DATA *ncbi, int volume);
61 static void reset_header_values (ESL_SQNCBI_DATA *ncbi);
62
63 static int read_amino (ESL_SQFILE *sqfp, ESL_SQ *sq);
64 static int read_dna (ESL_SQFILE *sqfp, ESL_SQ *sq);
65 static int read_nres_amino (ESL_SQFILE *sqfp, ESL_SQ *sq, int len, uint64_t *nres);
66 static int read_nres_dna (ESL_SQFILE *sqfp, ESL_SQ *sq, int len, uint64_t *nres);
67
68 static int inmap_ncbi (ESL_SQFILE *sqfp);
69 static int inmap_ncbi_amino (ESL_SQFILE *sqfp);
70 static int inmap_ncbi_dna (ESL_SQFILE *sqfp);
71
72 /* parsing routines */
73 static int parse_header (ESL_SQNCBI_DATA *ncbi, ESL_SQ *sq);
74 static int parse_def_line (ESL_SQNCBI_DATA *ncbi, ESL_SQ *sq);
75 static int parse_seq_id (ESL_SQNCBI_DATA *ncbi);
76 static int parse_textseq_id (ESL_SQNCBI_DATA *ncbi);
77 static int parse_object_id (ESL_SQNCBI_DATA *ncbi);
78 static int parse_dbtag (ESL_SQNCBI_DATA *ncbi);
79 static int parse_patent_seq_id (ESL_SQNCBI_DATA *ncbi);
80 static int parse_giimport_id (ESL_SQNCBI_DATA *ncbi);
81 static int parse_id_pat (ESL_SQNCBI_DATA *ncbi);
82 static int parse_pdb_seq_id (ESL_SQNCBI_DATA *ncbi);
83 static int parse_date_std (ESL_SQNCBI_DATA *ncbi);
84 static int parse_string (ESL_SQNCBI_DATA *ncbi, char **str, int *len);
85 static int parse_integer (ESL_SQNCBI_DATA *ncbi, int *value);
86 static int ignore_sequence_of_integer(ESL_SQNCBI_DATA *ncbi);
87
88 #define INDEX_TABLE_SIZE 1024
89 #define INIT_HDR_BUFFER_SIZE 2048
90
91 #define NCBI_VERSION_4 4
92 #define NCBI_DNA_DB 0
93 #define NCBI_AMINO_DB 1
94
95 /*****************************************************************
96 *# 1. An <ESL_SQFILE> object, in text mode.
97 *****************************************************************/
98
99 /* Function: esl_sqncbi_Open()
100 * Synopsis: Open a sequence file for reading.
101 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
102 *
103 * Purpose: Open a sequence file <filename> for reading.
104 * The opened <ESL_SQFILE> is returned through <ret_sqfp>.
105 *
106 * The .pin, .phr and .psq files are required for the
107 * open function to succeed. Only ncbi version 4
108 * databases are currently supported.
109 *
110 * Returns: <eslOK> on success, and <*ret_sqfp> points to a new
111 * open <ESL_SQFILE>. Caller deallocates this object with
112 * <esl_sqfile_Close()>.
113 *
114 * Returns <eslENOTFOUND> if <filename> can't be found or
115 * opened. Returns <eslEFORMAT> if the file is empty, or
116 * if autodetection is attempted and the format can't be
117 * determined. On any error condition, <*ret_sqfp> is
118 * returned NULL.
119 *
120 * Throws: <eslEMEM> on allocation failure.
121 */
122 int
esl_sqncbi_Open(char * filename,int format,ESL_SQFILE * sqfp)123 esl_sqncbi_Open(char *filename, int format, ESL_SQFILE *sqfp)
124 {
125 int i;
126 int status = eslOK; /* return status from an ESL call */
127
128 ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
129
130 /* before we go any further, make sure we can handle the format */
131 if (format != eslSQFILE_NCBI && format != eslSQFILE_UNKNOWN) return eslENOTFOUND;
132
133 ncbi->fppin = NULL;
134 ncbi->fpphr = NULL;
135 ncbi->fppsq = NULL;
136
137 ncbi->title = NULL;
138 ncbi->timestamp = NULL;
139
140 ncbi->index = 0;
141
142 ncbi->hdr_off = -1;
143 ncbi->seq_off = -1;
144 ncbi->amb_off = -1;
145
146 ncbi->index_start = -1;
147 ncbi->index_end = -1;
148 ncbi->hdr_indexes = NULL;
149 ncbi->seq_indexes = NULL;
150 ncbi->amb_indexes = NULL;
151
152 ncbi->hdr_buf = NULL;
153 reset_header_values(ncbi);
154
155 ncbi->amb_off = 0;
156
157 ncbi->alphatype = eslUNKNOWN;
158 ncbi->alphasym = NULL;
159
160 ncbi->vol_index = -1;
161 ncbi->volumes = 0;
162
163 for (i = 0; i < MAX_DB_VOLUMES; ++i) {
164 ncbi->vols[i].name = NULL;
165 ncbi->vols[i].start_seq = -1;
166 ncbi->vols[i].end_seq = -1;
167 }
168
169 if ((status = sqncbi_Open(ncbi, filename)) != eslOK) goto ERROR;
170
171 sqfp->format = eslSQFILE_NCBI;
172 if ((status = inmap_ncbi(sqfp)) != eslOK) goto ERROR;
173
174 /* initialize the function pointers for the ncbi routines */
175 sqfp->position = &sqncbi_Position;
176 sqfp->close = &sqncbi_Close;
177
178 sqfp->set_digital = &sqncbi_SetDigital;
179 sqfp->guess_alphabet = &sqncbi_GuessAlphabet;
180
181 sqfp->is_rewindable = &sqncbi_IsRewindable;
182
183 sqfp->read = &sqncbi_Read;
184 sqfp->read_info = &sqncbi_ReadInfo;
185 sqfp->read_seq = &sqncbi_ReadSequence;
186 sqfp->read_window = &sqncbi_ReadWindow;
187 sqfp->echo = &sqncbi_Echo;
188
189 sqfp->read_block = &sqncbi_ReadBlock;
190
191 sqfp->get_error = &sqncbi_GetError;
192
193 return eslOK;
194
195 ERROR:
196 sqncbi_Close(sqfp);
197 return status;
198 }
199
200 /* sqncbi_ParseIndexFile()
201 *
202 * Parse an open index file verifying database type and version
203 * if handled. Read in the entries, i.e. name, number of sequences,
204 * etc. Keep track of the offsets where the header and sequence
205 * tables begin.
206 */
207 static int
sqncbi_ParseIndexFile(ESL_SQNCBI_DATA * ncbi,int dbtype)208 sqncbi_ParseIndexFile(ESL_SQNCBI_DATA *ncbi, int dbtype)
209 {
210 int len;
211 uint32_t info[4];
212 int status = eslOK; /* return status from an ESL call */
213
214 if (fread(&info[0], sizeof(uint32_t), 3, ncbi->fppin) != 3) status = eslFAIL;
215 if (htobe32(info[0]) != NCBI_VERSION_4) status = eslEFORMAT;
216 if (htobe32(info[1]) != dbtype) status = eslEUNIMPLEMENTED;
217
218 if (status != eslOK) goto ERROR;
219 ncbi->version = htobe32(info[0]);
220 ncbi->alphatype = (dbtype == NCBI_DNA_DB) ? eslDNA : eslAMINO;
221 ncbi->index = 0;
222
223 /* read the database title */
224 len = htobe32(info[2]);
225 ESL_ALLOC(ncbi->title, sizeof(char) * (len + 1));
226 if (fread(ncbi->title, sizeof(char), len, ncbi->fppin) != len) { status = eslFAIL; goto ERROR; }
227 ncbi->title[len] = 0;
228
229 /* read the database time stamp */
230 if (fread(&info[0], sizeof(uint32_t), 1, ncbi->fppin) != 1) { status = eslFAIL; goto ERROR; }
231 len = htobe32(info[0]);
232 ESL_ALLOC(ncbi->timestamp, sizeof(char) * (len + 1));
233 if (fread(ncbi->timestamp, sizeof(char), len, ncbi->fppin) != len) { status = eslFAIL; goto ERROR; }
234 ncbi->timestamp[len] = 0;
235
236 /* read in database stats */
237 if (fread(&info[0], sizeof(uint32_t), 4, ncbi->fppin) != 4) { status = eslFAIL; goto ERROR; }
238 ncbi->num_seq = htobe32(info[0]);
239 memcpy(&ncbi->total_res, info+1, sizeof(uint64_t));
240 ncbi->max_seq = htobe32(info[3]);
241
242 /* save the offsets to the index tables */
243 ncbi->hdr_off = ftell(ncbi->fppin);
244 ncbi->seq_off = ncbi->hdr_off + sizeof(uint32_t) * (ncbi->num_seq + 1);
245
246 return eslOK;
247
248 ERROR:
249 return status;
250 }
251
252 /* sqncbi_DbOpen()
253 *
254 * Try to open the database files. On successful opening of
255 * the files, index, header and sequence, the index file is
256 * parsed for validity.
257 */
258 static int
sqncbi_DbOpen(ESL_SQNCBI_DATA * ncbi,char * filename,int dbtype)259 sqncbi_DbOpen(ESL_SQNCBI_DATA *ncbi, char *filename, int dbtype)
260 {
261 int status = eslOK; /* return status from an ESL call */
262 int len;
263
264 char *name = NULL;
265
266 len = strlen(filename);
267 ESL_ALLOC(name, sizeof(char) * (len+5));
268 strcpy(name, filename);
269
270 /* Check for basic database first */
271 strcpy(name+len, ".Xin");
272 name[len+1] = (dbtype == NCBI_DNA_DB) ? 'n' : 'p';
273 if ((ncbi->fppin = fopen(name, "rb")) == NULL) {
274 status = eslENOTFOUND;
275 goto ERROR;
276 }
277 strcpy(name+len, ".Xhr");
278 name[len+1] = (dbtype == NCBI_DNA_DB) ? 'n' : 'p';
279 if ((ncbi->fpphr = fopen(name, "rb")) == NULL) {
280 status = eslENOTFOUND;
281 goto ERROR;
282 }
283 strcpy(name+len, ".Xsq");
284 name[len+1] = (dbtype == NCBI_DNA_DB) ? 'n' : 'p';
285 if ((ncbi->fppsq = fopen(name, "rb")) == NULL) {
286 status = eslENOTFOUND;
287 goto ERROR;
288 }
289
290 /* parse the header make sure we are looking at a version 4 db. */
291 if ((status = sqncbi_ParseIndexFile(ncbi, dbtype)) != eslOK) goto ERROR;
292
293 if (name != NULL) free(name);
294
295 return eslOK;
296
297 ERROR:
298
299 reset_db(ncbi);
300
301 if (name != NULL) free(name);
302 return status;
303 }
304
305
306 /* sqncbi_AliasOpen()
307 *
308 * Opens an alias file parsing the DBLIST directive building
309 * a list of all the volumes makeing up this database. As each
310 * volume is successfully parsed, the name of the volume, number
311 * of sequences and offsets are kept for quick reference.
312 */
313 static int
sqncbi_AliasOpen(ESL_SQNCBI_DATA * ncbi,char * filename,int dbtype)314 sqncbi_AliasOpen(ESL_SQNCBI_DATA *ncbi, char *filename, int dbtype)
315 {
316 int status = eslOK; /* return status from an ESL call */
317 int newline = 1;
318 int seqcnt = 0;
319 int rescnt = 0;
320 int done = 0;
321 int vol;
322 int len;
323
324 char *ptr = NULL;
325 char *name = NULL;
326 char buffer[80];
327
328 char *dbptr = NULL;
329 char *dbname = NULL;
330 int dbsize = 512;
331 int dblen = 0;
332
333 FILE *fp = NULL;
334
335 len = strlen(filename);
336 ESL_ALLOC(name, sizeof(char) * (len+5));
337 strcpy(name, filename);
338
339 /* Check for alias file */
340 strcpy(name+len, ".Xal");
341 name[len+1] = (dbtype == NCBI_DNA_DB) ? 'n' : 'p';
342 if ((fp = fopen(name, "r")) == NULL) {
343 status = eslENOTFOUND;
344 goto ERROR;
345 }
346
347 /* copy the filename */
348 dbsize = (dbsize > len) ? dbsize : len * 2;
349 ESL_ALLOC(dbname, sizeof(char) * dbsize);
350 strcpy(dbname, filename);
351
352 /* remove the filename keeping the path */
353 while (len > 0 && dbname[len-1] != eslDIRSLASH) {
354 dbname[len-1] = '\0';
355 --len;
356 }
357
358 /* find the DBLIST directive */
359 while (!done) {
360 ptr = buffer;
361 if (fgets(buffer, sizeof(buffer), fp) == NULL) {
362 status = eslEFORMAT;
363 goto ERROR;
364 }
365
366 if (newline) {
367 if (strncmp(buffer, "DBLIST", 6) == 0 && isspace(buffer[6])) {
368 done = 1;
369 ptr = buffer + 6;
370 }
371 }
372
373 /* skip to the end of the line */
374 if (!done) {
375 newline = 0;
376 while (*ptr != '\0') {
377 if (*ptr == '\n') newline = 1;
378 ++ptr;
379 }
380 }
381 }
382
383 /* parse the DBLIST line */
384 done = 0;
385 dblen = len;
386 while (!done) {
387
388 /* check if we hit the end of the buffer */
389 if (*ptr == '\0') {
390 ptr = buffer;
391 if (fgets(buffer, sizeof(buffer), fp) == NULL) {
392 status = eslEFORMAT;
393 goto ERROR;
394 }
395 }
396
397 /* skip spaces */
398 if (isspace(*ptr)) {
399 if (dblen > len) {
400 dbname[dblen++] = '\0';
401
402 /* if the name in the DBLIST directive was ablsolute, do not
403 * use the working directory as a prefix.
404 */
405 if (dbname[len] == eslDIRSLASH) {
406 dbptr = dbname + len;
407 } else {
408 dbptr = dbname;
409 }
410
411 status = sqncbi_DbOpen(ncbi, dbptr, dbtype);
412 if (status != eslOK) goto ERROR;
413
414 /* close any open files and free up allocations */
415 reset_db(ncbi);
416
417 /* if successful, copy the db information */
418 vol = ncbi->volumes++;
419
420 /* allocate the name of the string big enought so the buffer can
421 * handle an extension, i.e. ".pin" or ".nsq" tacked on to the end
422 * of it without reallocating.
423 */
424 ncbi->vols[vol].name = NULL;
425 ESL_ALLOC(ncbi->vols[vol].name, sizeof(char) * strlen(dbptr) + 5);
426 strcpy(ncbi->vols[vol].name, dbptr);
427
428 ncbi->vols[vol].start_seq = seqcnt;
429 ncbi->vols[vol].end_seq = seqcnt + ncbi->num_seq - 1;
430
431 ncbi->vols[vol].hdr_off = ncbi->hdr_off;
432 ncbi->vols[vol].seq_off = ncbi->seq_off;
433 ncbi->vols[vol].amb_off = ncbi->amb_off;
434
435 seqcnt += ncbi->num_seq;
436 rescnt += ncbi->total_res;
437
438 dblen = len;
439 }
440
441 done = *ptr == '\n' || *ptr == '\r';
442
443 ptr++;
444 } else {
445 dbname[dblen++] = *ptr++;
446 if (dblen >= dbsize - 1) {
447 char *t;
448 dbsize += dbsize;
449 ESL_RALLOC(dbname, t, dbsize);
450 }
451 }
452 }
453
454 /* reopen the first volume for processing */
455 if ((status = volume_open(ncbi, 0)) != eslOK) goto ERROR;
456
457 ncbi->num_seq = seqcnt;
458 ncbi->total_res = rescnt;
459
460 if (name != NULL) free(name);
461 if (dbname != NULL) free(dbname);
462
463 if (fp != NULL) fclose(fp);
464
465 return eslOK;
466
467 ERROR:
468
469 reset_db(ncbi);
470
471 if (fp != NULL) fclose(fp);
472
473 if (dbname != NULL) free(dbname);
474 if (name != NULL) free(name);
475
476 return status;
477 }
478
479
480 /* Function: sqncbi_Open()
481 * Synopsis: Open an ncbi database.
482 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
483 *
484 * Purpose: Open an ncbi database making sure all the necessry
485 * files are present. Parse the index file for database
486 * information filling in the ncbi data structure.
487 *
488 * Returns: <eslOK> on success, and the ncbi data structre is filled
489 * in with the database information.
490 *
491 * Returns <eslENOTFOUND> if <filename> can't be found or
492 * opened. Returns <eslEFORMAT> if the index file is an
493 * upsupported version.
494 *
495 * Throws: <eslEMEM> on allocation failure.
496 */
497 static int
sqncbi_Open(ESL_SQNCBI_DATA * ncbi,char * filename)498 sqncbi_Open(ESL_SQNCBI_DATA *ncbi, char *filename)
499 {
500 int status = eslOK; /* return status from an ESL call */
501 char *name = NULL;
502
503 /* first try to open a single protein database */
504 status = sqncbi_DbOpen(ncbi, filename, NCBI_AMINO_DB);
505
506 /* if the database was not found, look for protein volume */
507 if (status == eslENOTFOUND) {
508 status = sqncbi_AliasOpen(ncbi, filename, NCBI_AMINO_DB);
509 }
510
511 /* if nothing so far, try a dna database */
512 if (status == eslENOTFOUND) {
513 status = sqncbi_DbOpen(ncbi, filename, NCBI_DNA_DB);
514 }
515
516 /* still nothing, look for dna volume */
517 if (status == eslENOTFOUND) {
518 status = sqncbi_AliasOpen(ncbi, filename, NCBI_DNA_DB);
519 }
520
521 if (status != eslOK) goto ERROR;
522
523 /* allocate buffers used in parsing the database files */
524 ESL_ALLOC(ncbi->hdr_indexes, sizeof(uint32_t) * INDEX_TABLE_SIZE);
525 ESL_ALLOC(ncbi->seq_indexes, sizeof(uint32_t) * INDEX_TABLE_SIZE);
526
527 /* if this is a dna database we need to allocate space for the
528 * ambiguity offsets.
529 */
530 if (ncbi->alphatype == eslDNA) {
531 ncbi->amb_off = ncbi->seq_off + sizeof(uint32_t) * (ncbi->num_seq + 1);
532 ESL_ALLOC(ncbi->amb_indexes, sizeof(uint32_t) * INDEX_TABLE_SIZE);
533 }
534
535 ncbi->hdr_alloced = INIT_HDR_BUFFER_SIZE;
536 ESL_ALLOC(ncbi->hdr_buf, sizeof(unsigned char) * INIT_HDR_BUFFER_SIZE);
537
538 /* skip the first sentinal byte in the .psq file */
539 fgetc(ncbi->fppsq);
540
541 if (name != NULL) free(name);
542 return eslOK;
543
544 ERROR:
545 if (name != NULL) free(name);
546 return status;
547 }
548
549
550 /* Function: sqncbi_Position()
551 * Synopsis: Reposition an open sequence file to an offset.
552 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
553 *
554 * Purpose: Reposition an open <sqfp> to offset <offset>.
555 * <offset> for the ncbi db format specified the sequence
556 * index, not file offset. Both the sequence and header
557 * files are repositioned.
558 *
559 * Returns: <eslOK> on success;
560 *
561 * Throws: <eslESYS> if the fseeko() or fread() call fails.
562 * On errors, the state of <sqfp> is indeterminate, and
563 * it should not be used again.
564 */
565 static int
sqncbi_Position(ESL_SQFILE * sqfp,off_t offset)566 sqncbi_Position(ESL_SQFILE *sqfp, off_t offset)
567 {
568 int status;
569
570 ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
571
572 status = pos_sequence(ncbi, offset);
573 return status;
574 }
575
576 /* Function: sqncbi_Close()
577 * Synopsis: Close a sequence file.
578 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
579 *
580 * Purpose: Closes an open <sqfp>.
581 *
582 * Returns: (void).
583 */
584 static void
sqncbi_Close(ESL_SQFILE * sqfp)585 sqncbi_Close(ESL_SQFILE *sqfp)
586 {
587 int i;
588
589 ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
590
591 if (ncbi->title != NULL) free(ncbi->title);
592 if (ncbi->timestamp != NULL) free(ncbi->timestamp);
593
594 if (ncbi->hdr_buf != NULL) free(ncbi->hdr_buf);
595
596 if (ncbi->hdr_indexes != NULL) free(ncbi->hdr_indexes);
597 if (ncbi->seq_indexes != NULL) free(ncbi->seq_indexes);
598 if (ncbi->amb_indexes != NULL) free(ncbi->amb_indexes);
599
600 if (ncbi->alphasym != NULL) free(ncbi->alphasym);
601
602 if (ncbi->fppin != NULL) fclose(ncbi->fppin);
603 if (ncbi->fpphr != NULL) fclose(ncbi->fpphr);
604 if (ncbi->fppsq != NULL) fclose(ncbi->fppsq);
605
606 ncbi->vol_index = -1;
607 ncbi->volumes = 0;
608
609 for (i = 0; i < MAX_DB_VOLUMES; ++i) {
610 if (ncbi->vols[i].name != NULL) free(ncbi->vols[i].name);
611
612 ncbi->vols[i].name = NULL;
613 ncbi->vols[i].start_seq = -1;
614 ncbi->vols[i].end_seq = -1;
615 }
616
617 ncbi->fppin = NULL;
618 ncbi->fpphr = NULL;
619 ncbi->fppsq = NULL;
620
621 ncbi->title = NULL;
622 ncbi->timestamp = NULL;
623
624 ncbi->index = 0;
625
626 ncbi->hdr_off = -1;
627 ncbi->seq_off = -1;
628 ncbi->amb_off = -1;
629
630 ncbi->index_start = -1;
631 ncbi->index_end = -1;
632 ncbi->hdr_indexes = NULL;
633 ncbi->seq_indexes = NULL;
634 ncbi->amb_indexes = NULL;
635
636 ncbi->hdr_buf = NULL;
637
638 ncbi->alphatype = eslUNKNOWN;
639 ncbi->alphasym = NULL;
640
641 return;
642 }
643 /*------------------- SQNCBI open/close -----------------------*/
644
645
646 /*****************************************************************
647 *# 2. An <ESL_SQFILE> object, in digital mode [with <alphabet>]
648 *****************************************************************/
649
650 /* Function: sqncbi_SetDigital()
651 * Synopsis: Set an open <ESL_SQFILE> to read in digital mode.
652 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
653 *
654 * Purpose: Given an <ESL_SQFILE> that's already been opened,
655 * configure it to expect subsequent input to conform
656 * to the digital alphabet <abc>.
657 *
658 * Calling <esl_sqfile_Open(); esl_sqfile_SetDigital()> is
659 * equivalent to <esl_sqfile_OpenDigital()>. The two-step
660 * version is useful when you need a
661 * <esl_sqfile_GuessAlphabet()> call in between, guessing
662 * the file's alphabet in text mode before you set it to
663 * digital mode.
664 *
665 * Returns: <eslOK> on success.
666 */
667 static int
sqncbi_SetDigital(ESL_SQFILE * sqfp,const ESL_ALPHABET * abc)668 sqncbi_SetDigital(ESL_SQFILE *sqfp, const ESL_ALPHABET *abc)
669 {
670 return eslOK;
671 }
672
673 /* Function: sqncbi_GuessAlphabet()
674 * Synopsis: Guess the alphabet of an open <ESL_SQFILE>.
675 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
676 *
677 * Purpose: The only ncbi db format supported is protein.
678 *
679 * Returns: <eslOK> on success, and <*ret_type> is set to <eslAMINO>.
680 */
681 static int
sqncbi_GuessAlphabet(ESL_SQFILE * sqfp,int * ret_type)682 sqncbi_GuessAlphabet(ESL_SQFILE *sqfp, int *ret_type)
683 {
684 *ret_type = sqfp->data.ncbi.alphatype;
685 return eslOK;
686 }
687
688 /*-------------- end, digital mode SQNCBI -------------------*/
689
690
691
692
693 /*****************************************************************
694 *# 3. Miscellaneous routines
695 *****************************************************************/
696
697 /* Function: sqncbi_IsRewindable()
698 * Synopsis: Return <TRUE> if <sqfp> can be rewound.
699 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
700 *
701 * Purpose: Returns <TRUE> if <sqfp> can be rewound (positioned
702 * to an offset of zero), in order to read it a second
703 * time.
704 */
705 static int
sqncbi_IsRewindable(const ESL_SQFILE * sqfp)706 sqncbi_IsRewindable(const ESL_SQFILE *sqfp)
707 {
708 return TRUE;
709 }
710
711 /* Function: sqncbi_GetError()
712 * Synopsis: Returns error buffer
713 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
714 *
715 * Purpose: Return a pointer to the error buffer.
716 */
717 static const char *
sqncbi_GetError(const ESL_SQFILE * sqfp)718 sqncbi_GetError(const ESL_SQFILE *sqfp)
719 {
720 return sqfp->data.ncbi.errbuf;
721 }
722
723
724 /*****************************************************************
725 *# 4. Sequence reading (sequential)
726 *****************************************************************/
727
728 /* Function: sqncbi_Read()
729 * Synopsis: Read the next sequence from a file.
730 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
731 *
732 * Purpose: Reads the next sequence from open sequence file <sqfp> into
733 * <sq>. Caller provides an allocated and initialized <s>, which
734 * will be internally reallocated if its space is insufficient.
735 *
736 * Returns: <eslOK> on success; the new sequence is stored in <s>.
737 *
738 * Returns <eslEOF> when there is no sequence left in the
739 * file (including first attempt to read an empty file).
740 *
741 * Returns <eslEFORMAT> if there's a problem with the format,
742 * such as an illegal character; the line number that the parse
743 * error occurs on is in <sqfp->linenumber>, and an informative
744 * error message is placed in <sqfp->errbuf>.
745 *
746 * Throws: <eslEMEM> on allocation failure;
747 * <eslEINCONCEIVABLE> on internal error.
748 */
749 static int
sqncbi_Read(ESL_SQFILE * sqfp,ESL_SQ * sq)750 sqncbi_Read(ESL_SQFILE *sqfp, ESL_SQ *sq)
751 {
752 //int index;
753 int status;
754
755 ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
756
757 //index = ncbi->index + 1;
758 if (ncbi->index >= ncbi->num_seq) return eslEOF;
759
760 if ((status = pos_sequence(ncbi, ncbi->index)) != eslOK) return status;
761
762 /* Disk offset bookkeeping */
763 sq->idx = ncbi->index;
764 sq->roff = ncbi->roff;
765 sq->doff = ncbi->doff;
766 sq->hoff = ncbi->hoff;
767 sq->eoff = ncbi->eoff;
768
769 if (ncbi->alphatype == eslAMINO)
770 status = read_amino(sqfp, sq);
771 else
772 status = read_dna(sqfp, sq);
773 if (status != eslOK) return status;
774
775 /* read and parse the ncbi header */
776 if ((status = parse_header(ncbi, sq)) != eslOK) return status;
777
778 (ncbi->index)++;
779
780 return eslOK;
781 }
782
783
784 /* Function: sqncbi_ReadInfo()
785 * Synopsis: Read sequence info, but not the sequence itself.
786 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
787 *
788 * Purpose: Read the next sequence from open sequence file <sqfp>,
789 * but don't store the sequence (or secondary structure).
790 * Upon successful return, <s> holds all the available
791 * information about the sequence -- its name, accession,
792 * description, and overall length <sq->L>.
793 *
794 * This is useful for indexing sequence files, where
795 * individual sequences might be ginormous, and we'd rather
796 * avoid reading complete seqs into memory.
797 *
798 * Returns: <eslOK> on success.
799 */
800 static int
sqncbi_ReadInfo(ESL_SQFILE * sqfp,ESL_SQ * sq)801 sqncbi_ReadInfo(ESL_SQFILE *sqfp, ESL_SQ *sq)
802 {
803 //int index;
804 int status;
805
806 ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
807
808 //index = ncbi->index + 1;
809 if (ncbi->index >= ncbi->num_seq) return eslEOF;
810
811 if ((status = pos_sequence(ncbi, ncbi->index)) != eslOK) return status;
812
813 /* Disk offset bookkeeping */
814 sq->idx = ncbi->index;
815 sq->roff = ncbi->roff;
816 sq->doff = ncbi->doff;
817 sq->hoff = ncbi->hoff;
818 sq->eoff = ncbi->eoff;
819
820 /* figure out the sequence length */
821 sq->L = -1;
822
823 /* read and parse the ncbi header */
824 if ((status = parse_header(ncbi, sq)) != eslOK) return status;
825
826 (ncbi->index)++;
827
828 return eslOK;
829 }
830
831
832 /* Function: sqncbi_ReadSequence()
833 * Synopsis: Read the sequence, not the sequence header.
834 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
835 *
836 * Purpose: Read the next sequence from open sequence file <sqfp>,
837 * but not the header information. Upon successful return,
838 * <s> holds all the sequence.
839 *
840 * This is useful reading binary formats and delaying the
841 * over heads of reading the sequence name until needed by
842 * the report generator.
843 *
844 * Returns: <eslOK> on success; the new sequence is stored in <s>.
845 *
846 * Returns <eslEOF> when there is no sequence left in the
847 * file (including first attempt to read an empty file).
848 *
849 * Throws: <eslEMEM> on allocation failure;
850 */
851 static int
sqncbi_ReadSequence(ESL_SQFILE * sqfp,ESL_SQ * sq)852 sqncbi_ReadSequence(ESL_SQFILE *sqfp, ESL_SQ *sq)
853 {
854 //int index;
855 int status;
856
857 ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
858
859 //index = ncbi->index + 1;
860 if (ncbi->index >= ncbi->num_seq) return eslEOF;
861
862 if ((status = pos_sequence(ncbi, ncbi->index)) != eslOK) return status;
863
864 /* Disk offset bookkeeping */
865 sq->idx = ncbi->index;
866 sq->roff = ncbi->roff;
867 sq->doff = ncbi->doff;
868 sq->hoff = ncbi->hoff;
869 sq->eoff = ncbi->eoff;
870
871 reset_header_values(ncbi);
872
873 if (ncbi->alphatype == eslAMINO)
874 status = read_amino(sqfp, sq);
875 else
876 status = read_dna(sqfp, sq);
877 if (status != eslOK) return status;
878
879 (ncbi->index)++;
880 return eslOK;
881 }
882
883
884 /* Function: sqncbi_ReadWindow()
885 * Synopsis: Read next window of sequence.
886 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
887 *
888 * Purpose: Read a next window of <W> residues from open file <sqfp>,
889 * keeping <C> residues from the previous window as
890 * context, and keeping previous annotation in the <sq>
891 * as before.
892 *
893 * If this is the first window of a new sequence record,
894 * <C> is ignored (there's no previous context yet), and
895 * the annotation fields of the <sq> (name, accession, and
896 * description) are initialized by reading the sequence
897 * record's header. This is the only time the annotation
898 * fields are initialized.
899 *
900 * On return, <sq->dsq[]> contains the window and its
901 * context; residues <1..sq->C> are the previous context,
902 * and residues <sq->C+1..sq->n> are the new window. The
903 * start and end coordinates of the whole <dsq[1..n]>
904 * (including context) in the original source sequence are
905 * <sq->start..sq->end>. (Or, for text mode sequences,
906 * <sq->seq[0..sq->C-1,sq->C..sq->n-1]>, while <start> and
907 * <end> coords are still <1..L>.)
908 *
909 * When a sequence record is completed and no more data
910 * remain, <eslEOD> is returned, with an ``info'' <sq>
911 * structure (containing the annotation and the total
912 * sequence length <L>, but no sequence). (The total
913 * sequence length <L> is unknown in <sq> until this
914 * <eslEOD> return.)
915 *
916 * The caller may then do one of two things before calling
917 * <esl_sq_ReadWindow()> again; it can reset the sequence
918 * with <esl_sq_Reuse()> to continue reading the next
919 * sequence in the file, or it can set a negative <W> as a
920 * signal to read windows from the reverse complement
921 * (Crick) strand. Reverse complement reading only works
922 * for nucleic acid sequence.
923 *
924 * If you read the reverse complement strand, you must read
925 * the whole thing, calling <esl_sqio_ReadWindow()> with
926 * negative <W> windows until <eslEOD> is returned again
927 * with an empty (info-only) <sq> structure. When that
928 * <EOD> is reached, the <sqfp> is repositioned at the
929 * start of the next sequence record; the caller should now
930 * <Reuse()> the <sq>, and the next <esl_sqio_ReadWindow()>
931 * call must have a positive <W>, corresponding to starting
932 * to read the Watson strand of the next sequence.
933 *
934 * Note that the <ReadWindow()> interface is designed for
935 * an idiom of sequential reading of complete sequences in
936 * overlapping windows, possibly on both strands; if you
937 * want more freedom to move around in the sequence
938 * grabbing windows in another order, you can use the
939 * <FetchSubseq()> interface.
940 *
941 * Reading the reverse complement strand requires file
942 * repositioning, so it will not work on non-repositionable
943 * streams like gzipped files or a stdin pipe. Moreover,
944 * for reverse complement input to be efficient, the
945 * sequence file should have consistent line lengths,
946 * suitable for SSI's fast subsequence indexing.
947 *
948 * Returns: <eslOK> on success; <sq> now contains next window of
949 * sequence, with at least 1 new residue. The number
950 * of new residues is <sq->W>; <sq->C> residues are
951 * saved from the previous window. Caller may now
952 * process residues <sq->dsq[sq->C+1]..sq->dsq[sq->n]>.
953 *
954 * <eslEOD> if no new residues were read for this sequence
955 * and strand, and <sq> now contains an empty info-only
956 * structure (annotation and <L> are valid). Before calling
957 * <esl_sqio_ReadWindow()> again, caller will either want
958 * to make <W> negative (to start reading the Crick strand
959 * of the current sequence), or it will want to reset the
960 * <sq> (with <esl_sq_Reuse()>) to go on the next sequence.
961 *
962 * <eslEOF> if we've already returned <eslEOD> before to
963 * signal the end of the previous seq record, and moreover,
964 * there's no more sequence records in the file.
965 *
966 * <eslEINVAL> if an invalid residue is found in the
967 * sequence, or if you attempt to take the reverse
968 * complement of a sequence that can't be reverse
969 * complemented.
970 *
971 * Throws: <eslESYNTAX> if you try to read a reverse window before
972 * you've read forward strand.
973 *
974 * <eslECORRUPT> if something goes awry internally in the
975 * coordinate system.
976 *
977 * <eslEMEM> on allocation error.
978 */
979 static int
sqncbi_ReadWindow(ESL_SQFILE * sqfp,int C,int W,ESL_SQ * sq)980 sqncbi_ReadWindow(ESL_SQFILE *sqfp, int C, int W, ESL_SQ *sq)
981 {
982 uint64_t nres;
983 int status;
984
985 ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
986
987 /* Negative W indicates reverse complement direction */
988 if (W < 0)
989 {
990 if (sq->L == -1) ESL_EXCEPTION(eslESYNTAX, "Can't read reverse complement until you've read forward strand");
991
992 /* update the sequence index */
993 if ((status = sqncbi_Position(sqfp, sq->idx)) != eslOK)
994 ESL_FAIL(eslEINVAL, ncbi->errbuf, "Unexpected error positioning database to sequence %" PRId64, sq->idx);
995
996 if (sq->end == 1)
997 { /* last end == 1 means last window was the final one on reverse strand,
998 * so we're EOD; jump back to last forward position.
999 */
1000 sq->start = 0;
1001 sq->end = 0;
1002 sq->C = 0;
1003 sq->W = 0;
1004 sq->n = 0;
1005 /* sq->L stays as it is */
1006 if (sq->dsq != NULL) sq->dsq[1] = eslDSQ_SENTINEL;
1007 else sq->seq[0] = '\0';
1008
1009 return eslEOD;
1010 }
1011
1012 /* If s == 0, we haven't read any reverse windows yet;
1013 * init reading from sq->L
1014 */
1015 W = -W;
1016 if (sq->start == 0)
1017 {
1018 sq->start = ESL_MAX(1, (sq->L - W + 1));
1019 sq->end = sq->start;
1020 sq->C = 0;
1021 }
1022 else
1023 { /* Else, we're continuing to next window; prv was <end>..<start> */
1024 sq->C = ESL_MIN(C, sq->L - sq->end + 1); /* based on prev window's end */
1025 sq->start = ESL_MAX(1, (sq->end - W));
1026 W = sq->end - sq->start + sq->C;
1027 sq->end = sq->start;
1028 }
1029
1030 /* grab the subseq and rev comp it */
1031 if ((status = esl_sq_GrowTo(sq, W)) != eslOK) return status;
1032 sq->n = 0;
1033 if (ncbi->alphatype == eslAMINO) status = read_nres_amino(sqfp, sq, W, &nres);
1034 else status = read_nres_dna(sqfp, sq, W, &nres);
1035
1036 if (status != eslOK || nres != W) {
1037 ESL_EXCEPTION(eslECORRUPT, "Failed to extract %d..%d", sq->start, sq->end);
1038 } else {
1039 sq->end = sq->start + nres - 1;
1040 sq->W = nres - sq->C;
1041 }
1042
1043 status = esl_sq_ReverseComplement(sq);
1044 if (status == eslEINVAL) ESL_FAIL(eslEINVAL, ncbi->errbuf, "can't reverse complement that seq - it's not DNA/RNA");
1045 else if (status != eslOK) return status;
1046
1047 return eslOK;
1048 }
1049
1050 /* Else, we're reading the forward strand */
1051 else
1052 { /* sq->start == 0 means we haven't read any windows on this sequence yet...
1053 * it's a new record, and we need to initialize with the header and
1054 * the first window. This is the only case that we're allowed to return
1055 * EOF from.
1056 */
1057 if (sq->start == 0)
1058 {
1059 if (ncbi->index >= ncbi->num_seq) return eslEOF;
1060
1061 /* get the sequence and header offsets */
1062 if ((status = pos_sequence(ncbi, ncbi->index)) != eslOK) return status;
1063
1064 /* Disk offset bookkeeping */
1065 sq->idx = ncbi->index;
1066 sq->roff = ncbi->roff;
1067 sq->doff = ncbi->doff;
1068 sq->hoff = ncbi->hoff;
1069 sq->eoff = ncbi->eoff;
1070
1071 ncbi->seq_cpos = -1;
1072 ncbi->seq_L = -1;
1073
1074 /* read and parse the ncbi header */
1075 if ((status = parse_header(ncbi, sq)) != eslOK) return status;
1076
1077 sq->start = 1;
1078 sq->C = 0; /* no context in first window */
1079 sq->L = -1; /* won't be known 'til EOD. */
1080 ncbi->seq_L = -1; /* init to 0, so we can count residues as we go */
1081 esl_sq_SetSource(sq, sq->name);
1082
1083 }
1084 else
1085 { /* else we're reading a window other than first; slide context over. */
1086 sq->C = ESL_MIN(C, sq->n);
1087
1088 /* if the case where the window is smaller than the context and the
1089 * context is not full, it is not necessary to move the context part
1090 * of the sequence that has been read in.
1091 */
1092 if (sq->C >= C) {
1093 if (sq->seq != NULL) memmove(sq->seq, sq->seq + sq->n - sq->C, sq->C);
1094 else memmove(sq->dsq+1, sq->dsq + sq->n - sq->C + 1, sq->C);
1095 sq->start = sq->end - sq->C + 1;
1096 sq->n = C;
1097 }
1098 }
1099
1100 if ((status = esl_sq_GrowTo(sq, C+W)) != eslOK) return status; /* EMEM */
1101 if (ncbi->alphatype == eslAMINO) status = read_nres_amino(sqfp, sq, W, &nres);
1102 else status = read_nres_dna(sqfp, sq, W, &nres);
1103
1104 if (status == eslEOD)
1105 {
1106 (ncbi->index)++;
1107 sq->start = 0;
1108 sq->end = 0;
1109 sq->C = 0;
1110 sq->W = 0;
1111 sq->n = 0;
1112
1113 if (sq->dsq != NULL) sq->dsq[1] = eslDSQ_SENTINEL; /* erase the saved context */
1114 else sq->seq[0] = '\0';
1115
1116 return eslEOD;
1117 }
1118 else if (status == eslOK)
1119 { /* Forward strand is still in progress. <= W residues were read. Return eslOK. */
1120 sq->end = sq->start + sq->C + nres - 1;
1121 sq->W = nres;
1122 return eslOK;
1123 }
1124 else return status; /* EFORMAT,EMEM */
1125 }
1126 /*NOTREACHED*/
1127 return eslOK;
1128 }
1129
1130 /* Function: sqncbi_ReadBlock()
1131 * Synopsis: Read the next block of sequences from a file.
1132 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
1133 *
1134 * Purpose: Reads a block of sequences from open sequence file <sqfp> into
1135 * <sqBlock>.
1136 *
1137 * In the case that <long_target> is false, the sequences are
1138 * expected to be protein - individual sequences won't be long
1139 * so read them in one-whole-sequence at a time. If <max_sequences>
1140 * is set to a number > 0 read <max_sequences> sequences.
1141 *
1142 * If <long_target> is true, the sequences are expected to be DNA.
1143 * Because sequences in a DNA database can exceed MAX_RESIDUE_COUNT,
1144 * this function uses ReadWindow to read chunks of sequence no
1145 * larger than <max_residues>, and must allow for the possibility that
1146 * a request will be made to continue reading a partly-read
1147 * sequence.
1148 *
1149 * Returns: <eslOK> on success; the new sequence is stored in <sqBlock>.
1150 *
1151 * Returns <eslEOF> when there is no sequence left in the
1152 * file (including first attempt to read an empty file).
1153 *
1154 * Returns <eslEFORMAT> if there's a problem with the format,
1155 * such as an illegal character;
1156 *
1157 * Throws: <eslEMEM> on allocation failure;
1158 * <eslEINCONCEIVABLE> on internal error.
1159 */
1160 static int
sqncbi_ReadBlock(ESL_SQFILE * sqfp,ESL_SQ_BLOCK * sqBlock,int max_residues,int max_sequences,int long_target)1161 sqncbi_ReadBlock(ESL_SQFILE *sqfp, ESL_SQ_BLOCK *sqBlock, int max_residues, int max_sequences, int long_target)
1162 {
1163 int i = 0;
1164 int size = 0;
1165 int status = eslOK;
1166 ESL_SQ *tmpsq = NULL;
1167
1168 sqBlock->count = 0;
1169
1170 if ( !long_target )
1171 { /* in these cases, an individual sequence won't ever be really long,
1172 so just read in a sequence at a time */
1173
1174 if (max_sequences < 1 || max_sequences > sqBlock->listSize)
1175 max_sequences = sqBlock->listSize;
1176
1177 for (i = 0; i < max_sequences && size < MAX_RESIDUE_COUNT; ++i)
1178 {
1179 status = sqncbi_Read(sqfp, sqBlock->list + i);
1180 if (status != eslOK) break;
1181 size += sqBlock->list[i].n;
1182 ++sqBlock->count;
1183 }
1184 }
1185 else
1186 { /* DNA, not an alignment. Might be really long sequences */
1187
1188 /*this variable was used instead of the MAX_RESIDUE_COUNT macro because old H3 impl_dummy could've required shorter sequences to fit in memory*/
1189 if (max_residues < 0)
1190 max_residues = MAX_RESIDUE_COUNT;
1191
1192 tmpsq = esl_sq_CreateDigital(sqBlock->list->abc);
1193
1194 //if complete flag set to FALSE, then the prior block must have ended with a window that was a possibly
1195 //incomplete part of it's full sequence. Read another overlapping window.
1196 if (! sqBlock->complete )
1197 {
1198 //overloading C as indicator of how big C should be for this window reading action
1199 status = sqncbi_ReadWindow(sqfp, sqBlock->list->C, max_residues, sqBlock->list);
1200 if (status == eslOK)
1201 {
1202 sqBlock->count = i = 1;
1203 size = sqBlock->list->n;
1204 sqBlock->list[i].L = sqfp->data.ncbi.seq_L;
1205 if (sqBlock->list->n >= max_residues)
1206 { // Filled the block with a single very long window.
1207
1208 if ( sqBlock->list->n == sqfp->data.ncbi.seq_L) {
1209 sqBlock->complete = TRUE;
1210 esl_sq_Reuse(tmpsq);
1211 tmpsq->start = sqBlock->list->start ;
1212 tmpsq->C = 0;
1213 status = sqncbi_ReadWindow(sqfp, 0, max_residues, tmpsq); // burn off the EOD
1214 if (status == eslEOD) // otherwise, the unexpected status will be returned
1215 status = eslOK;
1216
1217 } else {
1218 sqBlock->complete = FALSE;
1219 }
1220
1221 if(tmpsq != NULL) esl_sq_Destroy(tmpsq);
1222 return status;
1223
1224 }
1225 else
1226 {
1227 // Burn off EOD (see notes for similar entry ~25 lines below), then go fetch the next sequence
1228 esl_sq_Reuse(tmpsq);
1229 tmpsq->start = sqBlock->list->start ;
1230 tmpsq->C = 0;
1231 status = sqncbi_ReadWindow(sqfp, 0, max_residues, tmpsq);
1232 if (status != eslEOD) {
1233 if(tmpsq != NULL) esl_sq_Destroy(tmpsq);
1234 return status; //surprising
1235 }
1236 sqBlock->list->L = tmpsq->L;
1237 }
1238 }
1239 else if (status == eslEOD)
1240 { // turns out there isn't any more of the sequence to read, after all
1241
1242 }
1243 else
1244 {
1245 if(tmpsq != NULL) esl_sq_Destroy(tmpsq);
1246 return status;
1247 }
1248 } // otherwise, just start at the beginning
1249
1250
1251 for ( ; i < sqBlock->listSize && size < max_residues; ++i)
1252 {
1253 /* restricted request_size is used to ensure that all blocks are pretty close to the
1254 * same size. Without it, we may either naively keep asking for max_residue windows,
1255 * which can result in a window with ~2*max_residues ... or we can end up with absurdly
1256 * short fragments at the end of blocks
1257 */
1258 int request_size = ESL_MAX(max_residues-size, max_residues * .05);
1259
1260 esl_sq_Reuse(tmpsq);
1261 esl_sq_Reuse(sqBlock->list + i);
1262 status = sqncbi_ReadWindow(sqfp, 0, request_size, tmpsq);
1263 esl_sq_Copy(tmpsq, sqBlock->list +i);
1264 if (status != eslOK) break; // end of sequences
1265
1266 size += sqBlock->list[i].n - sqBlock->list[i].C;
1267 sqBlock->list[i].L = sqfp->data.ncbi.seq_L;
1268 ++(sqBlock->count);
1269 if (size >= max_residues)
1270 { // a full window worth of sequence was read
1271
1272 if ( sqBlock->list[i].n == sqfp->data.ncbi.seq_L) {
1273 sqBlock->complete = TRUE;
1274 esl_sq_Reuse(tmpsq);
1275 tmpsq->start = sqBlock->list->start ;
1276 tmpsq->C = 0;
1277 status = sqncbi_ReadWindow(sqfp, 0, max_residues, tmpsq); // burn off the EOD
1278 if (status == eslEOD) // otherwise, the unexpected status will be returned
1279 status = eslOK;
1280
1281 } else {
1282 sqBlock->complete = FALSE;
1283 }
1284
1285
1286 if(tmpsq != NULL) esl_sq_Destroy(tmpsq);
1287 return status;
1288
1289 }
1290 else
1291 {
1292 /* Sequence was finished before filling a full window. Need to burn off the EOD value that will be
1293 returned by the next ReadWindow call. Can just use a tmp sq, after setting a couple
1294 values ReadWindow needs to see for correct processing.
1295 */
1296 esl_sq_Reuse(tmpsq);
1297 tmpsq->start = sqBlock->list[i].start ;
1298 tmpsq->end = sqBlock->list[i].end ;
1299 tmpsq->n = sqBlock->list[i].n ;
1300 //tmpsq->doff = sqBlock->list[i].doff ;
1301 tmpsq->idx = sqBlock->list[i].idx ;
1302
1303 tmpsq->C = 0;
1304 status = sqncbi_ReadWindow(sqfp, 0, max_residues, tmpsq);
1305 if (status != eslEOD) {
1306 if(tmpsq != NULL) esl_sq_Destroy(tmpsq);
1307 return status; //surprising
1308 }
1309 //sqBlock->list[i].L = tmpsq->L;
1310 status = eslOK;
1311 }
1312 }
1313 }
1314
1315 /* EOF will be returned only in the case were no sequences were read */
1316 if (status == eslEOF && i > 0) status = eslOK;
1317
1318 sqBlock->complete = TRUE;
1319
1320 if(tmpsq != NULL) esl_sq_Destroy(tmpsq);
1321
1322 return status;
1323 }
1324
1325 /* Function: sqncbi_Echo()
1326 * Synopsis: Echo a sequence's record onto output stream.
1327 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
1328 *
1329 * Returns: <eslEUNIMPLEMENTED>.
1330 */
1331 static int
sqncbi_Echo(ESL_SQFILE * sqfp,const ESL_SQ * sq,FILE * ofp)1332 sqncbi_Echo(ESL_SQFILE *sqfp, const ESL_SQ *sq, FILE *ofp)
1333 {
1334 ESL_EXCEPTION(eslEINVAL, "can't Echo() a sequence from NCBI database");
1335 return eslEUNIMPLEMENTED;
1336 }
1337 /*------------------ end, sequential sequence input -------------*/
1338
1339
1340
1341 /* Function: reset_db()
1342 * Synopsis: Resets a sequence file.
1343 * Incept: MSF, Wed Mar 17, 2010 [Janelia]
1344 *
1345 * Purpose: Closes just the currently opened db and frees
1346 * up the memory associated with that db. This is
1347 * used primarily with multi-volume databases where
1348 * the current db is closed so the next one can be
1349 * opened.
1350 *
1351 * Returns: void.
1352 */
1353 static void
reset_db(ESL_SQNCBI_DATA * ncbi)1354 reset_db(ESL_SQNCBI_DATA *ncbi)
1355 {
1356 if (ncbi->title != NULL) free(ncbi->title);
1357 if (ncbi->timestamp != NULL) free(ncbi->timestamp);
1358
1359 if (ncbi->fppin != NULL) fclose(ncbi->fppin);
1360 if (ncbi->fpphr != NULL) fclose(ncbi->fpphr);
1361 if (ncbi->fppsq != NULL) fclose(ncbi->fppsq);
1362
1363 ncbi->fppin = NULL;
1364 ncbi->fpphr = NULL;
1365 ncbi->fppsq = NULL;
1366
1367 ncbi->title = NULL;
1368 ncbi->timestamp = NULL;
1369
1370 return;
1371 }
1372
1373
1374 /* reset_header_values()
1375 *
1376 * Clear the header values so it is clear which values
1377 * have been set by the current header.
1378 */
1379 static void
reset_header_values(ESL_SQNCBI_DATA * ncbi)1380 reset_header_values(ESL_SQNCBI_DATA *ncbi)
1381 {
1382 ncbi->name_ptr = NULL;
1383 ncbi->name_size = 0;
1384 ncbi->acc_ptr = NULL;
1385 ncbi->acc_size = 0;
1386 ncbi->int_id = -1;
1387 ncbi->str_id_ptr = NULL;
1388 ncbi->str_id_size = 0;
1389 }
1390
1391 /* volume_open()
1392 *
1393 * Open up the index, head and sequence files for a particular
1394 * volume. Parse the first three fields in the index file,
1395 * one more time, just to make sure nothing funny is going on.
1396 */
1397 static int
volume_open(ESL_SQNCBI_DATA * ncbi,int volume)1398 volume_open(ESL_SQNCBI_DATA *ncbi, int volume)
1399 {
1400 int len;
1401 uint32_t info[4];
1402 int dbtype;
1403 int status = eslOK; /* return status from an ESL call */
1404
1405 char *name;
1406
1407 if (volume < 0 || volume > ncbi->volumes) return eslEINVAL;
1408
1409 /* if the db has no volumes return */
1410 if (ncbi->volumes == 0) return eslOK;
1411
1412 if (ncbi->fppin != NULL) fclose(ncbi->fppin);
1413 if (ncbi->fpphr != NULL) fclose(ncbi->fpphr);
1414 if (ncbi->fppsq != NULL) fclose(ncbi->fppsq);
1415
1416 name = ncbi->vols[volume].name;
1417 len = strlen(name);
1418
1419 dbtype = (ncbi->alphatype == eslDNA) ? NCBI_DNA_DB : NCBI_AMINO_DB;
1420
1421 /* Check for basic database first */
1422 strcpy(name+len, ".Xin");
1423 name[len+1] = (dbtype == NCBI_DNA_DB) ? 'n' : 'p';
1424 if ((ncbi->fppin = fopen(name, "rb")) == NULL) {
1425 status = eslFAIL;
1426 goto ERROR;
1427 }
1428 strcpy(name+len, ".Xhr");
1429 name[len+1] = (dbtype == NCBI_DNA_DB) ? 'n' : 'p';
1430 if ((ncbi->fpphr = fopen(name, "rb")) == NULL) {
1431 status = eslFAIL;
1432 goto ERROR;
1433 }
1434 strcpy(name+len, ".Xsq");
1435 name[len+1] = (dbtype == NCBI_DNA_DB) ? 'n' : 'p';
1436 if ((ncbi->fppsq = fopen(name, "rb")) == NULL) {
1437 status = eslFAIL;
1438 goto ERROR;
1439 }
1440
1441 /* quickly parse the header make sure we are sane. */
1442 if (fread(&info[0], sizeof(uint32_t), 3, ncbi->fppin) != 3) status = eslFAIL;
1443 if (htobe32(info[0]) != NCBI_VERSION_4) status = eslEFORMAT;
1444 if (htobe32(info[1]) != dbtype) status = eslEFORMAT;
1445
1446 if (status != eslOK) goto ERROR;
1447
1448 /* save the offsets to the index tables */
1449 ncbi->hdr_off = ncbi->vols[volume].hdr_off;
1450 ncbi->seq_off = ncbi->vols[volume].seq_off;
1451 if (dbtype == NCBI_DNA_DB) {
1452 ncbi->amb_off = ncbi->vols[volume].amb_off;
1453 }
1454
1455 ncbi->vol_index = volume;
1456 ncbi->index_start = -1;
1457 ncbi->index_end = -1;
1458
1459 /* skip the first sentinal byte in the .psq file */
1460 fgetc(ncbi->fppsq);
1461
1462 /* zero terminate the name other functions can just
1463 * tack on any extension without a lot of testing.
1464 */
1465 name[len] = '\0';
1466
1467 return eslOK;
1468
1469 ERROR:
1470
1471 reset_db(ncbi);
1472
1473 return status;
1474 }
1475
1476
1477 /* pos_sequence_offsets()
1478 *
1479 * Position the sequence and header files for reading at
1480 * the start of the indexed sequence <inx>. This routine
1481 * buffers <INDEX_TABLE_SIZE> offsets from the header and
1482 * sequence files. If the index <inx> is not in the
1483 * currently buffered table, read the the indexes. If the
1484 * index is not in the current volume find which volume
1485 * the indexed sequence is in and open up that database.
1486 */
1487 static int
pos_sequence(ESL_SQNCBI_DATA * ncbi,int inx)1488 pos_sequence(ESL_SQNCBI_DATA *ncbi, int inx)
1489 {
1490 int cnt;
1491 int status;
1492
1493 uint32_t offset;
1494 uint32_t start;
1495 uint32_t end;
1496
1497 ESL_SQNCBI_VOLUME *volume;
1498
1499 if (inx < 0 || inx > ncbi->num_seq) return eslEINVAL;
1500
1501 start = ncbi->index_start;
1502 end = ncbi->index_end;
1503
1504 /* get the offsets for the header, sequence and ambiguity table */
1505 if (ncbi->index_start == -1 || inx < start || inx > end) {
1506
1507 /* if the db is broken up into volumes, lets find the correct one to use */
1508 if (ncbi->volumes > 0) {
1509 volume = ncbi->vols + ncbi->vol_index;
1510 if (inx < volume->start_seq || inx > volume->end_seq) {
1511 volume = ncbi->vols;
1512 for (cnt = 0; cnt < ncbi->volumes; ++cnt) {
1513 if (inx < volume->end_seq) break;
1514 ++volume;
1515 }
1516
1517 /* check just to make sure we found the volume */
1518 if (cnt >= ncbi->volumes) return eslFAIL;
1519
1520 if ((status = volume_open(ncbi, cnt)) != eslOK) return status;
1521 }
1522 }
1523
1524 /* adjust where we start reading from if we are reading forwards or backwards */
1525 if (ncbi->index_start == -1 || inx > end) {
1526 start = inx;
1527 } else {
1528 start = inx + 2;
1529 start = (start > INDEX_TABLE_SIZE) ? start - INDEX_TABLE_SIZE : 0;
1530 }
1531 ncbi->index_start = start;
1532
1533 /* when calculating the count be sure to take into account the fact that the
1534 * index tables contain one index more that the number of sequences and this
1535 * last index is used to point to the end of the last header and sequences.
1536 */
1537 if (ncbi->volumes > 0) {
1538 cnt = volume->end_seq - inx + 2; // cppcheck thinks end_seq can be uninitialized here. I think it's wrong.
1539 start = start - volume->start_seq; // .. and ditto for start_seq.
1540 } else {
1541 cnt = ncbi->num_seq - inx + 1;
1542 }
1543 cnt = (cnt > INDEX_TABLE_SIZE) ? INDEX_TABLE_SIZE : cnt;
1544 ncbi->index_end = ncbi->index_start + cnt - 2;
1545
1546 offset = ncbi->hdr_off + (sizeof(uint32_t) * start);
1547 if (fseek(ncbi->fppin, offset, SEEK_SET) != 0) {
1548 ESL_FAIL(eslEFORMAT, ncbi->errbuf, "Error seeking header index %d\n", offset);
1549 }
1550 if (fread(ncbi->hdr_indexes, sizeof(uint32_t), cnt, ncbi->fppin) != cnt) {
1551 ESL_FAIL(eslEFORMAT, ncbi->errbuf, "Error reading header index %d at %d(%d)\n", start, offset, cnt);
1552 }
1553
1554 offset = ncbi->seq_off + (sizeof(uint32_t) * start);
1555 if (fseek(ncbi->fppin, offset, SEEK_SET) != 0) {
1556 ESL_FAIL(eslEFORMAT, ncbi->errbuf, "Error seeking sequence index %d\n", offset);
1557 }
1558 if (fread(ncbi->seq_indexes, sizeof(uint32_t), cnt, ncbi->fppin) != cnt) {
1559 ESL_FAIL(eslEFORMAT, ncbi->errbuf, "Error reading sequence index %d at %d(%d)\n", start, offset, cnt);
1560 }
1561
1562 if (ncbi->alphatype == eslDNA) {
1563 offset = ncbi->amb_off + (sizeof(uint32_t) * start);
1564 if (fseek(ncbi->fppin, offset, SEEK_SET) != 0) {
1565 ESL_FAIL(eslEFORMAT, ncbi->errbuf, "Error seeking ambiguity index %d\n", offset);
1566 }
1567 if (fread(ncbi->amb_indexes, sizeof(uint32_t), cnt, ncbi->fppin) != cnt) {
1568 ESL_FAIL(eslEFORMAT, ncbi->errbuf, "Error reading ambiguity index %d at %d(%d)\n", start, offset, cnt);
1569 }
1570 }
1571 }
1572
1573 ncbi->index = inx;
1574
1575 inx -= ncbi->index_start;
1576 ncbi->roff = htobe32(ncbi->hdr_indexes[inx]);
1577 ncbi->doff = htobe32(ncbi->seq_indexes[inx]);
1578 ncbi->hoff = htobe32(ncbi->hdr_indexes[inx+1]);
1579 ncbi->eoff = htobe32(ncbi->seq_indexes[inx+1]);
1580
1581 if (ncbi->alphatype == eslDNA) {
1582 ncbi->seq_apos = htobe32(ncbi->amb_indexes[inx]);
1583 ncbi->seq_alen = ncbi->seq_apos + htobe32(ncbi->amb_indexes[inx+1]) + 1;
1584 } else {
1585 ncbi->seq_apos = 0;
1586 ncbi->seq_alen = 0;
1587 }
1588
1589 if (fseek(ncbi->fpphr, ncbi->roff, SEEK_SET) != 0) return eslESYS;
1590 if (fseek(ncbi->fppsq, ncbi->doff, SEEK_SET) != 0) return eslESYS;
1591
1592 return eslOK;
1593 }
1594
1595 /* Function: read_amino()
1596 * Synopsis: Read in the amino sequence
1597 * Incept: MSF, Wed Jan 27, 2010 [Janelia]
1598 *
1599 * Purpose: Read and translate the amino acid sequence.
1600 */
1601 static int
read_amino(ESL_SQFILE * sqfp,ESL_SQ * sq)1602 read_amino(ESL_SQFILE *sqfp, ESL_SQ *sq)
1603 {
1604 int inx;
1605 int size;
1606
1607 ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
1608
1609 if (ncbi->index >= ncbi->num_seq) return eslEOF;
1610
1611 size = sq->eoff - sq->doff;
1612
1613 /* figure out the sequence length */
1614 if (esl_sq_GrowTo(sq, size) != eslOK) return eslEMEM;
1615
1616 /* figure out if the sequence is in digital mode or not */
1617 if (sq->dsq != NULL) {
1618 ESL_DSQ *ptr = sq->dsq + 1;
1619 if (fread(ptr, sizeof(char), size, ncbi->fppsq) != size) return eslEFORMAT;
1620 for (inx = 0; inx < size - 1; ++inx) {
1621 *ptr = sqfp->inmap[(int) *ptr];
1622 ++ptr;
1623 }
1624 *ptr = eslDSQ_SENTINEL;
1625 } else {
1626 char *ptr = sq->seq;
1627 if (fread(ptr, sizeof(char), size, ncbi->fppsq) != size) return eslEFORMAT;
1628 for (inx = 0; inx < size - 1; ++inx) {
1629 *ptr = sqfp->inmap[(int) *ptr];
1630 *ptr = ncbi->alphasym[(int) *ptr];
1631 ++ptr;
1632 }
1633 *ptr = '\0';
1634 }
1635
1636 sq->start = 1;
1637 sq->end = size - 1;
1638 sq->C = 0;
1639 sq->W = size - 1;
1640 sq->L = size - 1;
1641 sq->n = size - 1;
1642
1643 return eslOK;
1644 }
1645
1646
1647 /* Function: read_dna()
1648 * Synopsis: Read in the dna sequence
1649 * Incept: MSF, Wed Jan 27, 2010 [Janelia]
1650 *
1651 * Purpose: Read and translate the dna sequence.
1652 */
1653 static int
read_dna(ESL_SQFILE * sqfp,ESL_SQ * sq)1654 read_dna(ESL_SQFILE *sqfp, ESL_SQ *sq)
1655 {
1656 int64_t inx;
1657 int64_t cnt;
1658 int64_t off;
1659 int size;
1660 int text;
1661 int status;
1662 int amb32;
1663
1664 int remainder;
1665 int length;
1666 int ssize;
1667 int n;
1668
1669 unsigned char *ptr;
1670 void *t;
1671
1672 unsigned char c;
1673
1674 ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
1675
1676 if (ncbi->index >= ncbi->num_seq) return eslEOF;
1677
1678 /* calculate the max size of the sequence. It is most likely
1679 * a bit smaller, but we need to read the sequence in first
1680 * before the real size can be figured out.
1681 */
1682 size = sq->eoff - sq->doff;
1683 if (ncbi->hdr_alloced < size) {
1684 while (ncbi->hdr_alloced < size) ncbi->hdr_alloced += ncbi->hdr_alloced;
1685 ESL_RALLOC(ncbi->hdr_buf, t, sizeof(char) * ncbi->hdr_alloced);
1686 }
1687 if (fread(ncbi->hdr_buf, sizeof(char), size, ncbi->fppsq) != size) return eslEFORMAT;
1688
1689 ssize = ncbi->seq_apos - sq->doff - 1;
1690 remainder = *(ncbi->hdr_buf + ssize) & 0x03;
1691 length = ssize * 4 + remainder;
1692
1693 /* figure out the sequence length */
1694 if (esl_sq_GrowTo(sq, length) != eslOK) return eslEMEM;
1695
1696 /* figure out if the sequence is in digital mode or not */
1697 if (sq->dsq != NULL) {
1698 text = FALSE;
1699 ptr = (unsigned char *) sq->dsq + 1;
1700 } else {
1701 text = TRUE;
1702 ptr = (unsigned char *) sq->seq;
1703 }
1704
1705 for (inx = 0; inx < ssize; ++inx) {
1706 c = ncbi->hdr_buf[inx];
1707 n = 1 << ((c >> 6) & 0x03);
1708 *ptr = sqfp->inmap[n];
1709 if (text) *ptr = ncbi->alphasym[(int) *ptr];
1710 ++ptr;
1711 n = 1 << ((c >> 4) & 0x03);
1712 *ptr = sqfp->inmap[n];
1713 if (text) *ptr = ncbi->alphasym[(int) *ptr];
1714 ++ptr;
1715 n = 1 << ((c >> 2) & 0x03);
1716 *ptr = sqfp->inmap[n];
1717 if (text) *ptr = ncbi->alphasym[(int) *ptr];
1718 ++ptr;
1719 n = 1 << ((c >> 0) & 0x03);
1720 *ptr = sqfp->inmap[n];
1721 if (text) *ptr = ncbi->alphasym[(int) *ptr];
1722 ++ptr;
1723 }
1724
1725 /* handle the remainder */
1726 c = ncbi->hdr_buf[inx];
1727 for (inx = 0; inx < remainder; ++inx) {
1728 n = 1 << ((c >> (6 - inx * 2)) & 0x03);
1729 *ptr = sqfp->inmap[n];
1730 if (text) *ptr = ncbi->alphasym[(int) *ptr];
1731 ++ptr;
1732 }
1733
1734 *ptr = (text) ? '\0' : eslDSQ_SENTINEL;
1735
1736 /* we need to look that the first by the the ambiguity table
1737 * to see if the entries are 32 or 64 bit entries.
1738 */
1739 amb32 = 0;
1740 if (ncbi->seq_apos - sq->doff < size) {
1741 amb32 = ((ncbi->hdr_buf[ncbi->seq_apos - sq->doff] & 0x80) == 0);
1742 }
1743
1744 /* skip past the count and start processing the abmiguity table */
1745 ssize = ncbi->seq_apos - sq->doff + 4;
1746 ptr = (text) ? (unsigned char *)sq->seq : (unsigned char *)sq->dsq + 1;
1747
1748 while (ssize < size) {
1749 /* get the ambiguity character */
1750 n = ((ncbi->hdr_buf[ssize] >> 4) & 0x0f);
1751 c = sqfp->inmap[n];
1752 if (text) c = ncbi->alphasym[(int) c];
1753
1754 if (amb32) {
1755 /* get the repeat count 4 bits */
1756 cnt = (ncbi->hdr_buf[ssize] & 0x0f);
1757 cnt += 1;
1758
1759 /* get the offset 24 bits */
1760 off = ncbi->hdr_buf[ssize+1];
1761 off = (off << 8) | ncbi->hdr_buf[ssize+2];
1762 off = (off << 8) | ncbi->hdr_buf[ssize+3];
1763
1764 for (inx = 0; inx < cnt; ++inx) ptr[off+inx] = c;
1765
1766 ssize += 4;
1767 } else {
1768 /* get the repeat count 12 bits */
1769 cnt = (ncbi->hdr_buf[ssize] & 0x0f);
1770 cnt = (cnt << 8) | ncbi->hdr_buf[ssize+1];
1771 cnt += 1;
1772
1773 /* get the offset 48 bits*/
1774 off = ncbi->hdr_buf[ssize+2];
1775 off = (off << 8) | ncbi->hdr_buf[ssize+3];
1776 off = (off << 8) | ncbi->hdr_buf[ssize+4];
1777 off = (off << 8) | ncbi->hdr_buf[ssize+5];
1778 off = (off << 8) | ncbi->hdr_buf[ssize+6];
1779 off = (off << 8) | ncbi->hdr_buf[ssize+7];
1780
1781 for (inx = 0; inx < cnt; ++inx) ptr[off+inx] = c;
1782
1783 ssize += 8;
1784 }
1785 }
1786
1787 sq->start = 1;
1788 sq->end = length;
1789 sq->C = 0;
1790 sq->W = length;
1791 sq->L = length;
1792 sq->n = length;
1793
1794 return eslOK;
1795
1796 ERROR:
1797 return eslEMEM;
1798 }
1799
1800
1801 /* Function: read_nres_amino()
1802 * Synopsis: Read in the amino sequence
1803 * Incept: MSF, Wed Jan 27, 2010 [Janelia]
1804 *
1805 * Purpose: Read and translate the dna sequence.
1806 */
1807 static int
read_nres_amino(ESL_SQFILE * sqfp,ESL_SQ * sq,int len,uint64_t * nres)1808 read_nres_amino(ESL_SQFILE *sqfp, ESL_SQ *sq, int len, uint64_t *nres)
1809 {
1810 int inx;
1811 int off;
1812 int size;
1813
1814 unsigned char *ptr;
1815 ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
1816
1817 if (ncbi->index >= ncbi->num_seq) return eslEOF;
1818
1819 /* if we don't know the sequence length, figure it out */
1820 if (ncbi->seq_L == -1) ncbi->seq_L = sq->eoff - sq->doff - 1;
1821
1822 /* check if we are at the end */
1823 if (sq->start + sq->n > ncbi->seq_L) {
1824 if (nres != NULL) *nres = 0;
1825 sq->L = ncbi->seq_L;
1826 return eslEOD;
1827 }
1828
1829 /* figure out if the sequence is in digital mode or not */
1830 ptr = (sq->dsq != NULL) ? (unsigned char *)sq->dsq + 1 : (unsigned char *) sq->seq;
1831 ptr += sq->n;
1832
1833 /* calculate where to start reading from */
1834 off = sq->doff + sq->start + sq->n - 1;
1835
1836 /* calculate the size to read */
1837 size = ncbi->seq_L - (sq->start + sq->n - 1);
1838 size = (size > len) ? len : size;
1839
1840 /* seek to the windows location and read into the buffer */
1841 if (fseek(ncbi->fppsq, off, SEEK_SET) != 0) return eslESYS;
1842 if (fread(ptr, sizeof(char), size, ncbi->fppsq) != size) return eslEFORMAT;
1843
1844 /* figure out if the sequence is in digital mode or not */
1845 for (inx = 0; inx < size; ++inx) {
1846 *ptr = sqfp->inmap[(int) *ptr];
1847 if (sq->dsq == NULL) *ptr = ncbi->alphasym[(int) *ptr];
1848 ++ptr;
1849 }
1850
1851 *ptr = (sq->dsq == NULL) ? '\0' : eslDSQ_SENTINEL;
1852
1853 sq->n = sq->n + size;
1854
1855 if (nres != NULL) *nres = size;
1856
1857 return eslOK;
1858 }
1859
1860 /* Function: correct_ambiguity()
1861 * Synopsis: Read in the dna sequence
1862 * Incept: MSF, Thu Feb 4, 2010 [Janelia]
1863 *
1864 * Purpose: Correct any ambiguity characters.
1865 */
1866 static int
correct_ambiguity(ESL_SQFILE * sqfp,ESL_SQ * sq,int len)1867 correct_ambiguity(ESL_SQFILE *sqfp, ESL_SQ *sq, int len)
1868 {
1869 int64_t alen; /* ambiguity length */
1870 int64_t soff; /* starting offset */
1871 int64_t eoff; /* ending offset */
1872 int64_t ainx; /* ambiguity index */
1873 int64_t size; /* size of table read in */
1874 int64_t cnt; /* repeat count */
1875 int64_t off;
1876 int64_t n;
1877
1878 int amb32; /* flag for 32 or 64 bits */
1879 char *ptr;
1880
1881 unsigned char c;
1882
1883 ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
1884
1885 if (ncbi->seq_alen == 0) return eslOK;
1886
1887 /* go to the start of the ambiguity table and see if the table
1888 * is in 32 or 64 bit entries.
1889 */
1890 if (fseek(ncbi->fppsq, ncbi->seq_apos, SEEK_SET) != 0) return eslESYS;
1891 if (fread(ncbi->hdr_buf, sizeof(char), 4, ncbi->fppsq) != 4) return eslEFORMAT;
1892 amb32 = ((ncbi->hdr_buf[0] & 0x80) == 0);
1893
1894 ptr = (sq->dsq != NULL) ? (char *)sq->dsq + 1 : sq->seq;
1895 ptr += sq->n;
1896
1897 /* calculate the starting and ending offsets */
1898 soff = sq->start + sq->n - 1;
1899 eoff = soff + len;
1900
1901 off = 0;
1902 ainx = 0;
1903 size = 0;
1904 alen = ncbi->seq_alen - 4;
1905 while (off < eoff) {
1906 /* check if we need to read in more of the abmiguity table */
1907 if (ainx == size) {
1908 size = alen;
1909 size = (size > INIT_HDR_BUFFER_SIZE) ? INIT_HDR_BUFFER_SIZE : size;
1910 if (fread(ncbi->hdr_buf, sizeof(char), size, ncbi->fppsq) != size) return eslEFORMAT;
1911 alen -= size;
1912 ainx = 0;
1913 }
1914
1915 /* get the ambiguity character */
1916 n = ((ncbi->hdr_buf[ainx] >> 4) & 0x0f);
1917 c = sqfp->inmap[n];
1918 if (sq->dsq == NULL) c = ncbi->alphasym[(int) c];
1919
1920 if (amb32) {
1921 /* get the repeat count 4 bits */
1922 cnt = (ncbi->hdr_buf[ainx] & 0x0f);
1923 cnt += 1;
1924
1925 /* get the offset 24 bits */
1926 off = ncbi->hdr_buf[ainx+1];
1927 off = (off << 8) | ncbi->hdr_buf[ainx+2];
1928 off = (off << 8) | ncbi->hdr_buf[ainx+3];
1929
1930 ainx += 4;
1931 } else {
1932 /* get the repeat count 12 bits */
1933 cnt = (ncbi->hdr_buf[ainx] & 0x0f);
1934 cnt = (cnt << 8) | ncbi->hdr_buf[ainx+1];
1935 cnt += 1;
1936
1937 /* get the offset 48 bits*/
1938 off = ncbi->hdr_buf[ainx+2];
1939 off = (off << 8) | ncbi->hdr_buf[ainx+3];
1940 off = (off << 8) | ncbi->hdr_buf[ainx+4];
1941 off = (off << 8) | ncbi->hdr_buf[ainx+5];
1942 off = (off << 8) | ncbi->hdr_buf[ainx+6];
1943 off = (off << 8) | ncbi->hdr_buf[ainx+7];
1944
1945 ainx += 8;
1946 }
1947
1948 if (off + cnt >= soff && off < eoff) {
1949 int inx;
1950 int start = (off > soff) ? off - soff : 0;
1951 int end = (off + cnt > eoff) ? eoff : off - soff + cnt;
1952 for (inx = start; inx < end; ++inx) ptr[inx] = c;
1953 }
1954
1955 off += cnt;
1956 }
1957
1958 return eslOK;
1959 }
1960
1961 /* Function: read_nres_dna()
1962 * Synopsis: Read in the dna sequence
1963 * Incept: MSF, Wed Jan 27, 2010 [Janelia]
1964 *
1965 * Purpose: Read and translate the dna sequence.
1966 */
1967 static int
read_nres_dna(ESL_SQFILE * sqfp,ESL_SQ * sq,int len,uint64_t * nres)1968 read_nres_dna(ESL_SQFILE *sqfp, ESL_SQ *sq, int len, uint64_t *nres)
1969 {
1970 int inx;
1971 int off;
1972 int cnt;
1973 int ncnt;
1974 int start;
1975 int skip;
1976 int text;
1977 int status;
1978
1979 int remainder;
1980 int length;
1981 int ssize;
1982 int n;
1983
1984 unsigned char *ptr;
1985 void *t;
1986
1987 unsigned char c;
1988
1989 ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
1990
1991 if (ncbi->index >= ncbi->num_seq) return eslEOF;
1992
1993 /* if we don't know the sequence length, figure it out */
1994 if (ncbi->seq_L == -1) {
1995 if (fseek(ncbi->fppsq, ncbi->seq_apos - 1, SEEK_SET) != 0) return eslESYS;
1996 if (fread(&c, sizeof(char), 1, ncbi->fppsq) != 1) return eslEFORMAT;
1997
1998 ssize = ncbi->seq_apos - sq->doff - 1;
1999 remainder = c & 0x03;
2000 length = ssize * 4 + remainder;
2001
2002 ncbi->seq_L = length;
2003 }
2004
2005 /* check if we are at the end */
2006 if (sq->start + sq->n > ncbi->seq_L) {
2007 if (nres != NULL) *nres = 0;
2008 sq->L = ncbi->seq_L;
2009 return eslEOD;
2010 }
2011
2012 /* calculate where to start reading from */
2013 start = sq->start + sq->n - 1;
2014 off = sq->doff + start / 4;
2015
2016 /* calculate bits to skip at the beginning and end */
2017 cnt = ncbi->seq_L - (sq->start + sq->n - 1);
2018 cnt = (cnt > len) ? len : cnt;
2019
2020 skip = start & 0x03;
2021 remainder = skip + cnt;
2022 remainder = (remainder & 0x03) ? (remainder & 0x03) : 4;
2023
2024 /* calculate bytes need to read in the window */
2025 ssize = (cnt + skip + 3) / 4;
2026
2027 /* seek to the windows location and read into the buffer */
2028 if (fseek(ncbi->fppsq, off, SEEK_SET) != 0) return eslESYS;
2029 if (ncbi->hdr_alloced < ssize) {
2030 while (ncbi->hdr_alloced < ssize) ncbi->hdr_alloced += ncbi->hdr_alloced;
2031 ESL_RALLOC(ncbi->hdr_buf, t, sizeof(char) * ncbi->hdr_alloced);
2032 }
2033 if (fread(ncbi->hdr_buf, sizeof(char), ssize, ncbi->fppsq) != ssize) return eslEFORMAT;
2034
2035 /* figure out if the sequence is in digital mode or not */
2036 if (sq->dsq != NULL) {
2037 text = FALSE;
2038 ptr = (unsigned char *)sq->dsq + 1;
2039 } else {
2040 text = TRUE;
2041 ptr = (unsigned char *)sq->seq;
2042 }
2043 ptr += sq->n;
2044
2045 inx = 0;
2046 c = ncbi->hdr_buf[inx];
2047 for (inx = skip; inx < 4; ++inx) {
2048 n = 1 << ((c >> (6 - inx * 2)) & 0x03);
2049 *ptr = sqfp->inmap[n];
2050 if (text) *ptr = ncbi->alphasym[(int) *ptr];
2051 ++ptr;
2052 }
2053
2054 for (inx = 1; inx < ssize - 1; ++inx) {
2055 c = ncbi->hdr_buf[inx];
2056 n = 1 << ((c >> 6) & 0x03);
2057 *ptr = sqfp->inmap[n];
2058 if (text) *ptr = ncbi->alphasym[(int) *ptr];
2059 ++ptr;
2060 n = 1 << ((c >> 4) & 0x03);
2061 *ptr = sqfp->inmap[n];
2062 if (text) *ptr = ncbi->alphasym[(int) *ptr];
2063 ++ptr;
2064 n = 1 << ((c >> 2) & 0x03);
2065 *ptr = sqfp->inmap[n];
2066 if (text) *ptr = ncbi->alphasym[(int) *ptr];
2067 ++ptr;
2068 n = 1 << ((c >> 0) & 0x03);
2069 *ptr = sqfp->inmap[n];
2070 if (text) *ptr = ncbi->alphasym[(int) *ptr];
2071 ++ptr;
2072 }
2073
2074 /* handle the remainder */
2075 c = ncbi->hdr_buf[inx];
2076 for (inx = 0; inx < remainder; ++inx) {
2077 n = 1 << ((c >> (6 - inx * 2)) & 0x03);
2078 *ptr = sqfp->inmap[n];
2079 if (text) *ptr = ncbi->alphasym[(int) *ptr];
2080 ++ptr;
2081 }
2082
2083 *ptr = (text) ? '\0' : eslDSQ_SENTINEL;
2084
2085 /* calculate the number of residues processed */
2086 ncnt = (ssize - 1) * 4 + remainder - skip;
2087
2088 /* start processing the abmiguity table if there is one */
2089 if (ncbi->seq_alen > 0) {
2090 correct_ambiguity(sqfp, sq, ncnt);
2091 }
2092
2093 sq->n = sq->n + ncnt;
2094
2095 if (nres != NULL) *nres = ncnt;
2096
2097 return eslOK;
2098
2099 ERROR:
2100 return eslEMEM;
2101 }
2102
2103
2104 /* Function: inmap_ncbi()
2105 * Synopsis: Set up a translation map
2106 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
2107 *
2108 * Purpose: Initialize the translation map used to translate a ncbi
2109 * sequences to the internal representation used in hmmer.
2110 *
2111 * Returns: <eslOK> on success;
2112 *
2113 * Throws: <eslEMEM> on allocation failure;
2114 */
2115 static int
inmap_ncbi(ESL_SQFILE * sqfp)2116 inmap_ncbi(ESL_SQFILE *sqfp)
2117 {
2118 int status = eslOK;
2119
2120 ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
2121
2122 switch(ncbi->alphatype) {
2123 case eslDNA:
2124 status = inmap_ncbi_dna(sqfp);
2125 break;
2126 case eslAMINO:
2127 status = inmap_ncbi_amino(sqfp);
2128 break;
2129 default:
2130 ESL_EXCEPTION(eslEINVAL, "bad alphabet type: unrecognized");
2131 }
2132
2133 return status;
2134 }
2135
2136 /* Function: inmap_ncbi_dna()
2137 * Synopsis: Set up a translation map
2138 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
2139 *
2140 * Purpose: Initialize the translation map used to translate a ncbi
2141 * protein sequence to the internal representation used in
2142 * hmmer.
2143 *
2144 * Returns: <eslOK> on success;
2145 *
2146 * Throws: <eslEMEM> on allocation failure;
2147 */
2148 static int
inmap_ncbi_dna(ESL_SQFILE * sqfp)2149 inmap_ncbi_dna(ESL_SQFILE *sqfp)
2150 {
2151 int x, y;
2152 const char *ncbisym = "-ACMGRSVTWYHKDBN";
2153
2154 ESL_ALPHABET *abc = NULL;
2155 ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
2156
2157 if ((abc = esl_alphabet_Create(eslDNA)) == NULL) return eslEMEM;
2158
2159 for (x = 0; x < 128; x++) sqfp->inmap[x] = eslDSQ_ILLEGAL;
2160
2161 /* for each letter in the ncbi alphabet, find that letter in the
2162 * hmmer alphabet and map the translation.
2163 */
2164 for (x = 0; x < strlen(ncbisym); ++x) {
2165 for (y = 0; y < strlen(abc->sym); ++y) {
2166 if (ncbisym[x] == abc->sym[y]) {
2167 sqfp->inmap[x] = y;
2168 break;
2169 }
2170 }
2171
2172 /* there is a problem if a translation does not exist */
2173 if (y >= strlen(abc->sym)) return eslEFORMAT;
2174 }
2175
2176 if (ncbi->alphasym == NULL) esl_strdup(abc->sym, -1, &ncbi->alphasym);
2177
2178 esl_alphabet_Destroy(abc);
2179
2180 return eslOK;
2181 }
2182
2183
2184 /* Function: inmap_ncbi_amino()
2185 * Synopsis: Set up a translation map
2186 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
2187 *
2188 * Purpose: Initialize the translation map used to translate a ncbi
2189 * protein sequence to the internal representation used in
2190 * hmmer.
2191 *
2192 * Returns: <eslOK> on success;
2193 *
2194 * Throws: <eslEMEM> on allocation failure;
2195 */
2196 static int
inmap_ncbi_amino(ESL_SQFILE * sqfp)2197 inmap_ncbi_amino(ESL_SQFILE *sqfp)
2198 {
2199 int x, y;
2200 const char *ncbisym = "-ABCDEFGHIKLMNPQRSTVWXYZU*OJ";
2201
2202 ESL_ALPHABET *abc = NULL;
2203 ESL_SQNCBI_DATA *ncbi = &sqfp->data.ncbi;
2204
2205 if ((abc = esl_alphabet_Create(eslAMINO)) == NULL) return eslEMEM;
2206
2207 for (x = 0; x < 128; x++) sqfp->inmap[x] = eslDSQ_ILLEGAL;
2208
2209 /* for each letter in the ncbi alphabet, find that letter in the
2210 * hmmer alphabet and map the translation.
2211 */
2212 for (x = 0; x < strlen(ncbisym); ++x) {
2213 for (y = 0; y < strlen(abc->sym); ++y) {
2214 if (ncbisym[x] == abc->sym[y]) {
2215 sqfp->inmap[x] = y;
2216 break;
2217 }
2218 }
2219
2220 /* there is a problem if a translation does not exist */
2221 if (y >= strlen(abc->sym)) return eslEFORMAT;
2222 }
2223
2224 if (ncbi->alphasym == NULL) esl_strdup(abc->sym, -1, &ncbi->alphasym);
2225
2226 esl_alphabet_Destroy(abc);
2227
2228 return eslOK;
2229 }
2230
2231
2232
2233 /*****************************************************************
2234 *# 5. Parsing routines
2235 *****************************************************************/
2236
2237 /* Function: parse_expect()
2238 * Synopsis: Expect the next bytes to parse match
2239 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
2240 *
2241 * Purpose: Match if the next <len> bytes to parse match the bytes
2242 * in <str>. If the bytes do not match, throw <eslEFORMAT>
2243 * error. Advance the parsers pointer.
2244 *
2245 * Returns: <eslOK> on success
2246 *
2247 * Throws: <eslEFORMAT> if there are insufficient bytes remaining
2248 * in the header or if the data to parse does not match
2249 * what is expected.
2250 */
2251 static int
parse_expect(ESL_SQNCBI_DATA * ncbi,void * str,int len)2252 parse_expect(ESL_SQNCBI_DATA *ncbi, void *str, int len)
2253 {
2254 int size;
2255 unsigned char *c;
2256 unsigned char *limit;
2257
2258 size = ncbi->hoff - ncbi->roff;
2259 limit = ncbi->hdr_buf + size;
2260
2261 /* verify the buffer has atleast len bytes remaining */
2262 if (ncbi->hdr_ptr + len > limit) {
2263 ESL_FAIL(eslEFORMAT, ncbi->errbuf, "Expecting %d bytes at %d : 0x%X(%d)\n",
2264 len, (uint32_t) (ncbi->hdr_ptr - ncbi->hdr_buf), ncbi->roff, size);
2265 }
2266
2267 /* check the buffer matches the token string */
2268 c = (unsigned char *) str;
2269 while (len--) {
2270 if (*ncbi->hdr_ptr != *c) {
2271 ESL_FAIL(eslEFORMAT, ncbi->errbuf, "Expecting 0x%X found 0x%X at %d : 0x%X(%d)\n",
2272 *ncbi->hdr_ptr, *c, (uint32_t) (ncbi->hdr_ptr - ncbi->hdr_buf), ncbi->roff, size);
2273 }
2274 ncbi->hdr_ptr++;
2275 c++;
2276 }
2277
2278 return eslOK;
2279 }
2280
2281 /* Function: parse_accept()
2282 * Synopsis: Check if the next bytes to parse match
2283 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
2284 *
2285 * Purpose: Check if the next <len> bytes to parse match the bytes
2286 * in <str>. If the bytes match, they are consumed and the
2287 * parsers pointer is advanced.
2288 *
2289 * Returns: <eslOK> on success
2290 * <eslEFORMAT> if the bytes to not match.
2291 */
2292 static int
parse_accept(ESL_SQNCBI_DATA * ncbi,void * str,int len)2293 parse_accept(ESL_SQNCBI_DATA *ncbi, void *str, int len)
2294 {
2295 int i;
2296 int size;
2297 unsigned char *c;
2298 unsigned char *limit;
2299
2300 size = ncbi->hoff - ncbi->roff;
2301 limit = ncbi->hdr_buf + size;
2302
2303 /* check the buffer matches the token string */
2304 if (ncbi->hdr_ptr + len > limit) return eslEFORMAT;
2305
2306 /* verify the buffer matches the token string without advancing
2307 * the buffer pointers until we have a complete match.
2308 */
2309 c = (unsigned char *) str;
2310 for (i = 0; i < len; ++i) {
2311 if (ncbi->hdr_ptr[i] != c[i]) return eslEFORMAT;
2312 }
2313 ncbi->hdr_ptr += len;
2314
2315 return eslOK;
2316 }
2317
2318 /* Function: parse_peek()
2319 * Synopsis: Peek at the next byte to parse
2320 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
2321 *
2322 * Purpose: Return the next characer to be parsed without advancing the
2323 * parsers pointer.
2324 *
2325 * Returns: <eslOK> on success
2326 * <eslEFORMAT> if there are insufficient bytes remaining
2327 * in the header.
2328 */
2329 static int
parse_peek(ESL_SQNCBI_DATA * ncbi,unsigned char * c)2330 parse_peek(ESL_SQNCBI_DATA *ncbi, unsigned char *c)
2331 {
2332 int size;
2333 unsigned char *limit;
2334
2335 size = ncbi->hoff - ncbi->roff;
2336 limit = ncbi->hdr_buf + size;
2337
2338 /* verify the buffer has atleast len bytes remaining */
2339 if (ncbi->hdr_ptr + 1 > limit) return eslEFORMAT;
2340
2341 *c = *ncbi->hdr_ptr;
2342
2343 return eslOK;
2344 }
2345
2346 /* Function: parse_consume()
2347 * Synopsis: Copies bytes from the header
2348 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
2349 *
2350 * Purpose: Copies <len> bytes from the header to the buffer supplied by
2351 * <str> if non-null. Adcance the parser pointer.
2352 *
2353 * Returns: <eslOK> on success
2354 *
2355 * Throws: <eslEFORMAT> if there are insufficient bytes remaining
2356 * in the header.
2357 */
2358 static int
parse_consume(ESL_SQNCBI_DATA * ncbi,void * str,int len)2359 parse_consume(ESL_SQNCBI_DATA *ncbi, void *str, int len)
2360 {
2361 int i;
2362 int size;
2363 unsigned char *c;
2364 unsigned char *limit;
2365
2366 size = ncbi->hoff - ncbi->roff;
2367 limit = ncbi->hdr_buf + size;
2368
2369 /* verify the buffer has atleast len bytes remaining */
2370 if (ncbi->hdr_ptr + len > limit) {
2371 ESL_FAIL(eslEFORMAT, ncbi->errbuf, "Expecting %d bytes at %d : 0x%X(%d)\n",
2372 len, (uint32_t) (ncbi->hdr_ptr - ncbi->hdr_buf), ncbi->roff, size);
2373 }
2374
2375 /* copy the characters in the buffer to <str> */
2376 c = (unsigned char *) str;
2377 for (i = 0; i < len; ++i) {
2378 if (c != NULL) *c++ = *ncbi->hdr_ptr++;
2379 }
2380
2381 return eslOK;
2382 }
2383
2384 /* Function: parse_advance()
2385 * Synopsis: Advance the parser pointer
2386 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
2387 *
2388 * Purpose: Advance the parser pointer <len> bytes.
2389 *
2390 * Returns: <eslOK> on success
2391 *
2392 * Throws: <eslEFORMAT> if there are insufficient bytes remaining
2393 * in the header.
2394 */
2395 static int
parse_advance(ESL_SQNCBI_DATA * ncbi,int len)2396 parse_advance(ESL_SQNCBI_DATA *ncbi, int len)
2397 {
2398 int size;
2399 unsigned char *limit;
2400
2401 size = ncbi->hoff - ncbi->roff;
2402 limit = ncbi->hdr_buf + size;
2403
2404 /* verify the buffer has atleast len bytes remaining */
2405 if (ncbi->hdr_ptr + len > limit) {
2406 ESL_FAIL(eslEFORMAT, ncbi->errbuf, "Expecting %d bytes at %d : 0x%X(%d)\n",
2407 len, (uint32_t) (ncbi->hdr_ptr - ncbi->hdr_buf), ncbi->roff, size);
2408 }
2409
2410 ncbi->hdr_ptr += len;
2411
2412 return eslOK;
2413 }
2414
2415 /* Function: parse_header()
2416 * Synopsis: Parse the ncbi db header
2417 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
2418 *
2419 * Purpose: Parse a ncbi database header. This routine implements
2420 * a recursive descent parser for the ASN.1 definition of
2421 * a blast database header filling in <sq>.
2422 *
2423 * The blast db header can have multiple definitions defined
2424 * within it. Only the information from the first usable
2425 * defition will be used.
2426 *
2427 * Returns: <eslOK> on success; the new sequence is stored in <s>.
2428 *
2429 * Returns <eslEFORMAT> if there's a problem with the format,
2430 * such as an illegal character.
2431 */
2432 static int
parse_header(ESL_SQNCBI_DATA * ncbi,ESL_SQ * sq)2433 parse_header(ESL_SQNCBI_DATA *ncbi, ESL_SQ *sq)
2434 {
2435 int size;
2436 int status;
2437 void *tmp;
2438
2439 unsigned char c;
2440
2441 reset_header_values(ncbi);
2442 size = ncbi->hoff - ncbi->roff;
2443
2444 /* read in the header data */
2445 if (ncbi->hdr_alloced < size) {
2446 while (ncbi->hdr_alloced < size) ncbi->hdr_alloced += ncbi->hdr_alloced;
2447 ESL_RALLOC(ncbi->hdr_buf, tmp, sizeof(char) * ncbi->hdr_alloced);
2448 }
2449 if (fread(ncbi->hdr_buf, sizeof(char), size, ncbi->fpphr) != size) return eslEFORMAT;
2450 ncbi->hdr_ptr = ncbi->hdr_buf;
2451
2452 /* verify we are at the beginning of a structure */
2453 if (parse_expect(ncbi, "\x30\x80", 2) != eslOK) return eslEFORMAT;
2454
2455 if (parse_peek(ncbi, &c) != eslOK) return eslEFORMAT;
2456
2457 /* parse the different seq id structures */
2458 while (c != 0x00) {
2459 if ((status = parse_def_line(ncbi, sq)) != eslOK) return status;
2460
2461 if (parse_peek(ncbi, &c) != eslOK) return eslEFORMAT;
2462 }
2463
2464 /* verify we are at the end of the structure */
2465 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
2466
2467 return eslOK;
2468 ERROR:
2469 return eslEMEM;
2470 }
2471
2472 /* Function: parse_def_line()
2473 * Synopsis: Parse the Blast-def-line definition
2474 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
2475 *
2476 * Blast-def-line ::= SEQUENCE {
2477 * title VisibleString OPTIONAL, -- simple title
2478 * seqid SEQUENCE OF Seq-id, -- Regular NCBI Seq-Id
2479 * taxid INTEGER OPTIONAL, -- taxonomy id
2480 * memberships SEQUENCE OF INTEGER OPTIONAL, -- bit arrays
2481 * links SEQUENCE OF INTEGER OPTIONAL, -- bit arrays
2482 * other-info SEQUENCE OF INTEGER OPTIONAL -- for future use (probably genomic sequences)
2483 * }
2484 */
2485 static int
parse_def_line(ESL_SQNCBI_DATA * ncbi,ESL_SQ * sq)2486 parse_def_line(ESL_SQNCBI_DATA *ncbi, ESL_SQ *sq)
2487 {
2488 int status;
2489
2490 int i;
2491 int len = 0;
2492 int taxid = -1;
2493 char *title = NULL;
2494
2495 /* verify we are at the beginning of a structure */
2496 if (parse_expect(ncbi, "\x30\x80", 2) != eslOK) return eslEFORMAT;
2497
2498 /* look for an optional title */
2499 sq->desc[0] = 0;
2500 if (parse_accept(ncbi, "\xa0\x80", 2) == eslOK) {
2501 if ((status = parse_string(ncbi, &title, &len)) != eslOK) return status;
2502 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
2503 }
2504
2505 /* look for sequence id structure */
2506 if (parse_expect(ncbi, "\xa1\x80", 2) != eslOK) return eslEFORMAT;
2507 if ((status = parse_seq_id(ncbi)) != eslOK) return status;
2508 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
2509
2510 /* look for an optional taxonomy id */
2511 sq->tax_id = -1;
2512 if (parse_accept(ncbi, "\xa2\x80", 2) == eslOK) {
2513 if ((status = parse_integer(ncbi, &taxid)) != eslOK) return status;
2514 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
2515 }
2516
2517 /* look for an optional memberships */
2518 if (parse_accept(ncbi, "\xa3\x80", 2) == eslOK) {
2519 if ((status = ignore_sequence_of_integer(ncbi)) != eslOK) return status;
2520 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
2521 }
2522
2523 /* look for an optional links */
2524 if (parse_accept(ncbi, "\xa4\x80", 2) == eslOK) {
2525 if ((status = ignore_sequence_of_integer(ncbi)) != eslOK) return status;
2526 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
2527 }
2528
2529 /* look for an optional other info */
2530 if (parse_accept(ncbi, "\xa5\x80", 2) == eslOK) {
2531 if ((status = ignore_sequence_of_integer(ncbi)) != eslOK) return status;
2532 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
2533 }
2534
2535 /* verify we are at the end of the structure */
2536 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
2537
2538 /* zero terminate any saved string */
2539 if (ncbi->name_ptr != NULL) ncbi->name_ptr[ncbi->name_size] = '\0';
2540 if (ncbi->acc_ptr != NULL) ncbi->acc_ptr[ncbi->acc_size] = '\0';
2541 if (ncbi->str_id_ptr != NULL) ncbi->str_id_ptr[ncbi->str_id_size] = '\0';
2542
2543 if (title != NULL) title[len] = '\0';
2544
2545 if (ncbi->name_ptr != NULL || ncbi->acc_ptr != NULL) {
2546 /* if we have both a name and accession, set both fields.
2547 * if we have only one, set the name to that one field.
2548 */
2549 if (ncbi->name_ptr != NULL) {
2550 esl_sq_SetName(sq, ncbi->name_ptr);
2551 if (ncbi->acc_ptr != NULL) {
2552 esl_sq_SetAccession(sq, ncbi->acc_ptr);
2553 }
2554 } else {
2555 esl_sq_SetName(sq, ncbi->acc_ptr);
2556 }
2557 if (title != NULL) esl_sq_SetDesc(sq, title);
2558 } else if (ncbi->str_id_ptr != NULL || ncbi->int_id != -1) {
2559 /* since we don't have a name or accession, use the id
2560 * as the name.
2561 */
2562 if (ncbi->str_id_ptr != NULL) {
2563 esl_sq_SetName(sq, ncbi->str_id_ptr);
2564 } else {
2565 char id[32];
2566 sprintf(id, "%d", ncbi->int_id);
2567 esl_sq_SetName(sq, id);
2568 }
2569 if (title != NULL) esl_sq_SetDesc(sq, title);
2570 } else if (title != NULL) {
2571 /* lastly we don't have anything, so lets just use the
2572 * title. take the first word of the title and use that
2573 * for the name. the remaining portion of the title will
2574 * be used for the description.
2575 */
2576 for (i = 0; i < len; ++i) {
2577 if (isspace(title[i])) {
2578 title[i] = '\0';
2579 break;
2580 }
2581 }
2582 esl_sq_SetName(sq, title);
2583 ++i;
2584
2585 /* skip over multiple spaces till the next word */
2586 for ( ; i < len; ++i) {
2587 if (!isspace(title[i])) break;
2588 }
2589 if (i < len) esl_sq_SetDesc(sq, title + i);
2590 }
2591
2592 if (taxid != -1) sq->tax_id = taxid;
2593
2594 return eslOK;
2595 }
2596
2597
2598 /* Function: parse_seq_id()
2599 * Synopsis: Parse the Blast-def-line definition
2600 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
2601 *
2602 * Seq-id ::= CHOICE {
2603 * local Object-id , -- local use
2604 * gibbsq INTEGER , -- Geninfo backbone seqid
2605 * gibbmt INTEGER , -- Geninfo backbone moltype
2606 * giim Giimport-id , -- Geninfo import id
2607 * genbank Textseq-id ,
2608 * embl Textseq-id ,
2609 * pir Textseq-id ,
2610 * swissprot Textseq-id ,
2611 * patent Patent-seq-id ,
2612 * other Textseq-id , -- for historical reasons, 'other' = 'refseq'
2613 * general Dbtag , -- for other databases
2614 * gi INTEGER , -- GenInfo Integrated Database
2615 * ddbj Textseq-id , -- DDBJ
2616 * prf Textseq-id , -- PRF SEQDB
2617 * pdb PDB-seq-id , -- PDB sequence
2618 * tpg Textseq-id , -- Third Party Annot/Seq GenBank
2619 * tpe Textseq-id , -- Third Party Annot/Seq EMBL
2620 * tpd Textseq-id , -- Third Party Annot/Seq DDBJ
2621 * gpipe Textseq-id , -- Internal NCBI genome pipeline processing ID
2622 * named-annot-track Textseq-id -- Internal named annotation tracking ID
2623 * }
2624 */
2625 static int
parse_seq_id(ESL_SQNCBI_DATA * ncbi)2626 parse_seq_id(ESL_SQNCBI_DATA *ncbi)
2627 {
2628 int status;
2629 int *id_ptr;
2630
2631 unsigned char c;
2632
2633 /* verify we are at the beginning of a structure */
2634 if (parse_expect(ncbi, "\x30\x80", 2) != eslOK) return eslEFORMAT;
2635
2636 if (parse_consume(ncbi, &c, 1) != eslOK) return eslEFORMAT;
2637
2638 /* parse the different seq id structures */
2639 while (c != 0x00) {
2640 if (parse_expect(ncbi, "\x80", 1) != eslOK) return eslEFORMAT;
2641 switch (c) {
2642 case 0xa0: /* LOCAL */
2643 status = parse_object_id(ncbi);
2644 break;
2645 case 0xa1: /* GIBBSQ */
2646 id_ptr = (ncbi->int_id != -1) ? NULL : &ncbi->int_id;
2647 status = parse_integer(ncbi, id_ptr);
2648 break;
2649 case 0xa2: /* GIBBMT */
2650 status = parse_integer(ncbi, NULL);
2651 break;
2652 case 0xa3: /* GIIM */
2653 status = parse_giimport_id(ncbi);
2654 break;
2655 case 0xa4: /* GENBANK */
2656 case 0xa5: /* EMBL */
2657 case 0xa6: /* PIR */
2658 case 0xa7: /* SWISSPROT */
2659 status = parse_textseq_id(ncbi);
2660 break;
2661 case 0xa8: /* PATENT */
2662 status = parse_patent_seq_id(ncbi);
2663 break;
2664 case 0xa9: /* OTHER */
2665 status = parse_textseq_id(ncbi);
2666 break;
2667 case 0xaa: /* GENERAL */
2668 status = parse_dbtag(ncbi);
2669 break;
2670 case 0xab: /* GI */
2671 status = parse_integer(ncbi, NULL);
2672 break;
2673 case 0xac: /* DDBJ */
2674 case 0xad: /* PRF */
2675 status = parse_textseq_id(ncbi);
2676 break;
2677 case 0xae: /* PDB */
2678 status = parse_pdb_seq_id(ncbi);
2679 break;
2680 case 0xaf: /* TPG */
2681 case 0xb0: /* TPE */
2682 case 0xb1: /* TPD */
2683 case 0xb2: /* GPIPE */
2684 case 0xb3: /* NAMED ANNOT TRACK */
2685 status = parse_textseq_id(ncbi);
2686 break;
2687 default:
2688 status = eslEFORMAT;
2689 }
2690
2691 if (status != eslOK) return status;
2692 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
2693 if (parse_consume(ncbi, &c, 1) != eslOK) return eslEFORMAT;
2694 }
2695
2696 /* verify we are at the end of the structure */
2697 if (c != 0x00 || parse_expect(ncbi, "\x00", 1) != eslOK) return eslEFORMAT;
2698
2699 return eslOK;
2700 }
2701
2702
2703 /* Function: parse_textseq_id()
2704 * Synopsis: Parse the general text header
2705 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
2706 *
2707 * Textseq-id ::= SEQUENCE {
2708 * name VisibleString OPTIONAL ,
2709 * accession VisibleString OPTIONAL ,
2710 * release VisibleString OPTIONAL ,
2711 * version INTEGER OPTIONAL
2712 * }
2713 */
2714 static int
parse_textseq_id(ESL_SQNCBI_DATA * ncbi)2715 parse_textseq_id(ESL_SQNCBI_DATA *ncbi)
2716 {
2717 char *acc = NULL;
2718 int alen = 0;
2719 char *name = NULL;
2720 int nlen = 0;
2721
2722 int status;
2723
2724 /* verify we are at the beginning of a structure */
2725 if (parse_expect(ncbi, "\x30\x80", 2) != eslOK) return eslEFORMAT;
2726
2727 /* look for an optional name */
2728 if (parse_accept(ncbi, "\xa0\x80", 2) == eslOK) {
2729 if ((status = parse_string(ncbi, &name, &nlen)) != eslOK) return status;
2730 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
2731 }
2732
2733 /* look for an optional accession */
2734 if (parse_accept(ncbi, "\xa1\x80", 2) == eslOK) {
2735 if ((status = parse_string(ncbi, &acc, &alen)) != eslOK) return status;
2736 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
2737 }
2738
2739 /* look for an optional release */
2740 if (parse_accept(ncbi, "\xa2\x80", 2) == eslOK) {
2741 if ((status = parse_string(ncbi, NULL, NULL)) != eslOK) return status;
2742 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
2743 }
2744
2745 /* look for an optional version */
2746 if (parse_accept(ncbi, "\xa3\x80", 2) == eslOK) {
2747 if ((status = parse_integer(ncbi, NULL)) != eslOK) return status;
2748 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
2749 }
2750
2751 /* verify we are at the end of the structure */
2752 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
2753
2754 /* if we found both the accession and name and so far
2755 * we have only come across incomplete headers, save
2756 * this one off.
2757 */
2758 if (acc != NULL && name != NULL) {
2759 if (ncbi->name_ptr == NULL || ncbi->acc_ptr == NULL) {
2760 ncbi->name_ptr = name;
2761 ncbi->name_size = nlen;
2762 ncbi->acc_ptr = acc;
2763 ncbi->acc_size = alen;
2764 }
2765
2766 } else if (ncbi->name_ptr == NULL && ncbi->acc_ptr == NULL) {
2767 /* if neither the accession or name have been set, and the
2768 * header supplied one, save it off.
2769 */
2770 if (acc != NULL) {
2771 ncbi->acc_ptr = acc;
2772 ncbi->acc_size = alen;
2773 }
2774 if (name != NULL) {
2775 ncbi->name_ptr = name;
2776 ncbi->name_size = nlen;
2777 }
2778 }
2779
2780 return eslOK;
2781 }
2782
2783
2784 /* Function: parse_dbtag()
2785 * Synopsis: Parse the a general db tag
2786 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
2787 *
2788 * Dbtag ::= SEQUENCE {
2789 * db VisibleString , -- name of database or system
2790 * tag Object-id -- appropriate tag
2791 * }
2792 */
2793 static int
parse_dbtag(ESL_SQNCBI_DATA * ncbi)2794 parse_dbtag(ESL_SQNCBI_DATA *ncbi)
2795 {
2796 int status;
2797 int temp_id;
2798
2799 /* verify we are at the beginning of a structure */
2800 if (parse_expect(ncbi, "\x30\x80", 2) != eslOK) return eslEFORMAT;
2801
2802 /* look for an db name */
2803 if (parse_expect(ncbi, "\xa0\x80", 2) != eslOK) return eslEFORMAT;
2804 if ((status = parse_string(ncbi, 0, NULL)) != eslOK) return status;
2805
2806 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
2807
2808 /* it looks like the dbtag is used when formatdb is run
2809 * without parsing sequence ids (ie -o F). if that is
2810 * the case, the id is equal to the sequence number in
2811 * the database. so for dbtag headers, nothing will be
2812 * saved. to do this lets create a bogus id value and
2813 * restore it after dbtag is parsed.
2814 */
2815 temp_id = ncbi->int_id;
2816 ncbi->int_id = 1;
2817
2818 /* look for a tag object */
2819 if (parse_expect(ncbi, "\xa1\x80", 2) != eslOK) return eslEFORMAT;
2820 if ((status = parse_object_id(ncbi)) != eslOK) return status;
2821 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
2822
2823 /* restore the id value */
2824 ncbi->int_id = temp_id;
2825
2826 /* verify we are at the end of the structure */
2827 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
2828
2829 return eslOK;
2830 }
2831
2832
2833 /* Function: parse_giimport_id()
2834 * Synopsis: Parse the a giimport id
2835 * Incept: MSF, Thu Mar 25, 2010 [Janelia]
2836 *
2837 * Giimport-id ::= SEQUENCE {
2838 * id INTEGER, -- the id to use here
2839 * db VisibleString OPTIONAL, -- dbase used in
2840 * release VisibleString OPTIONAL } -- the release
2841 * }
2842 */
2843 static int
parse_giimport_id(ESL_SQNCBI_DATA * ncbi)2844 parse_giimport_id(ESL_SQNCBI_DATA *ncbi)
2845 {
2846 int status;
2847 int id;
2848
2849 /* verify we are at the beginning of a structure */
2850 if (parse_expect(ncbi, "\x30\x80", 2) != eslOK) return eslEFORMAT;
2851
2852 /* look for an id */
2853 if (parse_expect(ncbi, "\xa0\x80", 2) != eslOK) return eslEFORMAT;
2854 if ((status = parse_integer(ncbi, &id)) != eslOK) return status;
2855
2856 /* look for an optional database */
2857 if (parse_accept(ncbi, "\xa1\x80", 2) == eslOK) {
2858 if ((status = parse_string(ncbi, NULL, NULL)) != eslOK) return status;
2859 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
2860 }
2861
2862 /* look for an optional release */
2863 if (parse_accept(ncbi, "\xa2\x80", 2) == eslOK) {
2864 if ((status = parse_string(ncbi, NULL, NULL)) != eslOK) return status;
2865 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
2866 }
2867
2868 /* verify we are at the end of the structure */
2869 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
2870
2871 /* if there is not already a saved seq id, save it */
2872 if (ncbi->int_id == -1 && ncbi->str_id_ptr == NULL) {
2873 ncbi->int_id = id;
2874 }
2875
2876 return eslOK;
2877 }
2878
2879
2880 /* Function: parse_patent_seq_id()
2881 * Synopsis: Parse the patent header
2882 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
2883 *
2884 * Patent-seq-id ::= SEQUENCE {
2885 * seqid INTEGER , -- number of sequence in patent
2886 * cit Id-pat -- patent citation
2887 * }
2888 */
2889 static int
parse_patent_seq_id(ESL_SQNCBI_DATA * ncbi)2890 parse_patent_seq_id(ESL_SQNCBI_DATA *ncbi)
2891 {
2892 int status;
2893 int id;
2894
2895 /* verify we are at the beginning of a structure */
2896 if (parse_expect(ncbi, "\x30\x80", 2) != eslOK) return eslEFORMAT;
2897
2898 /* look for a seqid */
2899 if (parse_expect(ncbi, "\xa0\x80", 2) != eslOK) return eslEFORMAT;
2900 if ((status = parse_integer(ncbi, &id)) != eslOK) return status;
2901
2902 /* look for a patent citation object */
2903 if (parse_expect(ncbi, "\xa1\x80", 2) != eslOK) return eslEFORMAT;
2904 if ((status = parse_id_pat(ncbi)) != eslOK) return status;
2905
2906 /* verify we are at the end of the structure */
2907 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
2908
2909 /* if there is not already a saved seq id, save it */
2910 if (ncbi->int_id == -1 && ncbi->str_id_ptr == NULL) {
2911 ncbi->int_id = id;
2912 }
2913
2914 return eslOK;
2915 }
2916
2917
2918 /* Function: parse_id_pat()
2919 * Synopsis: Parse the patent citation
2920 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
2921 *
2922 * Id-pat ::= SEQUENCE { -- just to identify a patent
2923 * country VisibleString , -- Patent Document Country
2924 * id CHOICE {
2925 * number VisibleString , -- Patent Document Number
2926 * app-number VisibleString -- Patent Doc Appl Number
2927 * } ,
2928 * doc-type VisibleString OPTIONAL -- Patent Doc Type
2929 * }
2930 */
2931 static int
parse_id_pat(ESL_SQNCBI_DATA * ncbi)2932 parse_id_pat(ESL_SQNCBI_DATA *ncbi)
2933 {
2934 int status;
2935
2936 /* verify we are at the beginning of a structure */
2937 if (parse_expect(ncbi, "\x30\x80", 2) != eslOK) return eslEFORMAT;
2938
2939 /* look for a country */
2940 if (parse_expect(ncbi, "\xa0\x80", 2) != eslOK) return eslEFORMAT;
2941 if ((status = parse_string(ncbi, NULL, NULL)) != eslOK) return status;
2942
2943 /* look for an id */
2944 if (parse_expect(ncbi, "\xa1\x80", 2) != eslOK) return eslEFORMAT;
2945
2946 /* the id is a choice of two strings */
2947
2948 /* verify we are at the beginning of a structure */
2949 if (parse_expect(ncbi, "\x30\x80", 2) != eslOK) return eslEFORMAT;
2950
2951 /* look for an optional taxonomy id */
2952 if (parse_accept(ncbi, "\xa0\x80", 2) == eslOK) {
2953 status = parse_string(ncbi, NULL, NULL);
2954 } else if (parse_accept(ncbi, "\xa1\x80", 2) == eslOK) {
2955 status = parse_string(ncbi, NULL, NULL);
2956 } else {
2957 status = eslEFORMAT;
2958 }
2959 if (status != eslOK) return status;
2960
2961 /* verify we are at the end of the structure */
2962 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
2963
2964 /* look for a doc type */
2965 if (parse_accept(ncbi, "\xa3\x80", 2) == eslOK) {
2966 if ((status = parse_string(ncbi, NULL, NULL)) != eslOK) return eslEFORMAT;
2967 }
2968
2969 /* verify we are at the end of the structure */
2970 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
2971
2972 return eslOK;
2973 }
2974
2975
2976 /* Function: parse_object_id()
2977 * Synopsis: Parse a generic sequence id
2978 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
2979 *
2980 * Object-id ::= CHOICE {
2981 * id INTEGER ,
2982 * str VisibleString
2983 * }
2984 */
2985 static int
parse_object_id(ESL_SQNCBI_DATA * ncbi)2986 parse_object_id(ESL_SQNCBI_DATA *ncbi)
2987 {
2988 int status;
2989
2990 char *id_str = NULL;
2991 int id_len = 0;
2992 int id = -1;
2993
2994 /* look for an optional taxonomy id */
2995 if (parse_accept(ncbi, "\xa0\x80", 2) == eslOK) {
2996 status = parse_integer(ncbi, &id);
2997 } else if (parse_accept(ncbi, "\xa1\x80", 2) == eslOK) {
2998 status = parse_string(ncbi, &id_str, &id_len);
2999 } else {
3000 status = eslEFORMAT;
3001 }
3002
3003 /* verify we are at the end of the structure */
3004 if (status == eslOK) {
3005 status = parse_expect(ncbi, "\x00\x00", 2);
3006
3007 /* if there is not already a saved seq id, save it */
3008 if (ncbi->int_id == -1 && ncbi->str_id_ptr == NULL) {
3009 if (id_str != NULL) {
3010 ncbi->str_id_ptr = id_str;
3011 ncbi->str_id_size = id_len;
3012 } else if (id != -1) {
3013 ncbi->int_id = id;
3014 }
3015 }
3016 }
3017
3018 return status;
3019 }
3020
3021
3022 /* Function: parse_pdb_seq_id()
3023 * Synopsis: Parse a PDB sequence
3024 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
3025 *
3026 * PDB-seq-id ::= SEQUENCE {
3027 * mol PDB-mol-id , -- the molecule name
3028 * chain INTEGER , -- a single ASCII character, chain id
3029 * rel Date OPTIONAL } -- release date, month and year
3030 *
3031 * Date ::= CHOICE {
3032 * str VisibleString , -- for those unparsed dates
3033 * std Date-std -- use this if you can
3034 * }
3035 */
3036 static int
parse_pdb_seq_id(ESL_SQNCBI_DATA * ncbi)3037 parse_pdb_seq_id(ESL_SQNCBI_DATA *ncbi)
3038 {
3039 int status;
3040
3041 char *id;
3042 int len;
3043
3044 /* verify we are at the beginning of a structure */
3045 if (parse_expect(ncbi, "\x30\x80", 2) != eslOK) return eslEFORMAT;
3046
3047 /* look for an pdb mol id */
3048 if (parse_expect(ncbi, "\xa0\x80", 2) != eslOK) return eslEFORMAT;
3049 if ((status = parse_string(ncbi, &id, &len)) != eslOK) return status;
3050 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
3051
3052 /* look for chain */
3053 if (parse_accept(ncbi, "\xa1\x80", 2) == eslOK) {
3054 if ((status = parse_integer(ncbi, NULL)) != eslOK) return status;
3055 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
3056 }
3057
3058 /* look for an optional date */
3059 if (parse_accept(ncbi, "\xa2\x80", 2) == eslOK) {
3060 if (parse_accept(ncbi, "\xa0\x80", 2) == eslOK) {
3061 status = parse_string(ncbi, NULL, NULL);
3062 } else if (parse_accept(ncbi, "\xa1\x80", 2) == eslOK) {
3063 status = parse_date_std(ncbi);
3064 } else {
3065 status = eslEFORMAT;
3066 }
3067 if (status != eslOK) return status;
3068 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
3069 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
3070 }
3071
3072 /* if there is not already a saved seq id, save it */
3073 if (ncbi->int_id == -1 && ncbi->str_id_ptr == NULL) {
3074 ncbi->str_id_ptr = id;
3075 ncbi->str_id_size = len;
3076 }
3077
3078 /* verify we are at the end of the structure */
3079 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
3080
3081 return eslOK;
3082 }
3083
3084
3085 /* Function: parse_date_std()
3086 * Synopsis: Parse the data structure
3087 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
3088 *
3089 * Date-std ::= SEQUENCE { -- NOTE: this is NOT a unix tm struct
3090 * year INTEGER , -- full year (including 1900)
3091 * month INTEGER OPTIONAL , -- month (1-12)
3092 * day INTEGER OPTIONAL , -- day of month (1-31)
3093 * season VisibleString OPTIONAL , -- for "spring", "may-june", etc
3094 * hour INTEGER OPTIONAL , -- hour of day (0-23)
3095 * minute INTEGER OPTIONAL , -- minute of hour (0-59)
3096 * second INTEGER OPTIONAL -- second of minute (0-59)
3097 * }
3098 */
3099 static int
parse_date_std(ESL_SQNCBI_DATA * ncbi)3100 parse_date_std(ESL_SQNCBI_DATA *ncbi)
3101 {
3102 int status;
3103
3104 /* verify we are at the beginning of a structure */
3105 if (parse_expect(ncbi, "\x30\x80", 2) != eslOK) return eslEFORMAT;
3106
3107 /* look for a year */
3108 if (parse_expect(ncbi, "\xa0\x80", 2) != eslOK) return eslEFORMAT;
3109 if ((status = parse_integer(ncbi, NULL)) != eslOK) return status;
3110 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
3111
3112 /* look for an optional month */
3113 if (parse_accept(ncbi, "\xa1\x80", 2) == eslOK) {
3114 if ((status = parse_integer(ncbi, NULL)) != eslOK) return status;
3115 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
3116 }
3117
3118 /* look for an optional day */
3119 if (parse_accept(ncbi, "\xa2\x80", 2) == eslOK) {
3120 if ((status = parse_integer(ncbi, NULL)) != eslOK) return status;
3121 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
3122 }
3123
3124 /* look for an optional season */
3125 if (parse_accept(ncbi, "\xa3\x80", 2) == eslOK) {
3126 if ((status = parse_string(ncbi, NULL, NULL)) != eslOK) return status;
3127 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
3128 }
3129
3130 /* look for an optional hour */
3131 if (parse_accept(ncbi, "\xa4\x80", 2) == eslOK) {
3132 if ((status = parse_integer(ncbi, NULL)) != eslOK) return status;
3133 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
3134 }
3135
3136 /* look for an optional minute */
3137 if (parse_accept(ncbi, "\xa5\x80", 2) == eslOK) {
3138 if ((status = parse_integer(ncbi, NULL)) != eslOK) return status;
3139 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
3140 }
3141
3142 /* look for an optional second */
3143 if (parse_accept(ncbi, "\xa6\x80", 2) == eslOK) {
3144 if ((status = parse_integer(ncbi, NULL)) != eslOK) return status;
3145 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
3146 }
3147
3148 /* verify we are at the end of the structure */
3149 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
3150
3151 return eslOK;
3152 }
3153
3154
3155 /* Function: parse_string()
3156 * Synopsis: Parse a visible string
3157 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
3158 *
3159 * Purpose: Parses a string from the header stream.
3160 *
3161 * If <str> is non null, the location of the string in
3162 * the header will be saved. If <len> is non null, the
3163 * length of the string will be filled in. If <str> is
3164 * non null, then <len> must be non null since the strings
3165 * are not zero terminated.
3166 *
3167 * Returns: <eslOK> on success.
3168 * <eslEFORMAT> if there's a problem with the format.
3169 * <eslEINCOMPAT> if <str> is non null and <len> is null.
3170 *
3171 */
3172 static int
parse_string(ESL_SQNCBI_DATA * ncbi,char ** str,int * len)3173 parse_string(ESL_SQNCBI_DATA *ncbi, char **str, int *len)
3174 {
3175 int n;
3176
3177 unsigned char x;
3178 unsigned char c;
3179 unsigned char *ptr;
3180
3181 if (parse_expect(ncbi, "\x1a", 1) != eslOK) return eslEFORMAT;
3182
3183 /* the next byte is the length of the string. if the length is
3184 * less than 128, then this is the true length; otherwise this
3185 * length describes the number of bytes necessary to hold the
3186 * true length of the string in the lower 7 bits.
3187 */
3188 if (parse_consume(ncbi, &c, 1) != eslOK) return eslEFORMAT;
3189 if (c < 128) {
3190 n = c;
3191 } else {
3192 c = c & 0x7f;
3193 if (c > sizeof(n)) return eslEFORMAT;
3194
3195 n = 0;
3196 while (c > 0) {
3197 if (parse_consume(ncbi, &x, 1) != eslOK) return eslEFORMAT;
3198 n = (n << 8) + (unsigned int) x;
3199 --c;
3200 }
3201 }
3202
3203 /* validate the length of the string */
3204 ptr = ncbi->hdr_ptr;
3205 if (parse_advance(ncbi, n) != eslOK) return eslEFORMAT;
3206
3207 /* fill in the values */
3208 if (str != NULL && len == NULL) return eslEINCOMPAT;
3209 if (len != NULL) *len = n;
3210 if (str != NULL) *str = (char *)ptr;
3211
3212 return eslOK;
3213 }
3214
3215
3216 /* Function: parse_integer()
3217 * Synopsis: Parse an integer
3218 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
3219 *
3220 * Purpose: Reads an integer from the header stream. If the integer is
3221 * more bytes than the native int format, the most significant
3222 * bytes will be lost.
3223 *
3224 * If <value> is non null, the parsed integer will be placed
3225 * in the pointer.
3226 *
3227 * Returns: <eslOK> on success.
3228 * <eslEFORMAT> if there's a problem with the format.
3229 *
3230 */
3231 static int
parse_integer(ESL_SQNCBI_DATA * ncbi,int * value)3232 parse_integer(ESL_SQNCBI_DATA *ncbi, int *value)
3233 {
3234 int n;
3235
3236 unsigned char c;
3237 unsigned char *ptr;
3238
3239 if (parse_expect(ncbi, "\x02", 1) != eslOK) return eslEFORMAT;
3240
3241 /* get the length of the integer */
3242 if (parse_peek(ncbi, &c) != eslOK) return eslEFORMAT;
3243 ptr = ncbi->hdr_ptr + 1;
3244
3245 /* advance past the integer to make sure the buffer holds all
3246 * of the integer. the pointer <ptr> points the the start of
3247 * the integer.
3248 */
3249 if (parse_advance(ncbi, c + 1) != eslOK) return eslEFORMAT;
3250
3251 n = 0;
3252 while (c > 0) {
3253 n = (n << 8) + (unsigned int) *ptr++;
3254 --c;
3255 }
3256
3257 if (value != NULL) *value = n;
3258
3259 return eslOK;
3260 }
3261
3262
3263 /* Function: ignore_sequence_of_integer()
3264 * Synopsis: Skip over the sequence of integers
3265 * Incept: MSF, Mon Dec 10, 2009 [Janelia]
3266 *
3267 * Purpose: Skip over a sequence of integers.
3268 *
3269 * Returns: <eslOK> on success.
3270 * <eslEFORMAT> if there's a problem with the format.
3271 *
3272 */
3273 static int
ignore_sequence_of_integer(ESL_SQNCBI_DATA * ncbi)3274 ignore_sequence_of_integer(ESL_SQNCBI_DATA *ncbi)
3275 {
3276 int status;
3277 unsigned char c;
3278
3279 /* verify we are at the beginning of a structure */
3280 if (parse_expect(ncbi, "\x30\x80", 2) != eslOK) return eslEFORMAT;
3281
3282 if (parse_peek(ncbi, &c) != eslOK) return eslEFORMAT;
3283 while (c == 0x02) {
3284 if ((status = parse_integer(ncbi, NULL)) != eslOK) return status;
3285 if (parse_peek(ncbi, &c) != eslOK) return eslEFORMAT;
3286 }
3287
3288 /* verify we are at the end of the structure */
3289 if (parse_expect(ncbi, "\x00\x00", 2) != eslOK) return eslEFORMAT;
3290
3291 return eslOK;
3292 }
3293