1 /* ncbl2_lib.c functions to read ncbi-blast format files from
2 formatdb (blast2.0 format files)
3 */
4
5 /* copyright (c) 2006, 2014 by William R. Pearson and
6 The Rector and Visitors of the University of Virginia */
7
8 /* Licensed under the Apache License, Version 2.0 (the "License");
9 you may not use this file except in compliance with the License.
10 You may obtain a copy of the License at
11
12 http://www.apache.org/licenses/LICENSE-2.0
13
14 Unless required by applicable law or agreed to in writing,
15 software distributed under this License is distributed on an "AS
16 IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
17 express or implied. See the License for the specific language
18 governing permissions and limitations under the License.
19 */
20
21 /* Updated 22-Aug-2014 to include
22
23 ambiguity-decoding code from
24
25 Ralf Jost, Dipl.-Inform.
26 Director, Technical Bioinformatics
27 Biomax Informatics AG
28 ralf.jost@biomax.com
29
30 using code from NCBI Blast distribution
31 */
32
33 /* $Name: $ - $Id: ncbl2_mlib.c 1291 2014-08-28 18:32:58Z wrp $ */
34
35 /* to turn on mmap()ing for Blast2 files: */
36
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <limits.h>
43 #include <fcntl.h>
44 #ifdef UNIX
45 #include <unistd.h>
46 #endif
47 #include <errno.h>
48
49
50 /* ****************************************************************
51
52 17-May-2006
53
54 Modified to read NCBI .[np]al and .msk files. The .nal or .pal file
55 provides a way to read sequences from a list of files. The .msk file
56 provides a compact way of indicating the subset of sequences in a
57 larger database (typically nr or nt) that comprise a smaller database
58 (e.g. swissprot or pdbaa). A .pal file (e.g. swissprot.00.pal) that
59 uses a .msk file has the form:
60
61 # Alias file generated by genmask
62 # Date created: Mon Apr 10 11:24:05 2006
63 #
64 TITLE Non-redundant SwissProt sequences
65 DBLIST nr.00
66 OIDLIST swissprot.00.msk
67 LENGTH 74351250
68 NSEQ 198346
69 MAXOID 2617347
70 MEMB_BIT 1
71 # end of the file
72
73 To work with this file, we must first load the nr.00 file, and then
74 read the swissprot.00.msk file, and then scan all the entries in the
75 swissprot.00.msk file (which are packed 32 mask-bit to an int) to
76 determine whether a specific libpos index entry is present in the
77 subset database.
78
79 **************************************************************** */
80
81
82 /* ****************************************************************
83 This code reads NCBI Blast2 format databases from formatdb version 3 and 4
84
85 (From NCBI) This section describes the format of the databases.
86
87 Formatdb creates three main files for proteins containing indices,
88 sequences, and headers with the extensions, respectively, of pin, psq,
89 and phr (for nucleotides these are nin, nsq, and nhr). A number of
90 other ISAM indices are created, but these are described elsewhere.
91
92 FORMAT OF THE INDEX FILE
93 ------------------------
94
95 1.) formatdb version number [4 bytes].
96
97 2.) protein dump flag (1 for a protein database, 0 for a nucleotide
98 database) [4 bytes].
99
100 3.) length of the database title in bytes [4 bytes].
101 4.) the database title [length given in 3.)].
102 5.) length of the date/time string [4 bytes].
103 6.) the date/time string [length given in 5.)].
104 7.) the number of sequences in the database [4 bytes].
105 8.) the total length of the database in residues/basepairs [4 bytes].
106 9.) the length of the longest sequence in the database [4 bytes].
107
108 10.) a list of the offsets for definitions (one for each sequence) in
109 the header file. There are num_of_seq+1 of these, where num_of_seq is
110 the number of sequences given in 7.).
111
112 11.) a list of the offsets for sequences (one for each sequence) in
113 the sequence file. There are num_of_seq+1 of these, where num_of_seq
114 is the number of sequences given in 7.).
115
116 12.) a list of the offsets for the ambiguity characters (one for each
117 sequence) in the sequence file. This list is only present for
118 nucleotide databases and, since the database is compressed 4/1 for
119 nucleotides, allows the ambiguity characters to be restored when the
120 sequence is generated. There are num_of_seq+1 of these, where
121 num_of_seq is the number of sequences given in 7.).
122
123
124 FORMAT OF THE SEQUENCE FILE
125 ---------------------------
126
127 There are different formats for the protein and nucleotide sequence files.
128
129 The protein sequence files is quite simple. The first byte in the
130 file is a NULL byte, followed by the sequence in ncbistdaa format
131 (described in the NCBI Software Development Toolkit documentation).
132 Following the sequence is another NULL byte, followed by the next
133 sequence. The file ends with a NULL byte, following the last
134 sequence.
135
136 The nucleotide sequence file contains the nucleotide sequence, with
137 four basepairs compressed into one byte. The format used is NCBI2na,
138 documented in the NCBI Software Development Toolkit manual. Any
139 ambiguity characters present in the original sequence are replaced at
140 random by A, C, G or T. The true value of ambiguity characters are
141 stored at the end of each sequence to allow true reproduction of the
142 original sequence.
143
144 FORMAT OF THE HEADER FILE (formatdb version 3)
145 -------------------------
146
147 The format of the header file depends on whether or not the identifiers in the
148 original file were parsed or not. For the case that they were not, then each
149 entry has the format:
150
151 gnl|BL_ORD_ID|entry_number my favorite yeast sequence...
152
153 Here entry_number gives the ordinal number of the sequence in the
154 database (with zero offset). The identifier
155 gnl|BL_ORD_ID|entry_number is used by the BLAST software to identify
156 the entry, if the user has not provided another identifier. If the
157 identifier was parsed, then gnl|BL_ORD_ID|entry_number is replaced by
158 the correct identifier, as described in
159 ftp://ncbi.nlm.nih.gov/blast/db/README .
160
161 There are no separators between these deflines.
162
163 For formatdb version 4, the header file contains blast ASN.1 binary
164 deflines, which can parsed with parse_fastadl_asn().
165
166 FORMAT OF THE .MSK FILE
167 -----------------------
168
169 The .msk file is simply a packed list of masks for formatdb "oids" for
170 some other file (typically nr). The first value is the last oid
171 available; the remainder are packed 32 oids/mask, so that the number
172 of masks is 1/32 the number of sequences in the file.
173
174 **************************************************************** */
175
176 #ifdef USE_MMAP
177 #include <sys/types.h>
178 #include <sys/stat.h>
179 #include <sys/mman.h>
180 #ifdef IBM_AIX
181 #include <fcntl.h>
182 #else
183 #include <sys/fcntl.h>
184 #endif
185 #endif
186
187 #ifdef USE_MMAP
188 #ifndef MAP_FILE
189 #define MAP_FILE 0
190 #endif
191 #endif
192
193 #ifdef UNIX
194 #define RBSTR "r"
195 #else
196 #define RBSTR "rb"
197 #endif
198
199 #ifdef WIN32
200 #define SLASH_CHAR '\\'
201 #define SLASH_STR "\\"
202 #else
203 #define SLASH_CHAR '/'
204 #define SLASH_STR "/"
205 #endif
206
207 #define XTERNAL
208 #include "uascii.h"
209
210 #define XTERNAL
211 #include "upam.h"
212 #include "ncbl2_head.h"
213
214 #include "defs.h"
215 #include "structs.h"
216 #include "mm_file.h"
217
218 #define MAX_FADL_ACC_LEN 64
219
220 unsigned int bl2_uint4_cvt(unsigned int);
221 unsigned int bl2_long4_cvt(long);
222 uint64_t bl2_long8_cvt(uint64_t);
223 void src_int4_read(FILE *fd, int *valp);
224 void src_uint4_read(FILE *fd, unsigned int *valp);
225 void src_long4_read(FILE *fd, long *valp);
226 void src_long8_read(FILE *fd, int64_t *val);
227 void ncbi_long8_read(FILE *fd, int64_t *valp);
228 void src_char_read(FILE *fd, char *valp);
229 unsigned char *parse_fastadl_asn(unsigned char *asn_buff, unsigned char *asn_max,
230 int *gi_p, int *db, char *acc, size_t acc_len,
231 char *name, size_t name_len,
232 char *title, size_t t_len, int *taxid);
233
234 /* nt_btoa maps from blast 2bit format to ascii characters */
235 static char nt_btoa[5] = {"ACGT"};
236
237 static char *aa_b2toa= "-ABCDEFGHIKLMNPQRSTVWXYZU*OJ"; /* NCBIstdaa */
238
239 static int aa_btof[32]; /* maps to fasta alphabet */
240 static int aa_btof_null = 0;
241
242 static int dbtype, dbformat, amb_cnt;
243
244 #define NCBIBL20 12
245
246 struct lmf_str *load_ncbl2(struct lmf_str *m_fptr, FILE *ifile, int dbformat, int dbtype);
247
248 int ncbl2_get_mmap_chain_o(struct seqr_chain *cur_seqr_chain,
249 struct lmf_str *m_fd, struct db_str *db);
250
251 int ncbl2_get_mmap_chain(struct seqr_chain *cur_seqr_chain,
252 struct lmf_str *m_fd, struct db_str *db);
253
254 int ncbl2_getliba(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
255 int ncbl2_getlibn(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
256
257 int ncbl2_getliba_o(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
258 int ncbl2_getlibn_o(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
259
260 void newname(char *, char *, char *, int);
261 void parse_pal(char *, char *, int *, int *, FILE *);
262
263 int readMFILE (void *buffer, size_t size, int nitems, struct lmf_str *m_fd);
264
265 void ncbl2_ranlib(char *, int, fseek_t, char *, struct lmf_str *m_fd);
266
267 /* ncbl2_openlib() is used to open (and memory map) a BLAST2.0 format
268 file. Ifdef USE_MMAP, then ncbl2_openlib returns a structure that can
269 be used to read the database. */
270
271 struct lmf_str *
ncbl2_openlib(struct lib_struct * lib_p,int ldnaseq)272 ncbl2_openlib(struct lib_struct *lib_p, int ldnaseq)
273 {
274 char lname[256]; /* .pal, .nal file name */
275 char dname[256]; /* .pin, .nin file for files included from .msk files */
276 char msk_name[256]; /* .msk file name */
277 char hname[256]; /* .phr, .nhr */
278 char sname[256]; /* .psq, .nsq */
279 char tname[256]; /* .pin, .nin file */
280 char db_dir[256]; /* directory where all the files live */
281 int pref_db= -1; /* right now, only swissprot.pal, pdbaa.pal
282 are used for masked OID files */
283 char *bp;
284 int oid_seqs, max_oid, have_oid_list;
285 int oid_cnt, oid_len;
286 unsigned int *oid_list, o_max;
287 int tmp;
288 int i;
289 #ifdef USE_MMAP
290 struct stat statbuf;
291 #endif
292 FILE *ifile; /* index offsets, also DB info */
293 struct lmf_str *m_fptr;
294
295 /* this function should be reorganized
296 (1) check to see if there is a .nal/.pal file before doing things
297 with names, since the names might be wrong.
298 (2) if there is a .nal/.pal file, check to see if it has a DBLIST
299 (later), if it does, then modify the lib_p->next chain
300 if it does not, then generate the appropriate file names, and
301 read the oid list
302 (3) otherwise (no .nal/.pal file), generate the file names
303
304
305 */
306
307 if (ldnaseq==SEQT_PROT) {
308 newname(lname,lib_p->file_name,AA_LIST_EXT,(int)sizeof(lname)); /* .pal */
309 }
310 else {
311 newname(lname,lib_p->file_name,NT_LIST_EXT,(int)sizeof(lname)); /* .nal */
312 }
313
314 /* check for a .nal/.pal OID list file */
315 max_oid = oid_seqs = 0;
316 oid_list = NULL;
317
318 /* here, we check for a .pal/.nal file by trying to open it */
319 if ((ifile = fopen(lname,"r"))!=NULL) {
320
321 if ((bp = strrchr(lib_p->file_name,SLASH_CHAR))!=NULL) {
322 *bp = '\0';
323 SAFE_STRNCPY(db_dir,lib_p->file_name,sizeof(db_dir));
324 SAFE_STRNCAT(db_dir,SLASH_STR,sizeof(db_dir));
325 *bp = SLASH_CHAR;
326 }
327 else {
328 db_dir[0]='\0';
329 }
330
331 /* we have a list file, we need to parse it */
332 parse_pal(dname, msk_name, &oid_seqs, &max_oid, ifile);
333 fclose(ifile);
334
335 pref_db = -1;
336
337 if (oid_seqs > 0) {
338
339 have_oid_list = 1;
340 /* get the pref_db before adding the directory */
341 if (strncmp(msk_name,"swissprot",9)==0) {
342 pref_db = 7;
343 }
344 else if (strncmp(msk_name,"pdbaa",5)==0) {
345 pref_db = 14;
346 }
347
348 /* need to add directory to both dname and msk_name */
349 SAFE_STRNCPY(tname,db_dir,sizeof(tname));
350 SAFE_STRNCAT(tname,msk_name, sizeof(tname));
351 SAFE_STRNCPY(msk_name, tname, sizeof(msk_name));
352
353 SAFE_STRNCPY(tname,db_dir,sizeof(tname));
354 SAFE_STRNCAT(tname,dname, sizeof(tname));
355 SAFE_STRNCPY(dname,tname,sizeof(dname));
356
357 if (ldnaseq == SEQT_PROT) {
358 newname(tname,dname,AA_INDEX_EXT,(int)sizeof(tname));
359 newname(hname,dname,AA_HEADER_EXT,(int)sizeof(hname));
360 newname(sname,dname,AA_SEARCHSEQ_EXT,(int)sizeof(sname));
361 }
362 else { /* reading DNA library */
363 newname(tname,dname,NT_INDEX_EXT,(int)sizeof(tname));
364 newname(hname,dname,NT_HEADER_EXT,(int)sizeof(hname));
365 newname(sname,dname,NT_SEARCHSEQ_EXT,(int)sizeof(sname));
366 }
367 /* now load the oid file */
368 if ((ifile = fopen(msk_name,RBSTR))==NULL) {
369 fprintf(stderr,"error - cannot load %s file\n",msk_name);
370 return NULL;
371 }
372 else {
373 src_uint4_read(ifile,&o_max);
374 if (o_max != max_oid) {
375 fprintf(stderr," error - oid count mismatch %d != %d\n",max_oid, o_max);
376 }
377 oid_len = (max_oid/32+1);
378 if ((oid_list=(unsigned int *)calloc(oid_len,sizeof(int)))==NULL) {
379 fprintf(stderr," error - cannot allocate oid_list[%d]\n",oid_len);
380 return NULL;
381 }
382 if ((oid_cnt=fread(oid_list,sizeof(int),oid_len,ifile))==0) {
383 fprintf(stderr," error - cannot read oid_list[%d]\n",oid_len);
384 return NULL;
385 }
386 fclose(ifile);
387 }
388 }
389 #ifdef DEBUG
390 else { /* we had a .msk file, but there are no oid's in
391 it. */
392 fprintf(stderr," *** WARNING -- found .pal/.nal file %s with no OIDs\n",lname);
393 return NULL;
394 }
395 #endif
396 }
397 else { /* else no OID/.msk file -- generate the names */
398 have_oid_list = 0;
399
400 /* initialize file names */
401 if (ldnaseq==SEQT_PROT) { /* read a protein database */
402 newname(tname,lib_p->file_name,AA_INDEX_EXT,(int)sizeof(tname)); /* .pin */
403 newname(hname,lib_p->file_name,AA_HEADER_EXT,(int)sizeof(hname)); /* .phr */
404 newname(sname,lib_p->file_name,AA_SEARCHSEQ_EXT,(int)sizeof(sname)); /* .psq */
405
406 }
407 else { /* reading DNA library */
408 newname(tname,lib_p->file_name,NT_INDEX_EXT,(int)sizeof(tname)); /* .nin */
409 newname(hname,lib_p->file_name,NT_HEADER_EXT,(int)sizeof(hname)); /* .nhr */
410 newname(sname,lib_p->file_name,NT_SEARCHSEQ_EXT,(int)sizeof(sname)); /* .nsq */
411 }
412 }
413
414
415 if (ldnaseq == SEQT_PROT) {
416 /* initialize map of BLAST2 amino acids to FASTA amino acids */
417 for (i=0; aa_b2toa[i]; i++) {
418 if ((tmp=aascii[aa_b2toa[i]])<NA) {
419 aa_btof[i]=tmp;
420 if (aa_b2toa[i] == 'O' || aa_b2toa[i] == 'o') aa_btof[i] = aascii['K'];
421 else if (aa_b2toa[i] == 'U' || aa_b2toa[i] == 'u') aa_btof[i] = aascii['C'];
422 }
423 else if (aa_b2toa[i]=='*') aa_btof[i]=aascii['X'];
424 else aa_btof[i]=0;
425 /* else aa_btof[i]=aascii['X']; */
426 }
427
428 /* check to see if aa_btof[] actually does anything interesting */
429 aa_btof_null = 1;
430 for (i=0; i<sizeof(aa_b2toa); i++) {
431 if (i != aa_btof[i]) {
432 #ifdef DEBUG
433 fprintf(stderr," difference at: i: %d [%c] != aa_btof[i]: %d [%c]\n",
434 i,aa_b2toa[i],aa_btof[i],NCBIstdaa[aa_btof[i]]);
435 #endif
436 aa_btof_null = 0;
437 }
438 }
439 } /* else no OID/.msk file */
440
441
442 /* now we have all the file names, open the files and read the data */
443 /* open the index/header file, and read the sequence type info */
444 if ((ifile = fopen(tname,RBSTR))==NULL) {
445 fprintf(stderr," cannot open %s (%s) INDEX file",tname,lib_p->file_name);
446 perror("...");
447 return NULL;
448 }
449 src_uint4_read(ifile,(unsigned *)&dbformat); /* get format DB version number */
450 src_uint4_read(ifile,(unsigned *)&dbtype); /* get 1 for protein/0 DNA */
451
452 if (dbformat != FORMATDBV3 && dbformat!=FORMATDBV4) {
453 fprintf(stderr,"error - %s wrong formatdb version (%d/%d)\n",
454 tname,dbformat,FORMATDBV3);
455 return NULL;
456 }
457
458 if ((ldnaseq==SEQT_PROT && dbtype != AAFORMAT) ||
459 (ldnaseq==SEQT_DNA && dbtype!=NTFORMAT)) {
460 fprintf(stderr,"error - %s wrong format (%d/%d)\n",
461 tname,dbtype,(ldnaseq ? NTFORMAT: AAFORMAT));
462 return NULL;
463 }
464
465 /* the files are there - allocate lmf_str */
466 if ((m_fptr=(struct lmf_str *)calloc(1,sizeof(struct lmf_str)))==NULL) {
467 fprintf(stderr," cannot allocate lmf_str\n");
468 return NULL;
469 }
470
471 m_fptr->lib_aa = (ldnaseq == 0);
472 m_fptr->tmp_buf_max = 4096;
473 if ((m_fptr->tmp_buf=
474 (char *)calloc(m_fptr->tmp_buf_max,sizeof(char)))==NULL) {
475 fprintf(stderr," cannot allocate lmf_str->tmp_buffer\n");
476 return NULL;
477 }
478
479 /* load the oid info */
480 m_fptr->have_oid_list = have_oid_list;
481 m_fptr->max_oid = max_oid;
482 m_fptr->oid_seqs = oid_seqs;
483 m_fptr->oid_list = oid_list;
484 m_fptr->pref_db= pref_db;
485 m_fptr->get_mmap_chain = NULL;
486
487 /* open the header file */
488 if ((m_fptr->hfile = fopen(hname,RBSTR))==NULL) {
489 fprintf(stderr," cannot open %s header file\n",hname);
490 goto error_r;
491 }
492
493 /* ncbl2_ranlib is used for all BLAST2.0 access */
494 m_fptr->ranlib = ncbl2_ranlib;
495 m_fptr->bl_format_ver = dbformat;
496 m_fptr->lb_type = NCBIBL20;
497 m_fptr->libf = NULL;
498
499 if (ldnaseq==SEQT_DNA) {
500 if (oid_seqs > 0) {
501 m_fptr->getlib = ncbl2_getlibn_o;
502 }
503 else {
504 m_fptr->getlib = ncbl2_getlibn;
505 }
506 m_fptr->sascii = nascii;
507 }
508 else {
509 if (oid_seqs > 0) {
510 m_fptr->getlib = ncbl2_getliba_o;
511 m_fptr->get_mmap_chain = ncbl2_get_mmap_chain_o;
512 }
513 else {
514 m_fptr->getlib = ncbl2_getliba;
515 m_fptr->get_mmap_chain = ncbl2_get_mmap_chain;
516 }
517 m_fptr->sascii = aascii;
518 }
519 m_fptr->lb_name = lib_p->file_name;
520
521 /* open the sequence file */
522
523 #if defined (USE_MMAP)
524 m_fptr->mm_flg=((m_fptr->mmap_fd=open(sname,O_RDONLY))>=0);
525 if (!m_fptr->mm_flg) {
526 fprintf(stderr," cannot open %s",sname);
527 perror("...");
528 }
529 else {
530 if(fstat(m_fptr->mmap_fd, &statbuf) < 0) {
531 fprintf(stderr," cannot fstat %s",sname);
532 perror("...");
533 m_fptr->mm_flg = 0;
534 }
535 else {
536 m_fptr->st_size = statbuf.st_size;
537 if((m_fptr->mmap_base =
538 mmap(NULL, m_fptr->st_size, PROT_READ,
539 MAP_FILE | MAP_SHARED, m_fptr->mmap_fd, 0)) == (char *) -1) {
540 fprintf(stderr," cannot mmap %s",sname);
541 perror("...");
542 m_fptr->mm_flg = 0;
543 }
544 else {
545 m_fptr->mmap_addr = m_fptr->mmap_base;
546 m_fptr->mm_flg = 1;
547 }
548 }
549 /* regardless, close the open()ed version */
550 close(m_fptr->mmap_fd);
551 }
552 #else
553 m_fptr->mm_flg = 0;
554 #endif
555
556 if (!m_fptr->mm_flg) {
557 if ((m_fptr->libf = fopen(sname,RBSTR))==NULL) {
558 fprintf(stderr," cannot open %s sequence file",sname);
559 perror("...");
560 goto error_r;
561 }
562 }
563
564 /* all files should be open -- the rest of the work can be done by a
565 function common to ncbl2_openlib() and ncbl2_reopen()
566 */
567
568 return load_ncbl2(m_fptr, ifile, dbformat, dbtype);
569
570 error_r:
571 /* here if failure after m_fptr allocated */
572 free(m_fptr);
573 return NULL;
574 }
575
576
577 /* **************************************************************** */
578 /* re-open an already opened library file using information in m_fptr
579
580 a valid m_fptr guarantees that the necessary files exist, and we
581 know whether there is an OID file. m_fptr's are NEVER used for
582 file lists, only for files with sequence data
583 */
584 /* **************************************************************** */
585
ncbl2_reopen(struct lmf_str * m_fptr)586 struct lmf_str *ncbl2_reopen(struct lmf_str *m_fptr) {
587 char lname[256]; /* .pal, .nal file name */
588 char dname[256]; /* .pin, .nin file for files included from .msk files */
589 char msk_name[256]; /* .msk file name */
590 char hname[256]; /* .phr, .nhr */
591 char sname[256]; /* .psq, .nsq */
592 char tname[256]; /* .pin, .nin file */
593 char db_dir[256]; /* directory where all the files live */
594 int pref_db= -1; /* right now, only swissprot.pal, pdbaa.pal
595 are used for masked OID files */
596 char *bp;
597 int oid_seqs, max_oid, have_oid_list;
598 int oid_cnt, oid_len;
599 unsigned int *oid_list, o_max;
600 #ifdef USE_MMAP
601 struct stat statbuf;
602 #endif
603 FILE *ifile; /* index offsets, also DB info */
604
605 /* its not open, but its being re-used, so re-initialize things */
606 m_fptr->libf = NULL;
607 m_fptr->mmap_fd = -1;
608 m_fptr->mm_flg = 0;
609
610 /* if we have an oid list, open it, read it, and use it to get the file names */
611 /* check for a .nal/.pal OID list file */
612 max_oid = oid_seqs = 0;
613 oid_list = NULL;
614
615 if (m_fptr->have_oid_list) {
616 if (m_fptr->lib_aa==1) {
617 newname(lname,m_fptr->lb_name,AA_LIST_EXT,(int)sizeof(lname)); /* .pal */
618 }
619 else {
620 newname(lname,m_fptr->lb_name,NT_LIST_EXT,(int)sizeof(lname)); /* .nal */
621 }
622
623 ifile = fopen(lname,"r"); /* it has to open, it did before */
624 if ((bp = strrchr(m_fptr->lb_name,SLASH_CHAR))!=NULL) {
625 *bp = '\0';
626 SAFE_STRNCPY(db_dir,m_fptr->lb_name,sizeof(db_dir));
627 SAFE_STRNCAT(db_dir,SLASH_STR,sizeof(db_dir));
628 *bp = SLASH_CHAR;
629 }
630 else {
631 db_dir[0]='\0';
632 }
633
634 /* we have a list file, we need to parse it */
635 parse_pal(dname, msk_name, &oid_seqs, &max_oid, ifile);
636 fclose(ifile);
637
638 /* we have read the .pal/.nal file, now deal with the .msk file */
639
640 pref_db = -1;
641
642 /* get the pref_db before adding the directory */
643 if (strncmp(msk_name,"swissprot",9)==0) {
644 pref_db = 7;
645 }
646 else if (strncmp(msk_name,"pdbaa",5)==0) {
647 pref_db = 14;
648 }
649
650 /* need to add directory to both dname and msk_name */
651 SAFE_STRNCPY(tname,db_dir,sizeof(tname));
652 SAFE_STRNCAT(tname,msk_name, sizeof(tname));
653 SAFE_STRNCPY(msk_name, tname, sizeof(msk_name));
654
655 SAFE_STRNCPY(tname,db_dir,sizeof(tname));
656 SAFE_STRNCAT(tname,dname, sizeof(tname));
657 SAFE_STRNCPY(dname,tname,sizeof(dname));
658
659 if (m_fptr->lib_aa) {
660 newname(tname,dname,AA_INDEX_EXT,(int)sizeof(tname));
661 newname(hname,dname,AA_HEADER_EXT,(int)sizeof(hname));
662 newname(sname,dname,AA_SEARCHSEQ_EXT,(int)sizeof(sname));
663 }
664 else { /* reading DNA library */
665 newname(tname,dname,NT_INDEX_EXT,(int)sizeof(tname));
666 newname(hname,dname,NT_HEADER_EXT,(int)sizeof(hname));
667 newname(sname,dname,NT_SEARCHSEQ_EXT,(int)sizeof(sname));
668 }
669
670 ifile = fopen(msk_name,RBSTR);
671 src_uint4_read(ifile,&o_max);
672 oid_len = (max_oid/32+1);
673 if ((oid_list=(unsigned int *)calloc(oid_len,sizeof(int)))==NULL) {
674 fprintf(stderr," error - cannot allocate oid_list[%d]\n",oid_len);
675 return NULL;
676 }
677 if ((oid_cnt=fread(oid_list,sizeof(int),oid_len,ifile))==0) {
678 fprintf(stderr," error - cannot read oid_list[%d]\n",oid_len);
679 return NULL;
680 }
681 fclose(ifile);
682 }
683 else { /* else no OID/.msk file -- generate the names */
684 /* initialize file names */
685 if (m_fptr->lib_aa) { /* read a protein database */
686 newname(tname,m_fptr->lb_name,AA_INDEX_EXT,(int)sizeof(tname)); /* .pin */
687 newname(hname,m_fptr->lb_name,AA_HEADER_EXT,(int)sizeof(hname)); /* .phr */
688 newname(sname,m_fptr->lb_name,AA_SEARCHSEQ_EXT,(int)sizeof(sname)); /* .psq */
689
690 }
691 else { /* reading DNA library */
692 newname(tname,m_fptr->lb_name,NT_INDEX_EXT,(int)sizeof(tname)); /* .nin */
693 newname(hname,m_fptr->lb_name,NT_HEADER_EXT,(int)sizeof(hname)); /* .nhr */
694 newname(sname,m_fptr->lb_name,NT_SEARCHSEQ_EXT,(int)sizeof(sname)); /* .nsq */
695 }
696 }
697
698 /* now we have all the file names, open the files and read the data */
699 /* open the index/header file, and read the sequence type info */
700 if ((ifile = fopen(tname,RBSTR))==NULL) {
701 fprintf(stderr," cannot open %s (%s) INDEX file",tname,m_fptr->lb_name);
702 perror("...");
703 return NULL;
704 }
705 src_uint4_read(ifile,(unsigned *)&dbformat); /* get format DB version number */
706 src_uint4_read(ifile,(unsigned *)&dbtype); /* get 1 for protein/0 DNA */
707
708 m_fptr->tmp_buf_max = 4096;
709 if ((m_fptr->tmp_buf=
710 (char *)calloc(m_fptr->tmp_buf_max,sizeof(char)))==NULL) {
711 fprintf(stderr," cannot allocate lmf_str->tmp_buffer\n");
712 return NULL;
713 }
714
715 /* load the oid info */
716 m_fptr->max_oid = max_oid;
717 m_fptr->oid_seqs = oid_seqs;
718 m_fptr->oid_list = oid_list;
719 m_fptr->pref_db= pref_db;
720 m_fptr->get_mmap_chain = NULL;
721
722 /* open the header file */
723 m_fptr->hfile = fopen(hname,RBSTR);
724
725 /* ncbl2_ranlib is used for all BLAST2.0 access */
726 m_fptr->ranlib = ncbl2_ranlib;
727 m_fptr->bl_format_ver = dbformat;
728
729 if (!m_fptr->lib_aa) {
730 if (oid_seqs > 0) {
731 m_fptr->getlib = ncbl2_getlibn_o;
732 }
733 else {
734 m_fptr->getlib = ncbl2_getlibn;
735 }
736 m_fptr->sascii = nascii;
737 }
738 else {
739 if (oid_seqs > 0) { m_fptr->getlib = ncbl2_getliba_o; }
740 else { m_fptr->getlib = ncbl2_getliba; }
741 m_fptr->sascii = aascii;
742 }
743
744 /* open the sequence file */
745
746 #if defined (USE_MMAP)
747 m_fptr->mm_flg=((m_fptr->mmap_fd=open(sname,O_RDONLY))>=0);
748 if(fstat(m_fptr->mmap_fd, &statbuf) < 0) {
749 fprintf(stderr," cannot fstat %s",sname);
750 perror("...");
751 m_fptr->mm_flg = 0;
752 }
753 else {
754 m_fptr->st_size = statbuf.st_size;
755 if((m_fptr->mmap_base =
756 mmap(NULL, m_fptr->st_size, PROT_READ,
757 MAP_FILE | MAP_SHARED, m_fptr->mmap_fd, 0)) == (char *) -1) {
758 fprintf(stderr," cannot mmap %s",sname);
759 perror("...");
760 m_fptr->mm_flg = 0;
761 }
762 else {
763 m_fptr->mmap_addr = m_fptr->mmap_base;
764 m_fptr->mm_flg = 1;
765 }
766 }
767 /* regardless, close the open()ed version */
768 close(m_fptr->mmap_fd);
769 #else
770 m_fptr->mm_flg = 0;
771 #endif
772
773 if (!m_fptr->mm_flg) {
774 m_fptr->libf = fopen(sname,RBSTR);
775 if (!m_fptr->libf) {
776 fprintf(stderr," cannot open %s\n",sname);
777 return NULL;
778 }
779 m_fptr->mm_flg = 0;
780 }
781
782 return load_ncbl2(m_fptr, ifile, dbformat, dbtype);
783 }
784
785 struct lmf_str
load_ncbl2(struct lmf_str * m_fptr,FILE * ifile,int dbformat,int dbtype)786 *load_ncbl2(struct lmf_str *m_fptr, FILE *ifile, int dbformat, int dbtype) {
787 int title_len;
788 char *title_str=NULL;
789 int date_len;
790 char *date_str=NULL;
791 long ltmp;
792 int64_t l8tmp;
793 int i, tmp;
794 unsigned int *f_pos_arr;
795
796 src_uint4_read(ifile,(unsigned *)&title_len);
797
798 if (title_len > 0) {
799 if ((title_str = calloc((size_t)title_len+1,sizeof(char)))==NULL) {
800 fprintf(stderr," cannot allocate title string (%d)\n",title_len);
801 goto error_r;
802 }
803 fread(title_str,(size_t)1,(size_t)title_len,ifile);
804 }
805
806 src_uint4_read(ifile,(unsigned *)&date_len);
807
808 if (date_len > 0) {
809 if ((date_str = calloc((size_t)date_len+1,sizeof(char)))==NULL) {
810 fprintf(stderr," cannot allocate date string (%d)\n",date_len);
811 goto error_r;
812 }
813 fread(date_str,(size_t)1,(size_t)date_len,ifile);
814 }
815
816 m_fptr->lpos = 0;
817 src_uint4_read(ifile,(unsigned *)&m_fptr->max_cnt);
818
819 if (dbformat == FORMATDBV3) {
820 src_long4_read(ifile,<mp);
821 m_fptr->tot_len = ltmp;
822 }
823 else {
824 ncbi_long8_read(ifile,&l8tmp);
825 m_fptr->tot_len = l8tmp;
826 }
827
828 src_long4_read(ifile,<mp);
829 m_fptr->max_len = ltmp;
830
831 /* currently we are not using this information, but perhaps later */
832 if (title_str!=NULL) free(title_str);
833 if (date_str!=NULL) free(date_str);
834
835 #ifdef DEBUG
836 fprintf(stderr,"%s format: BL2 (%s) max_cnt: %d, totlen: %lld, maxlen %ld\n",
837 m_fptr->lb_name,m_fptr->mm_flg ? "mmap" : "fopen",
838 m_fptr->max_cnt,m_fptr->tot_len,m_fptr->max_len);
839 #endif
840
841 /* allocate and read hdr indexes */
842 if ((f_pos_arr=(unsigned int *)calloc((size_t)m_fptr->max_cnt+1,sizeof(int)))==NULL) {
843 fprintf(stderr," cannot allocate tmp header pointers\n");
844 goto error_r;
845 }
846
847 if ((m_fptr->d_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) {
848 fprintf(stderr," cannot allocate header pointers\n");
849 goto error_r;
850 }
851
852 /* allocate and read sequence offsets */
853 if ((m_fptr->s_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) {
854 fprintf(stderr," cannot allocate sequence pointers\n");
855 goto error_r;
856 }
857
858 if (fread(f_pos_arr,(size_t)4,m_fptr->max_cnt+1,ifile)!=m_fptr->max_cnt+1) {
859 fprintf(stderr," error reading hdr offsets: %s\n",m_fptr->lb_name);
860 goto error_r;
861 }
862
863 for (i=0; i<=m_fptr->max_cnt; i++)
864 #ifdef IS_BIG_ENDIAN
865 m_fptr->d_pos_arr[i] = f_pos_arr[i];
866 #else
867 m_fptr->d_pos_arr[i] = bl2_uint4_cvt(f_pos_arr[i]);
868 #endif
869
870 if (fread(f_pos_arr,(size_t)4,m_fptr->max_cnt+1,ifile)!=m_fptr->max_cnt+1) {
871 fprintf(stderr," error reading seq offsets: %s\n",m_fptr->lb_name);
872 goto error_r;
873 }
874 for (i=0; i<=m_fptr->max_cnt; i++) {
875 #ifdef IS_BIG_ENDIAN
876 m_fptr->s_pos_arr[i] = f_pos_arr[i];
877 #else
878 m_fptr->s_pos_arr[i] = bl2_uint4_cvt(f_pos_arr[i]);
879 #endif
880 }
881
882 if (dbtype == NTFORMAT) {
883 /* allocate and ambiguity offsets */
884 if ((m_fptr->a_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) {
885 fprintf(stderr," cannot allocate sequence pointers\n");
886 goto error_r;
887 }
888
889 /*
890 for (i=0; i<=m_fptr->max_cnt; i++) src_uint4_read(ifile,&m_fptr->a_pos_arr[i]);
891 */
892
893 if (fread(f_pos_arr,(size_t)4,m_fptr->max_cnt+1,ifile)!=m_fptr->max_cnt+1) {
894 fprintf(stderr," error reading seq offsets: %s\n",m_fptr->lb_name);
895 goto error_r;
896 }
897 for (i=0; i<=m_fptr->max_cnt; i++) {
898 #ifdef IS_BIG_ENDIAN
899 m_fptr->a_pos_arr[i] = f_pos_arr[i];
900 #else
901 m_fptr->a_pos_arr[i] = bl2_uint4_cvt(f_pos_arr[i]);
902 #endif
903 }
904 }
905
906 /*
907 for (i=0; i < min(m_fptr->max_cnt,10); i++) {
908 fprintf(stderr,"%d: %d %d %d\n",i,m_fptr->s_pos_arr[i],m_fptr->a_pos_arr[i],m_fptr->d_pos_arr[i]);
909 }
910 */
911
912 /* all done with ifile, close it */
913 fclose(ifile);
914 free(f_pos_arr);
915
916 if (!m_fptr->mm_flg) {
917 tmp = fgetc(m_fptr->libf);
918 if (tmp!=NULLB)
919 fprintf(stderr," phase error: %d:%d found\n",0,tmp);
920 }
921
922 m_fptr->bl_lib_pos = 1;
923 amb_cnt = 0;
924
925 return m_fptr;
926
927 error_r:
928 /* here if failure after m_fptr allocated */
929 free(m_fptr);
930 return NULL;
931 }
932
933 /* **************************************************************** */
934 /* close the library, free s_pos_arr, a_pos_arr, but save file info */
935 /* **************************************************************** */
936
ncbl2_closelib(struct lmf_str * m_fptr)937 void ncbl2_closelib(struct lmf_str *m_fptr)
938 {
939
940 if (m_fptr->tmp_buf != NULL) {
941 free(m_fptr->tmp_buf);
942 m_fptr->tmp_buf = NULL;
943 m_fptr->tmp_buf_max = 0;
944 }
945
946 if (m_fptr->s_pos_arr !=NULL) {
947 free(m_fptr->s_pos_arr);
948 m_fptr->s_pos_arr = NULL;
949 }
950 if (m_fptr->a_pos_arr!=NULL) {
951 free(m_fptr->a_pos_arr);
952 m_fptr->a_pos_arr = NULL;
953 }
954
955 if (m_fptr->hfile !=NULL ) {
956 fclose(m_fptr->hfile);
957 m_fptr->hfile=NULL;
958 free(m_fptr->d_pos_arr);
959 m_fptr->d_pos_arr = NULL;
960 }
961
962 if (m_fptr->oid_list != NULL) {
963 free(m_fptr->oid_list);
964 m_fptr->oid_list = NULL;
965 m_fptr->oid_seqs = m_fptr->max_oid = 0;
966 }
967
968 #ifdef use_mmap
969 if (m_fptr->mm_flg) {
970 munmap(m_fptr->mmap_base,m_fptr->st_size);
971 m_fptr->mmap_fd = -1;
972 }
973 else
974 #endif
975 if (m_fptr->libf !=NULL ) {
976 fclose(m_fptr->libf);
977 m_fptr->libf=NULL;
978 }
979
980 m_fptr->mm_flg = 0;
981 }
982
983 /* **************************************************************** */
984 /* read a protein sequence using OID offsets */
985 /* **************************************************************** */
986
987 int
ncbl2_getliba_o(unsigned char * seq,int maxs,char * libstr,int n_libstr,fseek_t * libpos,int * lcont,struct lmf_str * m_fd,long * l_off)988 ncbl2_getliba_o(unsigned char *seq,
989 int maxs,
990 char *libstr,
991 int n_libstr,
992 fseek_t *libpos,
993 int *lcont,
994 struct lmf_str *m_fd,
995 long *l_off)
996 {
997 int tpos;
998 unsigned int t_mask, t_shift, oid_mask;
999
1000 /* get to the next valid pointer */
1001
1002 for ( tpos = m_fd->lpos ;tpos <= m_fd->max_oid; tpos++) {
1003 t_mask = tpos / 32;
1004 t_shift = 31 - (tpos % 32);
1005 if ((oid_mask = m_fd->oid_list[t_mask])==0) { continue; }
1006
1007 if ((bl2_uint4_cvt(oid_mask) & 0x1 << t_shift)) {
1008 if (!m_fd->mm_flg) fseek(m_fd->libf,m_fd->s_pos_arr[tpos],0);
1009 m_fd->lpos = tpos; /* already bumped up */
1010 m_fd->bl_lib_pos = m_fd->s_pos_arr[tpos];
1011 return ncbl2_getliba(seq, maxs, libstr, n_libstr,
1012 libpos, lcont, m_fd, l_off);
1013 }
1014 }
1015 return -1;
1016 }
1017
1018 /* **************************************************************** */
1019 /* read a protein sequence */
1020 /* **************************************************************** */
1021
1022 int
ncbl2_getliba(unsigned char * seq,int maxs,char * libstr,int n_libstr,fseek_t * libpos,int * lcont,struct lmf_str * m_fd,long * l_off)1023 ncbl2_getliba(unsigned char *seq,
1024 int maxs,
1025 char *libstr,
1026 int n_libstr,
1027 fseek_t *libpos,
1028 int *lcont,
1029 struct lmf_str *m_fd,
1030 long *l_off)
1031 {
1032 unsigned char *sptr, *dptr;
1033 int s_chunk, d_len, lib_cnt;
1034 long seqcnt;
1035 long tmp;
1036 static long seq_len;
1037 #if defined(DEBUG) || defined(PCOMPLIB)
1038 int gi, my_db, taxid;
1039 char acc[MAX_FADL_ACC_LEN], title[MAX_UID], name[MAX_FADL_ACC_LEN];
1040 #endif
1041
1042 *l_off = 1;
1043
1044 lib_cnt = m_fd->lpos;
1045 *libpos = (fseek_t)m_fd->lpos;
1046
1047 if (*lcont==0) {
1048 if (lib_cnt >= m_fd->max_cnt) return -1; /* no more sequences */
1049 seq_len = m_fd->s_pos_arr[lib_cnt+1] - m_fd->s_pos_arr[lib_cnt]; /* value is +1 off to get the NULL */
1050 if (m_fd->mm_flg) m_fd->mmap_addr = m_fd->mmap_base+m_fd->s_pos_arr[lib_cnt];
1051 #if !defined(DEBUG) && !defined(PCOMPLIB)
1052 libstr[0]='\0';
1053 #else
1054 /* get the name from the header file */
1055 fseek(m_fd->hfile,m_fd->d_pos_arr[lib_cnt],0);
1056
1057 if (m_fd->bl_format_ver == FORMATDBV3) {
1058 d_len = min(n_libstr-1,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1);
1059 fread(libstr,(size_t)1,(size_t)d_len,m_fd->hfile);
1060 libstr[d_len]='\0';
1061 }
1062 else {
1063 d_len = min(m_fd->tmp_buf_max,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1);
1064 fread(m_fd->tmp_buf,(size_t)1,(size_t)d_len,m_fd->hfile);
1065 parse_fastadl_asn((unsigned char *)m_fd->tmp_buf, (unsigned char *)m_fd->tmp_buf+d_len,
1066 &gi, &my_db, acc, sizeof(acc), name, sizeof(name), title, sizeof(title), &taxid);
1067 sprintf(m_fd->tmp_buf,"gi|%d",gi);
1068 SAFE_STRNCPY(libstr,m_fd->tmp_buf,n_libstr);
1069 }
1070 libstr[n_libstr-1]='\0';
1071 #endif
1072 }
1073 if (seq_len <= maxs) { /* sequence fits */
1074 seqcnt = seq_len;
1075 m_fd->lpos++;
1076 *lcont = 0;
1077 }
1078 else { /* doesn't fit */
1079 seqcnt = maxs-1;
1080 (*lcont)++;
1081 }
1082
1083 if (m_fd->mm_flg) sptr = (unsigned char *)m_fd->mmap_addr;
1084 else {
1085 if ((tmp=fread(seq,(size_t)1,(size_t)seq_len,m_fd->libf))!=(size_t)seq_len) {
1086 fprintf(stderr," could not read sequence record: %lld %ld != %ld\n",
1087 *libpos,tmp,seq_len);
1088 goto error;
1089 }
1090 sptr = seq;
1091 }
1092 if (seq_len <= maxs) {seqcnt = --seq_len;}
1093
1094 /* everything is ready, set up dst. pointer, seq_len */
1095 if (aa_b2toa[sptr[seq_len-1]]=='*') seq_len--;
1096 if (aa_btof_null) {
1097 memcpy(seq,sptr,seq_len);
1098 }
1099 else {
1100 dptr = seq;
1101 s_chunk = seqcnt/16;
1102 while (s_chunk-- > 0) {
1103 *dptr++ = aa_btof[*sptr++];
1104 *dptr++ = aa_btof[*sptr++];
1105 *dptr++ = aa_btof[*sptr++];
1106 *dptr++ = aa_btof[*sptr++];
1107 *dptr++ = aa_btof[*sptr++];
1108 *dptr++ = aa_btof[*sptr++];
1109 *dptr++ = aa_btof[*sptr++];
1110 *dptr++ = aa_btof[*sptr++];
1111 *dptr++ = aa_btof[*sptr++];
1112 *dptr++ = aa_btof[*sptr++];
1113 *dptr++ = aa_btof[*sptr++];
1114 *dptr++ = aa_btof[*sptr++];
1115 *dptr++ = aa_btof[*sptr++];
1116 *dptr++ = aa_btof[*sptr++];
1117 *dptr++ = aa_btof[*sptr++];
1118 *dptr++ = aa_btof[*sptr++];
1119 }
1120 while (dptr < seq+seqcnt) *dptr++ = aa_btof[*sptr++];
1121 }
1122
1123 if (m_fd->mm_flg) m_fd->mmap_addr = (char *)sptr;
1124
1125 /* we didn't get it all, so reset for more */
1126 if (*lcont) seq_len -= seqcnt;
1127
1128 seq[seqcnt]= EOSEQ;
1129 return (seqcnt);
1130
1131 error: fprintf(stderr," error reading %s at %lld\n",libstr,*libpos);
1132 fflush(stderr);
1133 return (-1);
1134 }
1135
1136 /* ncbl2_mmap_getchain_o fills cur_seqr_chain with sequence pointers
1137 from the memory mapped file at *m_fd, based on oid coordinates (not
1138 contiguous)
1139
1140 requires NCBIstdaa core encoding
1141 */
1142 int
ncbl2_get_mmap_chain_o(struct seqr_chain * cur_seqr_chain,struct lmf_str * m_fd,struct db_str * db)1143 ncbl2_get_mmap_chain_o(struct seqr_chain *cur_seqr_chain,
1144 struct lmf_str *m_fd, struct db_str *db)
1145 {
1146 int i;
1147 struct seq_record *seq_a, *seq_p;
1148 struct mseq_record *mseq_a, *mseq_p;
1149
1150 int tpos;
1151 unsigned int t_mask, t_shift, oid_mask;
1152
1153 tpos = m_fd->lpos;
1154 if (tpos > m_fd->max_oid) return EOF;
1155 seq_a = cur_seqr_chain->seqr_base;
1156 mseq_a = cur_seqr_chain->mseqr_base;
1157
1158 for (i=0; i < cur_seqr_chain->max_chain_seqs; i++) {
1159 if (tpos > m_fd->max_oid) {
1160 break;
1161 }
1162 seq_p = &seq_a[i];
1163 mseq_p = &mseq_a[i];
1164
1165 /* now get the next valid pointer */
1166 while (tpos <= m_fd->max_oid) {
1167 t_mask = tpos / 32;
1168 t_shift = 31 - (tpos % 32);
1169 if ((oid_mask = m_fd->oid_list[t_mask])==0) { tpos++; continue; }
1170
1171 /* have a valid entry */
1172 if ((bl2_uint4_cvt(oid_mask) & 0x1 << t_shift)) {
1173 m_fd->bl_lib_pos = m_fd->s_pos_arr[tpos];
1174 seq_p->n1 = m_fd->s_pos_arr[tpos+1] - m_fd->s_pos_arr[tpos]-1; /* value is +1 off to get the NULL */
1175 seq_p->aa1b = (unsigned char *)(m_fd->mmap_base + m_fd->s_pos_arr[tpos]);
1176 seq_p->l_offset = 0;
1177 seq_p->l_off = 1;
1178
1179 db->entries++;
1180 db->length += seq_p->n1;
1181 if (db->length > LONG_MAX) {
1182 db->length -= LONG_MAX; db->carry++;
1183 }
1184
1185 mseq_p->m_file_p = m_fd;
1186 mseq_p->n1tot_p=NULL;
1187 mseq_p->cont = 0;
1188 seq_p->index = mseq_p->index = mseq_p->lseek = tpos++;
1189 #ifndef DEBUG
1190 mseq_p->libstr[0] = '\0';
1191 #else
1192 #endif
1193 #if DEBUG
1194 seq_p->adler32_crc = mseq_p->adler32_crc = adler32(1L,seq_p->aa1b,seq_p->n1);
1195 #endif
1196 break;
1197 }
1198 else {
1199 tpos++;
1200 }
1201 }
1202 }
1203 if (i==0 && tpos > m_fd->max_oid) return EOF;
1204 m_fd->lpos = tpos;
1205 cur_seqr_chain->cur_seq_cnt = i;
1206 if (i >= m_fd->max_cnt) return EOF;
1207 else return i;
1208 }
1209
1210 /* ncbl2_mmap_getchain fills cur_seqr_chain with sequence pointers
1211 from the memory mapped file at *m_fd
1212
1213 because the database is opened read-only, this code only works with
1214 an amino acid mapping identical to that used by blastdbcmd, aa_b2toa[]
1215
1216 */
1217 int
ncbl2_get_mmap_chain(struct seqr_chain * cur_seqr_chain,struct lmf_str * m_fd,struct db_str * db)1218 ncbl2_get_mmap_chain(struct seqr_chain *cur_seqr_chain,
1219 struct lmf_str *m_fd, struct db_str *db) {
1220 int i, lib_cnt;
1221 struct seq_record *seq_a, *seq_p;
1222 struct mseq_record *mseq_a, *mseq_p;
1223
1224 lib_cnt = m_fd->lpos;
1225 if (lib_cnt >= m_fd->max_cnt) return EOF;
1226 seq_a = cur_seqr_chain->seqr_base;
1227 mseq_a = cur_seqr_chain->mseqr_base;
1228
1229 for (i=0; i < cur_seqr_chain->max_chain_seqs; i++) {
1230 if (lib_cnt >= m_fd->max_cnt) break;
1231 seq_p = &seq_a[i];
1232 mseq_p = &mseq_a[i];
1233 seq_p->n1 = m_fd->s_pos_arr[lib_cnt+1] - m_fd->s_pos_arr[lib_cnt]-1; /* value is +1 off to get the NULL */
1234
1235 db->entries++;
1236 db->length += seq_p->n1;
1237 if (db->length > LONG_MAX) {
1238 db->length -= LONG_MAX; db->carry++;
1239 }
1240
1241 mseq_p->m_file_p = m_fd;
1242 mseq_p->n1tot_p=NULL;
1243 mseq_p->cont = 0;
1244 seq_p->index = mseq_p->index = mseq_p->lseek = lib_cnt;
1245 #ifndef DEBUG
1246 mseq_p->libstr[0] = '\0';
1247 #else
1248 #endif
1249 seq_p->aa1b = (unsigned char *)(m_fd->mmap_base + m_fd->s_pos_arr[lib_cnt++]);
1250 seq_p->l_offset = 0;
1251 seq_p->l_off = 1;
1252 #if DEBUG
1253 seq_p->adler32_crc = mseq_p->adler32_crc = adler32(1L,seq_p->aa1b,seq_p->n1);
1254 #endif
1255 }
1256 m_fd->lpos = lib_cnt;
1257 cur_seqr_chain->cur_seq_cnt = i;
1258 return i;
1259 }
1260
1261 /* **************************************************************** */
1262 /* read a DNA sequence using OID offsets */
1263 /* **************************************************************** */
1264
1265 int
ncbl2_getlibn_o(unsigned char * seq,int maxs,char * libstr,int n_libstr,fseek_t * libpos,int * lcont,struct lmf_str * m_fd,long * l_off)1266 ncbl2_getlibn_o(unsigned char *seq,
1267 int maxs,
1268 char *libstr,
1269 int n_libstr,
1270 fseek_t *libpos,
1271 int *lcont,
1272 struct lmf_str *m_fd,
1273 long *l_off)
1274 {
1275 int tpos;
1276 unsigned int t_mask, t_shift, oid_mask;
1277
1278 /* get to the next valid pointer */
1279
1280 for (tpos = m_fd->lpos; tpos <= m_fd->max_oid; tpos++) {
1281 t_mask = tpos / 32;
1282 t_shift = 31 - (tpos % 32);
1283 if ((oid_mask = m_fd->oid_list[t_mask])==0) { continue; }
1284
1285 if ((bl2_uint4_cvt(oid_mask) & 0x1 << t_shift)) {
1286 if (!m_fd->mm_flg) fseek(m_fd->libf,m_fd->s_pos_arr[tpos],0);
1287 m_fd->lpos = tpos; /* already bumped up */
1288 m_fd->bl_lib_pos = m_fd->s_pos_arr[tpos];
1289 return ncbl2_getlibn(seq, maxs, libstr, n_libstr,
1290 libpos, lcont, m_fd, l_off);
1291 }
1292 }
1293 return -1;
1294 }
1295
1296 static char tmp_amb[4096];
1297
1298 /* **************************************************************** */
1299 /* read a DNA sequence */
1300 /* **************************************************************** */
1301
1302 int
ncbl2_getlibn(unsigned char * seq,int maxs,char * libstr,int n_libstr,fseek_t * libpos,int * lcont,struct lmf_str * m_fd,long * l_off)1303 ncbl2_getlibn(unsigned char *seq,
1304 int maxs,
1305 char *libstr,
1306 int n_libstr,
1307 fseek_t *libpos,
1308 int *lcont,
1309 struct lmf_str *m_fd,
1310 long *l_off)
1311 {
1312 unsigned char *sptr, *tptr, stmp;
1313 long seqcnt;
1314 int s_chunk, lib_cnt;
1315 size_t tmp;
1316 char ch;
1317 static long seq_len;
1318 static int c_len,c_pad;
1319 int c_len_set, d_len;
1320 #if defined(DEBUG) || defined(PCOMPLIB)
1321 int gi, my_db, taxid;
1322 char acc[MAX_FADL_ACC_LEN], title[MAX_UID], name[MAX_FADL_ACC_LEN];
1323 #endif
1324
1325 /* ambiguity code adapted from NCBI Blast sources by:
1326
1327 Ralf Jost, Dipl.-Inform.
1328 Director, Technical Bioinformatics
1329 Biomax Informatics AG
1330 ralf.jost@biomax.com
1331 */
1332
1333 int amb_lower = *lcont * maxs;
1334 /* int amb_upper = (*lcont + 1) * maxs - 1; */ /* not used */
1335
1336 unsigned long x;
1337 unsigned long soff, eoff;
1338 int index;
1339
1340 unsigned int amb_cnt = 0;
1341 unsigned int large_amb = 0;
1342
1343 unsigned int start;
1344 unsigned int end;
1345 unsigned int amb_start;
1346
1347 const char str_2bit[] = "ACGT";
1348 const char str_4bit[] = "-ACMGRSVTWYHKDBN";
1349
1350 unsigned int *amb_ptr = NULL;
1351 long filepos = 0;
1352 char *mmap_pos = NULL;
1353
1354 unsigned int i;
1355 unsigned char res;
1356 int row_len;
1357 int j, position = 0;
1358
1359 *l_off = 1;
1360
1361 lib_cnt = m_fd->lpos;
1362 *libpos = (fseek_t)lib_cnt;
1363 if (*lcont==0) { /* not a continuation of previous */
1364 if (lib_cnt >= m_fd->max_cnt) return (-1);
1365 c_len = m_fd->a_pos_arr[lib_cnt]- m_fd->s_pos_arr[lib_cnt];
1366 if (!m_fd->mm_flg) {
1367 /* fseek over amb_ray */
1368 fseek(m_fd->libf,m_fd->s_pos_arr[lib_cnt],0);
1369 m_fd->bl_lib_pos = m_fd->s_pos_arr[lib_cnt];
1370 }
1371 else m_fd->mmap_addr = m_fd->mmap_base + m_fd->s_pos_arr[lib_cnt];
1372 #if !defined(DEBUG) && !defined(PCOMPLIB)
1373 libstr[0]='\0';
1374 #else
1375 /* get the name from the header file */
1376 fseek(m_fd->hfile,m_fd->d_pos_arr[lib_cnt],0);
1377
1378 if (m_fd->bl_format_ver == FORMATDBV3) {
1379 d_len = min(n_libstr-1,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1);
1380 fread(libstr,(size_t)1,(size_t)d_len,m_fd->hfile);
1381 }
1382 else {
1383 d_len = min(m_fd->tmp_buf_max,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1);
1384 fread(m_fd->tmp_buf,(size_t)1,(size_t)d_len,m_fd->hfile);
1385 parse_fastadl_asn((unsigned char *)m_fd->tmp_buf, (unsigned char *)m_fd->tmp_buf+d_len,
1386 &gi, &my_db, acc, sizeof(acc), name, sizeof(name), title, sizeof(title), &taxid);
1387 sprintf(m_fd->tmp_buf,"gi|%d",gi);
1388 SAFE_STRNCPY(libstr,m_fd->tmp_buf,n_libstr);
1389 }
1390 libstr[n_libstr-1]='\0';
1391 #endif
1392 } /* end of *lcont==0 */
1393
1394 /* To avoid the situation where c_len <= 1; we must anticipate what
1395 c_len will be after this pass. If it will be <= 64, back off this
1396 time so next time it will be > 64 */
1397
1398 seq_len = c_len*4;
1399
1400 if ((seq_len+4 > maxs) && (seq_len+4 - maxs <= 256)) {
1401 /* we won't be done but we will have less than 256 to go */
1402 c_len -= 64; seq_len -= 256; c_len_set = 1; maxs -= 256;}
1403 else c_len_set = 0;
1404
1405 /*
1406 fprintf(stderr," lib_cnt: %d %d %d %d\n",lib_cnt,c_len,seq_len,maxs);
1407 */
1408
1409 /* does the rest of the sequence fit? */
1410 if (seq_len <= maxs-4 && !c_len_set) {
1411 seqcnt = c_len;
1412 if (!m_fd->mm_flg) {
1413 if ((tmp=fread(seq,(size_t)1,(size_t)seqcnt,m_fd->libf))!=(size_t)seqcnt) {
1414 fprintf(stderr,
1415 " could not read sequence record: %s %lld %ld != %ld: %d\n",
1416 libstr,*libpos,tmp,seqcnt,*seq);
1417 goto error;
1418 }
1419 m_fd->bl_lib_pos += tmp;
1420 sptr = seq + seqcnt;
1421 }
1422 else sptr = (unsigned char *)(m_fd->mmap_addr+seqcnt);
1423
1424 *lcont = 0; /* this is the last chunk */
1425 // lib_cnt++; /* increment to the next sequence */
1426 /* the last byte is either '0' (no remainder) or the last 1-3 chars and the remainder */
1427 c_pad = *(sptr-1);
1428 c_pad &= 0x3; /* get the last (low) 2 bits */
1429 seq_len -= (4 - c_pad); /* if the last 2 bits are 0, its a NULL byte */
1430 }
1431 else { /* get the next chunk, but more to come */
1432 seqcnt = ((maxs+3)/4)-1;
1433 if (!m_fd->mm_flg) {
1434 if ((tmp=fread(seq,(size_t)1,(size_t)(seqcnt),m_fd->libf))!=(size_t)(seqcnt)) {
1435 fprintf(stderr," could not read sequence record: %lld %ld/%ld\n",
1436 *libpos,tmp,seqcnt);
1437 goto error;
1438 }
1439 m_fd->bl_lib_pos += tmp;
1440 sptr = seq + seqcnt;
1441 }
1442 else {
1443 sptr = (unsigned char *)(m_fd->mmap_addr+seqcnt);
1444 m_fd->mmap_addr += seqcnt;
1445 }
1446 seq_len = 4*seqcnt;
1447 c_len -= seqcnt;
1448 /* if (c_len_set) {c_len += 64; maxs += 256;} */
1449 (*lcont)++;
1450 /* hopefully we don't need this because of c_len -= 64. */
1451 /*
1452 if (c_len == 1) {
1453 #if !defined (USE_MMAP)
1454 c_pad = fgetc(m_fd->libf);
1455 *sptr=c_pad;
1456 #else
1457 c_pad = *m_fd->mmap_addr++;
1458 sptr = m_fd->mmap_addr;
1459 #endif
1460 c_pad &= 0x3;
1461 seq_len += c_pad;
1462 seqcnt++;
1463 lib_cnt++;
1464 *lcont = 0;
1465 }
1466 */
1467 }
1468
1469 /* point to the last packed byte and to the end of the array
1470 seqcnt is the exact number of bytes read
1471 tptr points to the destination, use multiple of 4 to simplify math
1472 sptr points to the source, note that the last byte will be read 4 cycles
1473 before it is written
1474 */
1475
1476 tptr = seq + 4*seqcnt;
1477 s_chunk = seqcnt/8;
1478 while (s_chunk-- > 0) {
1479 stmp = *--sptr;
1480 *--tptr = (stmp&3) +1;
1481 *--tptr = ((stmp >>= 2)&3)+1;
1482 *--tptr = ((stmp >>= 2)&3)+1;
1483 *--tptr = ((stmp >>= 2)&3)+1;
1484 stmp = *--sptr;
1485 *--tptr = (stmp&3) +1;
1486 *--tptr = ((stmp >>= 2)&3)+1;
1487 *--tptr = ((stmp >>= 2)&3)+1;
1488 *--tptr = ((stmp >>= 2)&3)+1;
1489 stmp = *--sptr;
1490 *--tptr = (stmp&3) +1;
1491 *--tptr = ((stmp >>= 2)&3)+1;
1492 *--tptr = ((stmp >>= 2)&3)+1;
1493 *--tptr = ((stmp >>= 2)&3)+1;
1494 stmp = *--sptr;
1495 *--tptr = (stmp&3) +1;
1496 *--tptr = ((stmp >>= 2)&3)+1;
1497 *--tptr = ((stmp >>= 2)&3)+1;
1498 *--tptr = ((stmp >>= 2)&3)+1;
1499 stmp = *--sptr;
1500 *--tptr = (stmp&3) +1;
1501 *--tptr = ((stmp >>= 2)&3)+1;
1502 *--tptr = ((stmp >>= 2)&3)+1;
1503 *--tptr = ((stmp >>= 2)&3)+1;
1504 stmp = *--sptr;
1505 *--tptr = (stmp&3) +1;
1506 *--tptr = ((stmp >>= 2)&3)+1;
1507 *--tptr = ((stmp >>= 2)&3)+1;
1508 *--tptr = ((stmp >>= 2)&3)+1;
1509 stmp = *--sptr;
1510 *--tptr = (stmp&3) +1;
1511 *--tptr = ((stmp >>= 2)&3)+1;
1512 *--tptr = ((stmp >>= 2)&3)+1;
1513 *--tptr = ((stmp >>= 2)&3)+1;
1514 stmp = *--sptr;
1515 *--tptr = (stmp&3) +1;
1516 *--tptr = ((stmp >>= 2)&3)+1;
1517 *--tptr = ((stmp >>= 2)&3)+1;
1518 *--tptr = ((stmp >>= 2)&3)+1;
1519 }
1520 while (tptr>seq) {
1521 stmp = *--sptr;
1522 *--tptr = (stmp&3) +1;
1523 *--tptr = ((stmp >>= 2)&3)+1;
1524 *--tptr = ((stmp >>= 2)&3)+1;
1525 *--tptr = ((stmp >>= 2)&3)+1;
1526 }
1527
1528 /*
1529 * Ambiguity-Decoding code from
1530
1531 Ralf Jost, Dipl.-Inform.
1532 Director, Technical Bioinformatics
1533 Biomax Informatics AG
1534 ralf.jost@biomax.com
1535
1536 using code from NCBI Blast distribution
1537 */
1538
1539 amb_start = m_fd->a_pos_arr[lib_cnt];
1540 end = m_fd->s_pos_arr[lib_cnt + 1];
1541
1542 if (amb_start != end) {
1543 if (!m_fd->mm_flg) {
1544 filepos = ftell(m_fd->libf);
1545 /* find the size of the ambiguity table */
1546 if (fseek(m_fd->libf, amb_start, SEEK_SET) != 0) {
1547 fprintf(stderr, "*** error [%s:%d] *** -- Seek amb start 0x%08x error %d\n",
1548 __FILE__, __LINE__, amb_start, ferror(m_fd->libf));
1549 }
1550 if (fread(&amb_cnt, sizeof(unsigned int), 1, m_fd->libf) != 1) {
1551 fprintf(stderr, "*** error [%s:%d] *** -- Read amb count error %d\n",
1552 __FILE__, __LINE__, ferror(m_fd->libf));
1553 }
1554 } else {
1555 mmap_pos = m_fd->mmap_addr;
1556 m_fd->mmap_addr = m_fd->mmap_base + amb_start;
1557 if (readMFILE((void *)&amb_cnt, sizeof(unsigned int), 1, m_fd) != 1) {
1558 fprintf(stderr, "*** error [%s:%d] *** -- Read amb count error %d\n",
1559 __FILE__, __LINE__, ferror(m_fd->libf));
1560 }
1561 }
1562
1563 amb_cnt = bl2_uint4_cvt(amb_cnt);
1564
1565 /* if the most significant bit is set on the count, then each
1566 * correction will take two entries in the table. the layout
1567 * is described below.
1568 */
1569 large_amb = amb_cnt >> 31;
1570 amb_cnt = amb_cnt & 0x7fffffff;
1571
1572 /* allocate enough space for the ambiguity table */
1573 amb_ptr = (unsigned int *) malloc(amb_cnt * sizeof(unsigned int));
1574 if (amb_ptr == NULL) {
1575 fprintf(stderr, "*** error [%s:%d] malloc amb table error size %ld\n",
1576 __FILE__, __LINE__, amb_cnt * sizeof(unsigned int));
1577 }
1578
1579 /* read the table */
1580 if (!m_fd->mm_flg) {
1581 if (fread((unsigned char *) amb_ptr, sizeof(unsigned int), amb_cnt, m_fd->libf)
1582 != amb_cnt) {
1583 fprintf(stderr, "*** error [%s:%d] *** -- Read amb table %d error %d\n",
1584 __FILE__, __LINE__, amb_cnt, ferror(m_fd->libf));
1585 }
1586 } else {
1587 if (readMFILE((void *) amb_ptr, sizeof(unsigned int), amb_cnt, m_fd)
1588 != amb_cnt) {
1589 fprintf(stderr, "*** error [%s:%d] *** -- Read amb table %d error %d\n",
1590 __FILE__, __LINE__, amb_cnt, ferror(m_fd->libf));
1591 }
1592 }
1593
1594 for (index=0; index < amb_cnt; index++) {
1595 amb_ptr[index] = bl2_uint4_cvt(amb_ptr[index]);
1596 }
1597
1598 for (i = 0; i < amb_cnt; i++) {
1599
1600 if (large_amb) {
1601 res = (unsigned char) (amb_ptr[i] >> 28);
1602 row_len = (int) (amb_ptr[i] >> 16) & 0xFFF;
1603 position = amb_ptr[i + 1];
1604 } else {
1605 res = (unsigned char) (amb_ptr[i] >> 28);
1606 row_len = (int) ((amb_ptr[i] >> 24) & 0xF);
1607 position = amb_ptr[i] & 0xFFFFFF;
1608 }
1609 for (index = position, j = 0; j <= row_len; j++) {
1610 if ((index + j >= amb_lower) && (index + j < amb_lower + 4 * seqcnt))
1611 seq[index + j - amb_lower] = nascii[str_4bit[res]];
1612 }
1613
1614 if (large_amb)
1615 i++;
1616 }
1617
1618 if (amb_ptr != NULL)
1619 free(amb_ptr);
1620
1621 if (!m_fd->mm_flg) {
1622 fseek(m_fd->libf, filepos, SEEK_SET);
1623 } else {
1624 m_fd->mmap_addr = mmap_pos;
1625 }
1626 }
1627
1628 /*
1629 * End of ambiguity-decoding.
1630 */
1631
1632 if ( *lcont == 0)
1633 lib_cnt++;
1634
1635
1636
1637
1638
1639 /*
1640 for (sptr=seq; sptr < seq+seq_len; sptr++) {
1641 printf("%c",nt[*sptr]);
1642 if ((int)(sptr-seq) % 60 == 59) printf("\n");
1643 }
1644 printf("\n");
1645 */
1646
1647 m_fd->lpos = lib_cnt;
1648 if (seqcnt*4 >= seq_len) { /* there was enough room */
1649 seq[seq_len]= EOSEQ;
1650 /* printf("%d\n",seq_len); */
1651 return seq_len;
1652 }
1653 else { /* not enough room */
1654 seq[seqcnt*4]=EOSEQ;
1655 seq_len -= 4*seqcnt;
1656 return (4*seqcnt);
1657 }
1658
1659 error: fprintf(stderr," error reading %s at %lld\n",libstr,*libpos);
1660 fflush(stderr);
1661 return (-1);
1662 }
1663
1664 /* 0 1 2 3 4 5 6 7
1665 8 9 10 11 12 13 14 15
1666 16 17 */
1667 static char
1668 *db_type_arr[] = {"lcl","gib","gim","gii","gb","emb","pir","sp",
1669 "pat","ref","gnl","gi","dbj","prf","pdb","tpg",
1670 "tpe","tpd"};
1671
1672 /* **************************************************************** */
1673 /* read a description, position for sequence */
1674 /* **************************************************************** */
1675 void
ncbl2_ranlib(char * str,int cnt,fseek_t libpos,char * libstr,struct lmf_str * m_fd)1676 ncbl2_ranlib(char *str,
1677 int cnt,
1678 fseek_t libpos,
1679 char *libstr,
1680 struct lmf_str *m_fd)
1681 {
1682 int llen, lib_cnt;
1683 char *bp;
1684 unsigned char *my_buff=NULL;
1685 char descr[2048];
1686 unsigned char *abp;
1687 int gi, taxid;
1688 int my_db;
1689 char db[5], acc[MAX_FADL_ACC_LEN], name[MAX_FADL_ACC_LEN];
1690 char title[2048];
1691 int have_my_buff=0;
1692 int have_descr = 0;
1693
1694 lib_cnt = (int)libpos;
1695 llen = m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt];
1696
1697 fseek(m_fd->hfile,m_fd->d_pos_arr[libpos],0);
1698
1699 if (m_fd->bl_format_ver == FORMATDBV3) {
1700 if (llen >= cnt) llen = cnt-1;
1701 fread(str,(size_t)1,(size_t)(llen),m_fd->hfile);
1702 }
1703 else {
1704 if (llen >= m_fd->tmp_buf_max) {
1705 if ((my_buff=(unsigned char *)calloc(llen,sizeof(char)))==NULL) {
1706 fprintf(stderr," cannot allocate ASN.1 buffer: %d\n",llen);
1707 my_buff = (unsigned char *)m_fd->tmp_buf;
1708 llen = m_fd->tmp_buf_max;
1709 }
1710 else have_my_buff = 1;
1711 }
1712 else {
1713 my_buff = (unsigned char *)m_fd->tmp_buf;
1714 }
1715 abp = my_buff;
1716 fread(my_buff,(size_t)1,llen,m_fd->hfile);
1717
1718 do {
1719 abp = parse_fastadl_asn(abp, my_buff+llen,
1720 &gi, &my_db, acc, sizeof(acc), name, sizeof(name),
1721 title, sizeof(title), &taxid);
1722
1723 if (gi > 0) {
1724 sprintf(descr,"gi|%d|%s|%s|%s ",gi,db_type_arr[my_db],acc,name);
1725 }
1726 else {
1727 if (acc[0] != '\0') sprintf(descr,"%s ",acc);
1728 else descr[0] = '\0';
1729 if (name[0] != '\0' && strcmp(name,"BL_ORD_ID")!=0) sprintf(descr+strlen(descr),"%s ", name);
1730 }
1731 if (my_db == 0 || m_fd->pref_db < 0) {
1732 if (!have_descr) {
1733 SAFE_STRNCPY(str,descr,cnt);
1734 have_descr = 1;
1735 }
1736 else {
1737 SAFE_STRNCAT(str,"\001",cnt);
1738 SAFE_STRNCAT(str,descr,cnt);
1739 }
1740 SAFE_STRNCAT(str,title,cnt);
1741 if (strlen(str) >= cnt-1) break;
1742 }
1743 else if (m_fd->pref_db == my_db) {
1744 have_descr = 1;
1745 SAFE_STRNCPY(str,descr,cnt);
1746 SAFE_STRNCAT(str,title,cnt);
1747 break;
1748 }
1749 } while (abp);
1750
1751 if (!have_descr) {
1752 SAFE_STRNCPY(str,descr,cnt);
1753 SAFE_STRNCAT(str,descr,cnt);
1754 }
1755
1756 if (have_my_buff) free(my_buff);
1757 }
1758
1759 str[cnt-1]='\0';
1760
1761 bp = str;
1762 while((bp=strchr(bp,'\001'))!=NULL) {*bp++=' ';}
1763
1764 if (!m_fd->mm_flg) fseek(m_fd->libf,m_fd->s_pos_arr[libpos],0);
1765
1766 m_fd->lpos = lib_cnt;
1767 m_fd->bl_lib_pos = m_fd->s_pos_arr[lib_cnt];
1768 }
1769
bl2_uint4_cvt(unsigned int val)1770 unsigned int bl2_uint4_cvt(unsigned int val)
1771 {
1772 unsigned int res;
1773 #ifdef IS_BIG_ENDIAN
1774 return val;
1775 #else /* it better be LITTLE_ENDIAN */
1776 res = ((val&255)*256)+ ((val>>8)&255);
1777 res = (res<<16) + (((val>>16)&255)*256) + ((val>>24)&255);
1778 return res;
1779 #endif
1780 }
1781
bl2_long4_cvt(long val)1782 unsigned int bl2_long4_cvt(long val)
1783 {
1784 int val4;
1785 unsigned int res;
1786 #ifdef IS_BIG_ENDIAN
1787 val4 = val;
1788 return val4;
1789 #else /* it better be LITTLE_ENDIAN */
1790 res = ((val&255)*256)+ ((val>>8)&255);
1791 res = (res<<16) + (((val>>16)&255)*256) + ((val>>24)&255);
1792 return res;
1793 #endif
1794 }
1795
bl2_long8_cvt(uint64_t val)1796 uint64_t bl2_long8_cvt(uint64_t val)
1797 {
1798 uint64_t res;
1799 #ifdef IS_BIG_ENDIAN
1800 return val;
1801 #else /* it better be LITTLE_ENDIAN */
1802 res = ((val&255)*256)+ ((val>>8)&255);
1803 res = (res<<16) + (((val>>16)&255)*256) + ((val>>24)&255);
1804 #ifdef BIG_LIB64
1805 res = (res<<16) + (((val>>32)&255)*256) + ((val>>40)&255);
1806 res = (res<<16) + (((val>>48)&255)*256) + ((val>>56)&255);
1807 #else
1808 fprintf(stderr,"Cannot use bl2_long8_cvt without 64-bit longs\n");
1809 exit(1);
1810 #endif
1811 return res;
1812 #endif
1813 }
1814
src_int4_read(FILE * fd,int * val)1815 void src_int4_read(FILE *fd, int *val)
1816 {
1817 #ifdef IS_BIG_ENDIAN
1818 fread((char *)val,(size_t)4,(size_t)1,fd);
1819 #else
1820 unsigned char b[4];
1821
1822 fread((char *)&b[0],(size_t)1,(size_t)4,fd);
1823 *val = 0;
1824 *val = (int)(((((b[0]<<8)+b[1])<<8)+b[2])<<8)+b[3];
1825 #endif
1826 }
1827
src_long4_read(FILE * fd,long * valp)1828 void src_long4_read(FILE *fd, long *valp)
1829 {
1830 int val4;
1831 #ifdef IS_BIG_ENDIAN
1832 fread(&val4,(size_t)4,(size_t)1,fd);
1833 *valp = val4;
1834 #else
1835 unsigned char b[4];
1836
1837 fread((char *)&b[0],(size_t)1,(size_t)4,fd);
1838 val4 = (int)(((((b[0]<<8)+b[1])<<8)+b[2])<<8)+b[3];
1839 *valp = val4;
1840 #endif
1841 }
1842
src_uint4_read(FILE * fd,unsigned int * valp)1843 void src_uint4_read(FILE *fd, unsigned int *valp)
1844 {
1845 #ifdef IS_BIG_ENDIAN
1846 fread(valp,(size_t)4,(size_t)1,fd);
1847 #else
1848 unsigned char b[4];
1849
1850 fread((char *)&b[0],(size_t)1,(size_t)4,fd);
1851 *valp = 0;
1852 *valp = (unsigned int)(((((b[0]<<8)+b[1])<<8)+b[2])<<8)+b[3];
1853 #endif
1854 }
1855
src_long8_read(FILE * fd,int64_t * val)1856 void src_long8_read(FILE *fd, int64_t *val)
1857 {
1858 #ifdef IS_BIG_ENDIAN
1859 fread((void *)val,(size_t)8,(size_t)1,fd);
1860 #else
1861 int val_h;
1862 unsigned int val_l;
1863 unsigned char b[8];
1864
1865 fread((char *)&b[0],(size_t)1,(size_t)8,fd);
1866
1867 /* modified to work with both 32-bit and 64-bit native ints */
1868 val_h = (((((b[0]<<8)+b[1])<<8)+b[2])<<8)+b[3];
1869 val_l = (((((b[4]<<8)+b[5])<<8)+b[6])<<8)+b[7];
1870 *val = val_h * 4294967296LL + val_l;
1871
1872 /*
1873 *val = 0;
1874 *val = (long)(((((((b[0]<<8)+b[1]<<8)+b[2]<<8)
1875 +b[3]<<8)+b[4]<<8)+b[5]<<8)
1876 +b[6]<<8)+b[7];
1877 */
1878 #endif
1879 }
1880
ncbi_long8_read(FILE * fd,int64_t * val)1881 void ncbi_long8_read(FILE *fd, int64_t *val)
1882 {
1883 unsigned char b[8];
1884
1885 fread((char *)&b[0],(size_t)1,(size_t)8,fd);
1886 *val = 0;
1887 *val = (long)(((((((((((((b[7]<<8)+b[6])<<8)+b[5])<<8)+b[4])<<8)+b[3])<<8)+b[2])<<8)+b[1])<<8)+b[0];
1888 }
1889
src_char_read(FILE * fd,char * val)1890 void src_char_read(FILE *fd, char *val)
1891 {
1892 fread(val,(size_t)1,(size_t)1,fd);
1893 }
1894
src_fstr_read(FILE * fd,char * val,int slen)1895 void src_fstr_read(FILE *fd, char *val, int slen)
1896 {
1897 fread(val,(size_t)slen,(size_t)1,fd);
1898 }
1899
1900 void
newname(char * nname,char * oname,char * suff,int maxn)1901 newname(char *nname, char *oname, char *suff, int maxn)
1902 {
1903 SAFE_STRNCPY(nname,oname,maxn);
1904 SAFE_STRNCAT(nname,".",maxn);
1905 SAFE_STRNCAT(nname,suff,maxn);
1906 }
1907
1908 #define ASN_SEQ 0x30
1909 #define ASN_IS_BOOL 1
1910 #define ASN_IS_INT 2
1911 #define ASN_IS_STR 26
1912 #define ASN_TYPE_MASK 31
1913
1914 unsigned char *
get_asn_int(unsigned char * abp,int * val)1915 get_asn_int(unsigned char *abp, int *val) {
1916
1917 int v_len, v;
1918
1919 v = 0;
1920 if (*abp++ != ASN_IS_INT) { /* check for int */
1921 fprintf(stderr,"*** error [%s:%d] -- int missing\n",__FILE__, __LINE__);
1922 }
1923 else {
1924 v_len = *abp++;
1925 while (v_len-- > 0) {
1926 v *= 256;
1927 v += *abp++;
1928 }
1929 abp += 2; /* skip over null's */
1930 }
1931 *val = v;
1932 return abp;
1933 }
1934
1935 unsigned char *
get_asn_text(unsigned char * abp,char * text,int t_len)1936 get_asn_text(unsigned char *abp, char *text, int t_len) {
1937 int tch, at_len;
1938
1939 text[0] = '\0';
1940 if (*abp++ != ASN_IS_STR) { /* check for str */
1941 fprintf(stderr,"*** error [%s:%d] - str missing\n",__FILE__,__LINE__);
1942 }
1943 else {
1944 if ((tch = *abp++) > 128) { /* string length is in next bytes */
1945 tch &= 0x7f; /* get number of bytes for len */
1946 at_len = 0;
1947 while (tch-- > 0) { at_len = (at_len << 8) + *abp++;}
1948 }
1949 else {
1950 at_len = tch;
1951 }
1952
1953 if ( at_len < t_len-1) {
1954 memcpy(text, abp, at_len);
1955 text[at_len] = '\0';
1956 }
1957 else {
1958 memcpy(text, abp, t_len-1);
1959 text[t_len-1] = '\0';
1960 }
1961 abp += at_len + 2;
1962 }
1963 return abp;
1964 }
1965
1966 /* something to try to skip over stuff we don't want */
1967 unsigned char *
get_asn_junk(unsigned char * abp)1968 get_asn_junk(unsigned char *abp) {
1969
1970 int seq_cnt = 0;
1971 int tmp;
1972 char string[256];
1973
1974 while (*abp) {
1975 if ( *abp == ASN_SEQ) { abp += 2; seq_cnt++;}
1976 else if ( *abp == ASN_IS_BOOL ) {abp = get_asn_int(abp, &tmp);}
1977 else if ( *abp == ASN_IS_INT ) {abp = get_asn_int(abp, &tmp);}
1978 else if ( *abp == ASN_IS_STR ) {abp = get_asn_text(abp, string, sizeof(string)-1);}
1979 else { abp += 2;}
1980 }
1981
1982 while (seq_cnt-- > 0) abp += 2;
1983 return abp;
1984 }
1985
1986 #define ASN_FADL_TITLE 0xa0 /* \240 160 */
1987 #define ASN_FADL_SEQID 0xa1 /* \241 161 */
1988 #define ASN_FADL_TAXID 0xa2 /* \242 162 */
1989 #define ASN_FADL_MEMBERS 0xa3 /* \243 163 */
1990 #define ASN_FADL_LINKS 0xa4 /* \244 164 */
1991 #define ASN_FADL_OTHER 0xa5 /* \245 165 */
1992 #define ASN_FADL_GI 171
1993
1994 #define ASN_FADL_TEXTSEQ_ID 0xa4 /* \244 164 */
1995 #define ASN_FADL_OTHERSEQ_ID 0xa5 /* \245 164 */
1996
1997 /* from seq.asn::Textseq-id/Textannot-id */
1998 #define ASN_TEXTSEQ_ID_NAME 0xa0 /* \240 160 */
1999 #define ASN_TEXTSEQ_ID_ACC 0xa1 /* \241 161 */
2000 #define ASN_TEXTSEQ_ID_REL 0xa2 /* \242 162 */
2001 #define ASN_TEXTSEQ_ID_VER 0xa3 /* \243 163 */
2002
2003 unsigned char *
get_asn_textseq_id(unsigned char * abp,char * name,size_t name_len,char * acc,size_t acc_len)2004 get_asn_textseq_id(unsigned char *abp,
2005 char *name, size_t name_len, char *acc, size_t acc_len)
2006 {
2007 char release[20], ver_str[10];
2008 int version;
2009 int seqcnt = 0;
2010
2011 ver_str[0]='\0';
2012
2013 if (*abp == ASN_SEQ) { abp += 2; seqcnt++;}
2014
2015 while (*abp) {
2016 switch (*abp) {
2017 case ASN_TEXTSEQ_ID_NAME :
2018 abp = get_asn_text(abp+2, name, name_len);
2019 break;
2020 case ASN_TEXTSEQ_ID_ACC :
2021 abp = get_asn_text(abp+2, acc, acc_len);
2022 break;
2023 case ASN_TEXTSEQ_ID_REL :
2024 abp = get_asn_text(abp+2, release, sizeof(release));
2025 break;
2026 case ASN_TEXTSEQ_ID_VER :
2027 abp = get_asn_int(abp+2, &version);
2028 sprintf(ver_str,".%d",version);
2029 break;
2030 default: abp += 2;
2031 }
2032 }
2033 while (seqcnt-- > 0) abp += 4;
2034 strncat(acc,ver_str,acc_len-strlen(acc));
2035 acc[19]='\0';
2036 return abp; /* skip 2 NULL's */
2037 }
2038
2039 unsigned char *
get_asn_local_id(unsigned char * abp,char * acc,size_t acc_len)2040 get_asn_local_id(unsigned char *abp, char *acc, size_t acc_len)
2041 {
2042 int seqcnt = 0;
2043
2044 if (*abp == ASN_SEQ) { abp += 2; seqcnt++;}
2045
2046 abp = get_asn_text(abp+2, acc, acc_len);
2047
2048 while (seqcnt-- > 0) abp += 4;
2049 acc[acc_len-1]='\0';
2050 return abp+2; /* skip 2 NULL's */
2051 }
2052
2053 unsigned char *
get_asn_dbtag(unsigned char * abp,char * name,size_t name_len,char * str,size_t str_len,int * id_p)2054 get_asn_dbtag(unsigned char *abp, char *name, size_t name_len, char *str, size_t str_len, int *id_p) {
2055
2056 if (*abp == ASN_SEQ) { abp += 2;}
2057
2058 if (*abp == 0xa0) { /* get db */
2059 abp = get_asn_text(abp+2, name, name_len);
2060 }
2061 else {
2062 fprintf(stderr,"*** error [%s:%d] - missing dbtag:db %d %d\n",__FILE__, __LINE__, abp[0],abp[1]);
2063 abp += 2;
2064 }
2065
2066 if (*abp == 0xa1) { /* get tag */
2067 abp += 2;
2068 abp += 2; /* skip over id */
2069 if (*abp == 2) abp = get_asn_int(abp,id_p);
2070 else abp = get_asn_text(abp, str, str_len);
2071 }
2072 else {
2073 fprintf(stderr,"*** error [%s:%d] - missing dbtag:tag %2x %2x\n",__FILE__, __LINE__, abp[0],abp[1]);
2074 abp += 2;
2075 }
2076 return abp+2; /* skip 2 NULL's */
2077 }
2078
2079 #define ASN_DATE_STR 0xa0
2080 #define ASN_DATE_STD 0xa1
2081 #define ASN_DATE_STD_YR 0xa0
2082 #define ASN_DATE_STD_MO 0xa1
2083 #define ASN_DATE_STD_DAY 0xa2
2084
2085 unsigned char *
get_asn_date_std(unsigned char * abp,char * date)2086 get_asn_date_std(unsigned char *abp, char *date) {
2087 int seq_cnt=0;
2088 int year, month, day;
2089
2090 year = month = day = 0;
2091
2092 while (*abp == ASN_SEQ) { abp+=2; seq_cnt++;}
2093
2094 while (*abp) {
2095 switch (*abp) {
2096 case ASN_DATE_STD_YR :
2097 abp = get_asn_int(abp+2, &year);
2098 break;
2099 case ASN_DATE_STD_MO :
2100 abp = get_asn_int(abp+2, &month);
2101 break;
2102 case ASN_DATE_STD_DAY :
2103 abp = get_asn_int(abp+2, &day);
2104 break;
2105 default:
2106 fprintf(stderr, "*** error [%s:%d] - incorrect date-std code: %0x1 %0x1\n",
2107 __FILE__, __LINE__, abp[0], abp[1]);
2108 }
2109 }
2110 sprintf(date, "%02d-%02d-%02d", year, month, day);
2111
2112 while (seq_cnt-- > 0) { abp += 4;}
2113
2114 return abp+2;
2115 }
2116
2117 unsigned char *
get_asn_date(unsigned char * abp,char * date,size_t date_len)2118 get_asn_date(unsigned char *abp, char *date, size_t date_len) {
2119
2120 if (*abp == ASN_DATE_STR) {
2121 abp = get_asn_text(abp, date, date_len);
2122 }
2123 else if (*abp == ASN_DATE_STD) {
2124 abp = get_asn_date_std(abp+2, date);
2125 }
2126 else {
2127 fprintf(stderr, "*** error [%s:%d] - incorrect date code: %0x1 %0x1\n",
2128 __FILE__, __LINE__, abp[0], abp[1]);
2129 }
2130 return abp+2;
2131 }
2132
2133 unsigned char *
get_asn_pdb_id(unsigned char * abp,char * acc,size_t acc_len,char * chain,size_t chain_len)2134 get_asn_pdb_id(unsigned char *abp, char *acc, size_t acc_len, char *chain, size_t chain_len)
2135 {
2136 int ichain, seq_cnt=0;
2137 char dummy[40];
2138
2139 if (*abp == ASN_SEQ) { abp += 2; seq_cnt++;}
2140
2141 while (*abp) {
2142 switch (*abp) {
2143 case 0: abp += 2; break;
2144 case 0xa0: /* mol */
2145 abp = get_asn_text(abp+2, acc, 20);
2146 break;
2147 case 0xa1: /* chain */
2148 abp = get_asn_int(abp+2, &ichain);
2149 chain[0] = ichain;
2150 chain[1] = '\0';
2151 break;
2152 case 0xa2: /* release */
2153 abp = get_asn_date(abp+2, dummy, sizeof(dummy));
2154 break;
2155 default: abp+=2;
2156 }
2157 }
2158 while (seq_cnt-- > 0) {abp += 4;}
2159 return abp;
2160 }
2161
2162 unsigned char *
get_asn_seqid_ori(unsigned char * abp,int * gi_p,int * db,char * acc,size_t acc_len,char * name,size_t name_len)2163 get_asn_seqid_ori(unsigned char *abp, int *gi_p, int *db, char *acc, size_t acc_len, char *name, size_t name_len)
2164 {
2165 int db_type, itmp, seq_cnt=0;
2166
2167 *gi_p = 0;
2168
2169 if (*abp != ASN_SEQ) {
2170 fprintf(stderr, "*** error [%s:%d] - seqid - missing SEQ 1: %2x %2x\n",
2171 __FILE__, __LINE__, abp[0], abp[1]);
2172 return abp;
2173 }
2174 else { abp += 2; seq_cnt++;}
2175
2176 db_type = (*abp & ASN_TYPE_MASK);
2177
2178 if (db_type == 11) { /* gi */
2179 abp = get_asn_int(abp+2,gi_p);
2180 }
2181
2182 while (*abp == ASN_SEQ) {abp += 2; seq_cnt++;}
2183
2184 db_type = (*abp & ASN_TYPE_MASK);
2185 if (db_type > 17) {db_type = 0;}
2186 *db = db_type;
2187
2188 switch(db_type) {
2189 case 0:
2190 abp = get_asn_local_id(abp, acc, acc_len);
2191 break;
2192 case 1:
2193 case 2:
2194 abp = get_asn_int(abp+2,&itmp);
2195 abp += 2;
2196 break;
2197 case 11:
2198 abp = get_asn_int(abp+2,&itmp);
2199 break;
2200 case 4:
2201 case 5:
2202 case 6:
2203 case 7:
2204 case 9:
2205 case 12:
2206 case 13:
2207 case 15:
2208 case 16:
2209 case 17:
2210 abp = get_asn_textseq_id(abp,name,name_len, acc, acc_len);
2211 break;
2212 case 10:
2213 abp = get_asn_dbtag(abp+2,name,name_len, acc, acc_len, &itmp);
2214 case 14:
2215 abp = get_asn_pdb_id(abp,acc,acc_len,name,name_len);
2216 break;
2217 default: abp += 2;
2218 }
2219
2220 while (seq_cnt-- > 0) { abp += 4;}
2221 return abp; /* skip over 2 NULL's */
2222 }
2223
2224 unsigned char *
get_asn_db_info(unsigned char * abp,int db_type,int * gi_p,char * name,size_t name_len,char * acc,size_t acc_len)2225 get_asn_db_info(unsigned char *abp, int db_type, int *gi_p, char *name, size_t name_len, char *acc, size_t acc_len) {
2226 int seq_cnt = 0, itmp;
2227
2228 if (db_type == 11) {
2229 abp = get_asn_int(abp, gi_p);
2230 return abp;
2231 }
2232
2233 while (*abp == ASN_SEQ) {abp += 2; seq_cnt++;}
2234
2235 switch(db_type) {
2236 case 0:
2237 abp = get_asn_local_id(abp, acc, acc_len);
2238 break;
2239 case 1:
2240 case 2:
2241 abp = get_asn_int(abp,gi_p);
2242 break;
2243 case 11:
2244 abp = get_asn_int(abp+2,gi_p);
2245 break;
2246 case 4:
2247 case 5:
2248 case 6:
2249 case 7:
2250 case 9:
2251 case 12:
2252 case 13:
2253 case 15:
2254 case 16:
2255 case 17:
2256 abp = get_asn_textseq_id(abp,name,name_len, acc, acc_len);
2257 break;
2258 case 10:
2259 abp = get_asn_dbtag(abp,name,name_len, acc, acc_len, &itmp);
2260 break;
2261 case 14:
2262 abp = get_asn_pdb_id(abp,acc,acc_len,name,name_len);
2263 break;
2264 default: abp += 2;
2265 }
2266
2267 while (seq_cnt-- > 0) { abp += 4;}
2268 return abp; /* skip over 2 NULL's */
2269 }
2270
2271
2272 unsigned char *
get_asn_seqid(unsigned char * abp,int * gi_p,int * db,char * acc,size_t acc_len,char * name,size_t name_len)2273 get_asn_seqid(unsigned char *abp, int *gi_p, int *db, char *acc, size_t acc_len, char *name, size_t name_len)
2274 {
2275 int db_type, itmp, seq_cnt=0;
2276
2277 *gi_p = 0;
2278
2279 if (*abp != ASN_SEQ) {
2280 fprintf(stderr, "*** error [%s:%d] - get_asn_seqid - missing SEQ 1: %2x %2x\n",
2281 __FILE__, __LINE__, abp[0], abp[1]);
2282 return abp;
2283 }
2284 else { abp += 2; seq_cnt++;}
2285
2286 while (*abp) {
2287 if (*abp == ASN_FADL_TEXTSEQ_ID) {
2288 abp = get_asn_textseq_id(abp+2, name, name_len, acc, acc_len );
2289 }
2290 else if (*abp == ASN_FADL_OTHERSEQ_ID) {
2291 abp = get_asn_textseq_id(abp+2, name, name_len, acc, acc_len );
2292 }
2293 else if ((db_type = (*abp & ASN_TYPE_MASK)) < 17) {
2294 abp = get_asn_db_info(abp+2, db_type, gi_p, name, name_len, acc, acc_len);
2295 *db = db_type;
2296 }
2297 else {
2298 fprintf(stderr,"*** error [%s:%d] -- get_asn_seqid not TEXTSEQ/not GI: %2x %2x\n",
2299 __FILE__, __LINE__,abp[0], abp[1]);
2300 return abp;
2301 }
2302 }
2303
2304 while (seq_cnt-- > 0) { abp += 4;}
2305 return abp; /* skip over 2 NULL's */
2306 }
2307
2308 unsigned char *
get_asn_seqid_other(unsigned char * abp,int * gi_p,char * acc,size_t acc_len,char * name,size_t name_len)2309 get_asn_seqid_other(unsigned char *abp, int *gi_p, char *acc, size_t acc_len, char *name, size_t name_len)
2310 {
2311 int db_type, itmp, seq_cnt=0;
2312
2313 *gi_p = 0;
2314 name[0] = acc[0] = '\0';
2315
2316 if (*abp != ASN_SEQ) {
2317 fprintf(stderr, "*** error [%s:%d] - get_asn_seqid - missing SEQ 1: %2x %2x\n",
2318 __FILE__, __LINE__, abp[0], abp[1]);
2319 return abp;
2320 }
2321 else { abp += 2; seq_cnt++;}
2322
2323 while (*abp) {
2324 if (*abp == ASN_TEXTSEQ_ID_ACC) {
2325 abp = get_asn_text(abp+2, acc, acc_len );
2326 }
2327 if (*abp == ASN_TEXTSEQ_ID_VER) {
2328 abp = get_asn_text(abp+2, acc, acc_len );
2329 }
2330 else if (*abp == ASN_TEXTSEQ_ID_NAME) {
2331 abp = get_asn_text(abp+2, name, name_len );
2332 }
2333 else if (*abp == ASN_IS_INT) {
2334 abp = get_asn_int(abp, &itmp );
2335 }
2336 else {
2337 fprintf(stderr,"*** error [%s:%d] -- get_asn_seqid not SEQID_ACC: %2x %2x\n",
2338 __FILE__, __LINE__,abp[0], abp[1]);
2339 return abp;
2340 }
2341 }
2342
2343 if (*abp == ASN_FADL_GI) {
2344 abp = get_asn_int(abp+2, gi_p);
2345 }
2346
2347 while (seq_cnt-- > 0) { abp += 4;}
2348 return abp; /* skip over 2 NULL's */
2349 }
2350
2351
2352 unsigned char *
parse_fastadl_asn(unsigned char * asn_buff,unsigned char * asn_max,int * gi_p,int * db,char * acc,size_t acc_len,char * name,size_t name_len,char * title,size_t t_len,int * taxid_p)2353 parse_fastadl_asn(unsigned char *asn_buff, unsigned char *asn_max,
2354 int *gi_p, int *db, char *acc, size_t acc_len,
2355 char *name, size_t name_len,
2356 char *title, size_t t_len, int *taxid_p) {
2357 unsigned char *abp;
2358 int this_db, itmp;
2359 int seq_cnt = 0;
2360
2361 acc[0] = name[0] = db[0] = title[0] = '\0';
2362
2363 abp = asn_buff;
2364 while ( abp < asn_max && *abp) {
2365 if (*abp == ASN_SEQ) { abp += 2; seq_cnt++; }
2366 else if (*abp == ASN_FADL_TITLE) {
2367 abp = get_asn_text(abp+2, title, t_len);
2368 }
2369 else if (*abp == ASN_FADL_SEQID ) {
2370 abp = get_asn_seqid(abp+2, gi_p, db, acc, acc_len, name, name_len);
2371 }
2372 else if (*abp == ASN_FADL_TAXID ) {
2373 abp = get_asn_int(abp+2, taxid_p);
2374 }
2375 else if (*abp == ASN_FADL_MEMBERS) {
2376 abp = get_asn_junk(abp+2);
2377 break;
2378 }
2379 else if (*abp == ASN_FADL_LINKS ) {
2380 abp = get_asn_junk(abp+2);
2381 break;
2382 }
2383 else if (*abp == ASN_FADL_OTHER ) { /* possibly here for seqid without name */
2384 abp = get_asn_seqid_other(abp+2, gi_p, acc, acc_len, name, name_len);
2385 }
2386 else {
2387 /* fprintf(stderr, " Error - missing ASN.1 %2x:%2x:%2x:%2x\n",
2388 abp[-2],abp[-1],abp[0],abp[1]); */
2389 abp = get_asn_junk(abp);
2390 /* something is broken, give up */
2391 break;
2392 }
2393 }
2394 while (abp < asn_max && *abp == '\0' ) abp++;
2395 if (abp >= asn_max) return NULL;
2396 else return abp;
2397 }
2398
2399
2400 void
parse_pal(char * dname,char * msk_name,int * oid_seqs,int * max_oid,FILE * fd)2401 parse_pal(char *dname, char *msk_name,
2402 int *oid_seqs, int *max_oid,
2403 FILE *fd) {
2404
2405 char line[MAX_STR];
2406
2407 while (fgets(line,sizeof(line),fd)) {
2408 if (line[0] == '#') continue;
2409
2410 if (strncmp(line, "DBLIST", 6)==0) {
2411 sscanf(line+7,"%s",dname);
2412 }
2413 else if (strncmp(line, "OIDLIST", 7)==0) {
2414 sscanf(line+8,"%s",msk_name);
2415 }
2416 else if (strncmp(line, "NSEQ", 4)==0) {
2417 sscanf(line+5,"%d",oid_seqs);
2418 }
2419 else if (strncmp(line, "MAXOID", 6)==0) {
2420 sscanf(line+7,"%d",max_oid);
2421 }
2422 }
2423 }
2424
2425 /* part of new ambiguity code */
readMFILE(void * buffer,size_t size,int nitems,struct lmf_str * m_fd)2426 int readMFILE (void *buffer, size_t size, int nitems, struct lmf_str *m_fd) {
2427 register size_t diff, len;
2428
2429 if (m_fd == NULL)
2430 return 0;
2431
2432 if (m_fd->mm_flg) {
2433 len = size * nitems;
2434 memcpy((void *) buffer, (void *) m_fd->mmap_addr, len);
2435 m_fd->mmap_addr += len;
2436 return nitems;
2437 }
2438 else {
2439 return fread((unsigned char *)buffer, size, nitems, m_fd->libf);
2440 }
2441 }
2442