1 /*	ncbl2_lib.c	functions to read ncbi-blast format files from
2 			formatdb (blast2.0 format files)
3 */
4 
5 /* copyright (c) 2006, 2014 by William R. Pearson and
6    The Rector and Visitors of the University of Virginia */
7 
8 /* Licensed under the Apache License, Version 2.0 (the "License");
9    you may not use this file except in compliance with the License.
10    You may obtain a copy of the License at
11 
12    http://www.apache.org/licenses/LICENSE-2.0
13 
14    Unless required by applicable law or agreed to in writing,
15    software distributed under this License is distributed on an "AS
16    IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
17    express or implied.  See the License for the specific language
18    governing permissions and limitations under the License.
19 */
20 
21 /* Updated 22-Aug-2014 to include
22 
23    ambiguity-decoding code from
24 
25      Ralf Jost, Dipl.-Inform.
26      Director, Technical Bioinformatics
27      Biomax Informatics AG
28      ralf.jost@biomax.com
29 
30      using code from NCBI Blast distribution
31 */
32 
33 /* $Name:  $ - $Id: ncbl2_mlib.c 1291 2014-08-28 18:32:58Z wrp $ */
34 
35 /* to turn on mmap()ing for Blast2 files: */
36 
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <limits.h>
43 #include <fcntl.h>
44 #ifdef UNIX
45 #include <unistd.h>
46 #endif
47 #include <errno.h>
48 
49 
50 /* ****************************************************************
51 
52 17-May-2006
53 
54 Modified to read NCBI .[np]al and .msk files.  The .nal or .pal file
55 provides a way to read sequences from a list of files.  The .msk file
56 provides a compact way of indicating the subset of sequences in a
57 larger database (typically nr or nt) that comprise a smaller database
58 (e.g. swissprot or pdbaa).  A .pal file (e.g. swissprot.00.pal) that
59 uses a .msk file has the form:
60 
61 	# Alias file generated by genmask
62 	# Date created: Mon Apr 10 11:24:05 2006
63 	#
64 	TITLE     Non-redundant SwissProt sequences
65 	DBLIST    nr.00
66 	OIDLIST   swissprot.00.msk
67 	LENGTH    74351250
68 	NSEQ      198346
69 	MAXOID    2617347
70 	MEMB_BIT 1
71 	# end of the file
72 
73 To work with this file, we must first load the nr.00 file, and then
74 read the swissprot.00.msk file, and then scan all the entries in the
75 swissprot.00.msk file (which are packed 32 mask-bit to an int) to
76 determine whether a specific libpos index entry is present in the
77 subset database.
78 
79 **************************************************************** */
80 
81 
82 /* ****************************************************************
83 This code reads NCBI Blast2 format databases from formatdb version 3 and 4
84 
85 (From NCBI) This section describes the format of the databases.
86 
87 Formatdb creates three main files for proteins containing indices,
88 sequences, and headers with the extensions, respectively, of pin, psq,
89 and phr (for nucleotides these are nin, nsq, and nhr).  A number of
90 other ISAM indices are created, but these are described elsewhere.
91 
92 FORMAT OF THE INDEX FILE
93 ------------------------
94 
95 1.) formatdb version number 	[4 bytes].
96 
97 2.) protein dump flag (1 for a protein database, 0 for a nucleotide
98     database) [4 bytes].
99 
100 3.) length of the database title in bytes	[4 bytes].
101 4.) the database title		[length given in 3.)].
102 5.) length of the date/time string	[4 bytes].
103 6.) the date/time string	[length given in 5.)].
104 7.) the number of sequences in the database	[4 bytes].
105 8.) the total length of the database in residues/basepairs	[4 bytes].
106 9.) the length of the longest sequence in the database 		[4 bytes].
107 
108 10.) a list of the offsets for definitions (one for each sequence) in
109 the header file.  There are num_of_seq+1 of these, where num_of_seq is
110 the number of sequences given in 7.).
111 
112 11.) a list of the offsets for sequences (one for each sequence) in
113 the sequence file.  There are num_of_seq+1 of these, where num_of_seq
114 is the number of sequences given in 7.).
115 
116 12.) a list of the offsets for the ambiguity characters (one for each
117 sequence) in the sequence file.  This list is only present for
118 nucleotide databases and, since the database is compressed 4/1 for
119 nucleotides, allows the ambiguity characters to be restored when the
120 sequence is generated.  There are num_of_seq+1 of these, where
121 num_of_seq is the number of sequences given in 7.).
122 
123 
124 FORMAT OF THE SEQUENCE FILE
125 ---------------------------
126 
127 There are different formats for the protein and nucleotide sequence files.
128 
129 The protein sequence files is quite simple.  The first byte in the
130 file is a NULL byte, followed by the sequence in ncbistdaa format
131 (described in the NCBI Software Development Toolkit documentation).
132 Following the sequence is another NULL byte, followed by the next
133 sequence.  The file ends with a NULL byte, following the last
134 sequence.
135 
136 The nucleotide sequence file contains the nucleotide sequence, with
137 four basepairs compressed into one byte.  The format used is NCBI2na,
138 documented in the NCBI Software Development Toolkit manual.  Any
139 ambiguity characters present in the original sequence are replaced at
140 random by A, C, G or T.  The true value of ambiguity characters are
141 stored at the end of each sequence to allow true reproduction of the
142 original sequence.
143 
144 FORMAT OF THE HEADER FILE  (formatdb version 3)
145 -------------------------
146 
147 The format of the header file depends on whether or not the identifiers in the
148 original file were parsed or not.  For the case that they were not, then each
149 entry has the format:
150 
151 gnl|BL_ORD_ID|entry_number my favorite yeast sequence...
152 
153 Here entry_number gives the ordinal number of the sequence in the
154 database (with zero offset).  The identifier
155 gnl|BL_ORD_ID|entry_number is used by the BLAST software to identify
156 the entry, if the user has not provided another identifier.  If the
157 identifier was parsed, then gnl|BL_ORD_ID|entry_number is replaced by
158 the correct identifier, as described in
159 ftp://ncbi.nlm.nih.gov/blast/db/README .
160 
161 There are no separators between these deflines.
162 
163 For formatdb version 4, the header file contains blast ASN.1 binary
164 deflines, which can parsed with parse_fastadl_asn().
165 
166 FORMAT OF THE .MSK FILE
167 -----------------------
168 
169 The .msk file is simply a packed list of masks for formatdb "oids" for
170 some other file (typically nr).  The first value is the last oid
171 available; the remainder are packed 32 oids/mask, so that the number
172 of masks is 1/32 the number of sequences in the file.
173 
174 **************************************************************** */
175 
176 #ifdef USE_MMAP
177 #include <sys/types.h>
178 #include <sys/stat.h>
179 #include <sys/mman.h>
180 #ifdef IBM_AIX
181 #include <fcntl.h>
182 #else
183 #include <sys/fcntl.h>
184 #endif
185 #endif
186 
187 #ifdef USE_MMAP
188 #ifndef MAP_FILE
189 #define MAP_FILE 0
190 #endif
191 #endif
192 
193 #ifdef UNIX
194 #define RBSTR "r"
195 #else
196 #define RBSTR "rb"
197 #endif
198 
199 #ifdef WIN32
200 #define SLASH_CHAR '\\'
201 #define SLASH_STR "\\"
202 #else
203 #define SLASH_CHAR '/'
204 #define SLASH_STR "/"
205 #endif
206 
207 #define XTERNAL
208 #include "uascii.h"
209 
210 #define XTERNAL
211 #include "upam.h"
212 #include "ncbl2_head.h"
213 
214 #include "defs.h"
215 #include "structs.h"
216 #include "mm_file.h"
217 
218 #define MAX_FADL_ACC_LEN 64
219 
220 unsigned int bl2_uint4_cvt(unsigned int);
221 unsigned int bl2_long4_cvt(long);
222 uint64_t bl2_long8_cvt(uint64_t);
223 void src_int4_read(FILE *fd,  int *valp);
224 void src_uint4_read(FILE *fd,  unsigned int *valp);
225 void src_long4_read(FILE *fd,  long *valp);
226 void src_long8_read(FILE *fd,  int64_t *val);
227 void ncbi_long8_read(FILE *fd,  int64_t *valp);
228 void src_char_read(FILE *fd,  char *valp);
229 unsigned char *parse_fastadl_asn(unsigned char *asn_buff, unsigned char *asn_max,
230 				 int *gi_p, int *db, char *acc,  size_t acc_len,
231 				 char *name, size_t name_len,
232 				 char *title, size_t t_len, int *taxid);
233 
234 /* nt_btoa maps  from blast 2bit  format to ascii  characters */
235 static char nt_btoa[5] = {"ACGT"};
236 
237 static char *aa_b2toa= "-ABCDEFGHIKLMNPQRSTVWXYZU*OJ";	/* NCBIstdaa */
238 
239 static int aa_btof[32];	/* maps to fasta alphabet */
240 static int aa_btof_null = 0;
241 
242 static int dbtype, dbformat, amb_cnt;
243 
244 #define NCBIBL20 12
245 
246 struct lmf_str *load_ncbl2(struct lmf_str *m_fptr, FILE *ifile, int dbformat, int dbtype);
247 
248 int ncbl2_get_mmap_chain_o(struct seqr_chain *cur_seqr_chain,
249 			   struct lmf_str *m_fd, struct db_str *db);
250 
251 int ncbl2_get_mmap_chain(struct seqr_chain *cur_seqr_chain,
252 			 struct lmf_str *m_fd, struct db_str *db);
253 
254 int ncbl2_getliba(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
255 int ncbl2_getlibn(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
256 
257 int ncbl2_getliba_o(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
258 int ncbl2_getlibn_o(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
259 
260 void newname(char *, char *, char *, int);
261 void parse_pal(char *, char *, int *, int *, FILE *);
262 
263 int readMFILE (void *buffer, size_t size, int nitems, struct lmf_str *m_fd);
264 
265 void ncbl2_ranlib(char *, int, fseek_t, char *, struct lmf_str *m_fd);
266 
267 /* ncbl2_openlib() is used to open (and memory map) a BLAST2.0 format
268    file.  Ifdef USE_MMAP, then ncbl2_openlib returns a structure that can
269    be used to read the database. */
270 
271 struct lmf_str *
ncbl2_openlib(struct lib_struct * lib_p,int ldnaseq)272 ncbl2_openlib(struct lib_struct *lib_p,  int ldnaseq)
273 {
274   char lname[256];	/* .pal, .nal file name */
275   char dname[256];	/* .pin, .nin file for files included from .msk files */
276   char msk_name[256];	/* .msk file name */
277   char hname[256];	/* .phr, .nhr */
278   char sname[256];	/* .psq, .nsq */
279   char tname[256];	/* .pin, .nin file */
280   char db_dir[256];	/*  directory where all the files live */
281   int pref_db= -1;	/*  right now, only swissprot.pal, pdbaa.pal
282 			    are used for masked OID files */
283   char *bp;
284   int oid_seqs, max_oid, have_oid_list;
285   int oid_cnt, oid_len;
286   unsigned int *oid_list, o_max;
287   int tmp;
288   int i;
289 #ifdef USE_MMAP
290   struct stat statbuf;
291 #endif
292   FILE *ifile;	/* index offsets, also DB info */
293   struct lmf_str *m_fptr;
294 
295   /* this function should be reorganized
296   (1) check to see if there is a .nal/.pal file before doing things
297       with names, since the names might be wrong.
298   (2) if there is a .nal/.pal file, check to see if it has a DBLIST
299       (later), if it does, then modify the lib_p->next chain
300       if it does not, then generate the appropriate file names, and
301       read the oid list
302   (3) otherwise (no .nal/.pal file), generate the file names
303 
304 
305   */
306 
307   if (ldnaseq==SEQT_PROT) {
308     newname(lname,lib_p->file_name,AA_LIST_EXT,(int)sizeof(lname));	/* .pal */
309   }
310   else {
311     newname(lname,lib_p->file_name,NT_LIST_EXT,(int)sizeof(lname));	/* .nal */
312   }
313 
314   /* check for a .nal/.pal OID list file */
315   max_oid = oid_seqs = 0;
316   oid_list = NULL;
317 
318   /* here, we check for a .pal/.nal file by trying to open it */
319   if ((ifile = fopen(lname,"r"))!=NULL) {
320 
321     if ((bp = strrchr(lib_p->file_name,SLASH_CHAR))!=NULL) {
322       *bp = '\0';
323       SAFE_STRNCPY(db_dir,lib_p->file_name,sizeof(db_dir));
324       SAFE_STRNCAT(db_dir,SLASH_STR,sizeof(db_dir));
325       *bp = SLASH_CHAR;
326     }
327     else {
328       db_dir[0]='\0';
329     }
330 
331     /* we have a list file, we need to parse it */
332     parse_pal(dname, msk_name, &oid_seqs, &max_oid, ifile);
333     fclose(ifile);
334 
335     pref_db = -1;
336 
337     if (oid_seqs > 0) {
338 
339       have_oid_list = 1;
340       /* get the pref_db before adding the directory */
341       if (strncmp(msk_name,"swissprot",9)==0) {
342 	pref_db = 7;
343       }
344       else if (strncmp(msk_name,"pdbaa",5)==0) {
345 	pref_db = 14;
346       }
347 
348       /* need to add directory to both dname and msk_name */
349       SAFE_STRNCPY(tname,db_dir,sizeof(tname));
350       SAFE_STRNCAT(tname,msk_name, sizeof(tname));
351       SAFE_STRNCPY(msk_name, tname, sizeof(msk_name));
352 
353       SAFE_STRNCPY(tname,db_dir,sizeof(tname));
354       SAFE_STRNCAT(tname,dname, sizeof(tname));
355       SAFE_STRNCPY(dname,tname,sizeof(dname));
356 
357       if (ldnaseq == SEQT_PROT) {
358 	newname(tname,dname,AA_INDEX_EXT,(int)sizeof(tname));
359 	newname(hname,dname,AA_HEADER_EXT,(int)sizeof(hname));
360 	newname(sname,dname,AA_SEARCHSEQ_EXT,(int)sizeof(sname));
361       }
362       else {	/* reading DNA library */
363 	newname(tname,dname,NT_INDEX_EXT,(int)sizeof(tname));
364 	newname(hname,dname,NT_HEADER_EXT,(int)sizeof(hname));
365 	newname(sname,dname,NT_SEARCHSEQ_EXT,(int)sizeof(sname));
366       }
367       /* now load the oid file */
368       if ((ifile = fopen(msk_name,RBSTR))==NULL) {
369 	fprintf(stderr,"error - cannot load %s file\n",msk_name);
370 	return NULL;
371       }
372       else {
373 	src_uint4_read(ifile,&o_max);
374 	if (o_max != max_oid) {
375 	  fprintf(stderr," error - oid count mismatch %d != %d\n",max_oid, o_max);
376 	}
377 	oid_len = (max_oid/32+1);
378 	if ((oid_list=(unsigned int *)calloc(oid_len,sizeof(int)))==NULL) {
379 	  fprintf(stderr," error - cannot allocate oid_list[%d]\n",oid_len);
380 	  return NULL;
381 	}
382 	if ((oid_cnt=fread(oid_list,sizeof(int),oid_len,ifile))==0) {
383 	  fprintf(stderr," error - cannot read oid_list[%d]\n",oid_len);
384 	  return NULL;
385 	}
386 	fclose(ifile);
387       }
388     }
389 #ifdef DEBUG
390     else {	/* we had a .msk file, but there are no oid's in
391 		   it. */
392       fprintf(stderr," *** WARNING -- found .pal/.nal file %s with no OIDs\n",lname);
393       return NULL;
394     }
395 #endif
396   }
397   else { /* else no OID/.msk file -- generate the names */
398     have_oid_list = 0;
399 
400     /* initialize file names */
401     if (ldnaseq==SEQT_PROT) {	/* read a protein database */
402       newname(tname,lib_p->file_name,AA_INDEX_EXT,(int)sizeof(tname));	/* .pin */
403       newname(hname,lib_p->file_name,AA_HEADER_EXT,(int)sizeof(hname));	/* .phr */
404       newname(sname,lib_p->file_name,AA_SEARCHSEQ_EXT,(int)sizeof(sname));	/* .psq */
405 
406     }
407     else {	/* reading DNA library */
408       newname(tname,lib_p->file_name,NT_INDEX_EXT,(int)sizeof(tname));	/* .nin */
409       newname(hname,lib_p->file_name,NT_HEADER_EXT,(int)sizeof(hname));	/* .nhr */
410       newname(sname,lib_p->file_name,NT_SEARCHSEQ_EXT,(int)sizeof(sname));	/* .nsq */
411     }
412   }
413 
414 
415   if (ldnaseq == SEQT_PROT) {
416     /* initialize map of BLAST2 amino acids to FASTA amino acids */
417     for (i=0; aa_b2toa[i]; i++) {
418       if ((tmp=aascii[aa_b2toa[i]])<NA) {
419 	aa_btof[i]=tmp;
420 	if (aa_b2toa[i] == 'O' || aa_b2toa[i] == 'o') aa_btof[i] = aascii['K'];
421 	else if (aa_b2toa[i] == 'U' || aa_b2toa[i] == 'u') aa_btof[i] = aascii['C'];
422       }
423       else if (aa_b2toa[i]=='*') aa_btof[i]=aascii['X'];
424       else aa_btof[i]=0;
425 /*    else aa_btof[i]=aascii['X']; */
426     }
427 
428     /* check to see if aa_btof[] actually does anything interesting */
429     aa_btof_null = 1;
430     for (i=0; i<sizeof(aa_b2toa); i++) {
431       if (i != aa_btof[i]) {
432 #ifdef DEBUG
433 	fprintf(stderr," difference at: i: %d [%c] != aa_btof[i]: %d [%c]\n",
434 		i,aa_b2toa[i],aa_btof[i],NCBIstdaa[aa_btof[i]]);
435 #endif
436 	aa_btof_null = 0;
437       }
438     }
439   }	/* else no OID/.msk file */
440 
441 
442   /* now we have all the file names, open the files and read the data */
443   /* open the index/header file, and read the sequence type info  */
444   if ((ifile = fopen(tname,RBSTR))==NULL) {
445     fprintf(stderr," cannot open %s (%s) INDEX file",tname,lib_p->file_name);
446     perror("...");
447     return NULL;
448   }
449   src_uint4_read(ifile,(unsigned *)&dbformat); /* get format DB version number */
450   src_uint4_read(ifile,(unsigned *)&dbtype);   /* get 1 for protein/0 DNA */
451 
452   if (dbformat != FORMATDBV3 && dbformat!=FORMATDBV4) {
453     fprintf(stderr,"error - %s wrong formatdb version (%d/%d)\n",
454 	    tname,dbformat,FORMATDBV3);
455     return NULL;
456   }
457 
458   if ((ldnaseq==SEQT_PROT && dbtype != AAFORMAT) ||
459       (ldnaseq==SEQT_DNA && dbtype!=NTFORMAT)) {
460     fprintf(stderr,"error - %s wrong format (%d/%d)\n",
461 	    tname,dbtype,(ldnaseq ? NTFORMAT: AAFORMAT));
462     return NULL;
463   }
464 
465   /* the files are there - allocate lmf_str */
466   if ((m_fptr=(struct lmf_str *)calloc(1,sizeof(struct lmf_str)))==NULL) {
467     fprintf(stderr," cannot allocate lmf_str\n");
468     return NULL;
469   }
470 
471   m_fptr->lib_aa = (ldnaseq == 0);
472   m_fptr->tmp_buf_max = 4096;
473   if ((m_fptr->tmp_buf=
474        (char *)calloc(m_fptr->tmp_buf_max,sizeof(char)))==NULL) {
475     fprintf(stderr," cannot allocate lmf_str->tmp_buffer\n");
476     return NULL;
477   }
478 
479   /* load the oid info */
480   m_fptr->have_oid_list = have_oid_list;
481   m_fptr->max_oid = max_oid;
482   m_fptr->oid_seqs = oid_seqs;
483   m_fptr->oid_list = oid_list;
484   m_fptr->pref_db= pref_db;
485   m_fptr->get_mmap_chain = NULL;
486 
487   /* open the header file */
488   if ((m_fptr->hfile = fopen(hname,RBSTR))==NULL) {
489     fprintf(stderr," cannot open %s header file\n",hname);
490     goto error_r;
491   }
492 
493   /* ncbl2_ranlib is used for all BLAST2.0 access */
494   m_fptr->ranlib = ncbl2_ranlib;
495   m_fptr->bl_format_ver = dbformat;
496   m_fptr->lb_type = NCBIBL20;
497   m_fptr->libf = NULL;
498 
499   if (ldnaseq==SEQT_DNA) {
500     if (oid_seqs > 0) {
501       m_fptr->getlib = ncbl2_getlibn_o;
502     }
503     else {
504       m_fptr->getlib = ncbl2_getlibn;
505     }
506     m_fptr->sascii = nascii;
507   }
508   else {
509     if (oid_seqs > 0) {
510       m_fptr->getlib = ncbl2_getliba_o;
511       m_fptr->get_mmap_chain = ncbl2_get_mmap_chain_o;
512     }
513     else {
514       m_fptr->getlib = ncbl2_getliba;
515       m_fptr->get_mmap_chain = ncbl2_get_mmap_chain;
516     }
517     m_fptr->sascii = aascii;
518   }
519   m_fptr->lb_name = lib_p->file_name;
520 
521   /* open the sequence file */
522 
523 #if defined (USE_MMAP)
524   m_fptr->mm_flg=((m_fptr->mmap_fd=open(sname,O_RDONLY))>=0);
525   if (!m_fptr->mm_flg) {
526     fprintf(stderr," cannot open %s",sname);
527     perror("...");
528   }
529   else {
530     if(fstat(m_fptr->mmap_fd, &statbuf) < 0) {
531       fprintf(stderr," cannot fstat %s",sname);
532       perror("...");
533       m_fptr->mm_flg = 0;
534     }
535     else {
536       m_fptr->st_size = statbuf.st_size;
537       if((m_fptr->mmap_base =
538 	  mmap(NULL, m_fptr->st_size, PROT_READ,
539 	       MAP_FILE | MAP_SHARED, m_fptr->mmap_fd, 0)) == (char *) -1) {
540 	fprintf(stderr," cannot mmap %s",sname);
541 	perror("...");
542 	m_fptr->mm_flg = 0;
543       }
544       else {
545 	m_fptr->mmap_addr = m_fptr->mmap_base;
546 	m_fptr->mm_flg = 1;
547       }
548     }
549     /* regardless, close the open()ed version */
550     close(m_fptr->mmap_fd);
551   }
552 #else
553   m_fptr->mm_flg = 0;
554 #endif
555 
556   if  (!m_fptr->mm_flg) {
557     if ((m_fptr->libf = fopen(sname,RBSTR))==NULL) {
558       fprintf(stderr," cannot open %s sequence file",sname);
559       perror("...");
560       goto error_r;
561     }
562   }
563 
564 /* all files should be open -- the rest of the work can be done by a
565    function common to ncbl2_openlib() and ncbl2_reopen()
566 */
567 
568   return load_ncbl2(m_fptr, ifile, dbformat, dbtype);
569 
570  error_r:
571   /* here if failure after m_fptr allocated */
572   free(m_fptr);
573   return NULL;
574 }
575 
576 
577 /* **************************************************************** */
578 /* re-open an already opened library file using information in m_fptr
579 
580    a valid m_fptr guarantees that the necessary files exist, and we
581    know whether there is an OID file.  m_fptr's are NEVER used for
582    file lists, only for files with sequence data
583 */
584 /* **************************************************************** */
585 
ncbl2_reopen(struct lmf_str * m_fptr)586 struct lmf_str *ncbl2_reopen(struct lmf_str *m_fptr) {
587   char lname[256];	/* .pal, .nal file name */
588   char dname[256];	/* .pin, .nin file for files included from .msk files */
589   char msk_name[256];	/* .msk file name */
590   char hname[256];	/* .phr, .nhr */
591   char sname[256];	/* .psq, .nsq */
592   char tname[256];	/* .pin, .nin file */
593   char db_dir[256];	/*  directory where all the files live */
594   int pref_db= -1;	/*  right now, only swissprot.pal, pdbaa.pal
595 			    are used for masked OID files */
596   char *bp;
597   int oid_seqs, max_oid, have_oid_list;
598   int oid_cnt, oid_len;
599   unsigned int *oid_list, o_max;
600 #ifdef USE_MMAP
601   struct stat statbuf;
602 #endif
603   FILE *ifile;	/* index offsets, also DB info */
604 
605   /* its not open, but its being re-used, so re-initialize things */
606   m_fptr->libf = NULL;
607   m_fptr->mmap_fd = -1;
608   m_fptr->mm_flg = 0;
609 
610   /* if we have an oid list, open it, read it, and use it to get the file names */
611   /* check for a .nal/.pal OID list file */
612   max_oid = oid_seqs = 0;
613   oid_list = NULL;
614 
615   if (m_fptr->have_oid_list) {
616     if (m_fptr->lib_aa==1) {
617       newname(lname,m_fptr->lb_name,AA_LIST_EXT,(int)sizeof(lname));	/* .pal */
618     }
619     else {
620       newname(lname,m_fptr->lb_name,NT_LIST_EXT,(int)sizeof(lname));	/* .nal */
621     }
622 
623     ifile = fopen(lname,"r");	/* it has to open, it did before */
624     if ((bp = strrchr(m_fptr->lb_name,SLASH_CHAR))!=NULL) {
625       *bp = '\0';
626       SAFE_STRNCPY(db_dir,m_fptr->lb_name,sizeof(db_dir));
627       SAFE_STRNCAT(db_dir,SLASH_STR,sizeof(db_dir));
628       *bp = SLASH_CHAR;
629     }
630     else {
631       db_dir[0]='\0';
632     }
633 
634     /* we have a list file, we need to parse it */
635     parse_pal(dname, msk_name, &oid_seqs, &max_oid, ifile);
636     fclose(ifile);
637 
638     /* we have read the .pal/.nal file, now deal with the .msk file */
639 
640     pref_db = -1;
641 
642     /* get the pref_db before adding the directory */
643     if (strncmp(msk_name,"swissprot",9)==0) {
644       pref_db = 7;
645     }
646     else if (strncmp(msk_name,"pdbaa",5)==0) {
647       pref_db = 14;
648     }
649 
650     /* need to add directory to both dname and msk_name */
651     SAFE_STRNCPY(tname,db_dir,sizeof(tname));
652     SAFE_STRNCAT(tname,msk_name, sizeof(tname));
653     SAFE_STRNCPY(msk_name, tname, sizeof(msk_name));
654 
655     SAFE_STRNCPY(tname,db_dir,sizeof(tname));
656     SAFE_STRNCAT(tname,dname, sizeof(tname));
657     SAFE_STRNCPY(dname,tname,sizeof(dname));
658 
659     if (m_fptr->lib_aa) {
660       newname(tname,dname,AA_INDEX_EXT,(int)sizeof(tname));
661       newname(hname,dname,AA_HEADER_EXT,(int)sizeof(hname));
662       newname(sname,dname,AA_SEARCHSEQ_EXT,(int)sizeof(sname));
663     }
664     else {	/* reading DNA library */
665       newname(tname,dname,NT_INDEX_EXT,(int)sizeof(tname));
666       newname(hname,dname,NT_HEADER_EXT,(int)sizeof(hname));
667       newname(sname,dname,NT_SEARCHSEQ_EXT,(int)sizeof(sname));
668     }
669 
670     ifile = fopen(msk_name,RBSTR);
671     src_uint4_read(ifile,&o_max);
672     oid_len = (max_oid/32+1);
673     if ((oid_list=(unsigned int *)calloc(oid_len,sizeof(int)))==NULL) {
674       fprintf(stderr," error - cannot allocate oid_list[%d]\n",oid_len);
675       return NULL;
676     }
677     if ((oid_cnt=fread(oid_list,sizeof(int),oid_len,ifile))==0) {
678       fprintf(stderr," error - cannot read oid_list[%d]\n",oid_len);
679       return NULL;
680     }
681     fclose(ifile);
682   }
683   else { /* else no OID/.msk file -- generate the names */
684     /* initialize file names */
685     if (m_fptr->lib_aa) {	/* read a protein database */
686       newname(tname,m_fptr->lb_name,AA_INDEX_EXT,(int)sizeof(tname));	/* .pin */
687       newname(hname,m_fptr->lb_name,AA_HEADER_EXT,(int)sizeof(hname));	/* .phr */
688       newname(sname,m_fptr->lb_name,AA_SEARCHSEQ_EXT,(int)sizeof(sname));	/* .psq */
689 
690     }
691     else {	/* reading DNA library */
692       newname(tname,m_fptr->lb_name,NT_INDEX_EXT,(int)sizeof(tname));	/* .nin */
693       newname(hname,m_fptr->lb_name,NT_HEADER_EXT,(int)sizeof(hname));	/* .nhr */
694       newname(sname,m_fptr->lb_name,NT_SEARCHSEQ_EXT,(int)sizeof(sname));	/* .nsq */
695     }
696   }
697 
698   /* now we have all the file names, open the files and read the data */
699   /* open the index/header file, and read the sequence type info  */
700   if ((ifile = fopen(tname,RBSTR))==NULL) {
701     fprintf(stderr," cannot open %s (%s) INDEX file",tname,m_fptr->lb_name);
702     perror("...");
703     return NULL;
704   }
705   src_uint4_read(ifile,(unsigned *)&dbformat); /* get format DB version number */
706   src_uint4_read(ifile,(unsigned *)&dbtype);   /* get 1 for protein/0 DNA */
707 
708   m_fptr->tmp_buf_max = 4096;
709   if ((m_fptr->tmp_buf=
710        (char *)calloc(m_fptr->tmp_buf_max,sizeof(char)))==NULL) {
711     fprintf(stderr," cannot allocate lmf_str->tmp_buffer\n");
712     return NULL;
713   }
714 
715   /* load the oid info */
716   m_fptr->max_oid = max_oid;
717   m_fptr->oid_seqs = oid_seqs;
718   m_fptr->oid_list = oid_list;
719   m_fptr->pref_db= pref_db;
720   m_fptr->get_mmap_chain = NULL;
721 
722   /* open the header file */
723   m_fptr->hfile = fopen(hname,RBSTR);
724 
725   /* ncbl2_ranlib is used for all BLAST2.0 access */
726   m_fptr->ranlib = ncbl2_ranlib;
727   m_fptr->bl_format_ver = dbformat;
728 
729   if (!m_fptr->lib_aa) {
730     if (oid_seqs > 0) {
731       m_fptr->getlib = ncbl2_getlibn_o;
732     }
733     else {
734       m_fptr->getlib = ncbl2_getlibn;
735     }
736     m_fptr->sascii = nascii;
737   }
738   else {
739     if (oid_seqs > 0) { m_fptr->getlib = ncbl2_getliba_o; }
740     else { m_fptr->getlib = ncbl2_getliba; }
741     m_fptr->sascii = aascii;
742   }
743 
744   /* open the sequence file */
745 
746 #if defined (USE_MMAP)
747   m_fptr->mm_flg=((m_fptr->mmap_fd=open(sname,O_RDONLY))>=0);
748   if(fstat(m_fptr->mmap_fd, &statbuf) < 0) {
749     fprintf(stderr," cannot fstat %s",sname);
750     perror("...");
751     m_fptr->mm_flg = 0;
752   }
753   else {
754     m_fptr->st_size = statbuf.st_size;
755     if((m_fptr->mmap_base =
756 	mmap(NULL, m_fptr->st_size, PROT_READ,
757 	     MAP_FILE | MAP_SHARED, m_fptr->mmap_fd, 0)) == (char *) -1) {
758       fprintf(stderr," cannot mmap %s",sname);
759       perror("...");
760       m_fptr->mm_flg = 0;
761     }
762     else {
763       m_fptr->mmap_addr = m_fptr->mmap_base;
764       m_fptr->mm_flg = 1;
765     }
766   }
767   /* regardless, close the open()ed version */
768   close(m_fptr->mmap_fd);
769 #else
770   m_fptr->mm_flg = 0;
771 #endif
772 
773   if  (!m_fptr->mm_flg) {
774     m_fptr->libf = fopen(sname,RBSTR);
775     if (!m_fptr->libf) {
776       fprintf(stderr," cannot open %s\n",sname);
777       return NULL;
778     }
779     m_fptr->mm_flg = 0;
780   }
781 
782   return load_ncbl2(m_fptr, ifile, dbformat, dbtype);
783 }
784 
785 struct lmf_str
load_ncbl2(struct lmf_str * m_fptr,FILE * ifile,int dbformat,int dbtype)786 *load_ncbl2(struct lmf_str *m_fptr, FILE *ifile, int dbformat, int dbtype) {
787   int title_len;
788   char *title_str=NULL;
789   int date_len;
790   char *date_str=NULL;
791   long ltmp;
792   int64_t l8tmp;
793   int i, tmp;
794   unsigned int *f_pos_arr;
795 
796   src_uint4_read(ifile,(unsigned *)&title_len);
797 
798   if (title_len > 0) {
799     if ((title_str = calloc((size_t)title_len+1,sizeof(char)))==NULL) {
800       fprintf(stderr," cannot allocate title string (%d)\n",title_len);
801       goto error_r;
802     }
803     fread(title_str,(size_t)1,(size_t)title_len,ifile);
804   }
805 
806   src_uint4_read(ifile,(unsigned *)&date_len);
807 
808   if (date_len > 0) {
809     if ((date_str = calloc((size_t)date_len+1,sizeof(char)))==NULL) {
810       fprintf(stderr," cannot allocate date string (%d)\n",date_len);
811       goto error_r;
812     }
813     fread(date_str,(size_t)1,(size_t)date_len,ifile);
814   }
815 
816   m_fptr->lpos = 0;
817   src_uint4_read(ifile,(unsigned *)&m_fptr->max_cnt);
818 
819   if (dbformat == FORMATDBV3) {
820     src_long4_read(ifile,&ltmp);
821     m_fptr->tot_len = ltmp;
822   }
823   else {
824     ncbi_long8_read(ifile,&l8tmp);
825     m_fptr->tot_len = l8tmp;
826   }
827 
828   src_long4_read(ifile,&ltmp);
829   m_fptr->max_len = ltmp;
830 
831   /* currently we are not using this information, but perhaps later */
832   if (title_str!=NULL) free(title_str);
833   if (date_str!=NULL) free(date_str);
834 
835 #ifdef DEBUG
836     fprintf(stderr,"%s format: BL2 (%s)  max_cnt: %d, totlen: %lld, maxlen %ld\n",
837 	    m_fptr->lb_name,m_fptr->mm_flg ? "mmap" : "fopen",
838 	    m_fptr->max_cnt,m_fptr->tot_len,m_fptr->max_len);
839 #endif
840 
841   /* allocate and read hdr indexes */
842   if ((f_pos_arr=(unsigned int *)calloc((size_t)m_fptr->max_cnt+1,sizeof(int)))==NULL) {
843       fprintf(stderr," cannot allocate tmp header pointers\n");
844       goto error_r;
845     }
846 
847   if ((m_fptr->d_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) {
848       fprintf(stderr," cannot allocate header pointers\n");
849       goto error_r;
850     }
851 
852   /* allocate and read sequence offsets */
853   if ((m_fptr->s_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) {
854       fprintf(stderr," cannot allocate sequence pointers\n");
855       goto error_r;
856     }
857 
858   if (fread(f_pos_arr,(size_t)4,m_fptr->max_cnt+1,ifile)!=m_fptr->max_cnt+1) {
859     fprintf(stderr," error reading hdr offsets: %s\n",m_fptr->lb_name);
860     goto error_r;
861   }
862 
863   for (i=0; i<=m_fptr->max_cnt; i++)
864 #ifdef IS_BIG_ENDIAN
865     m_fptr->d_pos_arr[i] = f_pos_arr[i];
866 #else
867     m_fptr->d_pos_arr[i] = bl2_uint4_cvt(f_pos_arr[i]);
868 #endif
869 
870   if (fread(f_pos_arr,(size_t)4,m_fptr->max_cnt+1,ifile)!=m_fptr->max_cnt+1) {
871     fprintf(stderr," error reading seq offsets: %s\n",m_fptr->lb_name);
872     goto error_r;
873   }
874   for (i=0; i<=m_fptr->max_cnt; i++) {
875 #ifdef IS_BIG_ENDIAN
876     m_fptr->s_pos_arr[i] = f_pos_arr[i];
877 #else
878     m_fptr->s_pos_arr[i] = bl2_uint4_cvt(f_pos_arr[i]);
879 #endif
880   }
881 
882   if (dbtype == NTFORMAT) {
883     /* allocate and ambiguity  offsets */
884     if ((m_fptr->a_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) {
885       fprintf(stderr," cannot allocate sequence pointers\n");
886       goto error_r;
887     }
888 
889     /*
890     for (i=0; i<=m_fptr->max_cnt; i++) src_uint4_read(ifile,&m_fptr->a_pos_arr[i]);
891     */
892 
893     if (fread(f_pos_arr,(size_t)4,m_fptr->max_cnt+1,ifile)!=m_fptr->max_cnt+1) {
894       fprintf(stderr," error reading seq offsets: %s\n",m_fptr->lb_name);
895       goto error_r;
896     }
897     for (i=0; i<=m_fptr->max_cnt; i++) {
898 #ifdef IS_BIG_ENDIAN
899       m_fptr->a_pos_arr[i] = f_pos_arr[i];
900 #else
901       m_fptr->a_pos_arr[i] = bl2_uint4_cvt(f_pos_arr[i]);
902 #endif
903     }
904   }
905 
906   /*
907   for (i=0; i < min(m_fptr->max_cnt,10); i++) {
908     fprintf(stderr,"%d: %d %d %d\n",i,m_fptr->s_pos_arr[i],m_fptr->a_pos_arr[i],m_fptr->d_pos_arr[i]);
909   }
910   */
911 
912   /* all done with ifile, close it */
913   fclose(ifile);
914   free(f_pos_arr);
915 
916   if (!m_fptr->mm_flg) {
917     tmp = fgetc(m_fptr->libf);
918     if (tmp!=NULLB)
919       fprintf(stderr," phase error: %d:%d found\n",0,tmp);
920   }
921 
922   m_fptr->bl_lib_pos = 1;
923   amb_cnt = 0;
924 
925   return m_fptr;
926 
927  error_r:
928   /* here if failure after m_fptr allocated */
929   free(m_fptr);
930   return NULL;
931 }
932 
933 /* **************************************************************** */
934 /* close the library, free s_pos_arr, a_pos_arr, but save file info */
935 /* **************************************************************** */
936 
ncbl2_closelib(struct lmf_str * m_fptr)937 void ncbl2_closelib(struct lmf_str *m_fptr)
938 {
939 
940   if (m_fptr->tmp_buf != NULL) {
941     free(m_fptr->tmp_buf);
942     m_fptr->tmp_buf = NULL;
943     m_fptr->tmp_buf_max = 0;
944   }
945 
946   if (m_fptr->s_pos_arr !=NULL) {
947     free(m_fptr->s_pos_arr);
948     m_fptr->s_pos_arr = NULL;
949   }
950   if (m_fptr->a_pos_arr!=NULL) {
951     free(m_fptr->a_pos_arr);
952     m_fptr->a_pos_arr = NULL;
953   }
954 
955   if (m_fptr->hfile !=NULL ) {
956     fclose(m_fptr->hfile);
957     m_fptr->hfile=NULL;
958     free(m_fptr->d_pos_arr);
959     m_fptr->d_pos_arr = NULL;
960   }
961 
962   if (m_fptr->oid_list != NULL) {
963     free(m_fptr->oid_list);
964     m_fptr->oid_list = NULL;
965     m_fptr->oid_seqs = m_fptr->max_oid = 0;
966   }
967 
968 #ifdef use_mmap
969   if (m_fptr->mm_flg) {
970     munmap(m_fptr->mmap_base,m_fptr->st_size);
971     m_fptr->mmap_fd = -1;
972   }
973   else
974 #endif
975     if (m_fptr->libf !=NULL ) {
976       fclose(m_fptr->libf);
977       m_fptr->libf=NULL;
978     }
979 
980   m_fptr->mm_flg = 0;
981 }
982 
983 /* **************************************************************** */
984 /* read a protein sequence using OID offsets */
985 /* **************************************************************** */
986 
987 int
ncbl2_getliba_o(unsigned char * seq,int maxs,char * libstr,int n_libstr,fseek_t * libpos,int * lcont,struct lmf_str * m_fd,long * l_off)988 ncbl2_getliba_o(unsigned char *seq,
989 		int maxs,
990 		char *libstr,
991 		int n_libstr,
992 		fseek_t *libpos,
993 		int *lcont,
994 		struct lmf_str *m_fd,
995 		long *l_off)
996 {
997   int tpos;
998   unsigned int t_mask, t_shift, oid_mask;
999 
1000   /* get to the next valid pointer */
1001 
1002   for ( tpos = m_fd->lpos ;tpos <= m_fd->max_oid; tpos++) {
1003     t_mask = tpos / 32;
1004     t_shift = 31 - (tpos % 32);
1005     if ((oid_mask = m_fd->oid_list[t_mask])==0) {  continue; }
1006 
1007     if ((bl2_uint4_cvt(oid_mask) & 0x1 << t_shift)) {
1008       if (!m_fd->mm_flg) fseek(m_fd->libf,m_fd->s_pos_arr[tpos],0);
1009       m_fd->lpos = tpos;	/* already bumped up */
1010       m_fd->bl_lib_pos = m_fd->s_pos_arr[tpos];
1011       return ncbl2_getliba(seq, maxs, libstr, n_libstr,
1012 			   libpos, lcont, m_fd, l_off);
1013     }
1014   }
1015   return -1;
1016 }
1017 
1018 /* **************************************************************** */
1019 /* read a protein sequence */
1020 /* **************************************************************** */
1021 
1022 int
ncbl2_getliba(unsigned char * seq,int maxs,char * libstr,int n_libstr,fseek_t * libpos,int * lcont,struct lmf_str * m_fd,long * l_off)1023 ncbl2_getliba(unsigned char *seq,
1024 	      int maxs,
1025 	      char *libstr,
1026 	      int n_libstr,
1027 	      fseek_t *libpos,
1028 	      int *lcont,
1029 	      struct lmf_str *m_fd,
1030 	      long *l_off)
1031 {
1032   unsigned char *sptr, *dptr;
1033   int s_chunk, d_len, lib_cnt;
1034   long seqcnt;
1035   long tmp;
1036   static long seq_len;
1037 #if defined(DEBUG) || defined(PCOMPLIB)
1038   int gi, my_db, taxid;
1039   char acc[MAX_FADL_ACC_LEN], title[MAX_UID], name[MAX_FADL_ACC_LEN];
1040 #endif
1041 
1042   *l_off = 1;
1043 
1044   lib_cnt = m_fd->lpos;
1045   *libpos = (fseek_t)m_fd->lpos;
1046 
1047   if (*lcont==0) {
1048     if (lib_cnt >= m_fd->max_cnt) return -1;	/* no more sequences */
1049     seq_len = m_fd->s_pos_arr[lib_cnt+1] - m_fd->s_pos_arr[lib_cnt]; /* value is +1 off to get the NULL */
1050     if (m_fd->mm_flg) m_fd->mmap_addr = m_fd->mmap_base+m_fd->s_pos_arr[lib_cnt];
1051 #if !defined(DEBUG) && !defined(PCOMPLIB)
1052     libstr[0]='\0';
1053 #else
1054     /* get the name from the header file */
1055     fseek(m_fd->hfile,m_fd->d_pos_arr[lib_cnt],0);
1056 
1057     if (m_fd->bl_format_ver == FORMATDBV3) {
1058       d_len = min(n_libstr-1,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1);
1059       fread(libstr,(size_t)1,(size_t)d_len,m_fd->hfile);
1060       libstr[d_len]='\0';
1061     }
1062     else {
1063       d_len = min(m_fd->tmp_buf_max,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1);
1064       fread(m_fd->tmp_buf,(size_t)1,(size_t)d_len,m_fd->hfile);
1065       parse_fastadl_asn((unsigned char *)m_fd->tmp_buf, (unsigned char *)m_fd->tmp_buf+d_len,
1066 			&gi, &my_db, acc, sizeof(acc), name, sizeof(name), title, sizeof(title), &taxid);
1067       sprintf(m_fd->tmp_buf,"gi|%d",gi);
1068       SAFE_STRNCPY(libstr,m_fd->tmp_buf,n_libstr);
1069     }
1070     libstr[n_libstr-1]='\0';
1071 #endif
1072   }
1073   if (seq_len <= maxs) { /* sequence fits */
1074     seqcnt = seq_len;
1075     m_fd->lpos++;
1076     *lcont = 0;
1077   }
1078   else {		/* doesn't fit */
1079     seqcnt = maxs-1;
1080     (*lcont)++;
1081   }
1082 
1083   if (m_fd->mm_flg) sptr = (unsigned char *)m_fd->mmap_addr;
1084   else {
1085     if ((tmp=fread(seq,(size_t)1,(size_t)seq_len,m_fd->libf))!=(size_t)seq_len) {
1086       fprintf(stderr," could not read sequence record: %lld %ld != %ld\n",
1087 	      *libpos,tmp,seq_len);
1088       goto error;
1089     }
1090     sptr = seq;
1091   }
1092   if (seq_len <= maxs) {seqcnt = --seq_len;}
1093 
1094   /* everything is ready, set up dst. pointer, seq_len */
1095   if (aa_b2toa[sptr[seq_len-1]]=='*') seq_len--;
1096   if (aa_btof_null) {
1097     memcpy(seq,sptr,seq_len);
1098   }
1099   else {
1100     dptr = seq;
1101     s_chunk = seqcnt/16;
1102     while (s_chunk-- > 0) {
1103       *dptr++ = aa_btof[*sptr++];
1104       *dptr++ = aa_btof[*sptr++];
1105       *dptr++ = aa_btof[*sptr++];
1106       *dptr++ = aa_btof[*sptr++];
1107       *dptr++ = aa_btof[*sptr++];
1108       *dptr++ = aa_btof[*sptr++];
1109       *dptr++ = aa_btof[*sptr++];
1110       *dptr++ = aa_btof[*sptr++];
1111       *dptr++ = aa_btof[*sptr++];
1112       *dptr++ = aa_btof[*sptr++];
1113       *dptr++ = aa_btof[*sptr++];
1114       *dptr++ = aa_btof[*sptr++];
1115       *dptr++ = aa_btof[*sptr++];
1116       *dptr++ = aa_btof[*sptr++];
1117       *dptr++ = aa_btof[*sptr++];
1118       *dptr++ = aa_btof[*sptr++];
1119     }
1120     while (dptr < seq+seqcnt) *dptr++ = aa_btof[*sptr++];
1121   }
1122 
1123   if (m_fd->mm_flg) m_fd->mmap_addr = (char *)sptr;
1124 
1125   /* we didn't get it all, so reset for more */
1126   if (*lcont) seq_len -= seqcnt;
1127 
1128   seq[seqcnt]= EOSEQ;
1129   return (seqcnt);
1130 
1131 error:	fprintf(stderr," error reading %s at %lld\n",libstr,*libpos);
1132   fflush(stderr);
1133   return (-1);
1134 }
1135 
1136 /* ncbl2_mmap_getchain_o fills cur_seqr_chain with sequence pointers
1137    from the memory mapped file at *m_fd, based on oid coordinates (not
1138    contiguous)
1139 
1140    requires NCBIstdaa core encoding
1141 */
1142 int
ncbl2_get_mmap_chain_o(struct seqr_chain * cur_seqr_chain,struct lmf_str * m_fd,struct db_str * db)1143 ncbl2_get_mmap_chain_o(struct seqr_chain *cur_seqr_chain,
1144 		       struct lmf_str *m_fd, struct db_str *db)
1145 {
1146   int i;
1147   struct seq_record *seq_a, *seq_p;
1148   struct mseq_record *mseq_a, *mseq_p;
1149 
1150   int tpos;
1151   unsigned int t_mask, t_shift, oid_mask;
1152 
1153   tpos = m_fd->lpos;
1154   if (tpos > m_fd->max_oid) return EOF;
1155   seq_a = cur_seqr_chain->seqr_base;
1156   mseq_a = cur_seqr_chain->mseqr_base;
1157 
1158   for (i=0; i < cur_seqr_chain->max_chain_seqs; i++) {
1159     if (tpos > m_fd->max_oid) {
1160       break;
1161     }
1162     seq_p = &seq_a[i];
1163     mseq_p = &mseq_a[i];
1164 
1165     /* now get the next valid pointer */
1166     while (tpos <= m_fd->max_oid) {
1167       t_mask = tpos / 32;
1168       t_shift = 31 - (tpos % 32);
1169       if ((oid_mask = m_fd->oid_list[t_mask])==0) {  tpos++; continue; }
1170 
1171       /* have a valid entry */
1172       if ((bl2_uint4_cvt(oid_mask) & 0x1 << t_shift)) {
1173 	m_fd->bl_lib_pos = m_fd->s_pos_arr[tpos];
1174 	seq_p->n1 = m_fd->s_pos_arr[tpos+1] - m_fd->s_pos_arr[tpos]-1; /* value is +1 off to get the NULL */
1175 	seq_p->aa1b = (unsigned char *)(m_fd->mmap_base + m_fd->s_pos_arr[tpos]);
1176 	seq_p->l_offset = 0;
1177 	seq_p->l_off = 1;
1178 
1179 	db->entries++;
1180 	db->length += seq_p->n1;
1181 	if (db->length > LONG_MAX) {
1182 	  db->length -= LONG_MAX; db->carry++;
1183 	}
1184 
1185 	mseq_p->m_file_p = m_fd;
1186 	mseq_p->n1tot_p=NULL;
1187 	mseq_p->cont = 0;
1188 	seq_p->index = mseq_p->index = mseq_p->lseek = tpos++;
1189 #ifndef DEBUG
1190 	mseq_p->libstr[0] = '\0';
1191 #else
1192 #endif
1193 #if DEBUG
1194 	seq_p->adler32_crc = mseq_p->adler32_crc = adler32(1L,seq_p->aa1b,seq_p->n1);
1195 #endif
1196 	break;
1197       }
1198       else {
1199 	tpos++;
1200       }
1201     }
1202   }
1203   if (i==0 && tpos > m_fd->max_oid) return EOF;
1204   m_fd->lpos = tpos;
1205   cur_seqr_chain->cur_seq_cnt = i;
1206   if (i >= m_fd->max_cnt) return EOF;
1207   else return i;
1208 }
1209 
1210 /* ncbl2_mmap_getchain fills cur_seqr_chain with sequence pointers
1211    from the memory mapped file at *m_fd
1212 
1213    because the database is opened read-only, this code only works with
1214    an amino acid mapping identical to that used by blastdbcmd, aa_b2toa[]
1215 
1216 */
1217 int
ncbl2_get_mmap_chain(struct seqr_chain * cur_seqr_chain,struct lmf_str * m_fd,struct db_str * db)1218 ncbl2_get_mmap_chain(struct seqr_chain *cur_seqr_chain,
1219 		     struct lmf_str *m_fd, struct db_str *db) {
1220   int i, lib_cnt;
1221   struct seq_record *seq_a, *seq_p;
1222   struct mseq_record *mseq_a, *mseq_p;
1223 
1224   lib_cnt = m_fd->lpos;
1225   if (lib_cnt >= m_fd->max_cnt) return EOF;
1226   seq_a = cur_seqr_chain->seqr_base;
1227   mseq_a = cur_seqr_chain->mseqr_base;
1228 
1229   for (i=0; i < cur_seqr_chain->max_chain_seqs; i++) {
1230     if (lib_cnt >= m_fd->max_cnt) break;
1231     seq_p = &seq_a[i];
1232     mseq_p = &mseq_a[i];
1233     seq_p->n1 = m_fd->s_pos_arr[lib_cnt+1] - m_fd->s_pos_arr[lib_cnt]-1; /* value is +1 off to get the NULL */
1234 
1235     db->entries++;
1236     db->length += seq_p->n1;
1237     if (db->length > LONG_MAX) {
1238       db->length -= LONG_MAX; db->carry++;
1239     }
1240 
1241     mseq_p->m_file_p = m_fd;
1242     mseq_p->n1tot_p=NULL;
1243     mseq_p->cont = 0;
1244     seq_p->index = mseq_p->index = mseq_p->lseek = lib_cnt;
1245 #ifndef DEBUG
1246     mseq_p->libstr[0] = '\0';
1247 #else
1248 #endif
1249     seq_p->aa1b = (unsigned char *)(m_fd->mmap_base + m_fd->s_pos_arr[lib_cnt++]);
1250     seq_p->l_offset = 0;
1251     seq_p->l_off = 1;
1252 #if DEBUG
1253     seq_p->adler32_crc = mseq_p->adler32_crc = adler32(1L,seq_p->aa1b,seq_p->n1);
1254 #endif
1255   }
1256   m_fd->lpos = lib_cnt;
1257   cur_seqr_chain->cur_seq_cnt = i;
1258   return i;
1259 }
1260 
1261 /* **************************************************************** */
1262 /* read a DNA sequence using OID offsets */
1263 /* **************************************************************** */
1264 
1265 int
ncbl2_getlibn_o(unsigned char * seq,int maxs,char * libstr,int n_libstr,fseek_t * libpos,int * lcont,struct lmf_str * m_fd,long * l_off)1266 ncbl2_getlibn_o(unsigned char *seq,
1267 		int maxs,
1268 		char *libstr,
1269 		int n_libstr,
1270 		fseek_t *libpos,
1271 		int *lcont,
1272 		struct lmf_str *m_fd,
1273 		long *l_off)
1274 {
1275   int tpos;
1276   unsigned int t_mask, t_shift, oid_mask;
1277 
1278   /* get to the next valid pointer */
1279 
1280   for (tpos = m_fd->lpos; tpos <= m_fd->max_oid; tpos++) {
1281     t_mask = tpos / 32;
1282     t_shift = 31 - (tpos % 32);
1283     if ((oid_mask = m_fd->oid_list[t_mask])==0) {  continue; }
1284 
1285     if ((bl2_uint4_cvt(oid_mask) & 0x1 << t_shift)) {
1286       if (!m_fd->mm_flg) fseek(m_fd->libf,m_fd->s_pos_arr[tpos],0);
1287       m_fd->lpos = tpos;	/* already bumped up */
1288       m_fd->bl_lib_pos = m_fd->s_pos_arr[tpos];
1289       return ncbl2_getlibn(seq, maxs, libstr, n_libstr,
1290 			   libpos, lcont, m_fd, l_off);
1291     }
1292   }
1293   return -1;
1294 }
1295 
1296 static char tmp_amb[4096];
1297 
1298 /* **************************************************************** */
1299 /* read a DNA sequence */
1300 /* **************************************************************** */
1301 
1302 int
ncbl2_getlibn(unsigned char * seq,int maxs,char * libstr,int n_libstr,fseek_t * libpos,int * lcont,struct lmf_str * m_fd,long * l_off)1303 ncbl2_getlibn(unsigned char *seq,
1304 	      int maxs,
1305 	      char *libstr,
1306 	      int n_libstr,
1307 	      fseek_t *libpos,
1308 	      int *lcont,
1309 	      struct lmf_str *m_fd,
1310 	      long *l_off)
1311 {
1312   unsigned char *sptr, *tptr, stmp;
1313   long seqcnt;
1314   int s_chunk, lib_cnt;
1315   size_t tmp;
1316   char ch;
1317   static long seq_len;
1318   static int c_len,c_pad;
1319   int c_len_set, d_len;
1320 #if defined(DEBUG) || defined(PCOMPLIB)
1321   int gi, my_db, taxid;
1322   char acc[MAX_FADL_ACC_LEN], title[MAX_UID], name[MAX_FADL_ACC_LEN];
1323 #endif
1324 
1325   /* ambiguity code adapted from NCBI Blast sources by:
1326 
1327      Ralf Jost, Dipl.-Inform.
1328      Director, Technical Bioinformatics
1329      Biomax Informatics AG
1330      ralf.jost@biomax.com
1331   */
1332 
1333   int amb_lower = *lcont * maxs;
1334   /*   int amb_upper = (*lcont + 1) * maxs - 1; */ /* not used */
1335 
1336   unsigned long x;
1337   unsigned long soff, eoff;
1338   int index;
1339 
1340   unsigned int amb_cnt = 0;
1341   unsigned int large_amb = 0;
1342 
1343   unsigned int start;
1344   unsigned int end;
1345   unsigned int amb_start;
1346 
1347   const char str_2bit[] = "ACGT";
1348   const char str_4bit[] = "-ACMGRSVTWYHKDBN";
1349 
1350   unsigned int *amb_ptr = NULL;
1351   long filepos = 0;
1352   char *mmap_pos = NULL;
1353 
1354   unsigned int i;
1355   unsigned char res;
1356   int row_len;
1357   int j, position = 0;
1358 
1359   *l_off = 1;
1360 
1361   lib_cnt = m_fd->lpos;
1362   *libpos = (fseek_t)lib_cnt;
1363   if (*lcont==0) {	/* not a continuation of previous */
1364     if (lib_cnt >= m_fd->max_cnt) return (-1);
1365     c_len = m_fd->a_pos_arr[lib_cnt]- m_fd->s_pos_arr[lib_cnt];
1366     if (!m_fd->mm_flg) {
1367       /* fseek over amb_ray */
1368 	  fseek(m_fd->libf,m_fd->s_pos_arr[lib_cnt],0);
1369 	  m_fd->bl_lib_pos = m_fd->s_pos_arr[lib_cnt];
1370     }
1371     else m_fd->mmap_addr = m_fd->mmap_base + m_fd->s_pos_arr[lib_cnt];
1372 #if !defined(DEBUG) && !defined(PCOMPLIB)
1373     libstr[0]='\0';
1374 #else
1375     /* get the name from the header file */
1376     fseek(m_fd->hfile,m_fd->d_pos_arr[lib_cnt],0);
1377 
1378     if (m_fd->bl_format_ver == FORMATDBV3) {
1379       d_len = min(n_libstr-1,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1);
1380       fread(libstr,(size_t)1,(size_t)d_len,m_fd->hfile);
1381     }
1382     else {
1383       d_len = min(m_fd->tmp_buf_max,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1);
1384       fread(m_fd->tmp_buf,(size_t)1,(size_t)d_len,m_fd->hfile);
1385       parse_fastadl_asn((unsigned char *)m_fd->tmp_buf, (unsigned char *)m_fd->tmp_buf+d_len,
1386 			&gi, &my_db, acc, sizeof(acc), name, sizeof(name), title, sizeof(title), &taxid);
1387       sprintf(m_fd->tmp_buf,"gi|%d",gi);
1388       SAFE_STRNCPY(libstr,m_fd->tmp_buf,n_libstr);
1389     }
1390     libstr[n_libstr-1]='\0';
1391 #endif
1392   }			/* end of *lcont==0 */
1393 
1394   /* To avoid the situation where c_len <= 1; we must anticipate what
1395      c_len will be after this pass.  If it will be <= 64, back off this
1396      time so next time it will be > 64 */
1397 
1398   seq_len = c_len*4;
1399 
1400   if ((seq_len+4 > maxs) && (seq_len+4 - maxs  <= 256)) {
1401     /* we won't be done but we will have less than 256 to go */
1402     c_len -= 64; seq_len -= 256; c_len_set = 1; maxs -= 256;}
1403   else c_len_set = 0;
1404 
1405   /*
1406   fprintf(stderr," lib_cnt: %d %d %d %d\n",lib_cnt,c_len,seq_len,maxs);
1407   */
1408 
1409   /* does the rest of the sequence fit? */
1410   if (seq_len <= maxs-4 && !c_len_set) {
1411     seqcnt = c_len;
1412     if (!m_fd->mm_flg) {
1413       if ((tmp=fread(seq,(size_t)1,(size_t)seqcnt,m_fd->libf))!=(size_t)seqcnt) {
1414 	fprintf(stderr,
1415 		" could not read sequence record: %s %lld %ld != %ld: %d\n",
1416 		libstr,*libpos,tmp,seqcnt,*seq);
1417 	goto error;
1418       }
1419       m_fd->bl_lib_pos += tmp;
1420       sptr = seq + seqcnt;
1421     }
1422     else sptr = (unsigned char *)(m_fd->mmap_addr+seqcnt);
1423 
1424     *lcont = 0;		/* this is the last chunk */
1425     // lib_cnt++;		/* increment to the next sequence */
1426     /* the last byte is either '0' (no remainder) or the last 1-3 chars and the remainder */
1427     c_pad = *(sptr-1);
1428     c_pad &= 0x3;	/* get the last (low) 2 bits */
1429     seq_len -= (4 - c_pad);	/* if the last 2 bits are 0, its a NULL byte */
1430   }
1431   else {	/* get the next chunk, but more to come */
1432     seqcnt = ((maxs+3)/4)-1;
1433     if (!m_fd->mm_flg) {
1434       if ((tmp=fread(seq,(size_t)1,(size_t)(seqcnt),m_fd->libf))!=(size_t)(seqcnt)) {
1435 	fprintf(stderr," could not read sequence record: %lld %ld/%ld\n",
1436 		*libpos,tmp,seqcnt);
1437 	goto error;
1438       }
1439       m_fd->bl_lib_pos += tmp;
1440       sptr = seq + seqcnt;
1441     }
1442     else {
1443       sptr = (unsigned char *)(m_fd->mmap_addr+seqcnt);
1444       m_fd->mmap_addr += seqcnt;
1445     }
1446     seq_len = 4*seqcnt;
1447     c_len -= seqcnt;
1448 /*    if (c_len_set) {c_len += 64; maxs += 256;} */
1449     (*lcont)++;
1450 /*  hopefully we don't need this because of c_len -= 64. */
1451 /*
1452     if (c_len == 1) {
1453 #if !defined (USE_MMAP)
1454       c_pad = fgetc(m_fd->libf);
1455       *sptr=c_pad;
1456 #else
1457       c_pad = *m_fd->mmap_addr++;
1458       sptr = m_fd->mmap_addr;
1459 #endif
1460       c_pad &= 0x3;
1461       seq_len += c_pad;
1462       seqcnt++;
1463       lib_cnt++;
1464       *lcont = 0;
1465     }
1466 */
1467   }
1468 
1469   /* point to the last packed byte and to the end of the array
1470      seqcnt is the exact number of bytes read
1471      tptr points to the destination, use multiple of 4 to simplify math
1472      sptr points to the source, note that the last byte will be read 4 cycles
1473      before it is written
1474      */
1475 
1476   tptr = seq + 4*seqcnt;
1477   s_chunk = seqcnt/8;
1478   while (s_chunk-- > 0) {
1479     stmp = *--sptr;
1480     *--tptr = (stmp&3) +1;
1481     *--tptr = ((stmp >>= 2)&3)+1;
1482     *--tptr = ((stmp >>= 2)&3)+1;
1483     *--tptr = ((stmp >>= 2)&3)+1;
1484     stmp = *--sptr;
1485     *--tptr = (stmp&3) +1;
1486     *--tptr = ((stmp >>= 2)&3)+1;
1487     *--tptr = ((stmp >>= 2)&3)+1;
1488     *--tptr = ((stmp >>= 2)&3)+1;
1489     stmp = *--sptr;
1490     *--tptr = (stmp&3) +1;
1491     *--tptr = ((stmp >>= 2)&3)+1;
1492     *--tptr = ((stmp >>= 2)&3)+1;
1493     *--tptr = ((stmp >>= 2)&3)+1;
1494     stmp = *--sptr;
1495     *--tptr = (stmp&3) +1;
1496     *--tptr = ((stmp >>= 2)&3)+1;
1497     *--tptr = ((stmp >>= 2)&3)+1;
1498     *--tptr = ((stmp >>= 2)&3)+1;
1499     stmp = *--sptr;
1500     *--tptr = (stmp&3) +1;
1501     *--tptr = ((stmp >>= 2)&3)+1;
1502     *--tptr = ((stmp >>= 2)&3)+1;
1503     *--tptr = ((stmp >>= 2)&3)+1;
1504     stmp = *--sptr;
1505     *--tptr = (stmp&3) +1;
1506     *--tptr = ((stmp >>= 2)&3)+1;
1507     *--tptr = ((stmp >>= 2)&3)+1;
1508     *--tptr = ((stmp >>= 2)&3)+1;
1509     stmp = *--sptr;
1510     *--tptr = (stmp&3) +1;
1511     *--tptr = ((stmp >>= 2)&3)+1;
1512     *--tptr = ((stmp >>= 2)&3)+1;
1513     *--tptr = ((stmp >>= 2)&3)+1;
1514     stmp = *--sptr;
1515     *--tptr = (stmp&3) +1;
1516     *--tptr = ((stmp >>= 2)&3)+1;
1517     *--tptr = ((stmp >>= 2)&3)+1;
1518     *--tptr = ((stmp >>= 2)&3)+1;
1519   }
1520   while (tptr>seq) {
1521     stmp = *--sptr;
1522     *--tptr = (stmp&3) +1;
1523     *--tptr = ((stmp >>= 2)&3)+1;
1524     *--tptr = ((stmp >>= 2)&3)+1;
1525     *--tptr = ((stmp >>= 2)&3)+1;
1526   }
1527 
1528   /*
1529    * Ambiguity-Decoding code from
1530 
1531      Ralf Jost, Dipl.-Inform.
1532      Director, Technical Bioinformatics
1533      Biomax Informatics AG
1534      ralf.jost@biomax.com
1535 
1536      using code from NCBI Blast distribution
1537    */
1538 
1539   amb_start = m_fd->a_pos_arr[lib_cnt];
1540   end = m_fd->s_pos_arr[lib_cnt + 1];
1541 
1542   if (amb_start != end) {
1543     if (!m_fd->mm_flg) {
1544       filepos = ftell(m_fd->libf);
1545       /* find the size of the ambiguity table */
1546       if (fseek(m_fd->libf, amb_start, SEEK_SET) != 0) {
1547 	fprintf(stderr, "*** error [%s:%d] *** -- Seek amb start 0x%08x error %d\n",
1548 		__FILE__, __LINE__, amb_start, ferror(m_fd->libf));
1549       }
1550       if (fread(&amb_cnt, sizeof(unsigned int), 1, m_fd->libf) != 1) {
1551 	fprintf(stderr, "*** error [%s:%d] *** -- Read amb count error %d\n",
1552 		__FILE__, __LINE__, ferror(m_fd->libf));
1553       }
1554     } else {
1555       mmap_pos = m_fd->mmap_addr;
1556       m_fd->mmap_addr = m_fd->mmap_base + amb_start;
1557       if (readMFILE((void *)&amb_cnt, sizeof(unsigned int), 1, m_fd) != 1) {
1558 	fprintf(stderr, "*** error [%s:%d] *** -- Read amb count error %d\n",
1559 		__FILE__, __LINE__, ferror(m_fd->libf));
1560       }
1561     }
1562 
1563     amb_cnt = bl2_uint4_cvt(amb_cnt);
1564 
1565     /* if the most significant bit is set on the count, then each
1566      * correction will take two entries in the table.  the layout
1567      * is described below.
1568      */
1569     large_amb = amb_cnt >> 31;
1570     amb_cnt = amb_cnt & 0x7fffffff;
1571 
1572     /* allocate enough space for the ambiguity table */
1573     amb_ptr = (unsigned int *) malloc(amb_cnt * sizeof(unsigned int));
1574     if (amb_ptr == NULL) {
1575       fprintf(stderr, "*** error [%s:%d] malloc amb table error size %ld\n",
1576 	      __FILE__, __LINE__, amb_cnt * sizeof(unsigned int));
1577     }
1578 
1579     /* read the table */
1580     if (!m_fd->mm_flg) {
1581       if (fread((unsigned char *) amb_ptr, sizeof(unsigned int), amb_cnt, m_fd->libf)
1582 	  != amb_cnt) {
1583 	fprintf(stderr, "*** error [%s:%d] *** -- Read amb table %d error %d\n",
1584 		__FILE__, __LINE__, amb_cnt, ferror(m_fd->libf));
1585       }
1586     } else {
1587       if (readMFILE((void *) amb_ptr, sizeof(unsigned int), amb_cnt, m_fd)
1588 	  != amb_cnt) {
1589 	fprintf(stderr, "*** error [%s:%d] *** -- Read amb table %d error %d\n",
1590 		__FILE__, __LINE__, amb_cnt, ferror(m_fd->libf));
1591       }
1592     }
1593 
1594     for (index=0; index < amb_cnt; index++) {
1595       amb_ptr[index] = bl2_uint4_cvt(amb_ptr[index]);
1596     }
1597 
1598     for (i = 0; i < amb_cnt; i++) {
1599 
1600       if (large_amb) {
1601 	res = (unsigned char) (amb_ptr[i] >> 28);
1602 	row_len = (int) (amb_ptr[i] >> 16) & 0xFFF;
1603 	position = amb_ptr[i + 1];
1604       } else {
1605 	res = (unsigned char) (amb_ptr[i] >> 28);
1606 	row_len = (int) ((amb_ptr[i] >> 24) & 0xF);
1607 	position = amb_ptr[i] & 0xFFFFFF;
1608       }
1609       for (index = position, j = 0; j <= row_len; j++) {
1610 	if ((index + j >= amb_lower) && (index + j < amb_lower + 4 * seqcnt))
1611 	  seq[index + j - amb_lower] = nascii[str_4bit[res]];
1612       }
1613 
1614       if (large_amb)
1615 	i++;
1616     }
1617 
1618     if (amb_ptr != NULL)
1619       free(amb_ptr);
1620 
1621     if (!m_fd->mm_flg) {
1622       fseek(m_fd->libf, filepos, SEEK_SET);
1623     } else {
1624       m_fd->mmap_addr = mmap_pos;
1625     }
1626   }
1627 
1628   /*
1629    * End of ambiguity-decoding.
1630    */
1631 
1632   if ( *lcont == 0)
1633     lib_cnt++;
1634 
1635 
1636 
1637 
1638 
1639   /*
1640     for (sptr=seq; sptr < seq+seq_len; sptr++) {
1641     printf("%c",nt[*sptr]);
1642     if ((int)(sptr-seq) % 60 == 59) printf("\n");
1643     }
1644     printf("\n");
1645     */
1646 
1647   m_fd->lpos = lib_cnt;
1648   if (seqcnt*4 >= seq_len) {	/* there was enough room */
1649     seq[seq_len]= EOSEQ;
1650     /* printf("%d\n",seq_len); */
1651     return seq_len;
1652   }
1653   else {				/* not enough room */
1654     seq[seqcnt*4]=EOSEQ;
1655     seq_len -= 4*seqcnt;
1656     return (4*seqcnt);
1657   }
1658 
1659 error:	fprintf(stderr," error reading %s at %lld\n",libstr,*libpos);
1660   fflush(stderr);
1661   return (-1);
1662 }
1663 
1664                 /*   0     1     2     3    4     5     6    7
1665 		     8     9    10    11   12    13    14   15
1666 		     16    17 */
1667 static char
1668 *db_type_arr[] = {"lcl","gib","gim","gii","gb","emb","pir","sp",
1669 		  "pat","ref","gnl","gi","dbj","prf","pdb","tpg",
1670 		  "tpe","tpd"};
1671 
1672 /* **************************************************************** */
1673 /* read a description, position for sequence */
1674 /* **************************************************************** */
1675 void
ncbl2_ranlib(char * str,int cnt,fseek_t libpos,char * libstr,struct lmf_str * m_fd)1676 ncbl2_ranlib(char *str,
1677 	     int cnt,
1678 	     fseek_t libpos,
1679 	     char *libstr,
1680 	     struct lmf_str *m_fd)
1681 {
1682   int llen, lib_cnt;
1683   char *bp;
1684   unsigned char *my_buff=NULL;
1685   char descr[2048];
1686   unsigned char *abp;
1687   int gi, taxid;
1688   int my_db;
1689   char db[5], acc[MAX_FADL_ACC_LEN], name[MAX_FADL_ACC_LEN];
1690   char title[2048];
1691   int have_my_buff=0;
1692   int have_descr = 0;
1693 
1694   lib_cnt = (int)libpos;
1695   llen = m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt];
1696 
1697   fseek(m_fd->hfile,m_fd->d_pos_arr[libpos],0);
1698 
1699   if (m_fd->bl_format_ver == FORMATDBV3) {
1700     if (llen >= cnt) llen = cnt-1;
1701     fread(str,(size_t)1,(size_t)(llen),m_fd->hfile);
1702   }
1703   else {
1704     if (llen >= m_fd->tmp_buf_max) {
1705       if ((my_buff=(unsigned char *)calloc(llen,sizeof(char)))==NULL) {
1706 	fprintf(stderr," cannot allocate ASN.1 buffer: %d\n",llen);
1707 	my_buff = (unsigned char *)m_fd->tmp_buf;
1708 	llen = m_fd->tmp_buf_max;
1709       }
1710       else have_my_buff = 1;
1711     }
1712     else {
1713       my_buff = (unsigned char *)m_fd->tmp_buf;
1714     }
1715     abp = my_buff;
1716     fread(my_buff,(size_t)1,llen,m_fd->hfile);
1717 
1718     do {
1719       abp = parse_fastadl_asn(abp, my_buff+llen,
1720 			      &gi, &my_db, acc, sizeof(acc), name, sizeof(name),
1721 			      title, sizeof(title), &taxid);
1722 
1723       if (gi > 0) {
1724 	sprintf(descr,"gi|%d|%s|%s|%s ",gi,db_type_arr[my_db],acc,name);
1725       }
1726       else {
1727 	if (acc[0] != '\0') sprintf(descr,"%s ",acc);
1728 	else descr[0] = '\0';
1729 	if (name[0] != '\0' && strcmp(name,"BL_ORD_ID")!=0) sprintf(descr+strlen(descr),"%s ", name);
1730       }
1731       if (my_db == 0 || m_fd->pref_db < 0) {
1732 	if (!have_descr) {
1733 	  SAFE_STRNCPY(str,descr,cnt);
1734 	  have_descr = 1;
1735 	}
1736 	else {
1737 	  SAFE_STRNCAT(str,"\001",cnt);
1738 	  SAFE_STRNCAT(str,descr,cnt);
1739 	}
1740 	SAFE_STRNCAT(str,title,cnt);
1741 	if (strlen(str) >= cnt-1) break;
1742       }
1743       else if (m_fd->pref_db == my_db) {
1744 	have_descr = 1;
1745 	SAFE_STRNCPY(str,descr,cnt);
1746 	SAFE_STRNCAT(str,title,cnt);
1747 	break;
1748       }
1749     } while (abp);
1750 
1751     if (!have_descr) {
1752       SAFE_STRNCPY(str,descr,cnt);
1753       SAFE_STRNCAT(str,descr,cnt);
1754     }
1755 
1756     if (have_my_buff) free(my_buff);
1757   }
1758 
1759   str[cnt-1]='\0';
1760 
1761   bp = str;
1762   while((bp=strchr(bp,'\001'))!=NULL) {*bp++=' ';}
1763 
1764   if (!m_fd->mm_flg) fseek(m_fd->libf,m_fd->s_pos_arr[libpos],0);
1765 
1766   m_fd->lpos = lib_cnt;
1767   m_fd->bl_lib_pos = m_fd->s_pos_arr[lib_cnt];
1768 }
1769 
bl2_uint4_cvt(unsigned int val)1770 unsigned int bl2_uint4_cvt(unsigned int val)
1771 {
1772   unsigned int res;
1773 #ifdef IS_BIG_ENDIAN
1774   return val;
1775 #else /* it better be LITTLE_ENDIAN */
1776   res = ((val&255)*256)+ ((val>>8)&255);
1777   res = (res<<16) + (((val>>16)&255)*256) + ((val>>24)&255);
1778   return res;
1779 #endif
1780 }
1781 
bl2_long4_cvt(long val)1782 unsigned int bl2_long4_cvt(long val)
1783 {
1784   int val4;
1785   unsigned int res;
1786 #ifdef IS_BIG_ENDIAN
1787   val4 = val;
1788   return val4;
1789 #else /* it better be LITTLE_ENDIAN */
1790   res = ((val&255)*256)+ ((val>>8)&255);
1791   res = (res<<16) + (((val>>16)&255)*256) + ((val>>24)&255);
1792   return res;
1793 #endif
1794 }
1795 
bl2_long8_cvt(uint64_t val)1796 uint64_t bl2_long8_cvt(uint64_t val)
1797 {
1798   uint64_t res;
1799 #ifdef IS_BIG_ENDIAN
1800   return val;
1801 #else /* it better be LITTLE_ENDIAN */
1802   res = ((val&255)*256)+ ((val>>8)&255);
1803   res = (res<<16) + (((val>>16)&255)*256) + ((val>>24)&255);
1804 #ifdef BIG_LIB64
1805   res = (res<<16) + (((val>>32)&255)*256) + ((val>>40)&255);
1806   res = (res<<16) + (((val>>48)&255)*256) + ((val>>56)&255);
1807 #else
1808   fprintf(stderr,"Cannot use bl2_long8_cvt without 64-bit longs\n");
1809   exit(1);
1810 #endif
1811   return res;
1812 #endif
1813 }
1814 
src_int4_read(FILE * fd,int * val)1815 void src_int4_read(FILE *fd,  int *val)
1816 {
1817 #ifdef IS_BIG_ENDIAN
1818   fread((char *)val,(size_t)4,(size_t)1,fd);
1819 #else
1820   unsigned char b[4];
1821 
1822   fread((char *)&b[0],(size_t)1,(size_t)4,fd);
1823   *val = 0;
1824   *val = (int)(((((b[0]<<8)+b[1])<<8)+b[2])<<8)+b[3];
1825 #endif
1826 }
1827 
src_long4_read(FILE * fd,long * valp)1828 void src_long4_read(FILE *fd,  long *valp)
1829 {
1830   int val4;
1831 #ifdef IS_BIG_ENDIAN
1832   fread(&val4,(size_t)4,(size_t)1,fd);
1833   *valp = val4;
1834 #else
1835   unsigned char b[4];
1836 
1837   fread((char *)&b[0],(size_t)1,(size_t)4,fd);
1838   val4 = (int)(((((b[0]<<8)+b[1])<<8)+b[2])<<8)+b[3];
1839   *valp = val4;
1840 #endif
1841 }
1842 
src_uint4_read(FILE * fd,unsigned int * valp)1843 void src_uint4_read(FILE *fd,  unsigned int *valp)
1844 {
1845 #ifdef IS_BIG_ENDIAN
1846   fread(valp,(size_t)4,(size_t)1,fd);
1847 #else
1848   unsigned char b[4];
1849 
1850   fread((char *)&b[0],(size_t)1,(size_t)4,fd);
1851   *valp = 0;
1852   *valp = (unsigned int)(((((b[0]<<8)+b[1])<<8)+b[2])<<8)+b[3];
1853 #endif
1854 }
1855 
src_long8_read(FILE * fd,int64_t * val)1856 void src_long8_read(FILE *fd,  int64_t *val)
1857 {
1858 #ifdef IS_BIG_ENDIAN
1859   fread((void *)val,(size_t)8,(size_t)1,fd);
1860 #else
1861   int val_h;
1862   unsigned int val_l;
1863   unsigned char b[8];
1864 
1865   fread((char *)&b[0],(size_t)1,(size_t)8,fd);
1866 
1867   /* modified to work with both 32-bit and 64-bit native ints */
1868   val_h = (((((b[0]<<8)+b[1])<<8)+b[2])<<8)+b[3];
1869   val_l = (((((b[4]<<8)+b[5])<<8)+b[6])<<8)+b[7];
1870   *val = val_h * 4294967296LL + val_l;
1871 
1872   /*
1873   *val = 0;
1874   *val = (long)(((((((b[0]<<8)+b[1]<<8)+b[2]<<8)
1875 		  +b[3]<<8)+b[4]<<8)+b[5]<<8)
1876 		+b[6]<<8)+b[7];
1877   */
1878 #endif
1879 }
1880 
ncbi_long8_read(FILE * fd,int64_t * val)1881 void ncbi_long8_read(FILE *fd,  int64_t *val)
1882 {
1883   unsigned char b[8];
1884 
1885   fread((char *)&b[0],(size_t)1,(size_t)8,fd);
1886   *val = 0;
1887   *val = (long)(((((((((((((b[7]<<8)+b[6])<<8)+b[5])<<8)+b[4])<<8)+b[3])<<8)+b[2])<<8)+b[1])<<8)+b[0];
1888 }
1889 
src_char_read(FILE * fd,char * val)1890 void src_char_read(FILE *fd, char *val)
1891 {
1892   fread(val,(size_t)1,(size_t)1,fd);
1893 }
1894 
src_fstr_read(FILE * fd,char * val,int slen)1895 void src_fstr_read(FILE *fd, char *val,  int slen)
1896 {
1897   fread(val,(size_t)slen,(size_t)1,fd);
1898 }
1899 
1900 void
newname(char * nname,char * oname,char * suff,int maxn)1901 newname(char *nname, char *oname, char *suff, int maxn)
1902 {
1903   SAFE_STRNCPY(nname,oname,maxn);
1904   SAFE_STRNCAT(nname,".",maxn);
1905   SAFE_STRNCAT(nname,suff,maxn);
1906 }
1907 
1908 #define ASN_SEQ 0x30
1909 #define ASN_IS_BOOL 1
1910 #define ASN_IS_INT 2
1911 #define ASN_IS_STR 26
1912 #define ASN_TYPE_MASK 31
1913 
1914 unsigned char *
get_asn_int(unsigned char * abp,int * val)1915 get_asn_int(unsigned char *abp, int *val) {
1916 
1917   int v_len, v;
1918 
1919   v = 0;
1920   if (*abp++ != ASN_IS_INT) { /* check for int */
1921     fprintf(stderr,"*** error [%s:%d] -- int missing\n",__FILE__, __LINE__);
1922   }
1923   else {
1924     v_len = *abp++;
1925     while (v_len-- > 0) {
1926       v *= 256;
1927       v += *abp++;
1928     }
1929     abp += 2;	/* skip over null's */
1930   }
1931   *val = v;
1932   return abp;
1933 }
1934 
1935 unsigned char *
get_asn_text(unsigned char * abp,char * text,int t_len)1936 get_asn_text(unsigned char *abp, char *text, int t_len) {
1937   int tch, at_len;
1938 
1939   text[0] = '\0';
1940   if (*abp++ != ASN_IS_STR) { /* check for str */
1941     fprintf(stderr,"*** error [%s:%d] - str missing\n",__FILE__,__LINE__);
1942   }
1943   else {
1944     if ((tch = *abp++) > 128) {	/* string length is in next bytes */
1945       tch &= 0x7f;	/* get number of bytes for len */
1946       at_len = 0;
1947       while (tch-- > 0) { at_len = (at_len << 8) + *abp++;}
1948     }
1949     else {
1950       at_len = tch;
1951     }
1952 
1953     if ( at_len < t_len-1) {
1954       memcpy(text, abp, at_len);
1955       text[at_len] = '\0';
1956     }
1957     else {
1958       memcpy(text, abp, t_len-1);
1959       text[t_len-1] = '\0';
1960     }
1961     abp += at_len + 2;
1962   }
1963   return abp;
1964 }
1965 
1966 /* something to try to skip over stuff we don't want */
1967 unsigned char *
get_asn_junk(unsigned char * abp)1968 get_asn_junk(unsigned char *abp) {
1969 
1970   int seq_cnt = 0;
1971   int tmp;
1972   char string[256];
1973 
1974   while (*abp) {
1975     if ( *abp  == ASN_SEQ) { abp += 2; seq_cnt++;}
1976     else if ( *abp == ASN_IS_BOOL ) {abp = get_asn_int(abp, &tmp);}
1977     else if ( *abp == ASN_IS_INT ) {abp = get_asn_int(abp, &tmp);}
1978     else if ( *abp == ASN_IS_STR ) {abp = get_asn_text(abp, string, sizeof(string)-1);}
1979     else { abp += 2;}
1980   }
1981 
1982   while (seq_cnt-- > 0) abp += 2;
1983   return abp;
1984 }
1985 
1986 #define ASN_FADL_TITLE 0xa0	/* \240 160 */
1987 #define ASN_FADL_SEQID 0xa1	/* \241 161 */
1988 #define ASN_FADL_TAXID 0xa2	/* \242 162 */
1989 #define ASN_FADL_MEMBERS 0xa3	/* \243 163 */
1990 #define ASN_FADL_LINKS 0xa4	/* \244 164 */
1991 #define ASN_FADL_OTHER 0xa5	/* \245 165 */
1992 #define ASN_FADL_GI    171
1993 
1994 #define ASN_FADL_TEXTSEQ_ID 0xa4  /* \244 164 */
1995 #define ASN_FADL_OTHERSEQ_ID 0xa5  /* \245 164 */
1996 
1997 /* from seq.asn::Textseq-id/Textannot-id */
1998 #define ASN_TEXTSEQ_ID_NAME 0xa0	/* \240 160 */
1999 #define ASN_TEXTSEQ_ID_ACC  0xa1	/* \241 161 */
2000 #define ASN_TEXTSEQ_ID_REL  0xa2	/* \242 162 */
2001 #define ASN_TEXTSEQ_ID_VER  0xa3	/* \243 163 */
2002 
2003 unsigned char *
get_asn_textseq_id(unsigned char * abp,char * name,size_t name_len,char * acc,size_t acc_len)2004 get_asn_textseq_id(unsigned char *abp,
2005 		   char *name, size_t name_len,  char *acc, size_t acc_len)
2006 {
2007   char release[20], ver_str[10];
2008   int version;
2009   int seqcnt = 0;
2010 
2011   ver_str[0]='\0';
2012 
2013   if (*abp == ASN_SEQ) { abp += 2; seqcnt++;}
2014 
2015   while (*abp) {
2016     switch (*abp) {
2017     case ASN_TEXTSEQ_ID_NAME :
2018       abp = get_asn_text(abp+2, name, name_len);
2019       break;
2020     case ASN_TEXTSEQ_ID_ACC :
2021       abp = get_asn_text(abp+2, acc, acc_len);
2022       break;
2023     case ASN_TEXTSEQ_ID_REL :
2024       abp = get_asn_text(abp+2, release, sizeof(release));
2025       break;
2026     case ASN_TEXTSEQ_ID_VER :
2027       abp = get_asn_int(abp+2, &version);
2028       sprintf(ver_str,".%d",version);
2029       break;
2030     default: abp += 2;
2031     }
2032   }
2033   while (seqcnt-- > 0) abp += 4;
2034   strncat(acc,ver_str,acc_len-strlen(acc));
2035   acc[19]='\0';
2036   return abp;	/* skip 2 NULL's */
2037 }
2038 
2039 unsigned char *
get_asn_local_id(unsigned char * abp,char * acc,size_t acc_len)2040 get_asn_local_id(unsigned char *abp, char *acc, size_t acc_len)
2041 {
2042   int seqcnt = 0;
2043 
2044   if (*abp == ASN_SEQ) { abp += 2; seqcnt++;}
2045 
2046   abp = get_asn_text(abp+2, acc, acc_len);
2047 
2048   while (seqcnt-- > 0) abp += 4;
2049   acc[acc_len-1]='\0';
2050   return abp+2;	/* skip 2 NULL's */
2051 }
2052 
2053 unsigned char *
get_asn_dbtag(unsigned char * abp,char * name,size_t name_len,char * str,size_t str_len,int * id_p)2054 get_asn_dbtag(unsigned char *abp, char *name, size_t name_len, char *str, size_t str_len, int *id_p) {
2055 
2056   if (*abp == ASN_SEQ) { abp += 2;}
2057 
2058   if (*abp == 0xa0) {  /* get db */
2059     abp = get_asn_text(abp+2, name, name_len);
2060   }
2061   else {
2062     fprintf(stderr,"*** error [%s:%d] -  missing dbtag:db %d %d\n",__FILE__, __LINE__, abp[0],abp[1]);
2063     abp += 2;
2064   }
2065 
2066   if (*abp == 0xa1) {  /* get tag */
2067     abp += 2;
2068     abp += 2; /* skip over id */
2069     if (*abp == 2) abp = get_asn_int(abp,id_p);
2070     else abp = get_asn_text(abp, str, str_len);
2071   }
2072   else {
2073     fprintf(stderr,"*** error [%s:%d] - missing dbtag:tag %2x %2x\n",__FILE__, __LINE__, abp[0],abp[1]);
2074     abp += 2;
2075   }
2076   return abp+2;	/* skip 2 NULL's */
2077 }
2078 
2079 #define ASN_DATE_STR 0xa0
2080 #define ASN_DATE_STD 0xa1
2081 #define ASN_DATE_STD_YR 0xa0
2082 #define ASN_DATE_STD_MO 0xa1
2083 #define ASN_DATE_STD_DAY 0xa2
2084 
2085 unsigned char *
get_asn_date_std(unsigned char * abp,char * date)2086 get_asn_date_std(unsigned char *abp, char *date) {
2087   int seq_cnt=0;
2088   int year, month, day;
2089 
2090   year = month = day = 0;
2091 
2092   while (*abp == ASN_SEQ) { abp+=2; seq_cnt++;}
2093 
2094   while (*abp) {
2095     switch (*abp) {
2096     case ASN_DATE_STD_YR :
2097       abp = get_asn_int(abp+2, &year);
2098       break;
2099     case ASN_DATE_STD_MO :
2100       abp = get_asn_int(abp+2, &month);
2101       break;
2102     case ASN_DATE_STD_DAY :
2103       abp = get_asn_int(abp+2, &day);
2104       break;
2105     default:
2106       fprintf(stderr, "*** error [%s:%d] - incorrect date-std code: %0x1 %0x1\n",
2107 	      __FILE__, __LINE__, abp[0], abp[1]);
2108     }
2109   }
2110   sprintf(date, "%02d-%02d-%02d", year, month, day);
2111 
2112   while (seq_cnt-- > 0) { abp += 4;}
2113 
2114   return abp+2;
2115 }
2116 
2117 unsigned char *
get_asn_date(unsigned char * abp,char * date,size_t date_len)2118 get_asn_date(unsigned char *abp, char *date, size_t date_len) {
2119 
2120   if (*abp == ASN_DATE_STR) {
2121     abp = get_asn_text(abp, date, date_len);
2122   }
2123   else if (*abp == ASN_DATE_STD) {
2124     abp = get_asn_date_std(abp+2, date);
2125   }
2126   else {
2127     fprintf(stderr, "*** error [%s:%d] - incorrect date code: %0x1 %0x1\n",
2128 	    __FILE__, __LINE__, abp[0], abp[1]);
2129   }
2130   return abp+2;
2131 }
2132 
2133 unsigned char *
get_asn_pdb_id(unsigned char * abp,char * acc,size_t acc_len,char * chain,size_t chain_len)2134 get_asn_pdb_id(unsigned char *abp, char *acc, size_t acc_len, char *chain, size_t chain_len)
2135 {
2136   int ichain, seq_cnt=0;
2137   char dummy[40];
2138 
2139   if (*abp == ASN_SEQ) { abp += 2; seq_cnt++;}
2140 
2141   while (*abp) {
2142     switch (*abp) {
2143     case 0: abp += 2; break;
2144     case 0xa0:	/* mol */
2145       abp = get_asn_text(abp+2, acc, 20);
2146       break;
2147     case 0xa1:	/* chain */
2148       abp = get_asn_int(abp+2, &ichain);
2149       chain[0] = ichain;
2150       chain[1] = '\0';
2151       break;
2152     case 0xa2:	/* release */
2153       abp = get_asn_date(abp+2, dummy, sizeof(dummy));
2154       break;
2155     default: abp+=2;
2156     }
2157   }
2158   while (seq_cnt-- > 0) {abp += 4;}
2159   return abp;
2160 }
2161 
2162 unsigned char *
get_asn_seqid_ori(unsigned char * abp,int * gi_p,int * db,char * acc,size_t acc_len,char * name,size_t name_len)2163 get_asn_seqid_ori(unsigned char *abp, int *gi_p, int *db, char *acc, size_t acc_len, char *name, size_t name_len)
2164 {
2165   int db_type, itmp, seq_cnt=0;
2166 
2167   *gi_p = 0;
2168 
2169   if (*abp != ASN_SEQ) {
2170     fprintf(stderr, "*** error [%s:%d] - seqid - missing SEQ 1: %2x %2x\n",
2171 	    __FILE__, __LINE__, abp[0], abp[1]);
2172     return abp;
2173   }
2174   else { abp += 2; seq_cnt++;}
2175 
2176   db_type = (*abp & ASN_TYPE_MASK);
2177 
2178   if (db_type == 11) { /* gi */
2179     abp = get_asn_int(abp+2,gi_p);
2180   }
2181 
2182   while (*abp == ASN_SEQ) {abp += 2; seq_cnt++;}
2183 
2184   db_type = (*abp & ASN_TYPE_MASK);
2185   if (db_type > 17) {db_type = 0;}
2186   *db = db_type;
2187 
2188   switch(db_type) {
2189   case 0:
2190     abp = get_asn_local_id(abp, acc, acc_len);
2191     break;
2192   case 1:
2193   case 2:
2194     abp = get_asn_int(abp+2,&itmp);
2195     abp += 2;
2196     break;
2197   case 11:
2198     abp = get_asn_int(abp+2,&itmp);
2199     break;
2200   case 4:
2201   case 5:
2202   case 6:
2203   case 7:
2204   case 9:
2205   case 12:
2206   case 13:
2207   case 15:
2208   case 16:
2209   case 17:
2210     abp = get_asn_textseq_id(abp,name,name_len, acc, acc_len);
2211     break;
2212   case 10:
2213     abp = get_asn_dbtag(abp+2,name,name_len, acc, acc_len, &itmp);
2214   case 14:
2215     abp = get_asn_pdb_id(abp,acc,acc_len,name,name_len);
2216     break;
2217   default: abp += 2;
2218   }
2219 
2220   while (seq_cnt-- > 0) { abp += 4;}
2221   return abp; /* skip over 2 NULL's */
2222 }
2223 
2224 unsigned char *
get_asn_db_info(unsigned char * abp,int db_type,int * gi_p,char * name,size_t name_len,char * acc,size_t acc_len)2225 get_asn_db_info(unsigned char *abp, int db_type, int *gi_p, char *name, size_t name_len, char *acc, size_t acc_len) {
2226   int seq_cnt = 0, itmp;
2227 
2228   if (db_type == 11) {
2229     abp = get_asn_int(abp, gi_p);
2230     return abp;
2231   }
2232 
2233   while (*abp == ASN_SEQ) {abp += 2; seq_cnt++;}
2234 
2235   switch(db_type) {
2236   case 0:
2237     abp = get_asn_local_id(abp, acc, acc_len);
2238     break;
2239   case 1:
2240   case 2:
2241     abp = get_asn_int(abp,gi_p);
2242     break;
2243   case 11:
2244     abp = get_asn_int(abp+2,gi_p);
2245     break;
2246   case 4:
2247   case 5:
2248   case 6:
2249   case 7:
2250   case 9:
2251   case 12:
2252   case 13:
2253   case 15:
2254   case 16:
2255   case 17:
2256     abp = get_asn_textseq_id(abp,name,name_len, acc, acc_len);
2257     break;
2258   case 10:
2259     abp = get_asn_dbtag(abp,name,name_len, acc, acc_len, &itmp);
2260     break;
2261   case 14:
2262     abp = get_asn_pdb_id(abp,acc,acc_len,name,name_len);
2263     break;
2264   default: abp += 2;
2265   }
2266 
2267   while (seq_cnt-- > 0) { abp += 4;}
2268   return abp; /* skip over 2 NULL's */
2269 }
2270 
2271 
2272 unsigned char *
get_asn_seqid(unsigned char * abp,int * gi_p,int * db,char * acc,size_t acc_len,char * name,size_t name_len)2273 get_asn_seqid(unsigned char *abp, int *gi_p, int *db, char *acc, size_t acc_len, char *name, size_t name_len)
2274 {
2275   int db_type, itmp, seq_cnt=0;
2276 
2277   *gi_p = 0;
2278 
2279   if (*abp != ASN_SEQ) {
2280     fprintf(stderr, "*** error [%s:%d] - get_asn_seqid - missing SEQ 1: %2x %2x\n",
2281 	    __FILE__, __LINE__, abp[0], abp[1]);
2282     return abp;
2283   }
2284   else { abp += 2; seq_cnt++;}
2285 
2286   while (*abp) {
2287     if (*abp == ASN_FADL_TEXTSEQ_ID) {
2288       abp = get_asn_textseq_id(abp+2, name, name_len, acc, acc_len );
2289     }
2290     else if (*abp == ASN_FADL_OTHERSEQ_ID) {
2291       abp = get_asn_textseq_id(abp+2, name, name_len, acc, acc_len );
2292     }
2293     else if ((db_type = (*abp & ASN_TYPE_MASK)) < 17) {
2294       abp = get_asn_db_info(abp+2, db_type, gi_p, name, name_len, acc, acc_len);
2295       *db = db_type;
2296     }
2297     else {
2298       fprintf(stderr,"*** error [%s:%d] -- get_asn_seqid not TEXTSEQ/not GI: %2x %2x\n",
2299 	      __FILE__, __LINE__,abp[0], abp[1]);
2300       return abp;
2301     }
2302   }
2303 
2304   while (seq_cnt-- > 0) { abp += 4;}
2305   return abp; /* skip over 2 NULL's */
2306 }
2307 
2308 unsigned char *
get_asn_seqid_other(unsigned char * abp,int * gi_p,char * acc,size_t acc_len,char * name,size_t name_len)2309 get_asn_seqid_other(unsigned char *abp, int *gi_p, char *acc, size_t acc_len, char *name, size_t name_len)
2310 {
2311   int db_type, itmp, seq_cnt=0;
2312 
2313   *gi_p = 0;
2314   name[0] = acc[0] = '\0';
2315 
2316   if (*abp != ASN_SEQ) {
2317     fprintf(stderr, "*** error [%s:%d] - get_asn_seqid - missing SEQ 1: %2x %2x\n",
2318 	    __FILE__, __LINE__, abp[0], abp[1]);
2319     return abp;
2320   }
2321   else { abp += 2; seq_cnt++;}
2322 
2323   while (*abp) {
2324     if (*abp == ASN_TEXTSEQ_ID_ACC) {
2325       abp = get_asn_text(abp+2, acc, acc_len );
2326     }
2327     if (*abp == ASN_TEXTSEQ_ID_VER) {
2328       abp = get_asn_text(abp+2, acc, acc_len );
2329     }
2330     else if (*abp == ASN_TEXTSEQ_ID_NAME) {
2331       abp = get_asn_text(abp+2, name, name_len );
2332     }
2333     else if (*abp == ASN_IS_INT) {
2334       abp = get_asn_int(abp, &itmp );
2335     }
2336     else {
2337       fprintf(stderr,"*** error [%s:%d] -- get_asn_seqid not SEQID_ACC: %2x %2x\n",
2338 	      __FILE__, __LINE__,abp[0], abp[1]);
2339       return abp;
2340     }
2341   }
2342 
2343   if (*abp == ASN_FADL_GI) {
2344     abp = get_asn_int(abp+2, gi_p);
2345   }
2346 
2347   while (seq_cnt-- > 0) { abp += 4;}
2348   return abp; /* skip over 2 NULL's */
2349 }
2350 
2351 
2352 unsigned char *
parse_fastadl_asn(unsigned char * asn_buff,unsigned char * asn_max,int * gi_p,int * db,char * acc,size_t acc_len,char * name,size_t name_len,char * title,size_t t_len,int * taxid_p)2353 parse_fastadl_asn(unsigned char *asn_buff, unsigned char *asn_max,
2354 		  int *gi_p, int *db,  char *acc, size_t acc_len,
2355 		  char *name, size_t name_len,
2356 		  char *title, size_t t_len, int *taxid_p) {
2357   unsigned char *abp;
2358   int this_db, itmp;
2359   int seq_cnt = 0;
2360 
2361   acc[0] = name[0] = db[0] = title[0] = '\0';
2362 
2363   abp = asn_buff;
2364   while ( abp < asn_max && *abp) {
2365     if (*abp == ASN_SEQ) { abp += 2; seq_cnt++; }
2366     else if (*abp == ASN_FADL_TITLE) {
2367       abp = get_asn_text(abp+2, title, t_len);
2368     }
2369     else if (*abp == ASN_FADL_SEQID ) {
2370       abp = get_asn_seqid(abp+2, gi_p, db, acc, acc_len, name, name_len);
2371     }
2372     else if (*abp == ASN_FADL_TAXID ) {
2373       abp = get_asn_int(abp+2, taxid_p);
2374     }
2375     else if (*abp == ASN_FADL_MEMBERS) {
2376       abp = get_asn_junk(abp+2);
2377       break;
2378     }
2379     else if (*abp == ASN_FADL_LINKS ) {
2380       abp = get_asn_junk(abp+2);
2381       break;
2382     }
2383     else if (*abp == ASN_FADL_OTHER ) { /* possibly here for seqid without name */
2384       abp = get_asn_seqid_other(abp+2, gi_p, acc, acc_len, name, name_len);
2385     }
2386     else {
2387       /*       fprintf(stderr, " Error - missing ASN.1 %2x:%2x:%2x:%2x\n",
2388 	       abp[-2],abp[-1],abp[0],abp[1]); */
2389       abp = get_asn_junk(abp);
2390       /* something is broken, give up */
2391       break;
2392     }
2393   }
2394   while (abp < asn_max && *abp == '\0'  ) abp++;
2395   if (abp >= asn_max) return NULL;
2396   else return abp;
2397 }
2398 
2399 
2400 void
parse_pal(char * dname,char * msk_name,int * oid_seqs,int * max_oid,FILE * fd)2401 parse_pal(char *dname, char *msk_name,
2402 	  int *oid_seqs, int *max_oid,
2403 	  FILE *fd) {
2404 
2405   char line[MAX_STR];
2406 
2407   while (fgets(line,sizeof(line),fd)) {
2408     if (line[0] == '#') continue;
2409 
2410     if (strncmp(line, "DBLIST", 6)==0) {
2411       sscanf(line+7,"%s",dname);
2412     }
2413     else if (strncmp(line, "OIDLIST", 7)==0) {
2414       sscanf(line+8,"%s",msk_name);
2415     }
2416     else if (strncmp(line, "NSEQ", 4)==0) {
2417       sscanf(line+5,"%d",oid_seqs);
2418     }
2419     else if (strncmp(line, "MAXOID", 6)==0) {
2420       sscanf(line+7,"%d",max_oid);
2421     }
2422   }
2423 }
2424 
2425 /* part of new ambiguity code */
readMFILE(void * buffer,size_t size,int nitems,struct lmf_str * m_fd)2426 int readMFILE (void *buffer, size_t size, int nitems, struct lmf_str *m_fd) {
2427   register size_t  diff, len;
2428 
2429   if (m_fd == NULL)
2430     return 0;
2431 
2432   if (m_fd->mm_flg) {
2433     len = size * nitems;
2434     memcpy((void *) buffer, (void *) m_fd->mmap_addr, len);
2435     m_fd->mmap_addr += len;
2436     return nitems;
2437   }
2438   else {
2439     return fread((unsigned char *)buffer, size, nitems, m_fd->libf);
2440   }
2441 }
2442